RED-8666 - implement reading order resolver
- added double column detection logic - removing non y-intersecting left/right zones and adding them to middle for manual sorting
This commit is contained in:
parent
fc43bccf60
commit
4644803fa8
@ -256,15 +256,14 @@ public class LayoutParsingPipeline {
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||
|
||||
List<Cell> emptyTableCells = tableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER_OLD ->
|
||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCSTRUM -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
|
||||
case REDACT_MANAGER -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
|
||||
case DOCSTRUM, REDACT_MANAGER -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
|
||||
};
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
classificationPage.setRotation(rotation);
|
||||
|
||||
@ -38,7 +38,7 @@ public class DocstrumBlockificationService {
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOder) {
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells) {
|
||||
|
||||
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
|
||||
List<Ruling> usedHorizonalRulings = new ArrayList<>();
|
||||
@ -52,7 +52,7 @@ public class DocstrumBlockificationService {
|
||||
});
|
||||
|
||||
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
|
||||
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOder);
|
||||
var zones = docstrumSegmentationService.segmentPage(textPositions);
|
||||
zones.forEach(zone -> {
|
||||
|
||||
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
|
||||
@ -29,7 +29,7 @@ public class DocstrumSegmentationService {
|
||||
private final ReadingOrderService readingOrderService;
|
||||
|
||||
|
||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOder) {
|
||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions) {
|
||||
|
||||
List<Zone> zones = new ArrayList<>();
|
||||
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
|
||||
@ -37,7 +37,7 @@ public class DocstrumSegmentationService {
|
||||
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
|
||||
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
|
||||
|
||||
return readingOrderService.resolve(zones, xyOder);
|
||||
return readingOrderService.resolve(zones);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -42,7 +42,16 @@ public abstract class BoundingBox {
|
||||
|
||||
public boolean contains(Rectangle2D contained, double tolerance) {
|
||||
|
||||
return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
|
||||
return bBox.getX() <= contained.getX() + tolerance
|
||||
&& bBox.getY() <= contained.getY() + tolerance
|
||||
&& bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance
|
||||
&& bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(BoundingBox other) {
|
||||
|
||||
return this.getBBox().getMinY() <= other.getBBox().getMaxY() && this.getBBox().getMaxY() >= other.getBBox().getMinY();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -2,8 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.s
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@ -15,22 +17,42 @@ import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.ut
|
||||
public class ReadingOrderService {
|
||||
|
||||
private static final double THRESHOLD = 5;
|
||||
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
|
||||
|
||||
|
||||
public List<Zone> resolve(List<Zone> zones, boolean xyOrder) {
|
||||
public List<Zone> resolve(List<Zone> zones) {
|
||||
|
||||
if (zones.isEmpty() || zones.size() == 1) {
|
||||
return zones;
|
||||
}
|
||||
|
||||
if (xyOrder) {
|
||||
// QuickSort.sort(zones, new ZoneComparator());
|
||||
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, 0)));
|
||||
return zones;
|
||||
Map<Long, Integer> histogram = new HashMap<>();
|
||||
for (Zone zone : zones) {
|
||||
long minY = Math.round(zone.getBBox().getMinY());
|
||||
long maxY = Math.round(zone.getBBox().getMaxY());
|
||||
for (long i = minY; i <= maxY; i++) {
|
||||
histogram.put(i, histogram.getOrDefault(i, 0) + 1);
|
||||
}
|
||||
}
|
||||
|
||||
return resolveMultiColumnReadingOder(zones);
|
||||
if (histogram.values()
|
||||
.stream()
|
||||
.mapToInt(Integer::intValue).average()
|
||||
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
||||
return resolveSingleColumnReadingOrder(zones);
|
||||
} else {
|
||||
|
||||
return resolveMultiColumnReadingOder(zones);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
|
||||
|
||||
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
return zones;
|
||||
}
|
||||
|
||||
|
||||
@ -67,13 +89,55 @@ public class ReadingOrderService {
|
||||
}
|
||||
|
||||
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
List<Zone> leftNotIntersecting = new ArrayList<>();
|
||||
for (Zone leftZone : leftOf) {
|
||||
boolean intersects = false;
|
||||
for (Zone rightZone : rightOf) {
|
||||
if (leftZone.intersectsY(rightZone)) {
|
||||
intersects = true;
|
||||
break;
|
||||
}
|
||||
// early stopping
|
||||
if (rightZone.getBBox().getMinY() > leftZone.getBBox().getMaxY()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!intersects) {
|
||||
leftNotIntersecting.add(leftZone);
|
||||
}
|
||||
}
|
||||
|
||||
List<Zone> rightNotIntersecting = new ArrayList<>();
|
||||
for (Zone rightZone : rightOf) {
|
||||
boolean intersects = false;
|
||||
for (Zone leftZone : leftOf) {
|
||||
if (rightZone.intersectsY(leftZone)) {
|
||||
intersects = true;
|
||||
break;
|
||||
}
|
||||
// early stopping
|
||||
if (leftZone.getBBox().getMinY() > rightZone.getBBox().getMaxY()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!intersects) {
|
||||
rightNotIntersecting.add(rightZone);
|
||||
}
|
||||
}
|
||||
|
||||
leftOf.removeAll(leftNotIntersecting);
|
||||
rightOf.removeAll(rightNotIntersecting);
|
||||
|
||||
middle.addAll(leftNotIntersecting);
|
||||
middle.addAll(rightNotIntersecting);
|
||||
|
||||
List<Zone> sortedZones = new ArrayList<>();
|
||||
sortedZones.addAll(leftOf);
|
||||
|
||||
@ -26,7 +26,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/new/270 rotated text on non rotated pages.pdf";
|
||||
String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user