RED-8666 - implement reading order resolver

- added double column detection logic
- removing non y-intersecting left/right zones and adding them to middle for manual sorting
This commit is contained in:
maverickstuder 2024-03-04 16:50:02 +01:00
parent fc43bccf60
commit 4644803fa8
6 changed files with 91 additions and 19 deletions

View File

@ -256,15 +256,14 @@ public class LayoutParsingPipeline {
PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
List<Cell> emptyTableCells = tableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER_OLD ->
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCSTRUM -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
case REDACT_MANAGER -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
case DOCSTRUM, REDACT_MANAGER -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
};
classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation);

View File

@ -38,7 +38,7 @@ public class DocstrumBlockificationService {
static final float THRESHOLD = 1f;
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOder) {
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells) {
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
List<Ruling> usedHorizonalRulings = new ArrayList<>();
@ -52,7 +52,7 @@ public class DocstrumBlockificationService {
});
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOder);
var zones = docstrumSegmentationService.segmentPage(textPositions);
zones.forEach(zone -> {
List<TextPositionSequence> textPositionSequences = new ArrayList<>();

View File

@ -29,7 +29,7 @@ public class DocstrumSegmentationService {
private final ReadingOrderService readingOrderService;
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOder) {
public List<Zone> segmentPage(List<TextPositionSequence> textPositions) {
List<Zone> zones = new ArrayList<>();
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
@ -37,7 +37,7 @@ public class DocstrumSegmentationService {
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
return readingOrderService.resolve(zones, xyOder);
return readingOrderService.resolve(zones);
}

View File

@ -42,7 +42,16 @@ public abstract class BoundingBox {
public boolean contains(Rectangle2D contained, double tolerance) {
return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
return bBox.getX() <= contained.getX() + tolerance
&& bBox.getY() <= contained.getY() + tolerance
&& bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance
&& bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
}
public boolean intersectsY(BoundingBox other) {
return this.getBBox().getMinY() <= other.getBBox().getMaxY() && this.getBBox().getMaxY() >= other.getBBox().getMinY();
}
}

View File

@ -2,8 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.s
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import org.springframework.stereotype.Service;
@ -15,22 +17,42 @@ import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.ut
public class ReadingOrderService {
private static final double THRESHOLD = 5;
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
public List<Zone> resolve(List<Zone> zones, boolean xyOrder) {
public List<Zone> resolve(List<Zone> zones) {
if (zones.isEmpty() || zones.size() == 1) {
return zones;
}
if (xyOrder) {
// QuickSort.sort(zones, new ZoneComparator());
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, 0)));
return zones;
Map<Long, Integer> histogram = new HashMap<>();
for (Zone zone : zones) {
long minY = Math.round(zone.getBBox().getMinY());
long maxY = Math.round(zone.getBBox().getMaxY());
for (long i = minY; i <= maxY; i++) {
histogram.put(i, histogram.getOrDefault(i, 0) + 1);
}
}
return resolveMultiColumnReadingOder(zones);
if (histogram.values()
.stream()
.mapToInt(Integer::intValue).average()
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
return resolveSingleColumnReadingOrder(zones);
} else {
return resolveMultiColumnReadingOder(zones);
}
}
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
return zones;
}
@ -67,13 +89,55 @@ public class ReadingOrderService {
}
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
List<Zone> leftNotIntersecting = new ArrayList<>();
for (Zone leftZone : leftOf) {
boolean intersects = false;
for (Zone rightZone : rightOf) {
if (leftZone.intersectsY(rightZone)) {
intersects = true;
break;
}
// early stopping
if (rightZone.getBBox().getMinY() > leftZone.getBBox().getMaxY()) {
break;
}
}
if (!intersects) {
leftNotIntersecting.add(leftZone);
}
}
List<Zone> rightNotIntersecting = new ArrayList<>();
for (Zone rightZone : rightOf) {
boolean intersects = false;
for (Zone leftZone : leftOf) {
if (rightZone.intersectsY(leftZone)) {
intersects = true;
break;
}
// early stopping
if (leftZone.getBBox().getMinY() > rightZone.getBBox().getMaxY()) {
break;
}
}
if (!intersects) {
rightNotIntersecting.add(rightZone);
}
}
leftOf.removeAll(leftNotIntersecting);
rightOf.removeAll(rightNotIntersecting);
middle.addAll(leftNotIntersecting);
middle.addAll(rightNotIntersecting);
List<Zone> sortedZones = new ArrayList<>();
sortedZones.addAll(leftOf);

View File

@ -26,7 +26,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/new/270 rotated text on non rotated pages.pdf";
String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();