From 4644803fa83d83e9316957e5f9d30a4826c7385c Mon Sep 17 00:00:00 2001 From: maverickstuder Date: Mon, 4 Mar 2024 16:50:02 +0100 Subject: [PATCH] RED-8666 - implement reading order resolver - added double column detection logic - removing non y-intersecting left/right zones and adding them to middle for manual sorting --- .../processor/LayoutParsingPipeline.java | 5 +- .../DocstrumBlockificationService.java | 4 +- .../docstrum/DocstrumSegmentationService.java | 4 +- .../services/docstrum/model/BoundingBox.java | 11 ++- .../docstrum/service/ReadingOrderService.java | 84 ++++++++++++++++--- .../server/graph/ViewerDocumentTest.java | 2 +- 6 files changed, 91 insertions(+), 19 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 895f932..bf1ee0c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -256,15 +256,14 @@ public class LayoutParsingPipeline { PDRectangle cropbox = pdPage.getCropBox(); CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings()); - List emptyTableCells = tableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); + List emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); ClassificationPage classificationPage = switch (layoutParsingType) { case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); - case DOCSTRUM -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false); - case REDACT_MANAGER -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true); + case DOCSTRUM, REDACT_MANAGER -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells); }; classificationPage.setCleanRulings(cleanRulings); classificationPage.setRotation(rotation); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index ccd3f94..8236e2b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -38,7 +38,7 @@ public class DocstrumBlockificationService { static final float THRESHOLD = 1f; - public ClassificationPage blockify(List textPositions, List cells, boolean xyOder) { + public ClassificationPage blockify(List textPositions, List cells) { // Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells. List usedHorizonalRulings = new ArrayList<>(); @@ -52,7 +52,7 @@ public class DocstrumBlockificationService { }); List abstractPageBlocks = new ArrayList<>(); - var zones = docstrumSegmentationService.segmentPage(textPositions, xyOder); + var zones = docstrumSegmentationService.segmentPage(textPositions); zones.forEach(zone -> { List textPositionSequences = new ArrayList<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java index 5fa3e01..9fd0ef3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java @@ -29,7 +29,7 @@ public class DocstrumSegmentationService { private final ReadingOrderService readingOrderService; - public List segmentPage(List textPositions, boolean xyOder) { + public List segmentPage(List textPositions) { List zones = new ArrayList<>(); zones.addAll(computeZones(textPositions, TextDirection.ZERO)); @@ -37,7 +37,7 @@ public class DocstrumSegmentationService { zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE)); zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE)); - return readingOrderService.resolve(zones, xyOder); + return readingOrderService.resolve(zones); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/BoundingBox.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/BoundingBox.java index 5215d6f..79647ed 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/BoundingBox.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/BoundingBox.java @@ -42,7 +42,16 @@ public abstract class BoundingBox { public boolean contains(Rectangle2D contained, double tolerance) { - return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance; + return bBox.getX() <= contained.getX() + tolerance + && bBox.getY() <= contained.getY() + tolerance + && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance + && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance; + } + + + public boolean intersectsY(BoundingBox other) { + + return this.getBBox().getMinY() <= other.getBBox().getMaxY() && this.getBBox().getMaxY() >= other.getBBox().getMinY(); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java index 3e1ab25..3503bea 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java @@ -2,8 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.s import java.util.ArrayList; import java.util.Comparator; +import java.util.HashMap; import java.util.List; import java.util.ListIterator; +import java.util.Map; import org.springframework.stereotype.Service; @@ -15,22 +17,42 @@ import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.ut public class ReadingOrderService { private static final double THRESHOLD = 5; + public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5; - public List resolve(List zones, boolean xyOrder) { + public List resolve(List zones) { if (zones.isEmpty() || zones.size() == 1) { return zones; } - if (xyOrder) { -// QuickSort.sort(zones, new ZoneComparator()); - zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, 0))); - return zones; + Map histogram = new HashMap<>(); + for (Zone zone : zones) { + long minY = Math.round(zone.getBBox().getMinY()); + long maxY = Math.round(zone.getBBox().getMaxY()); + for (long i = minY; i <= maxY; i++) { + histogram.put(i, histogram.getOrDefault(i, 0) + 1); + } } - return resolveMultiColumnReadingOder(zones); + if (histogram.values() + .stream() + .mapToInt(Integer::intValue).average() + .orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) { + return resolveSingleColumnReadingOrder(zones); + } else { + + return resolveMultiColumnReadingOder(zones); + } + + } + + + private static List resolveSingleColumnReadingOrder(List zones) { + + zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + return zones; } @@ -67,13 +89,55 @@ public class ReadingOrderService { } leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + + List leftNotIntersecting = new ArrayList<>(); + for (Zone leftZone : leftOf) { + boolean intersects = false; + for (Zone rightZone : rightOf) { + if (leftZone.intersectsY(rightZone)) { + intersects = true; + break; + } + // early stopping + if (rightZone.getBBox().getMinY() > leftZone.getBBox().getMaxY()) { + break; + } + } + if (!intersects) { + leftNotIntersecting.add(leftZone); + } + } + + List rightNotIntersecting = new ArrayList<>(); + for (Zone rightZone : rightOf) { + boolean intersects = false; + for (Zone leftZone : leftOf) { + if (rightZone.intersectsY(leftZone)) { + intersects = true; + break; + } + // early stopping + if (leftZone.getBBox().getMinY() > rightZone.getBBox().getMaxY()) { + break; + } + } + if (!intersects) { + rightNotIntersecting.add(rightZone); + } + } + + leftOf.removeAll(leftNotIntersecting); + rightOf.removeAll(rightNotIntersecting); + + middle.addAll(leftNotIntersecting); + middle.addAll(rightNotIntersecting); List sortedZones = new ArrayList<>(); sortedZones.addAll(leftOf); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 7f85f01..c7291e6 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -26,7 +26,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - String fileName = "files/new/270 rotated text on non rotated pages.pdf"; + String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile();