From d32182ece8e51092a69e484884260d2acef73da0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominique=20Eifl=C3=A4nder?= Date: Tue, 5 Mar 2024 10:32:21 +0100 Subject: [PATCH] RED-7141: Fixed missing spaces in ExpansionTest.pdf --- .../processor/LayoutParsingPipeline.java | 3 ++- .../DocstrumBlockificationService.java | 4 ++-- .../docstrum/DocstrumSegmentationService.java | 4 ++-- .../services/docstrum/model/Line.java | 2 +- .../docstrum/service/ReadingOrderService.java | 19 ++++++++++--------- .../server/graph/ViewerDocumentTest.java | 2 +- 6 files changed, 18 insertions(+), 16 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index bf1ee0c..e2a9fa1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -263,7 +263,8 @@ public class LayoutParsingPipeline { redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); - case DOCSTRUM, REDACT_MANAGER -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells); + case DOCSTRUM -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false); + case REDACT_MANAGER -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true); }; classificationPage.setCleanRulings(cleanRulings); classificationPage.setRotation(rotation); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 8236e2b..c71c1e7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -38,7 +38,7 @@ public class DocstrumBlockificationService { static final float THRESHOLD = 1f; - public ClassificationPage blockify(List textPositions, List cells) { + public ClassificationPage blockify(List textPositions, List cells, boolean xyOrder) { // Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells. List usedHorizonalRulings = new ArrayList<>(); @@ -52,7 +52,7 @@ public class DocstrumBlockificationService { }); List abstractPageBlocks = new ArrayList<>(); - var zones = docstrumSegmentationService.segmentPage(textPositions); + var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder); zones.forEach(zone -> { List textPositionSequences = new ArrayList<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java index 9fd0ef3..019b4a8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java @@ -29,7 +29,7 @@ public class DocstrumSegmentationService { private final ReadingOrderService readingOrderService; - public List segmentPage(List textPositions) { + public List segmentPage(List textPositions, boolean xyOrder) { List zones = new ArrayList<>(); zones.addAll(computeZones(textPositions, TextDirection.ZERO)); @@ -37,7 +37,7 @@ public class DocstrumSegmentationService { zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE)); zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE)); - return readingOrderService.resolve(zones); + return readingOrderService.resolve(zones, xyOrder); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Line.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Line.java index e9e0201..9cbdd18 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Line.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Line.java @@ -12,7 +12,7 @@ import lombok.Data; @Data public class Line extends BoundingBox { - private static final double WORD_DISTANCE_MULTIPLIER = 0.2; + private static final double WORD_DISTANCE_MULTIPLIER = 0.18; private final double x0; private final double y0; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java index 3503bea..e084d88 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java @@ -20,12 +20,16 @@ public class ReadingOrderService { public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5; - public List resolve(List zones) { + public List resolve(List zones, boolean xyReadingOrder) { if (zones.isEmpty() || zones.size() == 1) { return zones; } + if (xyReadingOrder) { + return resolveSingleColumnReadingOrder(zones); + } + Map histogram = new HashMap<>(); for (Zone zone : zones) { long minY = Math.round(zone.getBBox().getMinY()); @@ -35,10 +39,7 @@ public class ReadingOrderService { } } - if (histogram.values() - .stream() - .mapToInt(Integer::intValue).average() - .orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) { + if (histogram.values().stream().mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) { return resolveSingleColumnReadingOrder(zones); } else { @@ -51,7 +52,7 @@ public class ReadingOrderService { private static List resolveSingleColumnReadingOrder(List zones) { zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); return zones; } @@ -89,13 +90,13 @@ public class ReadingOrderService { } leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); List leftNotIntersecting = new ArrayList<>(); for (Zone leftZone : leftOf) { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index c7291e6..97fd9ec 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -26,7 +26,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf"; + String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile();