From 1d765a6baa3541ddb60193e345a92954e890a339 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominique=20Eifl=C3=A4nder?= Date: Thu, 14 Mar 2024 16:30:52 +0100 Subject: [PATCH] RED-7141: Fixed more overlap problems --- .../DocstrumBlockificationService.java | 21 +++++++++++-------- .../services/factory/SectionNodeFactory.java | 2 +- .../server/graph/ViewerDocumentTest.java | 2 +- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 535baf6..92f3a73 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -54,7 +54,11 @@ public class DocstrumBlockificationService { var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder); var pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder); - return new ClassificationPage(pageBlocks); + var classificationPage = new ClassificationPage(pageBlocks); + + mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 0); + + return classificationPage; } @@ -91,8 +95,6 @@ public class DocstrumBlockificationService { public void combineBlocks(ClassificationPage page) { - mergeIntersectingBlocks(page.getTextBlocks(), 0, 0); - TextPageBlock previous = new TextPageBlock(); ListIterator itty = page.getTextBlocks().listIterator(); while (itty.hasNext()) { @@ -112,22 +114,22 @@ public class DocstrumBlockificationService { } if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) { - combineBlocksAndResetIterator(previous, current, itty, true); + previous = combineBlocksAndResetIterator(previous, current, itty, true); continue; } if (previous.almostIntersects(current, 0, 0)) { - combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate()); + previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate()); continue; } if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, page)) { - combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate()); + previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate()); continue; } if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, page)) { - combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate()); + previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate()); continue; } @@ -135,7 +137,7 @@ public class DocstrumBlockificationService { previous = current; } - mergeIntersectingBlocks(page.getTextBlocks(), 0, 0); + mergeIntersectingBlocks(page.getTextBlocks(), 0, 6.5f); } @@ -165,7 +167,7 @@ public class DocstrumBlockificationService { } - private void combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator itty, boolean toDuplicate) { + private TextPageBlock combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator itty, boolean toDuplicate) { previous.getSequences().addAll(current.getSequences()); previous = buildTextBlock(previous.getSequences(), 0); @@ -174,6 +176,7 @@ public class DocstrumBlockificationService { itty.previous(); itty.set(previous); itty.next(); + return previous; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java index 490f83c..31c723d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java @@ -91,7 +91,7 @@ public class SectionNodeFactory { if (abstractPageBlock instanceof TextPageBlock) { switch (layoutParsingType) { - case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> { + case REDACT_MANAGER, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> { alreadyMerged.add(abstractPageBlock); remainingBlocks.remove(abstractPageBlock); DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>()); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index bd187ed..197780e 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - String fileName = "files/new/270 rotated text on non rotated pages.pdf"; + String fileName = "files/100 Trinexapac-ethyl_RAR_20_Volume_3CP_B-9_ 2018-01-10.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile();