diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java index b6567a9..f914d29 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java @@ -4,6 +4,7 @@ public enum LayoutParsingType { REDACT_MANAGER, REDACT_MANAGER_OLD, REDACT_MANAGER_PARAGRAPH_DEBUG, + REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, DOCUMINE, DOCUMINE_OLD, CLARIFYND, diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 135c389..92b03b9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -51,8 +51,8 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService; -import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService; import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier; +import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService; @@ -319,10 +319,16 @@ public class LayoutParsingPipeline { case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations()); case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings); - case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> - docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType); - case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> - docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType); + case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> docstrumBlockificationService.blockify(words, + cleanRulings, + true, + classificationDocument.getVisualizations(), + layoutParsingType); + case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words, + cleanRulings, + false, + classificationDocument.getVisualizations(), + layoutParsingType); }; classificationPage.setCleanRulings(cleanRulings); @@ -381,8 +387,8 @@ public class LayoutParsingPipeline { } log.info("Classify TextBlocks for {}", identifier); switch (layoutParsingType) { - case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG -> - redactManagerClassificationService.classifyDocument(classificationDocument); + case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> redactManagerClassificationService.classifyDocument( + classificationDocument); case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 2aefa76..e0a046b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -59,8 +59,10 @@ public class DocstrumBlockificationService { mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0); - if (layoutParsingType == LayoutParsingType.DOCUMINE || layoutParsingType == LayoutParsingType.REDACT_MANAGER) { - combineBlocks(classificationPage); + if (layoutParsingType == LayoutParsingType.DOCUMINE + || layoutParsingType == LayoutParsingType.REDACT_MANAGER + || layoutParsingType == LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH) { + combineBlocks(classificationPage, layoutParsingType); } if (layoutParsingType == LayoutParsingType.CLARIFYND) { @@ -106,7 +108,7 @@ public class DocstrumBlockificationService { } - public void combineBlocks(ClassificationPage page) { + public void combineBlocks(ClassificationPage page, LayoutParsingType layoutParsingType) { TextPageBlock previous = new TextPageBlock(); ListIterator itty = page.getTextBlocks().listIterator(); @@ -138,7 +140,8 @@ public class DocstrumBlockificationService { } if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) { - previous = combineBlocksAndResetIterator(previous, current, itty, true); +// previous = combineBlocksAndResetIterator(previous, current, itty, true); + previous = combineBlocksAndResetIterator(previous, current, itty, layoutParsingType != LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH); continue; } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 8763e37..d73fc14 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -70,7 +70,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { file = new File(filePath); } - LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true); + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true); prepareStorage(layoutParsingRequest, file); LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java index 320e8ac..b96b411 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java @@ -56,7 +56,7 @@ public class OutlineDetectionTest extends AbstractTest { var documentFile = new ClassPathResource(fileName).getFile(); long start = System.currentTimeMillis(); - ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER); + ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH); OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree(); assertEquals(outlineObjectTree.getRootNodes().size(), 8); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/SimplifiedTextServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/SimplifiedTextServiceTest.java index d2dfe41..ab7e4ce 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/SimplifiedTextServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/SimplifiedTextServiceTest.java @@ -64,8 +64,8 @@ public class SimplifiedTextServiceTest @SneakyThrows protected Document buildGraph(File file) { - return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, - layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, + layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, file, new ImageServiceResponse(), new TableServiceResponse(), diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java index d523d2e..8d8597a 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java @@ -56,8 +56,8 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest { @SneakyThrows private void writeJsons(Path filename) { - Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD, - layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, + Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, + layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, filename.toFile(), new ImageServiceResponse(), new TableServiceResponse(), diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index a3d6ec9..3351eb0 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -38,7 +38,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { var documentFile = new ClassPathResource(fileName).getFile(); long start = System.currentTimeMillis(); - Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); + Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH); layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false); System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index 52ef0b9..abd9788 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -54,7 +54,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { @SneakyThrows public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) { - return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, originDocument, new ImageServiceResponse(), tableServiceResponse, @@ -122,7 +122,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { .get(0).getTextBlocks() .get(0).toString()).contains(textToSearch); - Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument); + Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, classificationDocument); TextBlock leafTextBlock = document.getFirstPage().getHeader().getLeafTextBlock(); assertThat(leafTextBlock.getSearchText().contains(textToSearch)).isTrue(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java index cb91962..919aac5 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -103,15 +103,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { @SneakyThrows private void writeJsons(Path filename) { - Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, - layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, + layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, filename.toFile(), new ImageServiceResponse(), new TableServiceResponse(), new VisualLayoutParsingResponse(), Map.of("file",filename.toFile().toString()))); - Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, - layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, + layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, filename.toFile(), new ImageServiceResponse(), new TableServiceResponse(), diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java index 32b0e6f..6857f68 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java @@ -50,7 +50,7 @@ public abstract class BuildDocumentTest extends AbstractTest { if (!filename.startsWith("files") && filename.startsWith("/")) { - LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true); + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true); prepareStorage(layoutParsingRequest, new File(filename)); return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, layoutParsingPipeline.parseLayout(layoutParsingType,