diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 02b3187..f8c89d2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -254,7 +254,9 @@ public class LayoutParsingPipeline { OutlineObject lastProcessedOutlineObject = null; // parsing the structure elements could be useful as well - classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument)); + if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) { + classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument)); + } long pageCount = originDocument.getNumberOfPages(); @@ -330,16 +332,18 @@ public class LayoutParsingPipeline { classificationPage.setPageWidth(cropbox.getWidth()); classificationPage.setPageHeight(cropbox.getHeight()); - List outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>()); + if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) { + List outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>()); - OutlineObject notFoundOutlineObject = null; - if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) { - lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight())); - notFoundOutlineObject = lastProcessedOutlineObject; - } - if (!outlineObjects.isEmpty()) { - classificationPage.setOutlineObjects(outlineObjects); - lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject); + OutlineObject notFoundOutlineObject = null; + if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) { + lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight())); + notFoundOutlineObject = lastProcessedOutlineObject; + } + if (!outlineObjects.isEmpty()) { + classificationPage.setOutlineObjects(outlineObjects); + lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject); + } } classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationSection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationSection.java index 58fea4e..cc29901 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationSection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationSection.java @@ -12,6 +12,7 @@ import lombok.NoArgsConstructor; @Data @NoArgsConstructor +@Deprecated public class ClassificationSection { private List pageBlocks = new ArrayList<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java index e3fe66c..d839c06 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java @@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; import lombok.Data; import lombok.EqualsAndHashCode; +import lombok.ToString; import lombok.experimental.SuperBuilder; @Data @@ -9,4 +10,10 @@ import lombok.experimental.SuperBuilder; @EqualsAndHashCode(callSuper = true) public class SuperSection extends Section { + @Override + public String toString() { + + return super.toString(); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index adb008b..50a0251 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -59,11 +59,6 @@ public class DocumentGraphFactory { document.getPages() .forEach(context::buildAndAddPageWithCounter); - document.getSections() - .stream() - .flatMap(section -> section.getImages() - .stream()) - .forEach(image -> context.getImages().add(image)); addSections(layoutParsingType, document, context, documentGraph); addHeaderAndFooterToEachPage(document, context); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java index b78e53b..fc43dfd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java @@ -12,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 05637b7..a3d6ec9 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -31,7 +31,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - String fileName = "files/syngenta/CustomerFiles/90 Trinexapac-ethyl - Peer Review Report Syngenta - March 2018.pdf"; + String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index 1981530..52ef0b9 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -37,8 +37,6 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; -import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; -import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest; @@ -62,6 +60,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { tableServiceResponse, new VisualLayoutParsingResponse(), Map.of("file", "document")); + } @@ -134,6 +133,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { @Test @SneakyThrows public void testTableAndCellRotations() { + String fileName = "files/Minimal Examples/simpleTablesRotated.pdf"; ClassPathResource pdfFileResource = new ClassPathResource(fileName); @@ -141,7 +141,6 @@ public class PdfSegmentationServiceTest extends AbstractTest { } - @Disabled @Test public void testScanRotationBorderIsIgnored() throws IOException { @@ -151,15 +150,19 @@ public class PdfSegmentationServiceTest extends AbstractTest { var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse); - assertThat(document.getSections() + assertThat(document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - var tables = document.getSections() + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) + .toList()).isNotEmpty(); + var tables = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList(); // Quality of the table parsing is not good, because the file is rotated at scanning. @@ -199,15 +202,19 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections() + assertThat(document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - TablePageBlock table = document.getSections() + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) + .toList()).isNotEmpty(); + TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList() .get(0); assertThat(table.getColCount()).isEqualTo(6); @@ -225,23 +232,29 @@ public class PdfSegmentationServiceTest extends AbstractTest { "files/syngenta/CustomerFiles/SinglePages/Merge Table - Page5_26 A8637C - EU AIR3 - LCP Section 10 - Ecotoxicological studies on the plant protection product - Reference list.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections() + assertThat(document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - TablePageBlock firstTable = document.getSections() + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) + .toList()).isNotEmpty(); + TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList() .get(0); assertThat(firstTable.getColCount()).isEqualTo(8); assertThat(firstTable.getRowCount()).isEqualTo(1); - TablePageBlock secondTable = document.getSections() + TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList() .get(1); assertThat(secondTable.getColCount()).isEqualTo(8); @@ -266,23 +279,29 @@ public class PdfSegmentationServiceTest extends AbstractTest { "files/syngenta/CustomerFiles/SinglePages/Merge Multi Page Table - Page4_Page5_51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections() + assertThat(document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - TablePageBlock firstTable = document.getSections() + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) + .toList()).isNotEmpty(); + TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList() .get(0); assertThat(firstTable.getColCount()).isEqualTo(9); assertThat(firstTable.getRowCount()).isEqualTo(5); - TablePageBlock secondTable = document.getSections() + TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList() .get(1); assertThat(secondTable.getColCount()).isEqualTo(9); @@ -307,23 +326,29 @@ public class PdfSegmentationServiceTest extends AbstractTest { "files/syngenta/CustomerFiles/SinglePages/Rotated Table Headers - Page4_65 Mesotrione - EU AIR3 - LCA Section 1 Supplement Reference List.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections() + assertThat(document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - TablePageBlock firstTable = document.getSections() + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) + .toList()).isNotEmpty(); + TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList() .get(0); assertThat(firstTable.getColCount()).isEqualTo(8); assertThat(firstTable.getRowCount()).isEqualTo(1); - TablePageBlock secondTable = document.getSections() + TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList() .get(1); assertThat(secondTable.getColCount()).isEqualTo(8); @@ -818,10 +843,12 @@ public class PdfSegmentationServiceTest extends AbstractTest { @SneakyThrows private void toHtml(ClassificationDocument document, String filename) { - var tables = document.getSections() + var tables = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList(); StringBuilder sb = new StringBuilder(); @@ -843,12 +870,15 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) { - TablePageBlock table = document.getSections() + TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList() .get(tableIndex); + List> rows = table.getRows(); int emptyCellsFoundFound = rows.stream() .flatMap(List::stream) @@ -870,10 +900,12 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTable(ClassificationDocument document, int tableIndex, List> values) { - TablePageBlock table = document.getSections() + TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList() .get(tableIndex); List> rows = table.getRows(); @@ -896,10 +928,11 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTableSize(ClassificationDocument document, int tableSize) { - assertThat(document.getSections() + assertThat(document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) .toList().size()).isEqualTo(tableSize); }