diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java index 109daa4..f82d3fa 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java @@ -43,6 +43,11 @@ public class Document extends AbstractSemanticNode { } + /** + * Gets the sections of the document as a list. + * + * @return A list of all sections within the document. + */ public List
getAllSections() { return streamAllSubNodesOfType(NodeType.SECTION).map(node -> (Section) node) @@ -50,6 +55,34 @@ public class Document extends AbstractSemanticNode { } + /** + * Gets the main sections of the document as a list. + * + * @return A list of main sections within the document + * @deprecated This method is marked for removal. + * Use {@link #streamChildrenOfType(NodeType)} instead, + * or {@link #getChildrenOfTypeSectionOrSuperSection()} which returns children of type SECTION as well as SUPER_SECTION. + */ + @Deprecated(forRemoval = true) + public List
getMainSections() { + + return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node) + .collect(Collectors.toList()); + } + + + /** + * Gets the direct children of type SECTION or SUPER_SECTION of the document as a list of SemanticNode objects. + * + * @return A list of all children of type SECTION or SUPER_SECTION. + */ + public List getChildrenOfTypeSectionOrSuperSection() { + + return streamChildren().filter(semanticNode -> semanticNode.getType().equals(NodeType.SECTION) || semanticNode.getType().equals(NodeType.SUPER_SECTION)) + .toList(); + } + + public List
getHeaders() { return streamChildrenOfType(NodeType.HEADER).map(node -> (Header) node) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java index b66ebaf..71da252 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java @@ -29,7 +29,7 @@ public class TOCEnrichmentService { TableOfContents toc = document.getTableOfContents(); Iterator iterator = toc.iterator(); TableOfContentItem currentTOCItem = null; - if(iterator.hasNext()) { + if (iterator.hasNext()) { currentTOCItem = iterator.next(); } List startBlocks = new ArrayList<>(); @@ -79,7 +79,7 @@ public class TOCEnrichmentService { currentTOCItem.getSectionBlocks().add(current); currentPageTOCItems.add(currentTOCItem); - if(iterator.hasNext()) { + if (iterator.hasNext()) { currentTOCItem = iterator.next(); } } else if (!foundFirstHeadline) { @@ -149,9 +149,9 @@ public class TOCEnrichmentService { } } if (!image.isAppendedToSection()) { - log.debug("Image uses first paragraph"); + log.debug("Image uses last found section"); if (!lastFoundTOCItems.isEmpty()) { - lastFoundTOCItems.get(0).getImages().add(image); + lastFoundTOCItems.get(lastFoundTOCItems.size() - 1).getImages().add(image); } else { startImages.add(image); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java new file mode 100644 index 0000000..320e8ac --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java @@ -0,0 +1,231 @@ +package com.knecon.fforesight.service.layoutparser.server; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.File; +import java.nio.file.Path; +import java.util.Collection; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.function.Predicate; + +import org.apache.commons.lang3.StringUtils; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.core.io.ClassPathResource; + +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; +import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; +import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; +import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest; +import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService; + +import lombok.SneakyThrows; + +public class OutlineDetectionTest extends AbstractTest { + + ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); + LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); + + @Autowired + protected LayoutParsingPipeline layoutParsingPipeline; + + + @Test + @SneakyThrows + public void testOutlinesToSections() { + + String fileName = "files/new/crafted_outline_test_doc.pdf"; + + String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; + + var documentFile = new ClassPathResource(fileName).getFile(); + + long start = System.currentTimeMillis(); + ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER); + + OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree(); + assertEquals(outlineObjectTree.getRootNodes().size(), 8); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage() + .get(1).size(), 1); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage() + .get(3).size(), 1); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage() + .get(4).size(), 1); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage() + .get(5).size(), 1); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage() + .get(6).size(), 2); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage() + .get(7).size(), 3); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage() + .get(8).size(), 2); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage() + .get(10).size(), 1); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage() + .get(11).size(), 4); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage() + .get(12).size(), 1); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage() + .get(13).size(), 2); + assertTrue(outlineObjectTree.getOutlineObjectsPerPage().values() + .stream() + .flatMap(Collection::stream) + .allMatch(OutlineObject::isFound)); + + TableOfContents tableOfContents = classificationDocument.getTableOfContents(); + + assertEquals(tableOfContents.getMainSections().size(), 9); + assertEquals(tableOfContents.getMainSections().subList(1, 9) + .stream() + .map(tableOfContentItem -> sanitizeString(tableOfContentItem.getHeadline().toString())) + .toList(), + outlineObjectTree.getRootNodes() + .stream() + .map(outlineObjectTreeNode -> sanitizeString(outlineObjectTreeNode.getOutlineObject().getTitle())) + .toList()); + assertEquals(tableOfContents.getMainSections() + .get(5).getChildren().size(), 6); + assertEquals(tableOfContents.getMainSections() + .get(7).getChildren().size(), 3); + assertEquals(tableOfContents.getMainSections() + .get(8).getChildren().size(), 3); + assertEquals(tableOfContents.getMainSections() + .get(8).getChildren() + .get(2).getChildren().size(), 1); + assertEquals(tableOfContents.getMainSections() + .get(8).getChildren() + .get(2).getChildren() + .get(0).getChildren().size(), 3); + + assertEquals(tableOfContents.getMainSections() + .get(0).getImages().size(), 1); + assertEquals(tableOfContents.getMainSections() + .get(6).getImages().size(), 1); + assertEquals(tableOfContents.getMainSections() + .get(8).getChildren() + .get(2).getChildren() + .get(0).getChildren() + .get(2).getImages().size(), 1); + + Document document = buildGraph(fileName, classificationDocument); + + assertTrue(tableOfContents.getAllTableOfContentItems() + .stream() + .allMatch(tableOfContentItem -> tableOfContentItem.getSection() != null)); + assertTrue(tableOfContents.getAllTableOfContentItems() + .stream() + .filter(tableOfContentItem -> tableOfContentItem.getChildren().isEmpty()) + .allMatch(tableOfContentItem -> tableOfContentItem.getSection() instanceof Section)); + assertTrue(tableOfContents.getAllTableOfContentItems() + .stream() + .filter(tableOfContentItem -> !tableOfContentItem.getChildren().isEmpty()) + .allMatch(tableOfContentItem -> tableOfContentItem.getSection() instanceof SuperSection)); + + List childrenOfTypeSectionOrSuperSection = document.getChildrenOfTypeSectionOrSuperSection(); + + assertEquals(childrenOfTypeSectionOrSuperSection.size(), 9); + assertEquals(childrenOfTypeSectionOrSuperSection.subList(1, 9) + .stream() + .map(section -> sanitizeString(section.getHeadline().getLeafTextBlock().toString())) + .toList(), + outlineObjectTree.getRootNodes() + .stream() + .map(outlineObjectTreeNode -> sanitizeString(outlineObjectTreeNode.getOutlineObject().getTitle())) + .toList()); + Predicate isSectionOrSuperSection = semanticNode -> semanticNode instanceof Section || semanticNode instanceof SuperSection; + assertEquals(childrenOfTypeSectionOrSuperSection.get(5).streamChildren() + .filter(isSectionOrSuperSection) + .count(), 6 + 1); // 1 additional for main text of parent section + assertEquals(childrenOfTypeSectionOrSuperSection.get(7).streamChildren() + .filter(isSectionOrSuperSection) + .count(), 3 + 1); + assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren() + .filter(isSectionOrSuperSection) + .count(), 3 + 1); + assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren() + .filter(isSectionOrSuperSection) + .toList() + .get(3).streamChildren() + .filter(isSectionOrSuperSection) + .count(), 1 + 1); + assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren() + .filter(isSectionOrSuperSection) + .toList() + .get(3).streamChildren() + .filter(isSectionOrSuperSection) + .toList() + .get(1).streamChildren() + .filter(isSectionOrSuperSection) + .count(), 3 + 1); + + List> imageTreeIdList = document.streamAllImages() + .map(image -> image.getParent().getTreeId()) + .toList(); + + assertEquals(imageTreeIdList.get(0), List.of(0)); + assertEquals(imageTreeIdList.get(1), List.of(6)); + assertEquals(imageTreeIdList.get(2), List.of(8, 4, 2, 4)); + + layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false); + System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); + } + + + private static String sanitizeString(String text) { + + return StringUtils.deleteWhitespace(text).toLowerCase(Locale.ROOT); + } + + + @SneakyThrows + protected ClassificationDocument parseLayout(String filename, LayoutParsingType layoutParsingType) { + + File fileResource = new ClassPathResource(filename).getFile(); + prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/crafted_outline_test_doc.IMAGE_INFO.json"); + return layoutParsingPipeline.parseLayout(layoutParsingType, + fileResource, + layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), + new TableServiceResponse(), + new VisualLayoutParsingResponse(), + Map.of("file", filename, "debug", "true")); + } + + + @SneakyThrows + protected Document buildGraph(String filename, ClassificationDocument classificationDocument) { + + if (!filename.startsWith("files") && filename.startsWith("/")) { + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true); + prepareStorage(layoutParsingRequest, new File(filename)); + return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, + layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + new File(filename), + layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId() + .get()), + new TableServiceResponse(), + new VisualLayoutParsingResponse(), + layoutParsingRequest.identifier())); + } else { + prepareStorage(filename); + return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument); + } + + } + +} + diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index fa10afc..0a398bf 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -31,7 +31,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - String fileName = "files/syngenta/CustomerFiles/SinglePages/S4_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; + String fileName = "files/new/crafted_outline_test_doc.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java index cbd6201..32b0e6f 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java @@ -48,6 +48,7 @@ public abstract class BuildDocumentTest extends AbstractTest { @SneakyThrows protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) { + if (!filename.startsWith("files") && filename.startsWith("/")) { LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true); prepareStorage(layoutParsingRequest, new File(filename)); diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/crafted_outline_test_doc.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/crafted_outline_test_doc.pdf new file mode 100644 index 0000000..1a246fd --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/crafted_outline_test_doc.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8afb731a307e1a3f827c59e902164b10bdabef96e14193b949fe081cd3aa859f +size 168878 diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/image_service_response/crafted_outline_test_doc.IMAGE_INFO.json b/layoutparser-service/layoutparser-service-server/src/test/resources/image_service_response/crafted_outline_test_doc.IMAGE_INFO.json new file mode 100644 index 0000000..a1839b7 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/resources/image_service_response/crafted_outline_test_doc.IMAGE_INFO.json @@ -0,0 +1,135 @@ +{ + "dossierId": "a91f19ff-11ba-4735-9f60-c650243f64a9", + "fileId": "6e8c5f114e2b71e103a32a20c5273188", + "targetFileExtension": "ORIGIN.pdf.gz", + "responseFileExtension": "IMAGE_INFO.json.gz", + "X-TENANT-ID": "redaction", + "data": [ + { + "classification": { + "label": "other", + "probabilities": { + "other": 0.9126, + "formula": 0.0588, + "signature": 0.0261, + "logo": 0.0024 + } + }, + "representation": "70E1070C1030E081B7EF7FFFF", + "position": { + "x1": 61, + "x2": 394, + "y1": 155, + "y2": 470, + "pageNumber": 1 + }, + "geometry": { + "width": 333, + "height": 315 + }, + "alpha": false, + "filters": { + "geometry": { + "imageSize": { + "quotient": 0.5976, + "tooLarge": false, + "tooSmall": false + }, + "imageFormat": { + "quotient": 1.0571, + "tooTall": false, + "tooWide": false + } + }, + "probability": { + "unconfident": false + }, + "allPassed": true + } + }, + { + "classification": { + "label": "other", + "probabilities": { + "other": 0.9126, + "formula": 0.0588, + "signature": 0.0261, + "logo": 0.0024 + } + }, + "representation": "70E1070C1030E081B7EF7FFFF", + "position": { + "x1": 61, + "x2": 394, + "y1": 202, + "y2": 517, + "pageNumber": 11 + }, + "geometry": { + "width": 333, + "height": 315 + }, + "alpha": false, + "filters": { + "geometry": { + "imageSize": { + "quotient": 0.5976, + "tooLarge": false, + "tooSmall": false + }, + "imageFormat": { + "quotient": 1.0571, + "tooTall": false, + "tooWide": false + } + }, + "probability": { + "unconfident": false + }, + "allPassed": true + } + }, + { + "classification": { + "label": "other", + "probabilities": { + "other": 0.9126, + "formula": 0.0588, + "signature": 0.0261, + "logo": 0.0024 + } + }, + "representation": "70E1070C1030E081B7EF7FFFF", + "position": { + "x1": 47, + "x2": 379, + "y1": 289, + "y2": 604, + "pageNumber": 16 + }, + "geometry": { + "width": 332, + "height": 315 + }, + "alpha": false, + "filters": { + "geometry": { + "imageSize": { + "quotient": 0.5967, + "tooLarge": false, + "tooSmall": false + }, + "imageFormat": { + "quotient": 1.054, + "tooTall": false, + "tooWide": false + } + }, + "probability": { + "unconfident": false + }, + "allPassed": true + } + } + ] +} \ No newline at end of file