From efb1a748af21826215d59111a6f4ee4a0d5db2c8 Mon Sep 17 00:00:00 2001 From: Maverick Studer Date: Tue, 28 May 2024 14:48:21 +0200 Subject: [PATCH] RED-7074: Design Subsection section tree structure algorithm --- .../processor/model/graph/nodes/Document.java | 4 ++-- .../processor/model/outline/TOCEnrichmentService.java | 4 +--- .../processor/model/text/RedTextPosition.java | 2 +- .../processor/services/SimplifiedSectionTextService.java | 3 +-- .../processor/services/factory/SectionNodeFactory.java | 9 --------- .../layoutparser/server/graph/ViewerDocumentTest.java | 2 +- .../server/segmentation/PdfSegmentationServiceTest.java | 2 +- 7 files changed, 7 insertions(+), 19 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java index 77a1b8a..109daa4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java @@ -43,9 +43,9 @@ public class Document extends AbstractSemanticNode { } - public List
getMainSections() { + public List
getAllSections() { - return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node) + return streamAllSubNodesOfType(NodeType.SECTION).map(node -> (Section) node) .collect(Collectors.toList()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java index d5526f6..b66ebaf 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java @@ -82,9 +82,7 @@ public class TOCEnrichmentService { if(iterator.hasNext()) { currentTOCItem = iterator.next(); } - } - - if (!foundFirstHeadline) { + } else if (!foundFirstHeadline) { startBlocks.add(current); } else { currentSection.getSectionBlocks().add(current); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java index da7b099..8a0bbc5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java @@ -68,7 +68,7 @@ public class RedTextPosition extends BoundingBox { // I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work. pos.setBBox(new Rectangle2D.Float(textPosition.getX(), textPosition.getY(), textPosition.getWidthDirAdj(), textPosition.getHeight())); - float textHeight = textPosition.getHeight() + HEIGHT_PADDING; + float textHeight = textPosition.getHeight() + 2 * HEIGHT_PADDING; Rectangle2D.Float dirAdjPosition = new Rectangle2D.Float(textPosition.getXDirAdj(), textPosition.getYDirAdj() - textHeight, textPosition.getWidthDirAdj(), diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SimplifiedSectionTextService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SimplifiedSectionTextService.java index 37071fb..ecde09b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SimplifiedSectionTextService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SimplifiedSectionTextService.java @@ -9,7 +9,6 @@ import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import lombok.extern.slf4j.Slf4j; @@ -20,7 +19,7 @@ public class SimplifiedSectionTextService { public SimplifiedText toSimplifiedText(Document document) { - List simplifiedMainSectionsList = document.getMainSections() + List simplifiedMainSectionsList = document.getAllSections() .stream() .map(this::toSimplifiedSectionText) .toList(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java index 0cce454..2ac8735 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java @@ -69,11 +69,6 @@ public class SectionNodeFactory { addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document); if (containsTablesAndTextBlocks(pageBlocks)) { - - if (pageBlocks.get(0).isHeadline()) { - pageBlocks.remove(0); - } - splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType, section, true, @@ -82,10 +77,6 @@ public class SectionNodeFactory { context, document)); } else if (!isLeaf) { - - if (pageBlocks.get(0).isHeadline()) { - pageBlocks.remove(0); - } addSection(layoutParsingType, section, true, pageBlocks, emptyList(), context, document); } else { addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index a3d6ec9..fa10afc 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -31,7 +31,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; + String fileName = "files/syngenta/CustomerFiles/SinglePages/S4_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index 52ef0b9..6e600b0 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -114,7 +114,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile()); assertThat(classificationDocument.getHeaders() - .get(0).getTextBlocks().size()).isEqualTo(3); + .get(0).getTextBlocks().size()).isEqualTo(2); assertThat(classificationDocument.getHeaders() .get(0).getTextBlocks() .get(0).getSequences().size()).isEqualTo(8);