From d2dc369df3b6bcd5b56f754a9ac5d81ea3b5a64d Mon Sep 17 00:00:00 2001 From: maverickstuder Date: Tue, 7 May 2024 14:25:54 +0200 Subject: [PATCH] RED-7074: Design Subsection section tree structure algorithm * temp --- .../processor/model/graph/nodes/Section.java | 11 ++++++++++ .../textblock/ConcatenatedTextBlock.java | 13 ++++++----- .../outline/OutlineExtractorService.java | 22 ++++++++++++------- .../services/factory/SectionNodeFactory.java | 3 +-- .../server/graph/ViewerDocumentTest.java | 5 ++++- 5 files changed, 38 insertions(+), 16 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java index 3a59884..532115a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java @@ -1,8 +1,10 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; import java.awt.geom.Rectangle2D; +import java.util.Arrays; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; @@ -42,6 +44,10 @@ public class Section implements GenericSemanticNode { @EqualsAndHashCode.Exclude Map bBoxCache; + @EqualsAndHashCode.Exclude + boolean isMainSection; + + @Override public NodeType getType() { @@ -56,6 +62,11 @@ public class Section implements GenericSemanticNode { .isPresent(); } + public boolean isLeafSection() { + + return streamAllSubNodesOfType(NodeType.SECTION).findAny() + .isEmpty(); + } @Override public TextBlock getTextBlock() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java index 10ce939..7038dbf 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java @@ -50,14 +50,17 @@ public class ConcatenatedTextBlock implements TextBlock { public ConcatenatedTextBlock concat(TextBlock textBlock) { + int start = textBlock.getBoundary().start(); + int end = textBlock.getBoundary().end(); if (this.atomicTextBlocks.isEmpty()) { - boundary.setStart(textBlock.getBoundary().start()); - boundary.setEnd(textBlock.getBoundary().end()); - } else if (boundary.end() != textBlock.getBoundary().start()) { - throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary())); + boundary.setStart(start); + boundary.setEnd(end); + } else if (boundary.end() != start) { + //throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary())); + return this; } this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks()); - boundary.setEnd(textBlock.getBoundary().end()); + boundary.setEnd(end); this.searchText = null; return this; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java index eb3f31b..3cc94ce 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java @@ -51,7 +51,8 @@ public class OutlineExtractorService { List rootNodes = new ArrayList<>(); if (documentOutline != null) { for (PDOutlineItem child : documentOutline.children()) { - rootNodes.add(createOutlineObjectWithChildren(child, document, 1)); + Optional outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, 1); + outlineObjectWithChildren.ifPresent(rootNodes::add); } } @@ -60,12 +61,14 @@ public class OutlineExtractorService { @SneakyThrows - private OutlineObjectTreeNode createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) { + private Optional createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) { - OutlineObjectTreeNode outlineObject = createOutlineObject(item, document, depth); - for (var child : item.children()) { - OutlineObjectTreeNode outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, depth + 1); - outlineObject.addChild(outlineObjectWithChildren); + Optional outlineObject = createOutlineObject(item, document, depth); + if (outlineObject.isPresent()) { + for (var child : item.children()) { + Optional outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, depth + 1); + outlineObjectWithChildren.ifPresent(outlineObjectTreeNode -> outlineObject.get().addChild(outlineObjectTreeNode)); + } } return outlineObject; @@ -75,11 +78,14 @@ public class OutlineExtractorService { // if the structure elements are processed beforehand, another case can be handled here as well: // outline objects can reference structure elements (see pdf documentation) @SneakyThrows - private OutlineObjectTreeNode createOutlineObject(PDOutlineItem item, PDDocument document, int depth) { + private Optional createOutlineObject(PDOutlineItem item, PDDocument document, int depth) { String title = item.getTitle(); PDPage page = item.findDestinationPage(document); + if (page == null) { + return Optional.empty(); + } int pageNumber = document.getPages().indexOf(page); Optional outlinePosition = Optional.empty(); @@ -109,7 +115,7 @@ public class OutlineExtractorService { log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title)); } - return new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, outlinePosition.orElse(new Point2D.Float(0, 0)), depth)); + return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, outlinePosition.orElse(new Point2D.Float(0, 0)), depth))); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java index 90b2e8a..459932c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java @@ -50,8 +50,7 @@ public class SectionNodeFactory { Map> blocksPerPage = pageBlocks.stream() .collect(groupingBy(AbstractPageBlock::getPage)); - Section section = Section.builder().documentTree(context.getDocumentTree()) - .build(); + Section section = Section.builder().isMainSection(parentNode == null).documentTree(context.getDocumentTree()).build(); context.getSections().add(section); blocksPerPage.keySet() diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index a0246cb..5e5028a 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -29,6 +29,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @Test @SneakyThrows + @Disabled public void testViewerDocuments() { String directory = "files/syngenta_190_deduplicated/"; @@ -81,7 +82,9 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; + + String fileName = "files/documine/20_TiltPlus_SensibilizacaoCutanea.pdf"; + //String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; //String fileName = "files/syngenta_190_deduplicated/1 Abamectin_prr.pdf"; //String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf"; //String fileName = "files/new/kaust-official-thesis-template.pdf";