diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/NodeType.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/NodeType.java index e0d08fd..e446fb0 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/NodeType.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/NodeType.java @@ -6,6 +6,7 @@ import java.util.Locale; public enum NodeType implements Serializable { DOCUMENT, SECTION, + SUPER_SECTION, HEADLINE, PARAGRAPH, TABLE, diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 0c8f15d..135c389 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor; import static java.lang.String.format; +import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.io.File; import java.io.IOException; @@ -29,6 +30,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.TOCEnrichmentService; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; @@ -45,6 +51,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService; +import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService; import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; @@ -90,12 +97,16 @@ public class LayoutParsingPipeline { TableExtractionService tableExtractionService; DocuMineBlockificationService docuMineBlockificationService; RedactManagerBlockificationService redactManagerBlockificationService; + BlockificationPostprocessingService blockificationPostprocessingService; DocstrumBlockificationService docstrumBlockificationService; LayoutGridService layoutGridService; ObservationRegistry observationRegistry; VisualLayoutParsingAdapter visualLayoutParsingAdapter; ClarifyndClassificationService clarifyndClassificationService; GraphicExtractorService graphicExtractorService; + OutlineExtractorService outlineExtractorService; + OutlineValidationService outlineValidationService; + TOCEnrichmentService tocEnrichmentService; LayoutparserSettings settings; @@ -123,8 +134,10 @@ public class LayoutParsingPipeline { } TableServiceResponse tableServiceResponse = new TableServiceResponse(); - if (layoutParsingRequest.tablesFileStorageId().isPresent()) { - tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get()); + if (layoutParsingRequest.tablesFileStorageId() + .isPresent()) { + tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId() + .get()); } ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null // @@ -204,15 +217,15 @@ public class LayoutParsingPipeline { private String buildSemanticNodeCountMessage(int numberOfPages, Map semanticNodeCounts) { - return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed", - numberOfPages, - semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), - semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), - semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), - semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), - semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), - semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), - semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); + return format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed", + numberOfPages, + semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), + semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), + semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), + semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), + semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), + semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), + semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); } @@ -227,6 +240,7 @@ public class LayoutParsingPipeline { PDDocument originDocument = openDocument(originFile); addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath())); + Map> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse); Map> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse); Map> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse); @@ -237,6 +251,12 @@ public class LayoutParsingPipeline { } List classificationPages = new ArrayList<>(); + OutlineObject lastProcessedOutlineObject = null; + + // parsing the structure elements could be useful as well + if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) { + classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument)); + } long pageCount = originDocument.getNumberOfPages(); @@ -282,7 +302,13 @@ public class LayoutParsingPipeline { TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings); - List graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), false); + List graphics = graphicExtractorService.extractPathElementGraphics(originDocument, + pdPage, + pageNumber, + cleanRulings, + stripper.getTextPositionSequences(), + + false); pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()) .addAll(graphics.stream() @@ -306,6 +332,20 @@ public class LayoutParsingPipeline { classificationPage.setPageWidth(cropbox.getWidth()); classificationPage.setPageHeight(cropbox.getHeight()); + if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) { + List outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>()); + + OutlineObject notFoundOutlineObject = null; + if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) { + lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight())); + notFoundOutlineObject = lastProcessedOutlineObject; + } + if (!outlineObjects.isEmpty()) { + classificationPage.setOutlineObjects(outlineObjects); + lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject); + } + } + classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber); // MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox. classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents())); @@ -347,14 +387,21 @@ public class LayoutParsingPipeline { case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument); } + List headlines = classificationDocument.getPages() + .stream() + .flatMap(classificationPage -> classificationPage.getTextBlocks() + .stream() + .filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline()) + .map(tb -> (TextPageBlock) tb)) + .toList(); + TableOfContents tableOfContents = outlineValidationService.createToC(headlines); + classificationDocument.setTableOfContents(tableOfContents); + log.info("Building Sections for {}", identifier); switch (layoutParsingType) { case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument); - default -> { - sectionsBuilderService.buildSections(classificationDocument); - sectionsBuilderService.addImagesToSections(classificationDocument); - } + default -> tocEnrichmentService.assignSectionBlocksAndImages(classificationDocument); } return classificationDocument; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java index 4f3f339..68e2d95 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java @@ -3,6 +3,8 @@ package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.ArrayList; import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; @@ -28,4 +30,7 @@ public class ClassificationDocument { private long rulesVersion; + private OutlineObjectTree outlineObjectTree; + private TableOfContents tableOfContents; + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java index a654636..ef97651 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java @@ -8,13 +8,13 @@ import java.util.Map; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import lombok.Data; import lombok.NonNull; import lombok.RequiredArgsConstructor; -import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; @Data @RequiredArgsConstructor @@ -23,6 +23,10 @@ public class ClassificationPage { @NonNull private List textBlocks; + private List outlineObjects = new ArrayList<>(); + + private List headlines = new ArrayList<>(); + private List images = new ArrayList<>(); private Rectangle bodyTextFrame; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationSection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationSection.java index 58fea4e..cc29901 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationSection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationSection.java @@ -12,6 +12,7 @@ import lombok.NoArgsConstructor; @Data @NoArgsConstructor +@Deprecated public class ClassificationSection { private List pageBlocks = new ArrayList<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java index 1292138..f67127a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java @@ -31,6 +31,19 @@ public enum PageBlockType { } + public static int getHeadlineNumber(PageBlockType pageBlockType) { + + return switch (pageBlockType) { + case H1 -> 1; + case H2 -> 2; + case H3 -> 3; + case H4 -> 4; + case H5 -> 5; + default -> 6; + }; + } + + public boolean isHeadline() { return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java index 7b6f8c4..dbcb2ce 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java @@ -8,6 +8,7 @@ import java.util.regex.Pattern; import lombok.AccessLevel; import lombok.AllArgsConstructor; +import lombok.Getter; import lombok.experimental.FieldDefaults; @AllArgsConstructor @@ -16,13 +17,15 @@ public class SectionIdentifier { static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?"); - private enum Format { + public enum Format { EMPTY, NUMERICAL, DOCUMENT } + @Getter Format format; + @Getter String identifierString; List identifiers; boolean asChild; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/DocumentTree.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/DocumentTree.java index dff1f4b..a95ee58 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/DocumentTree.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/DocumentTree.java @@ -140,8 +140,8 @@ public class DocumentTree { if (treeId.isEmpty()) { return root; } - Entry entry = root.children.get(treeId.get(0)); - for (int id : treeId.subList(1, treeId.size())) { + Entry entry = root; + for (int id : treeId) { entry = entry.children.get(id); } return entry; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/AbstractSemanticNode.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/AbstractSemanticNode.java new file mode 100644 index 0000000..4afdb9a --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/AbstractSemanticNode.java @@ -0,0 +1,74 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; + +import java.awt.geom.Rectangle2D; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import lombok.experimental.FieldDefaults; +import lombok.experimental.SuperBuilder; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Data +@SuperBuilder +@AllArgsConstructor +@NoArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public abstract class AbstractSemanticNode implements GenericSemanticNode { + + @Builder.Default + Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); + List treeId; + + TextBlock textBlock; + @EqualsAndHashCode.Exclude + DocumentTree documentTree; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + @EqualsAndHashCode.Exclude + Map bBoxCache; + + + @Override + public TextBlock getTextBlock() { + + if (textBlock == null) { + textBlock = GenericSemanticNode.super.getTextBlock(); + } + return textBlock; + } + + + @Override + public String toString() { + + return treeId.toString() + ": " + getType() + ": " + this.getTextBlock().buildSummary(); + } + + + @Override + public Map getBBox() { + + if (bBoxCache == null) { + bBoxCache = GenericSemanticNode.super.getBBox(); + } + return bBoxCache; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java index 9a9d9cc..77a1b8a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java @@ -3,43 +3,35 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; import java.awt.geom.Rectangle2D; import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; -import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; import lombok.AccessLevel; import lombok.AllArgsConstructor; -import lombok.Builder; import lombok.Data; +import lombok.EqualsAndHashCode; import lombok.NoArgsConstructor; import lombok.experimental.FieldDefaults; +import lombok.experimental.SuperBuilder; @Data -@Builder +@SuperBuilder @AllArgsConstructor @NoArgsConstructor +@EqualsAndHashCode(callSuper = true) @FieldDefaults(level = AccessLevel.PRIVATE) -public class Document implements GenericSemanticNode { - - @Builder.Default - Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); +public class Document extends AbstractSemanticNode { Set pages; - DocumentTree documentTree; Integer numberOfPages; - TextBlock textBlock; - @Builder.Default - Set entities = new HashSet<>(); LayoutparsingVisualizations visualizations; @@ -51,15 +43,6 @@ public class Document implements GenericSemanticNode { } - public TextBlock getTextBlock() { - - if (textBlock == null) { - textBlock = GenericSemanticNode.super.getTextBlock(); - } - return textBlock; - } - - public List
getMainSections() { return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node) @@ -81,6 +64,15 @@ public class Document implements GenericSemanticNode { } + @Override + public Headline getHeadline() { + + return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node) + .findFirst() + .orElse(Headline.builder().build()); + } + + public Stream streamTerminalTextBlocksInOrder() { return streamAllNodes().filter(SemanticNode::isLeaf) @@ -102,18 +94,9 @@ public class Document implements GenericSemanticNode { } - @Override - public Headline getHeadline() { - - return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node) - .findFirst() - .orElse(Headline.builder().build()); - } - - private Stream streamAllNodes() { - return documentTree.allEntriesInOrder() + return getDocumentTree().allEntriesInOrder() .map(DocumentTree.Entry::getNode); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/DuplicatedParagraph.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/DuplicatedParagraph.java index 93c2427..7cf126a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/DuplicatedParagraph.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/DuplicatedParagraph.java @@ -20,7 +20,8 @@ public class DuplicatedParagraph extends Paragraph { @Override public TextBlock getTextBlock() { - return Stream.of(leafTextBlock, unsortedLeafTextBlock).collect(new TextBlockCollector()); + return Stream.of(super.getLeafTextBlock(), unsortedLeafTextBlock) + .collect(new TextBlockCollector()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Footer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Footer.java index 14485df..ed299d3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Footer.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Footer.java @@ -1,48 +1,24 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; -import java.awt.geom.Rectangle2D; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import lombok.AccessLevel; import lombok.AllArgsConstructor; -import lombok.Builder; import lombok.Data; import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; import lombok.experimental.FieldDefaults; +import lombok.experimental.SuperBuilder; @Data -@Builder +@SuperBuilder @AllArgsConstructor -@NoArgsConstructor +@EqualsAndHashCode(callSuper = true) @FieldDefaults(level = AccessLevel.PRIVATE) -public class Footer implements GenericSemanticNode { +public class Footer extends AbstractSemanticNode { - @Builder.Default - Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); - - List treeId; TextBlock leafTextBlock; - @EqualsAndHashCode.Exclude - DocumentTree documentTree; - - @Builder.Default - @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); - - @EqualsAndHashCode.Exclude - Map bBoxCache; - @Override public NodeType getType() { @@ -68,17 +44,7 @@ public class Footer implements GenericSemanticNode { @Override public String toString() { - return treeId + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary(); - } - - - @Override - public Map getBBox() { - - if (bBoxCache == null) { - bBoxCache = GenericSemanticNode.super.getBBox(); - } - return bBoxCache; + return getTreeId() + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary(); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Header.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Header.java index 9285490..b648a8d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Header.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Header.java @@ -1,47 +1,24 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; -import java.awt.geom.Rectangle2D; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import lombok.AccessLevel; import lombok.AllArgsConstructor; -import lombok.Builder; import lombok.Data; import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; import lombok.experimental.FieldDefaults; +import lombok.experimental.SuperBuilder; @Data -@Builder +@SuperBuilder @AllArgsConstructor -@NoArgsConstructor +@EqualsAndHashCode(callSuper = true) @FieldDefaults(level = AccessLevel.PRIVATE) -public class Header implements GenericSemanticNode { +public class Header extends AbstractSemanticNode { - @Builder.Default - Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); - List treeId; TextBlock leafTextBlock; - @EqualsAndHashCode.Exclude - DocumentTree documentTree; - - @Builder.Default - @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); - - @EqualsAndHashCode.Exclude - Map bBoxCache; - @Override public boolean isLeaf() { @@ -67,17 +44,7 @@ public class Header implements GenericSemanticNode { @Override public String toString() { - return treeId + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary(); - } - - - @Override - public Map getBBox() { - - if (bBoxCache == null) { - bBoxCache = GenericSemanticNode.super.getBBox(); - } - return bBoxCache; + return getTreeId() + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary(); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Headline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Headline.java index b708c2b..00592f3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Headline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Headline.java @@ -1,47 +1,24 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; -import java.awt.geom.Rectangle2D; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import lombok.AccessLevel; import lombok.AllArgsConstructor; -import lombok.Builder; import lombok.Data; import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; import lombok.experimental.FieldDefaults; +import lombok.experimental.SuperBuilder; @Data -@Builder +@SuperBuilder @AllArgsConstructor -@NoArgsConstructor +@EqualsAndHashCode(callSuper = true) @FieldDefaults(level = AccessLevel.PRIVATE) -public class Headline implements GenericSemanticNode { +public class Headline extends AbstractSemanticNode { - @Builder.Default - Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); - List treeId; TextBlock leafTextBlock; - @EqualsAndHashCode.Exclude - DocumentTree documentTree; - - @Builder.Default - @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); - - @EqualsAndHashCode.Exclude - Map bBoxCache; - @Override public NodeType getType() { @@ -67,7 +44,7 @@ public class Headline implements GenericSemanticNode { @Override public String toString() { - return treeId + ": " + NodeType.HEADLINE + ": " + leafTextBlock.buildSummary(); + return getTreeId() + ": " + NodeType.HEADLINE + ": " + leafTextBlock.buildSummary(); } @@ -77,14 +54,4 @@ public class Headline implements GenericSemanticNode { return this; } - - @Override - public Map getBBox() { - - if (bBoxCache == null) { - bBoxCache = GenericSemanticNode.super.getBBox(); - } - return bBoxCache; - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Image.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Image.java index 0abb37d..0be8b09 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Image.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Image.java @@ -3,15 +3,10 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; import java.awt.geom.Rectangle2D; import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; -import java.util.List; import java.util.Map; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import lombok.AccessLevel; @@ -21,18 +16,16 @@ import lombok.Data; import lombok.EqualsAndHashCode; import lombok.NoArgsConstructor; import lombok.experimental.FieldDefaults; +import lombok.experimental.SuperBuilder; @Data -@Builder +@SuperBuilder @AllArgsConstructor @NoArgsConstructor +@EqualsAndHashCode(callSuper = true) @FieldDefaults(level = AccessLevel.PRIVATE) -public class Image implements GenericSemanticNode { +public class Image extends AbstractSemanticNode { - @Builder.Default - Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); - - List treeId; String id; ImageType imageType; @@ -53,13 +46,6 @@ public class Image implements GenericSemanticNode { @EqualsAndHashCode.Exclude Page page; - @EqualsAndHashCode.Exclude - DocumentTree documentTree; - - @Builder.Default - @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); - @Override public NodeType getType() { @@ -85,7 +71,7 @@ public class Image implements GenericSemanticNode { @Override public String toString() { - return treeId + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position; + return getTreeId() + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java index dfcb4f9..70a5ce1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java @@ -1,20 +1,10 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; -import java.awt.geom.Rectangle2D; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import lombok.AccessLevel; import lombok.AllArgsConstructor; -import lombok.Builder; import lombok.Data; import lombok.EqualsAndHashCode; import lombok.experimental.FieldDefaults; @@ -23,25 +13,12 @@ import lombok.experimental.SuperBuilder; @Data @SuperBuilder @AllArgsConstructor +@EqualsAndHashCode(callSuper = true) @FieldDefaults(level = AccessLevel.PROTECTED) -public class Paragraph implements GenericSemanticNode { +public class Paragraph extends AbstractSemanticNode { - @Builder.Default - Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); - - List treeId; TextBlock leafTextBlock; - @EqualsAndHashCode.Exclude - DocumentTree documentTree; - - @Builder.Default - @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); - - @EqualsAndHashCode.Exclude - Map bBoxCache; - @Override public NodeType getType() { @@ -63,21 +40,4 @@ public class Paragraph implements GenericSemanticNode { return leafTextBlock; } - - @Override - public String toString() { - - return treeId + ": " + NodeType.PARAGRAPH + ": " + leafTextBlock.buildSummary(); - } - - - @Override - public Map getBBox() { - - if (bBoxCache == null) { - bBoxCache = GenericSemanticNode.super.getBBox(); - } - return bBoxCache; - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java index 3a59884..d86d682 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java @@ -1,47 +1,20 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; -import java.awt.geom.Rectangle2D; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import lombok.AccessLevel; import lombok.AllArgsConstructor; -import lombok.Builder; import lombok.Data; import lombok.EqualsAndHashCode; import lombok.experimental.FieldDefaults; -import lombok.extern.slf4j.Slf4j; +import lombok.experimental.SuperBuilder; -@Slf4j @Data -@Builder +@SuperBuilder @AllArgsConstructor @FieldDefaults(level = AccessLevel.PRIVATE) -public class Section implements GenericSemanticNode { - - @Builder.Default - Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); - List treeId; - - TextBlock textBlock; - @EqualsAndHashCode.Exclude - DocumentTree documentTree; - - @Builder.Default - @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); - - @EqualsAndHashCode.Exclude - Map bBoxCache; - +@EqualsAndHashCode(callSuper = true) +public class Section extends AbstractSemanticNode { @Override public NodeType getType() { @@ -50,6 +23,14 @@ public class Section implements GenericSemanticNode { } + public Headline getHeadline() { + + return streamChildrenOfType(NodeType.HEADLINE).map(node -> (Headline) node) + .findFirst() + .orElseGet(() -> getParent().getHeadline()); + } + + public boolean hasTables() { return streamAllSubNodesOfType(NodeType.TABLE).findAny() @@ -57,39 +38,10 @@ public class Section implements GenericSemanticNode { } - @Override - public TextBlock getTextBlock() { - - if (textBlock == null) { - textBlock = GenericSemanticNode.super.getTextBlock(); - } - return textBlock; - } - - @Override public String toString() { - return treeId.toString() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary(); - } - - - public Headline getHeadline() { - - return streamChildrenOfType(NodeType.HEADLINE)// - .map(node -> (Headline) node)// - .findFirst()// - .orElseGet(() -> getParent().getHeadline()); - } - - - @Override - public Map getBBox() { - - if (bBoxCache == null) { - bBoxCache = GenericSemanticNode.super.getBBox(); - } - return bBoxCache; + return getTreeId() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary(); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java new file mode 100644 index 0000000..75bb270 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SuperSection.java @@ -0,0 +1,40 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.experimental.FieldDefaults; +import lombok.experimental.SuperBuilder; + +@Data +@SuperBuilder +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +@EqualsAndHashCode(callSuper = true) +public class SuperSection extends AbstractSemanticNode { + + @Override + public NodeType getType() { + + return NodeType.SUPER_SECTION; + } + + + public Headline getHeadline() { + + return streamChildrenOfType(NodeType.HEADLINE).map(node -> (Headline) node) + .findFirst() + .orElseGet(() -> getParent().getHeadline()); + } + + + @Override + public String toString() { + + return getTreeId() + ": " + NodeType.SUPER_SECTION + ": " + this.getTextBlock().buildSummary(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/TableCell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/TableCell.java index e94da17..3049460 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/TableCell.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/TableCell.java @@ -2,34 +2,26 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; import java.awt.geom.Rectangle2D; import java.util.HashMap; -import java.util.HashSet; -import java.util.List; import java.util.Map; -import java.util.Set; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector; import lombok.AccessLevel; import lombok.AllArgsConstructor; -import lombok.Builder; import lombok.Data; import lombok.EqualsAndHashCode; import lombok.experimental.FieldDefaults; +import lombok.experimental.SuperBuilder; @Data -@Builder +@SuperBuilder @AllArgsConstructor +@EqualsAndHashCode(callSuper = true) @FieldDefaults(level = AccessLevel.PRIVATE) -public class TableCell implements GenericSemanticNode { +public class TableCell extends AbstractSemanticNode { - @Builder.Default - Set engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM)); - List treeId; int row; int col; boolean header; @@ -40,13 +32,6 @@ public class TableCell implements GenericSemanticNode { TextBlock textBlock; - @EqualsAndHashCode.Exclude - DocumentTree documentTree; - - @Builder.Default - @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); - @Override public Map getBBox() { @@ -96,7 +81,7 @@ public class TableCell implements GenericSemanticNode { @Override public String toString() { - return treeId + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary(); + return getTreeId() + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary(); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java index 10ce939..d48170b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/textblock/ConcatenatedTextBlock.java @@ -50,14 +50,16 @@ public class ConcatenatedTextBlock implements TextBlock { public ConcatenatedTextBlock concat(TextBlock textBlock) { + int start = textBlock.getBoundary().start(); + int end = textBlock.getBoundary().end(); if (this.atomicTextBlocks.isEmpty()) { - boundary.setStart(textBlock.getBoundary().start()); - boundary.setEnd(textBlock.getBoundary().end()); - } else if (boundary.end() != textBlock.getBoundary().start()) { + boundary.setStart(start); + boundary.setEnd(end); + } else if (boundary.end() != start) { throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary())); } this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks()); - boundary.setEnd(textBlock.getBoundary().end()); + boundary.setEnd(end); this.searchText = null; return this; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java new file mode 100644 index 0000000..3cc94ce --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java @@ -0,0 +1,209 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.outline; + +import java.awt.geom.Point2D; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import java.util.Optional; + +import org.apache.pdfbox.cos.COSArray; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSString; +import org.apache.pdfbox.pdmodel.PDDestinationNameTreeNode; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.interactive.action.PDAction; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitHeightDestination; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitRectangleDestination; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitWidthDestination; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageXYZDestination; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; +import org.springframework.stereotype.Service; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +@Service +@Slf4j +public class OutlineExtractorService { + + private static final String PDDESTINATION_TYPE_FIT = "Fit"; + private static final String PDDESTINATION_TYPE_FIT_B = "FitB"; + private static final String PDDESTINATION_TYPE_FIT_H = "FitH"; + private static final String PDDESTINATION_TYPE_FIT_V = "FitV"; + private static final String PDDESTINATION_TYPE_FIT_R = "FitR"; + private static final String PDDESTINATION_TYPE_FIT_BH = "FitBH"; + private static final String PDDESTINATION_TYPE_FIT_BV = "FitBV"; + private static final String PDDESTINATION_TYPE_XYZ = "XYZ"; + + + @SneakyThrows + public OutlineObjectTree getOutlineObjectTree(PDDocument document) { + + PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline(); + + List rootNodes = new ArrayList<>(); + if (documentOutline != null) { + for (PDOutlineItem child : documentOutline.children()) { + Optional outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, 1); + outlineObjectWithChildren.ifPresent(rootNodes::add); + } + } + + return new OutlineObjectTree(rootNodes); + } + + + @SneakyThrows + private Optional createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) { + + Optional outlineObject = createOutlineObject(item, document, depth); + if (outlineObject.isPresent()) { + for (var child : item.children()) { + Optional outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, depth + 1); + outlineObjectWithChildren.ifPresent(outlineObjectTreeNode -> outlineObject.get().addChild(outlineObjectTreeNode)); + } + } + + return outlineObject; + } + + + // if the structure elements are processed beforehand, another case can be handled here as well: + // outline objects can reference structure elements (see pdf documentation) + @SneakyThrows + private Optional createOutlineObject(PDOutlineItem item, PDDocument document, int depth) { + + String title = item.getTitle(); + + PDPage page = item.findDestinationPage(document); + if (page == null) { + return Optional.empty(); + } + int pageNumber = document.getPages().indexOf(page); + + Optional outlinePosition = Optional.empty(); + + try { + PDDocumentNameDictionary names = document.getDocumentCatalog().getNames(); + PDDestinationNameTreeNode destinations = null; + if (names != null) { + destinations = names.getDests(); + } + + PDDestination destination = item.getDestination(); + if (destination != null) { + outlinePosition = getLocationFromCOSBase(destinations, destination.getCOSObject()); + } + + if (outlinePosition.isEmpty()) { + + PDAction action = item.getAction(); + if (action != null) { + outlinePosition = extractOutlineLocationGoTo(destinations, action.getCOSObject()); + } + + } + + } catch (Exception e) { + log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title)); + } + + return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, outlinePosition.orElse(new Point2D.Float(0, 0)), depth))); + + } + + + @SneakyThrows + private static Optional extractOutlineLocationGoTo(PDDestinationNameTreeNode destinations, COSDictionary cosDictionary) { + + if (isGoToAction(cosDictionary)) { + COSBase cosBase = cosDictionary.getItem(COSName.D); + return getLocationFromCOSBase(destinations, cosBase); + } + + return Optional.empty(); + } + + + private static Optional getLocationFromCOSBase(PDDestinationNameTreeNode destinations, COSBase cosBase) throws IOException { + + if (cosBase != null) { + if (cosBase instanceof COSArray cosArray) { + return getLocationFromCosArray(cosArray); + } + + if (cosBase instanceof COSString cosString) { + String destinationName = cosString.getString(); + COSArray cosArray = destinations.getValue(destinationName).getCOSObject(); + return getLocationFromCosArray(cosArray); + } + + } + return Optional.empty(); + } + + + private static Optional getLocationFromCosArray(COSArray cosArray) { + + boolean located = false; + float x = 0; + float y = 0; + + try { + + PDDestination destination = PDDestination.create(cosArray); + COSName type = (COSName) cosArray.getObject(1); + String typeString = type.getName(); + + switch (typeString) { + case PDDESTINATION_TYPE_FIT_V: + case PDDESTINATION_TYPE_FIT_BV: + PDPageFitHeightDestination fitHeightDestination = (PDPageFitHeightDestination) destination; + x = fitHeightDestination.getLeft(); + located = true; + break; + case PDDESTINATION_TYPE_FIT_R: + PDPageFitRectangleDestination fitRectangleDestination = (PDPageFitRectangleDestination) destination; + x = fitRectangleDestination.getLeft(); + y = fitRectangleDestination.getTop(); + located = true; + break; + case PDDESTINATION_TYPE_FIT_H: + case PDDESTINATION_TYPE_FIT_BH: + PDPageFitWidthDestination fitWidthDestination = (PDPageFitWidthDestination) destination; + y = fitWidthDestination.getTop(); + located = true; + break; + case PDDESTINATION_TYPE_XYZ: + PDPageXYZDestination xyzDestination = (PDPageXYZDestination) destination; + x = xyzDestination.getLeft(); + y = xyzDestination.getTop(); + located = true; + break; + case PDDESTINATION_TYPE_FIT: + case PDDESTINATION_TYPE_FIT_B: + default: + } + + } catch (IOException e) { + throw new RuntimeException(e); + } + + return located ? Optional.of(new Point2D.Float(x, y)) : Optional.empty(); + + } + + + private static boolean isGoToAction(COSDictionary cosDictionary) { + + return cosDictionary.getNameAsString("S").toLowerCase(Locale.ROOT).equals("goto"); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java new file mode 100644 index 0000000..6f8af6b --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java @@ -0,0 +1,35 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.outline; + +import java.awt.geom.Point2D; + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.RequiredArgsConstructor; + +@Data +@RequiredArgsConstructor +@AllArgsConstructor +public class OutlineObject { + + private final String title; + private final int pageNumber; + private Point2D point; + private final int treeDepth; + + private boolean found; + + + public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) { + + this(title, pageNumber, depth); + this.point = point2D; + } + + + @Override + public String toString() { + + return "OutlineObject{" + "title='" + title + '\'' + '}'; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTree.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTree.java new file mode 100644 index 0000000..61b0dd8 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTree.java @@ -0,0 +1,42 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.outline; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import lombok.Data; +import lombok.RequiredArgsConstructor; + +@Data +@RequiredArgsConstructor +public class OutlineObjectTree { + + private List rootNodes = new ArrayList<>(); + + private Map> outlineObjectsPerPage = new HashMap<>(); + + + public OutlineObjectTree(List rootNodes) { + + this.rootNodes = rootNodes; + flattenNodesAndGroupByPage(rootNodes); + } + + + private void flattenNodesAndGroupByPage(List outlineObjectTreeNodes) { + + for (OutlineObjectTreeNode node : outlineObjectTreeNodes) { + int pageNumber = node.getOutlineObject().getPageNumber(); + if (!this.outlineObjectsPerPage.containsKey(pageNumber)) { + outlineObjectsPerPage.put(pageNumber, new ArrayList<>()); + } + outlineObjectsPerPage.get(pageNumber).add(node.getOutlineObject()); + + if (!node.getChildren().isEmpty()) { + flattenNodesAndGroupByPage(node.getChildren()); + } + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTreeNode.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTreeNode.java new file mode 100644 index 0000000..f5cfd49 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTreeNode.java @@ -0,0 +1,34 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.outline; + +import java.util.ArrayList; +import java.util.List; + +import lombok.Data; + +@Data +public class OutlineObjectTreeNode { + + private OutlineObject outlineObject; + + private List children = new ArrayList<>(); + + + public OutlineObjectTreeNode(OutlineObject outlineObject) { + + this.outlineObject = outlineObject; + } + + + public void addChild(OutlineObjectTreeNode outlineObject) { + + children.add(outlineObject); + } + + + @Override + public String toString() { + + return "OutlineObjectTreeNode{" + "outlineObject=" + outlineObject + '}'; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java new file mode 100644 index 0000000..d1c8e74 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java @@ -0,0 +1,61 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.outline; + +import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeSet; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; + +import lombok.extern.slf4j.Slf4j; + +@Service +@Slf4j +public class OutlineValidationService { + + public TableOfContents createToC(List headlines) { + + List mainSections = new ArrayList<>(); + Map lastItemsPerDepth = new HashMap<>(); + TableOfContentItem last = null; + TreeSet depths = new TreeSet<>(); + + for (TextPageBlock current : headlines) { + int currentDepth = getHeadlineNumber(current.getClassification()); + Integer parentDepth = depths.floor(currentDepth - 1); + + var tocItem = new TableOfContentItem(current); + + if (parentDepth == null) { + mainSections.add(tocItem); + lastItemsPerDepth = new HashMap<>(); + depths = new TreeSet<>(); + + } else { + assert last != null; + int lastDepth = getHeadlineNumber(last.getHeadline().getClassification()); + + if (lastDepth < parentDepth) { + parentDepth = lastDepth; + } else if (lastDepth == currentDepth && last.getParent() != null) { + parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification()); + } + + TableOfContentItem parent = lastItemsPerDepth.get(parentDepth); + parent.addChild(tocItem); + } + + last = tocItem; + lastItemsPerDepth.put(currentDepth, tocItem); + depths.add(currentDepth); + } + + return new TableOfContents(mainSections); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java new file mode 100644 index 0000000..d5526f6 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TOCEnrichmentService.java @@ -0,0 +1,261 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.outline; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; + +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +public class TOCEnrichmentService { + + public void assignSectionBlocksAndImages(ClassificationDocument document) { + + TableOfContents toc = document.getTableOfContents(); + Iterator iterator = toc.iterator(); + TableOfContentItem currentTOCItem = null; + if(iterator.hasNext()) { + currentTOCItem = iterator.next(); + } + List startBlocks = new ArrayList<>(); + List startImages = new ArrayList<>(); + TableOfContentItem currentSection = null; + boolean foundFirstHeadline = false; + + List headers = new ArrayList<>(); + List footers = new ArrayList<>(); + TablePageBlock previousTable = null; + List lastFoundTOCItems = new ArrayList<>(); + + for (ClassificationPage page : document.getPages()) { + List currentPageTOCItems = new ArrayList<>(); + List header = new ArrayList<>(); + List footer = new ArrayList<>(); + for (AbstractPageBlock current : page.getTextBlocks()) { + + if (current.getClassification() == null) { + continue; + } + + current.setPage(page.getPageNumber()); + + if (current.getClassification().equals(PageBlockType.HEADER)) { + header.add((TextPageBlock) current); + continue; + } + + if (current.getClassification().equals(PageBlockType.FOOTER)) { + footer.add((TextPageBlock) current); + continue; + } + + if (current instanceof TablePageBlock table) { + if (previousTable != null) { + mergeTableMetadata(table, previousTable); + } + previousTable = table; + } + + if (current instanceof TextPageBlock && currentTOCItem != null && currentTOCItem.getHeadline().getText().equals(current.getText())) { + if (!foundFirstHeadline) { + foundFirstHeadline = true; + } + currentSection = currentTOCItem; + currentTOCItem.getSectionBlocks().add(current); + currentPageTOCItems.add(currentTOCItem); + + if(iterator.hasNext()) { + currentTOCItem = iterator.next(); + } + } + + if (!foundFirstHeadline) { + startBlocks.add(current); + } else { + currentSection.getSectionBlocks().add(current); + } + } + + if (!currentPageTOCItems.isEmpty()) { + lastFoundTOCItems = currentPageTOCItems; + } + + for (ClassifiedImage image : page.getImages()) { + + Double xMin = null; + Double yMin = null; + Double xMax = null; + Double yMax = null; + + for (TableOfContentItem tocItem : lastFoundTOCItems) { + var headline = tocItem.getHeadline(); + + if (headline.getPage() != page.getPageNumber()) { + continue; + } + + if (headline.getMinX() < headline.getMaxX()) { + if (xMin == null || headline.getMinX() < xMin) { + xMin = headline.getMinX(); + } + if (xMax == null || headline.getMaxX() > xMax) { + xMax = headline.getMaxX(); + } + } else { + if (xMin == null || headline.getMaxX() < xMin) { + xMin = headline.getMaxX(); + } + if (xMax == null || headline.getMinX() > xMax) { + xMax = headline.getMinX(); + } + } + + if (headline.getMinY() < headline.getMaxY()) { + if (yMin == null || headline.getMinY() < yMin) { + yMin = headline.getMinY(); + } + if (yMax == null || headline.getMaxY() > yMax) { + yMax = headline.getMaxY(); + } + } else { + if (yMin == null || headline.getMaxY() < yMin) { + yMin = headline.getMaxY(); + } + if (yMax == null || headline.getMinY() > yMax) { + yMax = headline.getMinY(); + } + } + + log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY()); + log.debug("Headline position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax); + + if (image.getPosition().getX() >= xMin && image.getPosition().getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) { + tocItem.getImages().add(image); + image.setAppendedToSection(true); + break; + } + } + if (!image.isAppendedToSection()) { + log.debug("Image uses first paragraph"); + if (!lastFoundTOCItems.isEmpty()) { + lastFoundTOCItems.get(0).getImages().add(image); + } else { + startImages.add(image); + } + image.setAppendedToSection(true); + } + } + + if (!header.isEmpty()) { + headers.add(new ClassificationHeader(header)); + } + if (!footer.isEmpty()) { + footers.add(new ClassificationFooter(footer)); + } + } + + if (!startBlocks.isEmpty()) { + TableOfContentItem unassigned = new TableOfContentItem(null); + unassigned.setSectionBlocks(startBlocks); + unassigned.setImages(startImages); + document.getTableOfContents().getMainSections().add(0, unassigned); + } + document.setHeaders(headers); + document.setFooters(footers); + } + + + private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) { + + // Distribute header information for subsequent tables + if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) { + List previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable); + List tableNonHeaderRow = getRowWithNonHeaderCells(currentTable); + // Allow merging of tables if header row is separated from first logical non-header row + if (previousTableNonHeaderRow.isEmpty() + && previousTable.getRowCount() == 1 + && previousTable.getRows() + .get(0).size() == tableNonHeaderRow.size()) { + previousTableNonHeaderRow = previousTable.getRows() + .get(0) + .stream() + .map(cell -> { + Cell fakeCell = Cell.copy(cell); + fakeCell.setHeaderCells(Collections.singletonList(cell)); + return fakeCell; + }) + .toList(); + } + if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) { + for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table + List row = currentTable.getRows() + .get(i); + if (row.size() == tableNonHeaderRow.size() && row.stream() + .allMatch(cell -> cell.getHeaderCells().isEmpty())) { + for (int j = 0; j < row.size(); j++) { + row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells()); + } + } + } + } + } + } + + + private boolean hasValidHeaderInformation(TablePageBlock table) { + + return !hasInvalidHeaderInformation(table); + } + + + private boolean hasInvalidHeaderInformation(TablePageBlock table) { + + return table.getRows() + .stream() + .flatMap(row -> row.stream() + .filter(cell -> !cell.getHeaderCells().isEmpty())) + .findAny().isEmpty(); + + } + + + private List getRowWithNonHeaderCells(TablePageBlock table) { + + for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table + List row = table.getRows() + .get(i); + if (row.size() == 1) { + continue; + } + boolean allNonHeader = true; + for (Cell cell : row) { + if (cell.isHeaderCell()) { + allNonHeader = false; + break; + } + } + if (allNonHeader) { + return row; + } + } + + return Collections.emptyList(); + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContentItem.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContentItem.java new file mode 100644 index 0000000..0231c16 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContentItem.java @@ -0,0 +1,109 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.outline; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; + +import lombok.Data; +import lombok.EqualsAndHashCode; + +@Data +@EqualsAndHashCode(onlyExplicitlyIncluded = true) +public class TableOfContentItem { + + @EqualsAndHashCode.Include + private TextPageBlock headline; + private List children = new ArrayList<>(); + private TableOfContentItem parent; + + private List sectionBlocks = new ArrayList<>(); + private List images = new ArrayList<>(); + + private AbstractSemanticNode section; + + + public TableOfContentItem(TextPageBlock headline) { + + this.headline = headline; + } + + + public void addChild(TableOfContentItem tableOfContentItem) { + + children.add(tableOfContentItem); + tableOfContentItem.setParent(this); + } + + + public TableOfContentItem getSiblingBefore() { + + if (parent != null) { + int index = parent.getChildren().indexOf(this); + if (index > 0) { + return parent.getChildren() + .get(index - 1); + } + } + return null; + } + + + public TableOfContentItem getSiblingAfter() { + + if (parent != null) { + int index = parent.getChildren().indexOf(this); + if (index >= 0 && index < parent.getChildren().size() - 1) { + return parent.getChildren() + .get(index + 1); + } + } + return null; + } + + + public boolean contains(TextPageBlock block) { + + if (headline.equals(block)) { + return true; + } + for (TableOfContentItem child : children) { + if (child.contains(block)) { + return true; + } + } + return false; + } + + + public boolean contains(TableOfContentItem tocItem) { + + if (this.equals(tocItem)) { + return true; + } + for (TableOfContentItem child : children) { + if (child.contains(tocItem)) { + return true; + } + } + return false; + } + + public List getNonEmptySectionBlocks() { + + return sectionBlocks.stream().filter(pageBlock -> !pageBlock.isEmpty()).collect(Collectors.toList()); + } + + @Override + public String toString() { + + return "OutlineObjectTreeNode{" + "textPageBlock=" + headline + '}'; + } + + + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java new file mode 100644 index 0000000..8d80cd3 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java @@ -0,0 +1,136 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.outline; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Stack; + +import org.springframework.lang.NonNull; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; + +import lombok.Data; +import lombok.RequiredArgsConstructor; + +@Data +@RequiredArgsConstructor +public class TableOfContents implements Iterable { + + private List mainSections = new ArrayList<>(); + + + public TableOfContents(List mainSections) { + + this.mainSections = mainSections; + } + + + public List getAllTextPageBlocks() { + + List allTextPageBlocks = new ArrayList<>(); + for (TableOfContentItem item : mainSections) { + collectTextPageBlocks(item, allTextPageBlocks); + } + return allTextPageBlocks; + } + + + private void collectTextPageBlocks(TableOfContentItem item, List textPageBlocks) { + + textPageBlocks.add(item.getHeadline()); + for (TableOfContentItem child : item.getChildren()) { + collectTextPageBlocks(child, textPageBlocks); + } + } + + + public List getAllTableOfContentItems() { + + List allItems = new ArrayList<>(); + for (TableOfContentItem item : mainSections) { + collectTableOfContentItems(item, allItems); + } + return allItems; + } + + + private void collectTableOfContentItems(TableOfContentItem item, List allItems) { + + allItems.add(item); + for (TableOfContentItem child : item.getChildren()) { + collectTableOfContentItems(child, allItems); + } + } + + + private boolean containsBlock(TextPageBlock block) { + + for (TableOfContentItem existingItem : this.getMainSections()) { + if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) { + return true; + } + } + return false; + } + + + private boolean containsItem(TableOfContentItem tocItem) { + + for (TableOfContentItem existingItem : this.getMainSections()) { + if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) { + return true; + } + } + return false; + } + + + @Override + public @NonNull Iterator iterator() { + + return new TableOfContentItemIterator(mainSections); + } + + + private static class TableOfContentItemIterator implements Iterator { + + private final Stack> stack = new Stack<>(); + + + TableOfContentItemIterator(List mainSections) { + + stack.push(mainSections.iterator()); + } + + + @Override + public boolean hasNext() { + + ensureStackTopIsCurrent(); + return !stack.isEmpty() && stack.peek().hasNext(); + } + + + @Override + public TableOfContentItem next() { + + ensureStackTopIsCurrent(); + TableOfContentItem currentItem = stack.peek().next(); + if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) { + stack.push(currentItem.getChildren() + .iterator()); + } + return currentItem; + } + + + private void ensureStackTopIsCurrent() { + + while (!stack.isEmpty() && !stack.peek().hasNext()) { + stack.pop(); + } + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index b7be4e1..c7c2ae6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -58,6 +58,20 @@ public class TextPageBlock extends AbstractPageBlock { } + @JsonIgnore + public float getPageHeight() { + + return sequences.get(0).getPageHeight(); + } + + + @JsonIgnore + public float getPageWidth() { + + return sequences.get(0).getPageWidth(); + } + + private void calculateBBox() { if (sequences == null) { @@ -69,6 +83,12 @@ public class TextPageBlock extends AbstractPageBlock { } + public void recalculateBBox() { + + calculateBBox(); + } + + public static TextPageBlock merge(List textBlocksToMerge) { if (textBlocksToMerge.isEmpty()) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java index 8f22568..7b2ada4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java @@ -27,8 +27,10 @@ import lombok.extern.slf4j.Slf4j; @Slf4j @Service +@Deprecated public class SectionsBuilderService { + public void buildSections(ClassificationDocument document) { List chunkWords = new ArrayList<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java new file mode 100644 index 0000000..a295711 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java @@ -0,0 +1,525 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.blockification; + +import static com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService.buildTextBlock; + +import java.awt.geom.Rectangle2D; +import java.util.ArrayList; +import java.util.List; +import java.util.ListIterator; +import java.util.Locale; +import java.util.function.Function; + +import org.apache.commons.lang3.StringUtils; +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; + +import lombok.Data; + +@Service +public class BlockificationPostprocessingService { + + private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f; + + private static final Function blockToBoundingBox = (abstractPageBlock) -> abstractPageBlock.getSequences() + .stream() + .map(textPositionSequence -> textPositionSequence.getTextPositions() + .stream() + .map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, textPositionSequence)) + .collect(RectangleTransformations.collectBBox())) + .collect(RectangleTransformations.collectBBox()); + + + public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) { + + List outlineObjects = classificationPage.getOutlineObjects(); + + if (getTextPageBlocks(classificationPage).isEmpty() || outlineObjects.isEmpty()) { + return null; + } + + float pageHeight = classificationPage.getPageHeight(); + + ListIterator outlineObjectListIterator = outlineObjects.listIterator(); + + if (notFoundOutlineObject != null) { + OutlineProcessionContext notFoundOutlineObjectProcessionContext = new OutlineProcessionContext(notFoundOutlineObject); + processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, notFoundOutlineObjectProcessionContext); + + OutlineObject firstOutlineObject = null; + OutlineProcessionContext firstOutlineObjectProcessionContext = null; + if (outlineObjectListIterator.hasNext()) { + firstOutlineObject = outlineObjectListIterator.next(); + firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject); + processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext); + } + + if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) { + notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext)); + } + if (firstOutlineObject != null) { + // re-create the context for the updated blocks + firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject); + processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext); + firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext)); + } + + } + + outlineObjectListIterator.forEachRemaining(outlineObject -> { + OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject); + processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext); + outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext)); + }); + + if (!outlineObjects.isEmpty()) { + return outlineObjects.get(outlineObjects.size() - 1); + } else { + return notFoundOutlineObject; + } + } + + + private static List getTextPageBlocks(ClassificationPage classificationPage) { + + return classificationPage.getTextBlocks() + .stream() + .filter(block -> block instanceof TextPageBlock) + .map(block -> (TextPageBlock) block) + .toList(); + } + + + private boolean contextsOverlap(OutlineProcessionContext notFoundOutlineObjectProcessionContext, OutlineProcessionContext firstOutlineObjectProcessionContext) { + + if (firstOutlineObjectProcessionContext == null) { + return false; + } + + String notFoundTitle = notFoundOutlineObjectProcessionContext.getOutlineObject().getTitle(); + String firstTitle = firstOutlineObjectProcessionContext.getOutlineObject().getTitle(); + + if (!firstTitle.startsWith(notFoundTitle)) { + return false; + } + + var blocksOfNotFoundOutline = getAllMatchingBlocks(notFoundOutlineObjectProcessionContext); + var blocksOfFirstOutline = getAllMatchingBlocks(firstOutlineObjectProcessionContext); + + double maxYFirst = blocksOfFirstOutline.stream() + .mapToDouble(TextPageBlock::getPdfMaxY) + .max() + .orElse(Double.NEGATIVE_INFINITY); + + return blocksOfNotFoundOutline.stream() + .mapToDouble(TextPageBlock::getPdfMaxY) + .anyMatch(y -> y >= maxYFirst); + } + + + private List getAllMatchingBlocks(OutlineProcessionContext context) { + + List blocks = new ArrayList<>(); + if (context.getDirectMatch() != null) { + blocks.add(context.getDirectMatch()); + } + if (context.getSplitCandidate() != null) { + blocks.add(context.getSplitCandidate()); + } + blocks.addAll(context.getMergeCandidates()); + return blocks; + } + + + private void processTextBlocks(List textBlocks, float pageHeight, OutlineProcessionContext context) { + + OutlineObject outlineObject = context.getOutlineObject(); + ListIterator iterator = textBlocks.listIterator(); + while (iterator.hasNext()) { + TextPageBlock pageBlock = iterator.next(); + if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) { + break; + } + } + if (iterator.hasPrevious()) { + iterator.previous(); + } + boolean earlyStop = false; + while (iterator.hasNext() && !earlyStop) { + TextPageBlock pageBlock = iterator.next(); + earlyStop = processOutlineForTextBlock(pageBlock, context); + } + } + + + private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) { + + OutlineObject outlineObject = context.outlineObject; + TextPageBlock directMatch = context.directMatch; + List mergeCandidates = context.mergeCandidates; + TextPageBlock splitCandidate = context.splitCandidate; + PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth()); + + double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch) : Double.MAX_VALUE; + double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate) : Double.MAX_VALUE; + + double distanceToBestMergeCandidates = Double.MAX_VALUE; + List bestMergeCandidateCombination = new ArrayList<>(); + if (!mergeCandidates.isEmpty()) { + + // with this code adjacent blocks to the first and last merge candidate get added, this could be useful for some edge cases: + //List allMergeCandidates = new ArrayList<>(mergeCandidates); + //addNeighborsOfCandidate(kdTree, mergeCandidates.get(0), allMergeCandidates); + //if (mergeCandidates.size() > 1) { + // addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates); + //} + //allMergeCandidates = allMergeCandidates.stream() + // .distinct() + // .toList(); + + List> combinations = findCombinations(outlineObject.getTitle(), mergeCandidates); + + for (List combination : combinations) { + double averageDistance = combination.stream() + .map(block -> calculateDistance(outlineObject, block)) + .mapToDouble(Double::doubleValue).average() + .orElse(Double.MAX_VALUE); + if (distanceToBestMergeCandidates > averageDistance) { + distanceToBestMergeCandidates = averageDistance; + bestMergeCandidateCombination = combination; + } + } + } + + double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates)); + + if (minDistance == Double.MAX_VALUE) { + return false; + } + if (minDistance == distanceToDirectMatch) { + directMatch.setClassification(headlineType); + } else if (minDistance == distanceToSplitCandidate) { + SplitBlockResult splitBlockResult = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle()); + if (splitBlockResult.modifiedBlockToSplit) { + splitCandidate.setClassification(headlineType); + } + splitBlockResult.otherBlocks.forEach(other -> other.setClassification(null)); + } else { + var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination); + merged.setClassification(headlineType); + } + return true; + } + + + private SplitBlockResult splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) { + + List otherBlocks = new ArrayList<>(); + int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit); + + String headline = title; + if (!sectionIdentifier.getFormat().equals(SectionIdentifier.Format.EMPTY) && !title.startsWith(sectionIdentifier.getIdentifierString())) { + headline = sectionIdentifier + headline; + } + + WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline); + if (wordSequenceResult.inSequence.isEmpty() && !headline.equals(title)) { + wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title); + } + + boolean modifiedBlockToSplit = false; + if (!wordSequenceResult.inSequence.isEmpty()) { + blockToSplit.setSequences(wordSequenceResult.inSequence); + blockToSplit.recalculateBBox(); + modifiedBlockToSplit = true; + } + + if (!wordSequenceResult.preSequence.isEmpty()) { + TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0); + classificationPage.getTextBlocks().add(blockToSplitIdx, block); + otherBlocks.add(block); + blockToSplitIdx++; + } + if (!wordSequenceResult.postSequence.isEmpty()) { + TextPageBlock block = buildTextBlock(wordSequenceResult.postSequence, 0); + classificationPage.getTextBlocks().add(blockToSplitIdx + 1, block); + otherBlocks.add(block); + } + + return new SplitBlockResult(modifiedBlockToSplit, otherBlocks); + } + + + private static WordSequenceResult findWordSequence(List textPositionSequences, String text) { + + String target = sanitizeString(text); + List inSequence = new ArrayList<>(); + List preSequence = new ArrayList<>(); + List postSequence = new ArrayList<>(); + StringBuilder currentSequence = new StringBuilder(); + + for (TextPositionSequence sequence : textPositionSequences) { + + currentSequence.append(sanitizeString(sequence.toString())); + inSequence.add(sequence); + + if (currentSequence.length() >= target.length()) { + + if (currentSequence.toString().endsWith(target)) { + + int index = 0; + String toRemove = currentSequence.substring(0, currentSequence.length() - target.length()); + + TextPositionSequence next = inSequence.get(index); + while (currentSequence.length() - next.length() >= target.length()) { + + TextPositionSequence removed = inSequence.remove(index); + currentSequence.delete(0, removed.toString().length()); + preSequence.add(removed); + + next = inSequence.get(index); + toRemove = toRemove.substring(removed.length()); + } + + if (!toRemove.isEmpty()) { + SplitSequenceResult splitSequenceResult = splitSequence(inSequence.remove(index), toRemove); + + currentSequence.delete(0, splitSequenceResult.out.length()); + preSequence.add(splitSequenceResult.out); + inSequence.add(index, splitSequenceResult.in); + } + + } else if (currentSequence.toString().startsWith(target)) { + + int index = inSequence.size() - 1; + String toRemove = currentSequence.substring(target.length()); + + SplitSequenceResult splitSequenceResult = splitSequence(inSequence.remove(index), toRemove); + currentSequence.delete(currentSequence.length() - splitSequenceResult.out.length(), currentSequence.length()); + + inSequence.add(index, splitSequenceResult.in); + postSequence.add(splitSequenceResult.out); + } + + if (currentSequence.toString().equals(target)) { + postSequence.addAll(textPositionSequences.subList(textPositionSequences.indexOf(sequence) + 1, textPositionSequences.size())); + return new WordSequenceResult(inSequence, preSequence, postSequence); + } + } + } + + return new WordSequenceResult(); + } + + + private static SplitSequenceResult splitSequence(TextPositionSequence sequence, String toRemove) { + + TextPositionSequence in = null; + TextPositionSequence out; + + String currentSequence = sequence.toString(); + int index = currentSequence.indexOf(toRemove); + int endIndex = index + toRemove.length(); + + out = createSubSequence(sequence, index, endIndex); + + if (index > 0) { + in = createSubSequence(sequence, 0, index); + } else if (endIndex < sequence.getTextPositions().size()) { + in = createSubSequence(sequence, endIndex, sequence.getTextPositions().size()); + } + + return new SplitSequenceResult(in, out); + } + + + private static TextPositionSequence createSubSequence(TextPositionSequence sequence, int start, int end) { + + TextPositionSequence newSeq = new TextPositionSequence(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage()); + newSeq.setParagraphStart(sequence.isParagraphStart()); + return newSeq; + } + + + private TextPageBlock mergeBlocks(ClassificationPage classificationPage, List blocksToMerge) { + + TextPageBlock firstBlock = blocksToMerge.get(0); + + if (blocksToMerge.size() > 1) { + + List mergedBlocks = new ArrayList<>(); + for (TextPageBlock textPageBlock : blocksToMerge.subList(1, blocksToMerge.size())) { + + if (firstBlock != null && !firstBlock.getSequences().isEmpty()) { + + if (textPageBlock.getDir() == firstBlock.getDir()) { + firstBlock.getSequences().addAll(textPageBlock.getSequences()); + mergedBlocks.add(textPageBlock); + } + } + } + + assert firstBlock != null; + firstBlock.setToDuplicate(false); + firstBlock.recalculateBBox(); + classificationPage.getTextBlocks().removeAll(mergedBlocks); + } + + return firstBlock; + } + + + private static List> findCombinations(String title, List blocks) { + + List> combinations = new ArrayList<>(); + findCombinations(title, blocks, new ArrayList<>(), combinations); + return combinations; + } + + + private static void findCombinations(String title, List blocks, List current, List> combinations) { + + String target = sanitizeString(title); + if (target.isEmpty()) { + combinations.add(new ArrayList<>(current)); + return; + } + + List remaining = blocks.stream() + .filter(block -> !current.contains(block)) + .toList(); + for (TextPageBlock block : remaining) { + String prefix = sanitizeString(block.getText()); + if (target.startsWith(prefix)) { + current.add(block); + findCombinations(target.substring(prefix.length()), blocks.subList(blocks.indexOf(block) + 1, blocks.size()), current, combinations); + current.remove(current.size() - 1); + } + } + } + + + private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock) { + + double deltaX = outlineObject.getPoint().getX() - pageBlock.getMinX(); + double deltaY = pageBlock.getPageHeight() - outlineObject.getPoint().getY() - pageBlock.getMinY(); + return Math.sqrt(deltaX * deltaX + deltaY * deltaY); + } + + + // currently only three cases are handled here: + // 1. equality + // 2. outline title contains block text + // 3. block text contains outline title + // another possible case is an intersection, meaning a title is split up between two different blocks + // this should not happen with how docstrum creates the blocks + // if it is indeed necessary, a splitting has to be done with a follow-up merge + private boolean processOutlineForTextBlock(TextPageBlock pageBlock, OutlineProcessionContext context) { + + OutlineObject outlineObject = context.getOutlineObject(); + String blockText = sanitizeString(pageBlock.getText()); + String outlineTitle = sanitizeString(outlineObject.getTitle()); + + boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle); + boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText); + + if (!blockTextContainsOutlineTitle && !outlineTitleContainsBlockText) { + return false; + } + + if (blockText.equals(outlineTitle) && context.directMatch == null) { + context.directMatch = pageBlock; + return true; + } + + if (outlineTitleContainsBlockText) { + context.mergeCandidates.add(pageBlock); + } + + if (blockTextContainsOutlineTitle) { + SectionIdentifier sectionIdentifier = SectionIdentifier.fromSearchText(blockText); + + if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY && !outlineTitle.startsWith(sectionIdentifier.getIdentifierString())) { + + if (blockText.startsWith(sectionIdentifier.getIdentifierString()) && blockText.endsWith(outlineTitle) && context.directMatch == null) { + context.directMatch = pageBlock; + return true; + } else if (context.splitCandidate == null) { + context.sectionIdentifier = sectionIdentifier; + } + } + if (context.splitCandidate == null) { + context.splitCandidate = pageBlock; + } + } + return false; + } + + + private static String sanitizeString(String text) { + + return StringUtils.deleteWhitespace(text).toLowerCase(Locale.ROOT); + } + + + @Data + private static class OutlineProcessionContext { + + private TextPageBlock directMatch; + private OutlineObject outlineObject; + private List mergeCandidates; + private TextPageBlock splitCandidate; + private SectionIdentifier sectionIdentifier; + + + OutlineProcessionContext(OutlineObject outlineObject) { + + this.outlineObject = outlineObject; + this.directMatch = null; + this.mergeCandidates = new ArrayList<>(); + this.splitCandidate = null; + this.sectionIdentifier = SectionIdentifier.empty(); + } + + } + + public static class WordSequenceResult { + + public List inSequence; + public List preSequence; + public List postSequence; + + + public WordSequenceResult(List inSequence, List preSequence, List postSequence) { + + this.inSequence = inSequence; + this.preSequence = preSequence; + this.postSequence = postSequence; + } + + + public WordSequenceResult() { + + this.inSequence = new ArrayList<>(); + this.preSequence = new ArrayList<>(); + this.postSequence = new ArrayList<>(); + } + + } + + public record SplitBlockResult(boolean modifiedBlockToSplit, List otherBlocks) { + + } + + public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) { + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 3b35f12..71ff68e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -126,6 +126,16 @@ public class DocstrumBlockificationService { continue; } + if (current.isHeadline() || previous.isHeadline()) { + if (intersectsYWithPreviousHavingMaxOneLine(previous, current, page)) { + previous = combineBlocksAndResetIterator(previous, current, itty, false); + } else { + previous = current; + } + + continue; + } + if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) { previous = combineBlocksAndResetIterator(previous, current, itty, true); continue; @@ -172,6 +182,12 @@ public class DocstrumBlockificationService { } + private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current, ClassificationPage page) { + + return previous.intersectsY(current) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1); + } + + private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) { return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 // @@ -185,6 +201,9 @@ public class DocstrumBlockificationService { previous.getSequences().addAll(current.getSequences()); previous = buildTextBlock(previous.getSequences(), 0); previous.setToDuplicate(toDuplicate); + if (current.getClassification() != null && previous.getClassification() == null) { + previous.setClassification(current.getClassification()); + } itty.remove(); itty.previous(); itty.set(previous); @@ -244,21 +263,30 @@ public class DocstrumBlockificationService { continue; } + if (block.getClassification() != null && block.getClassification().isHeadline()) { + continue; + } + TextPageBlock current = (TextPageBlock) block; for (int i = 0; i < blocks.size(); i++) { - if (blocks.get(i) == null) { + AbstractPageBlock abstractPageBlock = blocks.get(i); + if (abstractPageBlock == null) { continue; } - if (blocks.get(i) == current) { + if (abstractPageBlock == current) { continue; } - if (blocks.get(i) instanceof TablePageBlock) { + if (abstractPageBlock instanceof TablePageBlock) { continue; } - TextPageBlock inner = (TextPageBlock) blocks.get(i); + if (abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) { + continue; + } + + TextPageBlock inner = (TextPageBlock) abstractPageBlock; if (usedRulings.lineBetween(current, blocks.get(i))) { continue; @@ -285,7 +313,7 @@ public class DocstrumBlockificationService { } - private TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) { + public static TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) { return new TextPageBlock(wordBlockList); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java index e3520c7..b0622e3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClarifyndClassificationService.java @@ -21,12 +21,16 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class ClarifyndClassificationService { + private final HeadlineClassificationService headlineClassificationService; + public void classifyDocument(ClassificationDocument document) { List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); + headlineClassificationService.resetContext(); + for (ClassificationPage page : document.getPages()) { classifyPage(page, document, headlineFontSizes); } @@ -47,6 +51,10 @@ public class ClarifyndClassificationService { var bodyTextFrame = page.getBodyTextFrame(); + if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) { + headlineClassificationService.setLastHeadlineFromOutline(textBlock); + return; + } if (document.getFontSizeCounter().getMostPopular() == null) { textBlock.setClassification(PageBlockType.PARAGRAPH); return; @@ -79,7 +87,8 @@ public class ClarifyndClassificationService { for (int i = 1; i <= headlineFontSizes.size(); i++) { if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) { - textBlock.setClassification(PageBlockType.getHeadlineType(i)); + PageBlockType headlineType = PageBlockType.getHeadlineType(i); + headlineClassificationService.classifyHeadline(textBlock, headlineType); document.setHeadlines(true); } } @@ -89,7 +98,8 @@ public class ClarifyndClassificationService { .getTextPositions() .get(0) .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { - textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1)); + PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1); + headlineClassificationService.classifyHeadline(textBlock, headlineType); document.setHeadlines(true); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index 7f7d147..97a8c13 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -24,6 +24,7 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class DocuMineClassificationService { + private final HeadlineClassificationService headlineClassificationService; private static final Pattern pattern = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE); private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*"); @@ -35,6 +36,8 @@ public class DocuMineClassificationService { log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); + headlineClassificationService.resetContext(); + for (ClassificationPage page : document.getPages()) { classifyPage(page, document, headlineFontSizes); } @@ -60,6 +63,10 @@ public class DocuMineClassificationService { Matcher matcher2 = pattern2.matcher(textBlock.toString()); Matcher matcher3 = pattern3.matcher(textBlock.toString()); + if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) { + headlineClassificationService.setLastHeadlineFromOutline(textBlock); + return; + } if (document.getFontSizeCounter().getMostPopular() == null) { textBlock.setClassification(PageBlockType.OTHER); return; @@ -94,6 +101,7 @@ public class DocuMineClassificationService { && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9 + && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && matcher2.reset().find() && !textBlock.toString() .contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && matcher2.reset().find() && !textBlock.toString().contains(":") @@ -102,11 +110,13 @@ public class DocuMineClassificationService { || textBlock.toString().startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.reset().find()) { - textBlock.setClassification(PageBlockType.getHeadlineType(1)); + PageBlockType headlineType = PageBlockType.getHeadlineType(1); + headlineClassificationService.classifyHeadline(textBlock, headlineType); document.setHeadlines(true); } else if (matcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.reset().find() && !matcher3.reset().matches()) { - textBlock.setClassification(PageBlockType.getHeadlineType(2)); + PageBlockType headlineType = PageBlockType.getHeadlineType(2); + headlineClassificationService.classifyHeadline(textBlock, headlineType); document.setHeadlines(true); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/HeadlineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/HeadlineClassificationService.java new file mode 100644 index 0000000..e302321 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/HeadlineClassificationService.java @@ -0,0 +1,62 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.classification; + +import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; + +import lombok.Getter; +import lombok.Setter; + +@Service +@Getter +@Setter +public class HeadlineClassificationService { + + TextPageBlock lastHeadline; + PageBlockType originalClassifiedBlockType; + TextPageBlock lastHeadlineFromOutline; + + public void resetContext() { + setLastHeadline(null); + setOriginalClassifiedBlockType(null); + setLastHeadlineFromOutline(null); + } + + + public void setLastHeadlineFromOutline(TextPageBlock lastHeadlineFromOutline) { + + this.lastHeadlineFromOutline = lastHeadlineFromOutline; + this.setLastHeadline(lastHeadlineFromOutline); + } + + + public void classifyHeadline(TextPageBlock textBlock, PageBlockType initialHeadlineType) { + + TextPageBlock lastHeadline = getLastHeadline(); + TextPageBlock lastHeadlineFromOutline = getLastHeadlineFromOutline(); + PageBlockType originalClassifiedBlockType = getOriginalClassifiedBlockType(); + PageBlockType finalHeadlineType = initialHeadlineType; + + if (lastHeadline != null) { + + if (lastHeadline.equals(lastHeadlineFromOutline)) { + + finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1); + + } else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) { + + PageBlockType lastHeadlineType = lastHeadline.getClassification(); + int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType); + finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(initialHeadlineType) - difference); + } + } + + setOriginalClassifiedBlockType(initialHeadlineType); + textBlock.setClassification(finalHeadlineType); + setLastHeadline(textBlock); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java index 63b0e87..ff532b5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java @@ -22,12 +22,17 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class RedactManagerClassificationService { + private final HeadlineClassificationService headlineClassificationService; + + public void classifyDocument(ClassificationDocument document) { List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); + headlineClassificationService.resetContext(); + for (ClassificationPage page : document.getPages()) { classifyPage(page, document, headlineFontSizes); } @@ -48,6 +53,10 @@ public class RedactManagerClassificationService { var bodyTextFrame = page.getBodyTextFrame(); + if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) { + headlineClassificationService.setLastHeadlineFromOutline(textBlock); + return; + } if (document.getFontSizeCounter().getMostPopular() == null) { textBlock.setClassification(PageBlockType.OTHER); return; @@ -60,58 +69,64 @@ public class RedactManagerClassificationService { textBlock.setClassification(PageBlockType.PARAGRAPH); return; } - - if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame, - textBlock, - page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter() + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) + || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null + || textBlock.getHighestFontSize() <= document.getFontSizeCounter() .getMostPopular())) { textBlock.setClassification(PageBlockType.HEADER); - } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, - textBlock, - page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter() + } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) + || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null + || textBlock.getHighestFontSize() <= document.getFontSizeCounter() .getMostPopular())) { textBlock.setClassification(PageBlockType.FOOTER); - } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, - document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() - .size() == 1)) { + } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 + && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) { if (!Pattern.matches("[0-9]+", textBlock.toString())) { textBlock.setClassification(PageBlockType.TITLE); } - } else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter() - .getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter() - .getCountPerValue() - .containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences() - .get(0) - .getTextPositions() - .get(0) - .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + } else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + && PositionUtils.getApproxLineCount(textBlock) < 4.9 + && (textBlock.getMostPopularWordStyle().equals("bold") + || !document.getFontStyleCounter().getCountPerValue().containsKey("bold") + && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) + && textBlock.getSequences() + .get(0).getTextPositions() + .get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { for (int i = 1; i <= headlineFontSizes.size(); i++) { if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) { - textBlock.setClassification(PageBlockType.getHeadlineType(i)); + PageBlockType headlineType = PageBlockType.getHeadlineType(i); + headlineClassificationService.classifyHeadline(textBlock, headlineType); document.setHeadlines(true); } } - } else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle() - .equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences() - .get(0) - .getTextPositions() - .get(0) - .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { - textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1)); + } else if (!textBlock.getText().startsWith("Figure ") + && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) + && textBlock.getMostPopularWordStyle().equals("bold") + && !document.getFontStyleCounter().getMostPopular().equals("bold") + && PositionUtils.getApproxLineCount(textBlock) < 2.9 + && textBlock.getSequences() + .get(0).getTextPositions() + .get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1); + headlineClassificationService.classifyHeadline(textBlock, headlineType); document.setHeadlines(true); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() - .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) + && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() + && textBlock.getMostPopularWordStyle().equals("bold") + && !document.getFontStyleCounter().getMostPopular().equals("bold")) { textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont() - .equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle() - .equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) + && textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular()) + && textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular()) + && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { textBlock.setClassification(PageBlockType.PARAGRAPH); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() - .getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter() - .getMostPopular() - .equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) + && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() + && textBlock.getMostPopularWordStyle().equals("italic") + && !document.getFontStyleCounter().getMostPopular().equals("italic") + && PositionUtils.getApproxLineCount(textBlock) < 2.9) { textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) { textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index 36ee3eb..1420d1f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -11,6 +11,7 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; +import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; @@ -30,9 +31,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.He import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder; import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; @@ -57,11 +59,6 @@ public class DocumentGraphFactory { document.getPages() .forEach(context::buildAndAddPageWithCounter); - document.getSections() - .stream() - .flatMap(section -> section.getImages() - .stream()) - .forEach(image -> context.getImages().add(image)); addSections(layoutParsingType, document, context, documentGraph); addHeaderAndFooterToEachPage(document, context); @@ -75,8 +72,17 @@ public class DocumentGraphFactory { private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) { - classificationDocument.getSections() - .forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context, document)); + for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) { + var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection(); + Optional section = SectionNodeFactory.addSection(layoutParsingType, + parent, + tocItem.getChildren().isEmpty(), + tocItem.getNonEmptySectionBlocks(), + tocItem.getImages(), + context, + document); + tocItem.setSection(section.orElse(null)); + } } @@ -181,10 +187,7 @@ public class DocumentGraphFactory { Page page = context.getPage(textBlocks.get(0).getPage()); Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build(); - AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), - footer, - context, - page); + AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer); footer.setTreeId(tocId); footer.setLeafTextBlock(textBlock); @@ -236,7 +239,7 @@ public class DocumentGraphFactory { DocumentTree documentTree; Map pages; - List
sections; + List sections; List images; TextBlockFactory textBlockFactory; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java index b6e09f5..0cce454 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java @@ -9,6 +9,7 @@ import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; @@ -17,6 +18,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; @@ -27,12 +30,13 @@ import lombok.experimental.UtilityClass; @UtilityClass public class SectionNodeFactory { - public void addSection(LayoutParsingType layoutParsingType, - GenericSemanticNode parentNode, - List pageBlocks, - List images, - DocumentGraphFactory.Context context, - Document document) { + public Optional addSection(LayoutParsingType layoutParsingType, + GenericSemanticNode parentNode, + boolean isLeaf, + List pageBlocks, + List images, + DocumentGraphFactory.Context context, + Document document) { // This is for the case where we have images on a page without any text/footer/header. // The pageBlocks list is empty, but we still need to add those images to the document. @@ -40,16 +44,22 @@ public class SectionNodeFactory { images.stream() .distinct() .forEach(image -> DocumentGraphFactory.addImage(document, image, context)); - return; + return Optional.empty(); } if (pageBlocks.isEmpty()) { - return; + return Optional.empty(); } Map> blocksPerPage = pageBlocks.stream() .collect(groupingBy(AbstractPageBlock::getPage)); - Section section = Section.builder().documentTree(context.getDocumentTree()).build(); + + AbstractSemanticNode section; + if (isLeaf) { + section = Section.builder().documentTree(context.getDocumentTree()).build(); + } else { + section = SuperSection.builder().documentTree(context.getDocumentTree()).build(); + } context.getSections().add(section); blocksPerPage.keySet() @@ -59,12 +69,24 @@ public class SectionNodeFactory { addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document); if (containsTablesAndTextBlocks(pageBlocks)) { + + if (pageBlocks.get(0).isHeadline()) { + pageBlocks.remove(0); + } + splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType, section, + true, subSectionPageBlocks, emptyList(), context, document)); + } else if (!isLeaf) { + + if (pageBlocks.get(0).isHeadline()) { + pageBlocks.remove(0); + } + addSection(layoutParsingType, section, true, pageBlocks, emptyList(), context, document); } else { addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document); } @@ -72,10 +94,12 @@ public class SectionNodeFactory { images.stream() .distinct() .forEach(image -> DocumentGraphFactory.addImage(section, image, context)); + + return Optional.of(section); } - private List getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, Section section) { + private List getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, AbstractSemanticNode section) { if (parentNode == null) { return context.getDocumentTree().createNewMainEntryAndReturnId(section); @@ -88,7 +112,7 @@ public class SectionNodeFactory { private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType, List pageBlocks, DocumentGraphFactory.Context context, - Section section, + AbstractSemanticNode section, Document document) { if (pageBlocks.get(0).isHeadline()) { @@ -101,7 +125,7 @@ public class SectionNodeFactory { private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType, List pageBlocks, DocumentGraphFactory.Context context, - Section section, + AbstractSemanticNode section, Document document) { Set alreadyMerged = new HashSet<>(); @@ -226,7 +250,7 @@ public class SectionNodeFactory { } - private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, Section section, Integer pageNumber) { + private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, AbstractSemanticNode section, Integer pageNumber) { Page page = context.getPage(pageNumber); page.getMainBody().add(section); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java index 4b7303f..b78e53b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java @@ -154,10 +154,11 @@ public class TableNodeFactory { } else if (firstTextBlockIsHeadline(cell)) { SectionNodeFactory.addSection(layoutParsingType, tableCell, + true, cell.getTextBlocks() .stream() .map(tb -> (AbstractPageBlock) tb) - .toList(), + .collect(Collectors.toList()), emptyList(), context, document); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java index a53c6d8..2b7e087 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java @@ -23,6 +23,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; @@ -62,6 +63,7 @@ public class DocumentGraphMapper { SemanticNode node = switch (entryData.getType()) { case SECTION -> buildSection(context); + case SUPER_SECTION -> buildSuperSection(context); case PARAGRAPH -> buildParagraph(context, entryData.getProperties()); case HEADLINE -> buildHeadline(context); case HEADER -> buildHeader(context); @@ -109,7 +111,7 @@ public class DocumentGraphMapper { private TableCell buildTableCell(Context context, Map properties) { - TableCell.TableCellBuilder builder = TableCell.builder(); + TableCell.TableCellBuilder builder = TableCell.builder(); PropertiesMapper.parseTableCellProperties(properties, builder); return builder.documentTree(context.documentTree).build(); } @@ -140,6 +142,11 @@ public class DocumentGraphMapper { return Section.builder().documentTree(context.documentTree).build(); } + private SuperSection buildSuperSection(Context context) { + + return SuperSection.builder().documentTree(context.documentTree).build(); + } + private Paragraph buildParagraph(Context context, Map properties) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java index 4cdc5bc..3a25058 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java @@ -105,7 +105,7 @@ public class LayoutGridService { Color color = switch (semanticNode.getType()) { case PARAGRAPH -> PARAGRAPH_COLOR; case TABLE -> TABLE_COLOR; - case SECTION -> SECTION_COLOR; + case SECTION, SUPER_SECTION -> SECTION_COLOR; case HEADLINE -> HEADLINE_COLOR; case HEADER, FOOTER -> HEADER_COLOR; case IMAGE -> IMAGE_COLOR; @@ -119,7 +119,7 @@ public class LayoutGridService { if (isNotSectionOrTableCellOrDocument(semanticNode)) { addAsRectangle(semanticNode, layoutGrid, color); } - if (semanticNode.getType().equals(NodeType.SECTION)) { + if (semanticNode.getType().equals(NodeType.SECTION) || semanticNode.getType().equals(NodeType.SUPER_SECTION)) { addSection(semanticNode, layoutGrid, color); } if (semanticNode.getType().equals(NodeType.TABLE)) { @@ -193,10 +193,11 @@ public class LayoutGridService { List subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION) .toList(); Page firstPage = semanticNode.getFirstPage(); + String treeIdString = buildTreeIdString(semanticNode); if (!subSections.isEmpty()) { - addPlacedText(firstPage, bBoxMap.get(firstPage), buildTreeIdString(semanticNode), layoutGrid); + addPlacedText(firstPage, bBoxMap.get(firstPage), treeIdString, layoutGrid); } else { - bBoxMap.forEach(((page, textBBox) -> addPlacedText(page, textBBox, buildTreeIdString(semanticNode), layoutGrid))); + bBoxMap.forEach(((page, textBBox) -> addPlacedText(page, textBBox, treeIdString, layoutGrid))); } if (bBoxMap.values().size() == 1) { Rectangle2D r = RectangleTransformations.pad(bBoxMap.get(firstPage), LINE_WIDTH, LINE_WIDTH); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java index bf64c12..bcba9e2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java @@ -112,8 +112,8 @@ public class PdfVisualisationUtility { case DOCUMENT -> Color.LIGHT_GRAY; case HEADER, FOOTER -> Color.GREEN; case PARAGRAPH -> Color.BLUE; + case SUPER_SECTION, SECTION -> Color.BLACK; case HEADLINE -> Color.RED; - case SECTION -> Color.BLACK; case TABLE -> Color.ORANGE; case TABLE_CELL -> Color.GRAY; case IMAGE -> Color.MAGENTA; diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 18b10df..a3d6ec9 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -32,6 +32,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { public void testViewerDocument() { String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; + String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index 1981530..52ef0b9 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -37,8 +37,6 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; -import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; -import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest; @@ -62,6 +60,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { tableServiceResponse, new VisualLayoutParsingResponse(), Map.of("file", "document")); + } @@ -134,6 +133,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { @Test @SneakyThrows public void testTableAndCellRotations() { + String fileName = "files/Minimal Examples/simpleTablesRotated.pdf"; ClassPathResource pdfFileResource = new ClassPathResource(fileName); @@ -141,7 +141,6 @@ public class PdfSegmentationServiceTest extends AbstractTest { } - @Disabled @Test public void testScanRotationBorderIsIgnored() throws IOException { @@ -151,15 +150,19 @@ public class PdfSegmentationServiceTest extends AbstractTest { var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse); - assertThat(document.getSections() + assertThat(document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - var tables = document.getSections() + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) + .toList()).isNotEmpty(); + var tables = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList(); // Quality of the table parsing is not good, because the file is rotated at scanning. @@ -199,15 +202,19 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections() + assertThat(document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - TablePageBlock table = document.getSections() + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) + .toList()).isNotEmpty(); + TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList() .get(0); assertThat(table.getColCount()).isEqualTo(6); @@ -225,23 +232,29 @@ public class PdfSegmentationServiceTest extends AbstractTest { "files/syngenta/CustomerFiles/SinglePages/Merge Table - Page5_26 A8637C - EU AIR3 - LCP Section 10 - Ecotoxicological studies on the plant protection product - Reference list.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections() + assertThat(document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - TablePageBlock firstTable = document.getSections() + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) + .toList()).isNotEmpty(); + TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList() .get(0); assertThat(firstTable.getColCount()).isEqualTo(8); assertThat(firstTable.getRowCount()).isEqualTo(1); - TablePageBlock secondTable = document.getSections() + TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList() .get(1); assertThat(secondTable.getColCount()).isEqualTo(8); @@ -266,23 +279,29 @@ public class PdfSegmentationServiceTest extends AbstractTest { "files/syngenta/CustomerFiles/SinglePages/Merge Multi Page Table - Page4_Page5_51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections() + assertThat(document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - TablePageBlock firstTable = document.getSections() + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) + .toList()).isNotEmpty(); + TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList() .get(0); assertThat(firstTable.getColCount()).isEqualTo(9); assertThat(firstTable.getRowCount()).isEqualTo(5); - TablePageBlock secondTable = document.getSections() + TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList() .get(1); assertThat(secondTable.getColCount()).isEqualTo(9); @@ -307,23 +326,29 @@ public class PdfSegmentationServiceTest extends AbstractTest { "files/syngenta/CustomerFiles/SinglePages/Rotated Table Headers - Page4_65 Mesotrione - EU AIR3 - LCA Section 1 Supplement Reference List.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections() + assertThat(document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - TablePageBlock firstTable = document.getSections() + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) + .toList()).isNotEmpty(); + TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList() .get(0); assertThat(firstTable.getColCount()).isEqualTo(8); assertThat(firstTable.getRowCount()).isEqualTo(1); - TablePageBlock secondTable = document.getSections() + TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList() .get(1); assertThat(secondTable.getColCount()).isEqualTo(8); @@ -818,10 +843,12 @@ public class PdfSegmentationServiceTest extends AbstractTest { @SneakyThrows private void toHtml(ClassificationDocument document, String filename) { - var tables = document.getSections() + var tables = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList(); StringBuilder sb = new StringBuilder(); @@ -843,12 +870,15 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) { - TablePageBlock table = document.getSections() + TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList() .get(tableIndex); + List> rows = table.getRows(); int emptyCellsFoundFound = rows.stream() .flatMap(List::stream) @@ -870,10 +900,12 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTable(ClassificationDocument document, int tableIndex, List> values) { - TablePageBlock table = document.getSections() + TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) .toList() .get(tableIndex); List> rows = table.getRows(); @@ -896,10 +928,11 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTableSize(ClassificationDocument document, int tableSize) { - assertThat(document.getSections() + assertThat(document.getTableOfContents().getAllTableOfContentItems() .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) + .flatMap(tocItem -> tocItem.getSectionBlocks() + .stream() + .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) .toList().size()).isEqualTo(tableSize); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java index 631f643..cb91962 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -93,6 +93,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { .toList(); for (String pdfFileName : pdfFileNames) { + writeJsons(Path.of(pdfFileName)); } } @@ -102,15 +103,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { @SneakyThrows private void writeJsons(Path filename) { - Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD, - layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, + Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, + layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, filename.toFile(), new ImageServiceResponse(), new TableServiceResponse(), new VisualLayoutParsingResponse(), Map.of("file",filename.toFile().toString()))); - Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD, - layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, + Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, + layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, filename.toFile(), new ImageServiceResponse(), new TableServiceResponse(), diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java index 4e3280f..cdd247d 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java @@ -229,7 +229,7 @@ public class PdfDraw { case HEADER, FOOTER -> Color.GREEN; case PARAGRAPH -> Color.BLUE; case HEADLINE -> Color.RED; - case SECTION -> Color.BLACK; + case SECTION, SUPER_SECTION -> Color.BLACK; case TABLE -> Color.ORANGE; case TABLE_CELL -> Color.GRAY; case IMAGE -> Color.MAGENTA; diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/Minimal Examples/RotateTextWithRulingsTestFile.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Minimal Examples/RotateTextWithRulingsTestFile.pdf index da05904..88846ba 100644 Binary files a/layoutparser-service/layoutparser-service-server/src/test/resources/files/Minimal Examples/RotateTextWithRulingsTestFile.pdf and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Minimal Examples/RotateTextWithRulingsTestFile.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/Minimal Examples/simpleTablesRotated.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Minimal Examples/simpleTablesRotated.pdf index f6571ef..6084303 100644 Binary files a/layoutparser-service/layoutparser-service-server/src/test/resources/files/Minimal Examples/simpleTablesRotated.pdf and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Minimal Examples/simpleTablesRotated.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/AbsolutelyEnormousTable.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/AbsolutelyEnormousTable.pdf index e6d9a07..f0e1f7e 100644 Binary files a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/AbsolutelyEnormousTable.pdf and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/AbsolutelyEnormousTable.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/DontMergeNonConsecutiveTables.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/DontMergeNonConsecutiveTables.pdf deleted file mode 100644 index 4e18c90..0000000 Binary files a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/DontMergeNonConsecutiveTables.pdf and /dev/null differ