From 17756f59772d90cb05773852f82632d15f3aa3fc Mon Sep 17 00:00:00 2001 From: maverickstuder Date: Wed, 17 Apr 2024 14:31:48 +0200 Subject: [PATCH] RED-7074: Design Subsection section tree structure algorithm * first draft: further implementations --- .../processor/LayoutParsingPipeline.java | 17 +- .../model/ClassificationDocument.java | 3 + .../processor/model/ClassificationPage.java | 6 +- .../outline}/OutlineExtractorService.java | 4 +- .../model/outline/OutlineObjectTree.java | 7 - .../outline/OutlineValidationService.java | 210 ++++++++++++++++++ .../model/outline/TableOfContentItem.java | 93 ++++++++ .../model/outline/TableOfContents.java | 59 +++++ .../BlockificationPostprocessingService.java | 10 +- .../RedactManagerClassificationService.java | 106 ++++++--- .../server/graph/OutlineProcessingTest.java | 2 +- 11 files changed, 463 insertions(+), 54 deletions(-) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{services => model/outline}/OutlineExtractorService.java (96%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContentItem.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index a2f885e..b1696a8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -31,7 +31,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Im import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree; -import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTreeNode; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; @@ -44,7 +44,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; -import com.knecon.fforesight.service.layoutparser.processor.services.OutlineExtractorService; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService; import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; @@ -231,14 +231,15 @@ public class LayoutParsingPipeline { PDDocument originDocument = openDocument(originFile); addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath())); - OutlineObjectTree outlineObjectTree = outlineExtractorService.getOutlineObjectTree(originDocument); - Map> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse); Map> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse); Map> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse); ClassificationDocument classificationDocument = new ClassificationDocument(); List classificationPages = new ArrayList<>(); + // parsing the structure elements could be useful as well + classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument)); + long pageCount = originDocument.getNumberOfPages(); for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) { @@ -296,10 +297,11 @@ public class LayoutParsingPipeline { case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false); }; - List outlineObjects = outlineObjectTree.getOutlineObjectsPerPage() + List outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage() .get(pageNumber - 1); - if(outlineObjects != null) { - blockificationPostprocessingService.sanitizeOutlineBlocksWithKdTree(classificationPage, outlineObjects); + if (outlineObjects != null) { + classificationPage.setOutlineObjects(outlineObjects); + blockificationPostprocessingService.sanitizeOutlineBlocksWithKdTree(classificationPage); } classificationPage.setCleanRulings(cleanRulings); @@ -361,7 +363,6 @@ public class LayoutParsingPipeline { .toList(); // ??? - log.info("Building Sections for {}", identifier); switch (layoutParsingType) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java index b3565ae..7369047 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.ArrayList; import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText; @@ -26,4 +27,6 @@ public class ClassificationDocument { private long rulesVersion; + private OutlineObjectTree outlineObjectTree; + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java index a654636..ef97651 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java @@ -8,13 +8,13 @@ import java.util.Map; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import lombok.Data; import lombok.NonNull; import lombok.RequiredArgsConstructor; -import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; @Data @RequiredArgsConstructor @@ -23,6 +23,10 @@ public class ClassificationPage { @NonNull private List textBlocks; + private List outlineObjects = new ArrayList<>(); + + private List headlines = new ArrayList<>(); + private List images = new ArrayList<>(); private Rectangle bodyTextFrame; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/OutlineExtractorService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java similarity index 96% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/OutlineExtractorService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java index 314e384..9be8c33 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/OutlineExtractorService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.services; +package com.knecon.fforesight.service.layoutparser.processor.model.outline; import java.awt.geom.Point2D; import java.io.IOException; @@ -74,6 +74,8 @@ public class OutlineExtractorService { } + // if the structure elements are processed beforehand, another case can be handled here as well: + // outline objects can reference structure elements (see pdf documentation) @SneakyThrows private OutlineObjectTreeNode createOutlineObject(PDOutlineItem item, PDDocument document, int depth) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTree.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTree.java index 5723cdc..61b0dd8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTree.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTree.java @@ -39,11 +39,4 @@ public class OutlineObjectTree { } } - - @Override - public String toString() { - - return super.toString(); - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java new file mode 100644 index 0000000..b1f0ca4 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java @@ -0,0 +1,210 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.outline; + +import java.awt.geom.Point2D; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; + +import lombok.extern.slf4j.Slf4j; + +@Service +@Slf4j +public class OutlineValidationService { + + + public TableOfContents validateWithToC(List allHeadlines, List headlinesFromOutlines, List newlyClassifiedHeadlines) { + TableOfContents validatedToC = createToC(headlinesFromOutlines); + TableOfContents currentToC = createToC(allHeadlines); + + TableOfContentItem lastHeadlineFromOutlines = null; + for (TableOfContentItem tocItem : currentToC.getAllTableOfContentItems()) { + if (!containsItem(validatedToC, tocItem)) { + addItemAtCorrectPosition(validatedToC, tocItem, lastHeadlineFromOutlines); + } else { + lastHeadlineFromOutlines = tocItem; + } + } + return validatedToC; + } + + private boolean containsBlock(TableOfContents toc, TextPageBlock block) { + for (TableOfContentItem existingItem : toc.getMainSections()) { + if (existingItem.getTextPageBlock().equals(block) || existingItem.contains(block)) { + return true; + } + } + return false; + } + + private boolean containsItem(TableOfContents toc, TableOfContentItem tocItem) { + for (TableOfContentItem existingItem : toc.getMainSections()) { + if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) { + return true; + } + } + return false; + } + + private void addItemAtCorrectPosition(TableOfContents toc, TableOfContentItem tocItem, TableOfContentItem lastHeadlineFromOutlines) { + + if(!tocItem.getChildren().isEmpty()) { + + } + } + + public TableOfContents createToC(List headlines) { + + List mainSections = new ArrayList<>(); + int parentDepth = 7; // more than 6 (h6) + TableOfContentItem parent = null; + for (TextPageBlock current : headlines) { + int currentDepth = getDepth(current.getClassification()); + if(parentDepth >= currentDepth) { + parentDepth = currentDepth; + parent = new TableOfContentItem(current); + mainSections.add(parent); + } else { + assert (parent!=null); + while(parentDepth < currentDepth && parent.getParent() != null) { + parent = parent.getParent(); + parentDepth = getDepth(parent.getTextPageBlock().getClassification()); + } + parent.addChild(new TableOfContentItem(current)); + } + } + return new TableOfContents(mainSections); + + } + + public void updateOutlineObjectTree(OutlineObjectTree outlineObjectTree, List allHeadlines, List newlyClassifiedHeadlines) { + + List newOutlineObjects = newlyClassifiedHeadlines.stream() + .map(textPageBlock -> new OutlineObject(textPageBlock.getText(), + textPageBlock.getPage(), + new Point2D.Double(textPageBlock.getMinX(), textPageBlock.getMinY()), + getDepth(textPageBlock.getClassification()))) + .toList(); + + } + + + private static int getDepth(PageBlockType pageBlockType) { + + return switch (pageBlockType) { + case H1 -> 1; + case H2 -> 2; + case H3 -> 3; + case H4 -> 4; + case H5 -> 5; + default -> 6; + }; + } + + + public void validate(List allHeadlines, List newlyClassifiedHeadlines) { + + if (allHeadlines.size() - newlyClassifiedHeadlines.size() > newlyClassifiedHeadlines.size()) { + + List headlines = allHeadlines.stream() + .map(textPageBlock -> new Headline(textPageBlock, newlyClassifiedHeadlines.contains(textPageBlock))) + .toList(); + for (TextPageBlock newHeadline : newlyClassifiedHeadlines) { + int newHeadlineIndex = headlines.indexOf(newHeadline); + List adjacentNewlyClassified = findAdjacentNewlyClassified(newHeadline, newlyClassifiedHeadlines); + // Find neighboring headlines from outlines + //TextPageBlock previousOutline = findNeighboringOutline(allHeadlines.indexOf(newHeadline), -1); + //TextPageBlock nextOutline = findNeighboringOutline(allHeadlines.indexOf(newHeadline), headlinesFromOutlines, 1); + + // If we have neighboring outlines, perform comparison + //if (previousOutline != null && nextOutline != null) { + // // Compare headline orders + // int orderComparison = compareHeadlineOrder(previousOutline, nextOutline); + // if (orderComparison != 0) { + // // Set classification based on comparison + // setClassification(newHeadline, orderComparison, previousOutline, nextOutline); + // } + //} + } + } + + } + + + private List findAdjacentNewlyClassified(TextPageBlock headline, List newlyClassifiedHeadlines) { + // Find adjacent newly classified headlines + List adjacentNewlyClassified = new ArrayList<>(); + int index = newlyClassifiedHeadlines.indexOf(headline); + if (index != -1) { + adjacentNewlyClassified.add(headline); + for (int i = index - 1; i >= 0; i--) { + if (newlyClassifiedHeadlines.get(i).equals(adjacentNewlyClassified.get(0))) { + adjacentNewlyClassified.add(0, newlyClassifiedHeadlines.get(i)); + } else { + break; + } + } + for (int i = index + 1; i < newlyClassifiedHeadlines.size(); i++) { + if (newlyClassifiedHeadlines.get(i).equals(adjacentNewlyClassified.get(adjacentNewlyClassified.size() - 1))) { + adjacentNewlyClassified.add(newlyClassifiedHeadlines.get(i)); + } else { + break; + } + } + } + return adjacentNewlyClassified; + } + + + private TextPageBlock findNeighboringOutline(TextPageBlock headline, List headlinesFromOutlines, int direction) { + // Find neighboring headline from outlines in the specified direction + int index = headlinesFromOutlines.indexOf(headline); + if (index != -1 && index + direction >= 0 && index + direction < headlinesFromOutlines.size()) { + return headlinesFromOutlines.get(index + direction); + } + return null; + } + + + private int compareHeadlineOrder(TextPageBlock headline1, TextPageBlock headline2) { + // Compare headline orders + // Implement your comparison logic here + return 0; // Placeholder return, implement actual comparison logic + } + + + private void setClassification(TextPageBlock headline, int orderComparison, TextPageBlock previousOutline, TextPageBlock nextOutline) { + // Set classification based on comparison with neighboring outlines + // Implement your classification logic here + } + + + record Headline(TextPageBlock textPageBlock, boolean newlyClassified) { + + @Override + public boolean equals(Object obj) { + + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + Headline headline = (Headline) obj; + return Objects.equals(textPageBlock, headline.textPageBlock); + } + + + @Override + public int hashCode() { + + return Objects.hash(textPageBlock); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContentItem.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContentItem.java new file mode 100644 index 0000000..2d57844 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContentItem.java @@ -0,0 +1,93 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.outline; + +import java.util.ArrayList; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; + +import lombok.Data; +import lombok.EqualsAndHashCode; + +@Data +@EqualsAndHashCode(onlyExplicitlyIncluded = true) +public class TableOfContentItem { + + @EqualsAndHashCode.Include + private TextPageBlock textPageBlock; + private List children = new ArrayList<>(); + private TableOfContentItem parent; + + + public TableOfContentItem(TextPageBlock textPageBlock) { + + this.textPageBlock = textPageBlock; + } + + + public void addChild(TableOfContentItem tableOfContentItem) { + + children.add(tableOfContentItem); + tableOfContentItem.setParent(this); + } + + + public TableOfContentItem getSiblingBefore() { + + try { + return parent.getChildren() + .get(parent.getChildren().indexOf(this) - 1); + } catch (IndexOutOfBoundsException indexOutOfBoundsException) { + return null; + } + } + public TableOfContentItem getSiblingAfter() { + + try { + return parent.getChildren() + .get(parent.getChildren().indexOf(this) + 1); + } catch (IndexOutOfBoundsException indexOutOfBoundsException) { + return null; + } + } + + + public boolean contains(TextPageBlock block) { + + boolean anyChildContains = false; + if (!children.isEmpty()) { + for (TableOfContentItem child : children) { + if (child.getTextPageBlock().equals(block)) { + return true; + } else { + anyChildContains = anyChildContains || child.contains(block); + } + } + } + return anyChildContains; + } + + + public boolean contains(TableOfContentItem tocItem) { + + boolean anyChildContains = false; + if (!children.isEmpty()) { + for (TableOfContentItem child : children) { + if (child.equals(tocItem)) { + return true; + } else { + anyChildContains = anyChildContains || child.contains(tocItem); + } + } + } + return anyChildContains; + } + + + @Override + public String toString() { + + return "OutlineObjectTreeNode{" + "textPageBlock=" + textPageBlock + '}'; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java new file mode 100644 index 0000000..bcffa89 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/TableOfContents.java @@ -0,0 +1,59 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.outline; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; + +import lombok.Data; +import lombok.RequiredArgsConstructor; + +@Data +@RequiredArgsConstructor +public class TableOfContents { + + private List mainSections = new ArrayList<>(); + + + public TableOfContents(List mainSections) { + + this.mainSections = mainSections; + } + + + public List getAllTextPageBlocks() { + + List allTextPageBlocks = new ArrayList<>(); + for (TableOfContentItem item : mainSections) { + collectTextPageBlocks(item, allTextPageBlocks); + } + return allTextPageBlocks; + } + + + private void collectTextPageBlocks(TableOfContentItem item, List textPageBlocks) { + + textPageBlocks.add(item.getTextPageBlock()); + for (TableOfContentItem child : item.getChildren()) { + collectTextPageBlocks(child, textPageBlocks); + } + } + + public List getAllTableOfContentItems() { + List allItems = new ArrayList<>(); + for (TableOfContentItem item : mainSections) { + collectTableOfContentItems(item, allItems); + } + return allItems; + } + + private void collectTableOfContentItems(TableOfContentItem item, List allItems) { + allItems.add(item); + for (TableOfContentItem child : item.getChildren()) { + collectTableOfContentItems(child, allItems); + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java index 6ff690c..5ea023e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java @@ -36,8 +36,9 @@ public class BlockificationPostprocessingService { .collect(RectangleTransformations.collectBBox()); - public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage, List outlineObjects) { + public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage) { + List outlineObjects = classificationPage.getOutlineObjects(); if (classificationPage.getTextBlocks().isEmpty() || outlineObjects.isEmpty()) { return; } @@ -244,6 +245,13 @@ public class BlockificationPostprocessingService { } + // currently only three cases are handled here: + // 1. equality + // 2. outline title contains block text + // 3. block text contains outline title + // another possible case is an intersection, meaning a title is split up between two different blocks + // this should not happen with how docstrum creates the blocks + // if it is indeed necessary, a splitting has to be done with a follow-up merge private boolean processOutlineForTextBlock(TextPageBlock pageBlock, OutlineProcessionContext context) { OutlineObject outlineObject = context.getOutlineObject(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java index f85cd3c..872dd85 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java @@ -1,8 +1,13 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classification; +import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; + import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; @@ -22,15 +27,43 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class RedactManagerClassificationService { + private final OutlineValidationService outlineValidationService; + + public void classifyDocument(ClassificationDocument document) { List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); + List headlinesFromOutlines = document.getPages() + .stream() + .flatMap(classificationPage -> classificationPage.getTextBlocks() + .stream() + .filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.isHeadline()) + .map(tb -> (TextPageBlock) tb)) + .toList(); + for (ClassificationPage page : document.getPages()) { classifyPage(page, document, headlineFontSizes); } + + List allHeadlines = document.getPages() + .stream() + .flatMap(classificationPage -> classificationPage.getTextBlocks() + .stream() + .filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline()) + .map(tb -> (TextPageBlock) tb)) + .toList(); + + List newlyClassifiedHeadlines = new ArrayList<>(allHeadlines); + newlyClassifiedHeadlines.removeAll(headlinesFromOutlines); + + TableOfContents toC = outlineValidationService.createToC(allHeadlines); + System.out.println(toC); + + outlineValidationService.validateWithToC(allHeadlines, headlinesFromOutlines, newlyClassifiedHeadlines); + } @@ -48,7 +81,7 @@ public class RedactManagerClassificationService { var bodyTextFrame = page.getBodyTextFrame(); - if(textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) { + if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) { return; } if (document.getFontSizeCounter().getMostPopular() == null) { @@ -62,33 +95,30 @@ public class RedactManagerClassificationService { .anyMatch(graphic -> graphic.getPosition().intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()))) { textBlock.setClassification(PageBlockType.PARAGRAPH); return; - } - - if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame, - textBlock, - page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter() + } if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) + || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null + || textBlock.getHighestFontSize() <= document.getFontSizeCounter() .getMostPopular())) { textBlock.setClassification(PageBlockType.HEADER); - } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, - textBlock, - page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter() + } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) + || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null + || textBlock.getHighestFontSize() <= document.getFontSizeCounter() .getMostPopular())) { textBlock.setClassification(PageBlockType.FOOTER); - } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, - document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() - .size() == 1)) { + } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 + && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) { if (!Pattern.matches("[0-9]+", textBlock.toString())) { textBlock.setClassification(PageBlockType.TITLE); } - } else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter() - .getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter() - .getCountPerValue() - .containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences() - .get(0) - .getTextPositions() - .get(0) - .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + } else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + && PositionUtils.getApproxLineCount(textBlock) < 4.9 + && (textBlock.getMostPopularWordStyle().equals("bold") + || !document.getFontStyleCounter().getCountPerValue().containsKey("bold") + && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) + && textBlock.getSequences() + .get(0).getTextPositions() + .get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { for (int i = 1; i <= headlineFontSizes.size(); i++) { if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) { @@ -96,25 +126,31 @@ public class RedactManagerClassificationService { document.setHeadlines(true); } } - } else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle() - .equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences() - .get(0) - .getTextPositions() - .get(0) - .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + } else if (!textBlock.getText().startsWith("Figure ") + && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) + && textBlock.getMostPopularWordStyle().equals("bold") + && !document.getFontStyleCounter().getMostPopular().equals("bold") + && PositionUtils.getApproxLineCount(textBlock) < 2.9 + && textBlock.getSequences() + .get(0).getTextPositions() + .get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1)); document.setHeadlines(true); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() - .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) + && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() + && textBlock.getMostPopularWordStyle().equals("bold") + && !document.getFontStyleCounter().getMostPopular().equals("bold")) { textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont() - .equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle() - .equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) + && textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular()) + && textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular()) + && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { textBlock.setClassification(PageBlockType.PARAGRAPH); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() - .getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter() - .getMostPopular() - .equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) + && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() + && textBlock.getMostPopularWordStyle().equals("italic") + && !document.getFontStyleCounter().getMostPopular().equals("italic") + && PositionUtils.getApproxLineCount(textBlock) < 2.9) { textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) { textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/OutlineProcessingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/OutlineProcessingTest.java index 83f86d8..33ceaba 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/OutlineProcessingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/OutlineProcessingTest.java @@ -3,7 +3,7 @@ package com.knecon.fforesight.service.layoutparser.server.graph; import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; -import com.knecon.fforesight.service.layoutparser.processor.services.OutlineExtractorService; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService; import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;