diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index b14faca..4bbe999 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -28,9 +28,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; -import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; @@ -43,7 +44,6 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; -import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService; import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; @@ -97,6 +97,7 @@ public class LayoutParsingPipeline { VisualLayoutParsingAdapter visualLayoutParsingAdapter; ClarifyndClassificationService clarifyndClassificationService; OutlineExtractorService outlineExtractorService; + OutlineValidationService outlineValidationService; public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { @@ -339,14 +340,15 @@ public class LayoutParsingPipeline { case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument); } - // compute ToC - List headlines = classificationDocument.getPages() + List headlines = classificationDocument.getPages() .stream() .flatMap(classificationPage -> classificationPage.getTextBlocks() .stream() - .filter(tb -> tb.getClassification().isHeadline())) + .filter(tb -> tb instanceof TextPageBlock && tb.getClassification().isHeadline()) + .map(tb -> (TextPageBlock) tb)) .toList(); - // ??? + TableOfContents tableOfContents = outlineValidationService.createToC(headlines); + classificationDocument.setTableOfContents(tableOfContents); log.info("Building Sections for {}", identifier); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java index 7369047..e6ef1ad 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java @@ -4,6 +4,7 @@ import java.util.ArrayList; import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText; @@ -28,5 +29,6 @@ public class ClassificationDocument { private long rulesVersion; private OutlineObjectTree outlineObjectTree; + private TableOfContents tableOfContents; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java index ea06aa6..d52ba5e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineValidationService.java @@ -2,8 +2,11 @@ package com.knecon.fforesight.service.layoutparser.processor.model.outline; import java.awt.geom.Point2D; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Objects; +import java.util.TreeSet; import org.springframework.stereotype.Service; @@ -16,8 +19,8 @@ import lombok.extern.slf4j.Slf4j; @Slf4j public class OutlineValidationService { - public TableOfContents validateWithToC(List allHeadlines, List headlinesFromOutlines, List newlyClassifiedHeadlines) { + TableOfContents validatedToC = createToC(headlinesFromOutlines); TableOfContents currentToC = createToC(allHeadlines); @@ -32,7 +35,9 @@ public class OutlineValidationService { return validatedToC; } + private boolean containsBlock(TableOfContents toc, TextPageBlock block) { + for (TableOfContentItem existingItem : toc.getMainSections()) { if (existingItem.getTextPageBlock().equals(block) || existingItem.contains(block)) { return true; @@ -41,7 +46,9 @@ public class OutlineValidationService { return false; } + private boolean containsItem(TableOfContents toc, TableOfContentItem tocItem) { + for (TableOfContentItem existingItem : toc.getMainSections()) { if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) { return true; @@ -50,6 +57,7 @@ public class OutlineValidationService { return false; } + private void addItemAtCorrectPosition(TableOfContents toc, TableOfContentItem tocItem, TableOfContentItem lastHeadlineFromOutlines) { //if (lastHeadlineFromOutlines == null || tocItem.g) @@ -58,20 +66,21 @@ public class OutlineValidationService { //} } - public TableOfContents createToC(List headlines) { + + public TableOfContents createToCOld(List headlines) { List mainSections = new ArrayList<>(); int parentDepth = 7; // more than 6 (h6) TableOfContentItem parent = null; for (TextPageBlock current : headlines) { int currentDepth = getDepth(current.getClassification()); - if(parentDepth >= currentDepth) { + if (parentDepth >= currentDepth) { parentDepth = currentDepth; parent = new TableOfContentItem(current); mainSections.add(parent); } else { - assert (parent!=null); - while(parentDepth < currentDepth && parent.getParent() != null) { + assert (parent != null); + while (parentDepth < currentDepth && parent.getParent() != null) { parent = parent.getParent(); parentDepth = getDepth(parent.getTextPageBlock().getClassification()); } @@ -82,6 +91,46 @@ public class OutlineValidationService { } + + public TableOfContents createToC(List headlines) { + + List mainSections = new ArrayList<>(); + Map lastItemsPerDepth = new HashMap<>(); + TableOfContentItem last = null; + TreeSet depths = new TreeSet<>(); + + for (TextPageBlock current : headlines) { + int currentDepth = getDepth(current.getClassification()); + Integer parentDepth = depths.floor(currentDepth - 1); + + var tocItem = new TableOfContentItem(current); + + if (parentDepth == null) { + mainSections.add(tocItem); + + } else { + assert last != null; + int lastDepth = getDepth(last.getTextPageBlock().getClassification()); + + if (lastDepth < parentDepth) { + parentDepth = lastDepth; + } else if (lastDepth == currentDepth && last.getParent() != null) { + parentDepth = getDepth(last.getParent().getTextPageBlock().getClassification()); + } + + TableOfContentItem parent = lastItemsPerDepth.get(parentDepth); + parent.addChild(tocItem); + } + + last = tocItem; + lastItemsPerDepth.put(currentDepth, tocItem); + depths.add(currentDepth); + } + + return new TableOfContents(mainSections); + } + + public void updateOutlineObjectTree(OutlineObjectTree outlineObjectTree, List allHeadlines, List newlyClassifiedHeadlines) { List newOutlineObjects = newlyClassifiedHeadlines.stream() diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java index b967c19..4755125 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java @@ -131,7 +131,7 @@ public class BlockificationPostprocessingService { } else if (minDistance == distanceToSplitCandidate) { List others = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle()); splitCandidate.setClassification(headlineType); - others.forEach(other -> other.setClassification(headlineType)); + others.forEach(other -> other.setClassification(null)); } else { var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination); merged.setClassification(headlineType); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 453c772..0ffc72b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -5,10 +5,8 @@ import static java.util.stream.Collectors.toSet; import java.awt.geom.Point2D; import java.util.ArrayList; import java.util.Comparator; -import java.util.HashSet; import java.util.List; import java.util.ListIterator; -import java.util.Set; import org.springframework.stereotype.Service; @@ -100,7 +98,7 @@ public class DocstrumBlockificationService { while (itty.hasNext()) { AbstractPageBlock block = itty.next(); - if (block instanceof TablePageBlock) { + if (block instanceof TablePageBlock || (block.getClassification() != null && block.getClassification().isHeadline())) { previous = new TextPageBlock(); continue; } @@ -230,21 +228,31 @@ public class DocstrumBlockificationService { continue; } + if(block.getClassification() != null && block.getClassification().isHeadline()) { + continue; + } + TextPageBlock current = (TextPageBlock) block; for (int i = 0; i < blocks.size(); i++) { - if(blocks.get(i) == null){ + AbstractPageBlock abstractPageBlock = blocks.get(i); + if(abstractPageBlock == null){ continue; } - if (blocks.get(i) == current) { + if (abstractPageBlock == current) { continue; } - if (blocks.get(i) instanceof TablePageBlock) { + if (abstractPageBlock instanceof TablePageBlock) { continue; } - TextPageBlock inner = (TextPageBlock) blocks.get(i); + if(abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) { + continue; + } + + TextPageBlock inner = (TextPageBlock) abstractPageBlock; + if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java index 2d2191d..8cfbf7d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java @@ -1,13 +1,8 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classification; -import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; -import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService; -import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents; -import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; - import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; @@ -15,6 +10,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import lombok.Data; @@ -26,8 +22,6 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class RedactManagerClassificationService { - private final OutlineValidationService outlineValidationService; - public void classifyDocument(ClassificationDocument document) { @@ -35,36 +29,10 @@ public class RedactManagerClassificationService { log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); - List headlinesFromOutlines = document.getPages() - .stream() - .flatMap(classificationPage -> classificationPage.getTextBlocks() - .stream() - .filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.isHeadline()) - .map(tb -> (TextPageBlock) tb)) - .toList(); - - HeadLineClassificationContext headLineClassificationContext = new HeadLineClassificationContext(); for (ClassificationPage page : document.getPages()) { classifyPage(page, document, headlineFontSizes, headLineClassificationContext); } - - List allHeadlines = document.getPages() - .stream() - .flatMap(classificationPage -> classificationPage.getTextBlocks() - .stream() - .filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline()) - .map(tb -> (TextPageBlock) tb)) - .toList(); - - List newlyClassifiedHeadlines = new ArrayList<>(allHeadlines); - newlyClassifiedHeadlines.removeAll(headlinesFromOutlines); - - TableOfContents toC = outlineValidationService.createToC(allHeadlines); - System.out.println(toC); - - outlineValidationService.validateWithToC(allHeadlines, headlinesFromOutlines, newlyClassifiedHeadlines); - }