From 59d9d6c3e6ecc2cf0020a77421b3b9a5a339338b Mon Sep 17 00:00:00 2001 From: maverickstuder Date: Tue, 16 Apr 2024 12:35:26 +0200 Subject: [PATCH] RED-7074: Design Subsection section tree structure algorithm * first draft: further implementations --- .../processor/LayoutParsingPipeline.java | 8 + .../BlockificationPostprocessingService.java | 253 ++++++++---------- .../server/graph/OutlineProcessingTest.java | 25 ++ .../server/graph/ViewerDocumentTest.java | 1 + 4 files changed, 149 insertions(+), 138 deletions(-) create mode 100644 layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/OutlineProcessingTest.java diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 4197509..a2f885e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -353,6 +353,14 @@ public class LayoutParsingPipeline { } // compute ToC + List headlines = classificationDocument.getPages() + .stream() + .flatMap(classificationPage -> classificationPage.getTextBlocks() + .stream() + .filter(tb -> tb.getClassification().isHeadline())) + .toList(); + // ??? + log.info("Building Sections for {}", identifier); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java index 5d1a1d8..6ff690c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java @@ -4,7 +4,6 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.bloc import java.awt.geom.Rectangle2D; import java.util.ArrayList; -import java.util.Comparator; import java.util.List; import java.util.function.Function; @@ -13,19 +12,20 @@ import org.tinspin.index.Index; import org.tinspin.index.kdtree.KDIterator; import org.tinspin.index.kdtree.KDTree; -import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; +import lombok.Data; + @Service public class BlockificationPostprocessingService { private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f; - private static final double BLOCK_COMPARISON_PRECISION = 1.0; private static final Function blockToBoundingBox = (abstractPageBlock) -> abstractPageBlock.getSequences() .stream() @@ -36,59 +36,13 @@ public class BlockificationPostprocessingService { .collect(RectangleTransformations.collectBBox()); - public void sanitizeOutlineBlocks(ClassificationPage classificationPage, List outlineObjects) { - - List textBlocks = classificationPage.getTextBlocks() - .stream() - .filter(block -> block instanceof TextPageBlock) - .toList() - .stream() - .map(block -> (TextPageBlock) block) - .toList(); - - textBlocks.sort(Comparator.comparing(TextPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, BLOCK_COMPARISON_PRECISION)) - .thenComparing(TextPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, BLOCK_COMPARISON_PRECISION))); - - for (OutlineObject outlineObject : outlineObjects) { - - boolean matchedExactly = false; - List splitCandidates = new ArrayList<>(); - List mergeCandidates = new ArrayList<>(); - - for (TextPageBlock textPageBlock : textBlocks) { - matchedExactly = processOutlineObjectForTextBlock(textPageBlock, outlineObject, splitCandidates, mergeCandidates); - - if (matchedExactly) { - break; - } - } - - if (!matchedExactly) { - //selectMatch(outlineObject, kdTree, splitCandidates, mergeCandidates); - } - } - - } - - public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage, List outlineObjects) { - List textBlocks = classificationPage.getTextBlocks() - .stream() - .filter(block -> block instanceof TextPageBlock) - .toList() - .stream() - .map(block -> (TextPageBlock) block) - .toList(); - if (textBlocks.isEmpty() || outlineObjects.isEmpty()) { + if (classificationPage.getTextBlocks().isEmpty() || outlineObjects.isEmpty()) { return; } - KDTree kdTree = KDTree.create(2); - textBlocks.forEach(block -> { - var boundingBox = blockToBoundingBox.apply(block); - kdTree.insert(new double[]{boundingBox.getMinX(), boundingBox.getMaxY()}, block); - }); + KDTree kdTree = createKdTree(classificationPage); for (OutlineObject outlineObject : outlineObjects) { @@ -99,36 +53,53 @@ public class BlockificationPostprocessingService { new double[]{Double.MAX_VALUE, Double.MAX_VALUE}); boolean matchedExactly = false; - List splitCandidates = new ArrayList<>(); - List mergeCandidates = new ArrayList<>(); + OutlineProcessionContext context = new OutlineProcessionContext(outlineObject); while (successorIterator.hasNext() && !matchedExactly) { TextPageBlock pageBlock = successorIterator.next().value(); - matchedExactly = processOutlineObjectForTextBlock(pageBlock, outlineObject, splitCandidates, mergeCandidates); + matchedExactly = processOutlineForTextBlock(pageBlock, context); } if (!matchedExactly) { - selectMatch(classificationPage, outlineObject, kdTree, splitCandidates, mergeCandidates); + selectMatch(classificationPage, kdTree, context); } } } - private void selectMatch(ClassificationPage classificationPage, - OutlineObject outlineObject, - KDTree kdTree, - List splitCandidates, - List mergeCandidates) { + private static KDTree createKdTree(ClassificationPage classificationPage) { - for (TextPageBlock splitCandidate : splitCandidates) { - System.out.println(splitCandidate); - } + List textBlocks = classificationPage.getTextBlocks() + .stream() + .filter(block -> block instanceof TextPageBlock) + .toList() + .stream() + .map(block -> (TextPageBlock) block) + .toList(); + + KDTree kdTree = KDTree.create(2); + textBlocks.forEach(block -> { + var boundingBox = blockToBoundingBox.apply(block); + kdTree.insert(new double[]{boundingBox.getMinX(), boundingBox.getMaxY()}, block); + }); + return kdTree; + } + + + private void selectMatch(ClassificationPage classificationPage, KDTree kdTree, OutlineProcessionContext context) { + + OutlineObject outlineObject = context.outlineObject; + List mergeCandidates = context.mergeCandidates; + TextPageBlock splitCandidate = context.splitCandidate; + PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth()); if (!mergeCandidates.isEmpty()) { List allMergeCandidates = new ArrayList<>(mergeCandidates); addNeighborsOfCandidate(kdTree, mergeCandidates.get(0), allMergeCandidates); - addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates); + if (mergeCandidates.size() > 1) { + addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates); + } allMergeCandidates = allMergeCandidates.stream() .distinct() .toList(); @@ -146,36 +117,84 @@ public class BlockificationPostprocessingService { bestCombination = combination; } } - mergeBlocks(classificationPage, bestCombination); + var merged = mergeBlocks(classificationPage, bestCombination); + merged.setClassification(headlineType); + } + + if (splitCandidate != null) { + TextPageBlock other = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle()); + splitCandidate.setClassification(headlineType); + other.setClassification(headlineType); } } - private void mergeBlocks(ClassificationPage classificationPage, List blocksToMerge) { + private TextPageBlock splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, String text) { - if (blocksToMerge.size() <= 1) { - return; - } + List wordSequence = findWordSequence(blockToSplit.getSequences(), text); + List remaining = blockToSplit.getSequences(); + remaining.removeAll(wordSequence); - TextPageBlock firstBlock = blocksToMerge.get(0); + blockToSplit.setSequences(wordSequence); - List mergedBlocks = new ArrayList<>(); - for (TextPageBlock textPageBlock : blocksToMerge.subList(1, blocksToMerge.size())) { + TextPageBlock other = buildTextBlock(remaining, 0); + classificationPage.getTextBlocks().add(other); + return other; + } - if (firstBlock != null && !firstBlock.getSequences().isEmpty()) { - if (textPageBlock.getDir() == firstBlock.getDir()) { - firstBlock.getSequences().addAll(textPageBlock.getSequences()); - mergedBlocks.add(textPageBlock); + private static List findWordSequence(List textPositionSequences, String text) { + + String target = text.replaceAll("\\s", ""); + List inSequence = new ArrayList<>(); + StringBuilder currentSequence = new StringBuilder(); + + for (TextPositionSequence sequence : textPositionSequences) { + + if (currentSequence.toString().equals(target)) { + return inSequence; + } + currentSequence.append(sequence.toString()); + inSequence.add(sequence); + + if (currentSequence.length() > target.length()) { + TextPositionSequence removed = inSequence.remove(0); + currentSequence.delete(0, removed.toString().length()); + + while (currentSequence.length() > target.length()) { + removed = inSequence.remove(0); + currentSequence.delete(0, removed.toString().length()); } } } + return new ArrayList<>(); + } - assert firstBlock != null; - buildTextBlock(firstBlock.getSequences(), 0); - firstBlock.setToDuplicate(false); - classificationPage.getTextBlocks().removeAll(mergedBlocks); + private TextPageBlock mergeBlocks(ClassificationPage classificationPage, List blocksToMerge) { + + TextPageBlock firstBlock = blocksToMerge.get(0); + + if (blocksToMerge.size() > 1) { + + List mergedBlocks = new ArrayList<>(); + for (TextPageBlock textPageBlock : blocksToMerge.subList(1, blocksToMerge.size())) { + + if (firstBlock != null && !firstBlock.getSequences().isEmpty()) { + + if (textPageBlock.getDir() == firstBlock.getDir()) { + firstBlock.getSequences().addAll(textPageBlock.getSequences()); + mergedBlocks.add(textPageBlock); + } + } + } + + assert firstBlock != null; + firstBlock.setToDuplicate(false); + classificationPage.getTextBlocks().removeAll(mergedBlocks); + } + + return firstBlock; } @@ -225,11 +244,9 @@ public class BlockificationPostprocessingService { } - private boolean processOutlineObjectForTextBlock(TextPageBlock pageBlock, - OutlineObject outlineObject, - List splitCandidates, - List mergeCandidates) { + private boolean processOutlineForTextBlock(TextPageBlock pageBlock, OutlineProcessionContext context) { + OutlineObject outlineObject = context.getOutlineObject(); String blockText = pageBlock.getText(); String outlineTitle = outlineObject.getTitle(); @@ -245,73 +262,33 @@ public class BlockificationPostprocessingService { return true; } - if (blockTextContainsOutlineTitle) { - splitCandidates.add(pageBlock); + if (outlineTitleContainsBlockText) { + context.mergeCandidates.add(pageBlock); } - if (outlineTitleContainsBlockText) { - mergeCandidates.add(pageBlock); + if (blockTextContainsOutlineTitle && context.splitCandidate != null) { + context.splitCandidate = pageBlock; } + return false; } - private void processOutlineObjectForTextBlockOld(ClassificationPage classificationPage, TextPageBlock pageBlock, OutlineObject outlineObject) { + @Data + private static class OutlineProcessionContext { - String blockText = pageBlock.getText(); - String outlineTitle = outlineObject.getTitle(); + private OutlineObject outlineObject; + private List mergeCandidates; + private TextPageBlock splitCandidate; - boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle); - boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText); - Rectangle2D boundingBox = pageBlock.getSequences() - .stream() - .map(textPositionSequence -> textPositionSequence.getTextPositions() - .stream() - .map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, textPositionSequence)) - .collect(RectangleTransformations.collectBBox())) - .collect(RectangleTransformations.collectBBox()); + public OutlineProcessionContext(OutlineObject outlineObject) { - if (!isCloseToOutline(boundingBox, outlineObject) || !blockTextContainsOutlineTitle && !outlineTitleContainsBlockText) { - return; + this.outlineObject = outlineObject; + this.mergeCandidates = new ArrayList<>(); + this.splitCandidate = null; } - if (blockText.equals(outlineTitle)) { - - pageBlock.setClassification(PageBlockType.getHeadlineType(outlineObject.getTreeDepth())); - return; - } - - if (blockTextContainsOutlineTitle) { - splitTextBlock(pageBlock, outlineTitle, classificationPage); - } - - if (outlineTitleContainsBlockText) { - // find other blocks, merge them into current, mark them for deletion after loop - } - } - - - private boolean isCloseToOutline(Rectangle2D boundingBox, OutlineObject outlineObject) { - - float threshold = BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD; - //if (textBlock instanceof TextPageBlock) { - // List sequences = ((TextPageBlock) textBlock).getSequences(); - // if (sequences != null) { - // float textHeightSum = 0; - // for (TextPositionSequence word : sequences) { - // textHeightSum += word.getTextHeight(); - // } - // threshold = textHeightSum / sequences.size(); - // } - //} - - return boundingBox.getMinY() - outlineObject.getPoint().getY() < threshold && boundingBox.getMinX() - outlineObject.getPoint().getX() < threshold; - } - - - private void splitTextBlock(TextPageBlock pageBlock, String title, ClassificationPage classificationPage) { - } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/OutlineProcessingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/OutlineProcessingTest.java new file mode 100644 index 0000000..83f86d8 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/OutlineProcessingTest.java @@ -0,0 +1,25 @@ +package com.knecon.fforesight.service.layoutparser.server.graph; + +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; + +import com.knecon.fforesight.service.layoutparser.processor.services.OutlineExtractorService; +import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService; +import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; + +import lombok.SneakyThrows; + +public class OutlineProcessingTest extends BuildDocumentTest { + + @Autowired + OutlineExtractorService outlineExtractorService; + @Autowired + BlockificationPostprocessingService blockificationPostprocessingService; + + @Test + @SneakyThrows + public void test() { + + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index bd5b7f2..8ccfe81 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -73,5 +73,6 @@ public class ViewerDocumentTest extends BuildDocumentTest { layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); } + }