RED-7074: Design Subsection section tree structure algorithm

* first draft: further implementations
This commit is contained in:
maverickstuder 2024-04-16 12:35:26 +02:00
parent ca35feeb63
commit 894355c7cd
4 changed files with 149 additions and 138 deletions

View File

@ -338,6 +338,14 @@ public class LayoutParsingPipeline {
} }
// compute ToC // compute ToC
List<AbstractPageBlock> headlines = classificationDocument.getPages()
.stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb.getClassification().isHeadline()))
.toList();
// ???
log.info("Building Sections for {}", identifier); log.info("Building Sections for {}", identifier);

View File

@ -4,7 +4,6 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.bloc
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.function.Function; import java.util.function.Function;
@ -13,19 +12,20 @@ import org.tinspin.index.Index;
import org.tinspin.index.kdtree.KDIterator; import org.tinspin.index.kdtree.KDIterator;
import org.tinspin.index.kdtree.KDTree; import org.tinspin.index.kdtree.KDTree;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory; import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data;
@Service @Service
public class BlockificationPostprocessingService { public class BlockificationPostprocessingService {
private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f; private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f;
private static final double BLOCK_COMPARISON_PRECISION = 1.0;
private static final Function<TextPageBlock, Rectangle2D> blockToBoundingBox = (abstractPageBlock) -> abstractPageBlock.getSequences() private static final Function<TextPageBlock, Rectangle2D> blockToBoundingBox = (abstractPageBlock) -> abstractPageBlock.getSequences()
.stream() .stream()
@ -36,59 +36,13 @@ public class BlockificationPostprocessingService {
.collect(RectangleTransformations.collectBBox()); .collect(RectangleTransformations.collectBBox());
public void sanitizeOutlineBlocks(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
.stream()
.filter(block -> block instanceof TextPageBlock)
.toList()
.stream()
.map(block -> (TextPageBlock) block)
.toList();
textBlocks.sort(Comparator.comparing(TextPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, BLOCK_COMPARISON_PRECISION))
.thenComparing(TextPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, BLOCK_COMPARISON_PRECISION)));
for (OutlineObject outlineObject : outlineObjects) {
boolean matchedExactly = false;
List<TextPageBlock> splitCandidates = new ArrayList<>();
List<TextPageBlock> mergeCandidates = new ArrayList<>();
for (TextPageBlock textPageBlock : textBlocks) {
matchedExactly = processOutlineObjectForTextBlock(textPageBlock, outlineObject, splitCandidates, mergeCandidates);
if (matchedExactly) {
break;
}
}
if (!matchedExactly) {
//selectMatch(outlineObject, kdTree, splitCandidates, mergeCandidates);
}
}
}
public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) { public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks() if (classificationPage.getTextBlocks().isEmpty() || outlineObjects.isEmpty()) {
.stream()
.filter(block -> block instanceof TextPageBlock)
.toList()
.stream()
.map(block -> (TextPageBlock) block)
.toList();
if (textBlocks.isEmpty() || outlineObjects.isEmpty()) {
return; return;
} }
KDTree<TextPageBlock> kdTree = KDTree.create(2); KDTree<TextPageBlock> kdTree = createKdTree(classificationPage);
textBlocks.forEach(block -> {
var boundingBox = blockToBoundingBox.apply(block);
kdTree.insert(new double[]{boundingBox.getMinX(), boundingBox.getMaxY()}, block);
});
for (OutlineObject outlineObject : outlineObjects) { for (OutlineObject outlineObject : outlineObjects) {
@ -99,36 +53,53 @@ public class BlockificationPostprocessingService {
new double[]{Double.MAX_VALUE, Double.MAX_VALUE}); new double[]{Double.MAX_VALUE, Double.MAX_VALUE});
boolean matchedExactly = false; boolean matchedExactly = false;
List<TextPageBlock> splitCandidates = new ArrayList<>();
List<TextPageBlock> mergeCandidates = new ArrayList<>();
OutlineProcessionContext context = new OutlineProcessionContext(outlineObject);
while (successorIterator.hasNext() && !matchedExactly) { while (successorIterator.hasNext() && !matchedExactly) {
TextPageBlock pageBlock = successorIterator.next().value(); TextPageBlock pageBlock = successorIterator.next().value();
matchedExactly = processOutlineObjectForTextBlock(pageBlock, outlineObject, splitCandidates, mergeCandidates); matchedExactly = processOutlineForTextBlock(pageBlock, context);
} }
if (!matchedExactly) { if (!matchedExactly) {
selectMatch(classificationPage, outlineObject, kdTree, splitCandidates, mergeCandidates); selectMatch(classificationPage, kdTree, context);
} }
} }
} }
private void selectMatch(ClassificationPage classificationPage, private static KDTree<TextPageBlock> createKdTree(ClassificationPage classificationPage) {
OutlineObject outlineObject,
KDTree<TextPageBlock> kdTree,
List<TextPageBlock> splitCandidates,
List<TextPageBlock> mergeCandidates) {
for (TextPageBlock splitCandidate : splitCandidates) { List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
System.out.println(splitCandidate); .stream()
} .filter(block -> block instanceof TextPageBlock)
.toList()
.stream()
.map(block -> (TextPageBlock) block)
.toList();
KDTree<TextPageBlock> kdTree = KDTree.create(2);
textBlocks.forEach(block -> {
var boundingBox = blockToBoundingBox.apply(block);
kdTree.insert(new double[]{boundingBox.getMinX(), boundingBox.getMaxY()}, block);
});
return kdTree;
}
private void selectMatch(ClassificationPage classificationPage, KDTree<TextPageBlock> kdTree, OutlineProcessionContext context) {
OutlineObject outlineObject = context.outlineObject;
List<TextPageBlock> mergeCandidates = context.mergeCandidates;
TextPageBlock splitCandidate = context.splitCandidate;
PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth());
if (!mergeCandidates.isEmpty()) { if (!mergeCandidates.isEmpty()) {
List<TextPageBlock> allMergeCandidates = new ArrayList<>(mergeCandidates); List<TextPageBlock> allMergeCandidates = new ArrayList<>(mergeCandidates);
addNeighborsOfCandidate(kdTree, mergeCandidates.get(0), allMergeCandidates); addNeighborsOfCandidate(kdTree, mergeCandidates.get(0), allMergeCandidates);
addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates); if (mergeCandidates.size() > 1) {
addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates);
}
allMergeCandidates = allMergeCandidates.stream() allMergeCandidates = allMergeCandidates.stream()
.distinct() .distinct()
.toList(); .toList();
@ -146,36 +117,84 @@ public class BlockificationPostprocessingService {
bestCombination = combination; bestCombination = combination;
} }
} }
mergeBlocks(classificationPage, bestCombination); var merged = mergeBlocks(classificationPage, bestCombination);
merged.setClassification(headlineType);
}
if (splitCandidate != null) {
TextPageBlock other = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle());
splitCandidate.setClassification(headlineType);
other.setClassification(headlineType);
} }
} }
private void mergeBlocks(ClassificationPage classificationPage, List<TextPageBlock> blocksToMerge) { private TextPageBlock splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, String text) {
if (blocksToMerge.size() <= 1) { List<TextPositionSequence> wordSequence = findWordSequence(blockToSplit.getSequences(), text);
return; List<TextPositionSequence> remaining = blockToSplit.getSequences();
} remaining.removeAll(wordSequence);
TextPageBlock firstBlock = blocksToMerge.get(0); blockToSplit.setSequences(wordSequence);
List<TextPageBlock> mergedBlocks = new ArrayList<>(); TextPageBlock other = buildTextBlock(remaining, 0);
for (TextPageBlock textPageBlock : blocksToMerge.subList(1, blocksToMerge.size())) { classificationPage.getTextBlocks().add(other);
return other;
}
if (firstBlock != null && !firstBlock.getSequences().isEmpty()) {
if (textPageBlock.getDir() == firstBlock.getDir()) { private static List<TextPositionSequence> findWordSequence(List<TextPositionSequence> textPositionSequences, String text) {
firstBlock.getSequences().addAll(textPageBlock.getSequences());
mergedBlocks.add(textPageBlock); String target = text.replaceAll("\\s", "");
List<TextPositionSequence> inSequence = new ArrayList<>();
StringBuilder currentSequence = new StringBuilder();
for (TextPositionSequence sequence : textPositionSequences) {
if (currentSequence.toString().equals(target)) {
return inSequence;
}
currentSequence.append(sequence.toString());
inSequence.add(sequence);
if (currentSequence.length() > target.length()) {
TextPositionSequence removed = inSequence.remove(0);
currentSequence.delete(0, removed.toString().length());
while (currentSequence.length() > target.length()) {
removed = inSequence.remove(0);
currentSequence.delete(0, removed.toString().length());
} }
} }
} }
return new ArrayList<>();
}
assert firstBlock != null;
buildTextBlock(firstBlock.getSequences(), 0);
firstBlock.setToDuplicate(false);
classificationPage.getTextBlocks().removeAll(mergedBlocks);
private TextPageBlock mergeBlocks(ClassificationPage classificationPage, List<TextPageBlock> blocksToMerge) {
TextPageBlock firstBlock = blocksToMerge.get(0);
if (blocksToMerge.size() > 1) {
List<TextPageBlock> mergedBlocks = new ArrayList<>();
for (TextPageBlock textPageBlock : blocksToMerge.subList(1, blocksToMerge.size())) {
if (firstBlock != null && !firstBlock.getSequences().isEmpty()) {
if (textPageBlock.getDir() == firstBlock.getDir()) {
firstBlock.getSequences().addAll(textPageBlock.getSequences());
mergedBlocks.add(textPageBlock);
}
}
}
assert firstBlock != null;
firstBlock.setToDuplicate(false);
classificationPage.getTextBlocks().removeAll(mergedBlocks);
}
return firstBlock;
} }
@ -225,11 +244,9 @@ public class BlockificationPostprocessingService {
} }
private boolean processOutlineObjectForTextBlock(TextPageBlock pageBlock, private boolean processOutlineForTextBlock(TextPageBlock pageBlock, OutlineProcessionContext context) {
OutlineObject outlineObject,
List<TextPageBlock> splitCandidates,
List<TextPageBlock> mergeCandidates) {
OutlineObject outlineObject = context.getOutlineObject();
String blockText = pageBlock.getText(); String blockText = pageBlock.getText();
String outlineTitle = outlineObject.getTitle(); String outlineTitle = outlineObject.getTitle();
@ -245,73 +262,33 @@ public class BlockificationPostprocessingService {
return true; return true;
} }
if (blockTextContainsOutlineTitle) { if (outlineTitleContainsBlockText) {
splitCandidates.add(pageBlock); context.mergeCandidates.add(pageBlock);
} }
if (outlineTitleContainsBlockText) { if (blockTextContainsOutlineTitle && context.splitCandidate != null) {
mergeCandidates.add(pageBlock); context.splitCandidate = pageBlock;
} }
return false; return false;
} }
private void processOutlineObjectForTextBlockOld(ClassificationPage classificationPage, TextPageBlock pageBlock, OutlineObject outlineObject) { @Data
private static class OutlineProcessionContext {
String blockText = pageBlock.getText(); private OutlineObject outlineObject;
String outlineTitle = outlineObject.getTitle(); private List<TextPageBlock> mergeCandidates;
private TextPageBlock splitCandidate;
boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle);
boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText);
Rectangle2D boundingBox = pageBlock.getSequences() public OutlineProcessionContext(OutlineObject outlineObject) {
.stream()
.map(textPositionSequence -> textPositionSequence.getTextPositions()
.stream()
.map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, textPositionSequence))
.collect(RectangleTransformations.collectBBox()))
.collect(RectangleTransformations.collectBBox());
if (!isCloseToOutline(boundingBox, outlineObject) || !blockTextContainsOutlineTitle && !outlineTitleContainsBlockText) { this.outlineObject = outlineObject;
return; this.mergeCandidates = new ArrayList<>();
this.splitCandidate = null;
} }
if (blockText.equals(outlineTitle)) {
pageBlock.setClassification(PageBlockType.getHeadlineType(outlineObject.getTreeDepth()));
return;
}
if (blockTextContainsOutlineTitle) {
splitTextBlock(pageBlock, outlineTitle, classificationPage);
}
if (outlineTitleContainsBlockText) {
// find other blocks, merge them into current, mark them for deletion after loop
}
}
private boolean isCloseToOutline(Rectangle2D boundingBox, OutlineObject outlineObject) {
float threshold = BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD;
//if (textBlock instanceof TextPageBlock) {
// List<TextPositionSequence> sequences = ((TextPageBlock) textBlock).getSequences();
// if (sequences != null) {
// float textHeightSum = 0;
// for (TextPositionSequence word : sequences) {
// textHeightSum += word.getTextHeight();
// }
// threshold = textHeightSum / sequences.size();
// }
//}
return boundingBox.getMinY() - outlineObject.getPoint().getY() < threshold && boundingBox.getMinX() - outlineObject.getPoint().getX() < threshold;
}
private void splitTextBlock(TextPageBlock pageBlock, String title, ClassificationPage classificationPage) {
} }
} }

View File

@ -0,0 +1,25 @@
package com.knecon.fforesight.service.layoutparser.server.graph;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import com.knecon.fforesight.service.layoutparser.processor.services.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import lombok.SneakyThrows;
public class OutlineProcessingTest extends BuildDocumentTest {
@Autowired
OutlineExtractorService outlineExtractorService;
@Autowired
BlockificationPostprocessingService blockificationPostprocessingService;
@Test
@SneakyThrows
public void test() {
}
}

View File

@ -73,5 +73,6 @@ public class ViewerDocumentTest extends BuildDocumentTest {
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
} }
} }