RED-7074: Design Subsection section tree structure algorithm
* first draft: further implementations
This commit is contained in:
parent
ca35feeb63
commit
894355c7cd
@ -338,6 +338,14 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
|
||||
// compute ToC
|
||||
List<AbstractPageBlock> headlines = classificationDocument.getPages()
|
||||
.stream()
|
||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(tb -> tb.getClassification().isHeadline()))
|
||||
.toList();
|
||||
// ???
|
||||
|
||||
|
||||
log.info("Building Sections for {}", identifier);
|
||||
|
||||
|
||||
@ -4,7 +4,6 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.bloc
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
|
||||
@ -13,19 +12,20 @@ import org.tinspin.index.Index;
|
||||
import org.tinspin.index.kdtree.KDIterator;
|
||||
import org.tinspin.index.kdtree.KDTree;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Service
|
||||
public class BlockificationPostprocessingService {
|
||||
|
||||
private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f;
|
||||
private static final double BLOCK_COMPARISON_PRECISION = 1.0;
|
||||
|
||||
private static final Function<TextPageBlock, Rectangle2D> blockToBoundingBox = (abstractPageBlock) -> abstractPageBlock.getSequences()
|
||||
.stream()
|
||||
@ -36,59 +36,13 @@ public class BlockificationPostprocessingService {
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
|
||||
public void sanitizeOutlineBlocks(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
|
||||
|
||||
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(block -> block instanceof TextPageBlock)
|
||||
.toList()
|
||||
.stream()
|
||||
.map(block -> (TextPageBlock) block)
|
||||
.toList();
|
||||
|
||||
textBlocks.sort(Comparator.comparing(TextPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, BLOCK_COMPARISON_PRECISION))
|
||||
.thenComparing(TextPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, BLOCK_COMPARISON_PRECISION)));
|
||||
|
||||
for (OutlineObject outlineObject : outlineObjects) {
|
||||
|
||||
boolean matchedExactly = false;
|
||||
List<TextPageBlock> splitCandidates = new ArrayList<>();
|
||||
List<TextPageBlock> mergeCandidates = new ArrayList<>();
|
||||
|
||||
for (TextPageBlock textPageBlock : textBlocks) {
|
||||
matchedExactly = processOutlineObjectForTextBlock(textPageBlock, outlineObject, splitCandidates, mergeCandidates);
|
||||
|
||||
if (matchedExactly) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!matchedExactly) {
|
||||
//selectMatch(outlineObject, kdTree, splitCandidates, mergeCandidates);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
|
||||
|
||||
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(block -> block instanceof TextPageBlock)
|
||||
.toList()
|
||||
.stream()
|
||||
.map(block -> (TextPageBlock) block)
|
||||
.toList();
|
||||
if (textBlocks.isEmpty() || outlineObjects.isEmpty()) {
|
||||
if (classificationPage.getTextBlocks().isEmpty() || outlineObjects.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
KDTree<TextPageBlock> kdTree = KDTree.create(2);
|
||||
textBlocks.forEach(block -> {
|
||||
var boundingBox = blockToBoundingBox.apply(block);
|
||||
kdTree.insert(new double[]{boundingBox.getMinX(), boundingBox.getMaxY()}, block);
|
||||
});
|
||||
KDTree<TextPageBlock> kdTree = createKdTree(classificationPage);
|
||||
|
||||
for (OutlineObject outlineObject : outlineObjects) {
|
||||
|
||||
@ -99,36 +53,53 @@ public class BlockificationPostprocessingService {
|
||||
new double[]{Double.MAX_VALUE, Double.MAX_VALUE});
|
||||
|
||||
boolean matchedExactly = false;
|
||||
List<TextPageBlock> splitCandidates = new ArrayList<>();
|
||||
List<TextPageBlock> mergeCandidates = new ArrayList<>();
|
||||
|
||||
OutlineProcessionContext context = new OutlineProcessionContext(outlineObject);
|
||||
while (successorIterator.hasNext() && !matchedExactly) {
|
||||
TextPageBlock pageBlock = successorIterator.next().value();
|
||||
matchedExactly = processOutlineObjectForTextBlock(pageBlock, outlineObject, splitCandidates, mergeCandidates);
|
||||
matchedExactly = processOutlineForTextBlock(pageBlock, context);
|
||||
}
|
||||
|
||||
if (!matchedExactly) {
|
||||
selectMatch(classificationPage, outlineObject, kdTree, splitCandidates, mergeCandidates);
|
||||
selectMatch(classificationPage, kdTree, context);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void selectMatch(ClassificationPage classificationPage,
|
||||
OutlineObject outlineObject,
|
||||
KDTree<TextPageBlock> kdTree,
|
||||
List<TextPageBlock> splitCandidates,
|
||||
List<TextPageBlock> mergeCandidates) {
|
||||
private static KDTree<TextPageBlock> createKdTree(ClassificationPage classificationPage) {
|
||||
|
||||
for (TextPageBlock splitCandidate : splitCandidates) {
|
||||
System.out.println(splitCandidate);
|
||||
}
|
||||
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(block -> block instanceof TextPageBlock)
|
||||
.toList()
|
||||
.stream()
|
||||
.map(block -> (TextPageBlock) block)
|
||||
.toList();
|
||||
|
||||
KDTree<TextPageBlock> kdTree = KDTree.create(2);
|
||||
textBlocks.forEach(block -> {
|
||||
var boundingBox = blockToBoundingBox.apply(block);
|
||||
kdTree.insert(new double[]{boundingBox.getMinX(), boundingBox.getMaxY()}, block);
|
||||
});
|
||||
return kdTree;
|
||||
}
|
||||
|
||||
|
||||
private void selectMatch(ClassificationPage classificationPage, KDTree<TextPageBlock> kdTree, OutlineProcessionContext context) {
|
||||
|
||||
OutlineObject outlineObject = context.outlineObject;
|
||||
List<TextPageBlock> mergeCandidates = context.mergeCandidates;
|
||||
TextPageBlock splitCandidate = context.splitCandidate;
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth());
|
||||
|
||||
if (!mergeCandidates.isEmpty()) {
|
||||
|
||||
List<TextPageBlock> allMergeCandidates = new ArrayList<>(mergeCandidates);
|
||||
addNeighborsOfCandidate(kdTree, mergeCandidates.get(0), allMergeCandidates);
|
||||
addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates);
|
||||
if (mergeCandidates.size() > 1) {
|
||||
addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates);
|
||||
}
|
||||
allMergeCandidates = allMergeCandidates.stream()
|
||||
.distinct()
|
||||
.toList();
|
||||
@ -146,36 +117,84 @@ public class BlockificationPostprocessingService {
|
||||
bestCombination = combination;
|
||||
}
|
||||
}
|
||||
mergeBlocks(classificationPage, bestCombination);
|
||||
var merged = mergeBlocks(classificationPage, bestCombination);
|
||||
merged.setClassification(headlineType);
|
||||
}
|
||||
|
||||
if (splitCandidate != null) {
|
||||
TextPageBlock other = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle());
|
||||
splitCandidate.setClassification(headlineType);
|
||||
other.setClassification(headlineType);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void mergeBlocks(ClassificationPage classificationPage, List<TextPageBlock> blocksToMerge) {
|
||||
private TextPageBlock splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, String text) {
|
||||
|
||||
if (blocksToMerge.size() <= 1) {
|
||||
return;
|
||||
}
|
||||
List<TextPositionSequence> wordSequence = findWordSequence(blockToSplit.getSequences(), text);
|
||||
List<TextPositionSequence> remaining = blockToSplit.getSequences();
|
||||
remaining.removeAll(wordSequence);
|
||||
|
||||
TextPageBlock firstBlock = blocksToMerge.get(0);
|
||||
blockToSplit.setSequences(wordSequence);
|
||||
|
||||
List<TextPageBlock> mergedBlocks = new ArrayList<>();
|
||||
for (TextPageBlock textPageBlock : blocksToMerge.subList(1, blocksToMerge.size())) {
|
||||
TextPageBlock other = buildTextBlock(remaining, 0);
|
||||
classificationPage.getTextBlocks().add(other);
|
||||
return other;
|
||||
}
|
||||
|
||||
if (firstBlock != null && !firstBlock.getSequences().isEmpty()) {
|
||||
|
||||
if (textPageBlock.getDir() == firstBlock.getDir()) {
|
||||
firstBlock.getSequences().addAll(textPageBlock.getSequences());
|
||||
mergedBlocks.add(textPageBlock);
|
||||
private static List<TextPositionSequence> findWordSequence(List<TextPositionSequence> textPositionSequences, String text) {
|
||||
|
||||
String target = text.replaceAll("\\s", "");
|
||||
List<TextPositionSequence> inSequence = new ArrayList<>();
|
||||
StringBuilder currentSequence = new StringBuilder();
|
||||
|
||||
for (TextPositionSequence sequence : textPositionSequences) {
|
||||
|
||||
if (currentSequence.toString().equals(target)) {
|
||||
return inSequence;
|
||||
}
|
||||
currentSequence.append(sequence.toString());
|
||||
inSequence.add(sequence);
|
||||
|
||||
if (currentSequence.length() > target.length()) {
|
||||
TextPositionSequence removed = inSequence.remove(0);
|
||||
currentSequence.delete(0, removed.toString().length());
|
||||
|
||||
while (currentSequence.length() > target.length()) {
|
||||
removed = inSequence.remove(0);
|
||||
currentSequence.delete(0, removed.toString().length());
|
||||
}
|
||||
}
|
||||
}
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
assert firstBlock != null;
|
||||
buildTextBlock(firstBlock.getSequences(), 0);
|
||||
firstBlock.setToDuplicate(false);
|
||||
classificationPage.getTextBlocks().removeAll(mergedBlocks);
|
||||
|
||||
private TextPageBlock mergeBlocks(ClassificationPage classificationPage, List<TextPageBlock> blocksToMerge) {
|
||||
|
||||
TextPageBlock firstBlock = blocksToMerge.get(0);
|
||||
|
||||
if (blocksToMerge.size() > 1) {
|
||||
|
||||
List<TextPageBlock> mergedBlocks = new ArrayList<>();
|
||||
for (TextPageBlock textPageBlock : blocksToMerge.subList(1, blocksToMerge.size())) {
|
||||
|
||||
if (firstBlock != null && !firstBlock.getSequences().isEmpty()) {
|
||||
|
||||
if (textPageBlock.getDir() == firstBlock.getDir()) {
|
||||
firstBlock.getSequences().addAll(textPageBlock.getSequences());
|
||||
mergedBlocks.add(textPageBlock);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert firstBlock != null;
|
||||
firstBlock.setToDuplicate(false);
|
||||
classificationPage.getTextBlocks().removeAll(mergedBlocks);
|
||||
}
|
||||
|
||||
return firstBlock;
|
||||
}
|
||||
|
||||
|
||||
@ -225,11 +244,9 @@ public class BlockificationPostprocessingService {
|
||||
}
|
||||
|
||||
|
||||
private boolean processOutlineObjectForTextBlock(TextPageBlock pageBlock,
|
||||
OutlineObject outlineObject,
|
||||
List<TextPageBlock> splitCandidates,
|
||||
List<TextPageBlock> mergeCandidates) {
|
||||
private boolean processOutlineForTextBlock(TextPageBlock pageBlock, OutlineProcessionContext context) {
|
||||
|
||||
OutlineObject outlineObject = context.getOutlineObject();
|
||||
String blockText = pageBlock.getText();
|
||||
String outlineTitle = outlineObject.getTitle();
|
||||
|
||||
@ -245,73 +262,33 @@ public class BlockificationPostprocessingService {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (blockTextContainsOutlineTitle) {
|
||||
splitCandidates.add(pageBlock);
|
||||
if (outlineTitleContainsBlockText) {
|
||||
context.mergeCandidates.add(pageBlock);
|
||||
}
|
||||
|
||||
if (outlineTitleContainsBlockText) {
|
||||
mergeCandidates.add(pageBlock);
|
||||
if (blockTextContainsOutlineTitle && context.splitCandidate != null) {
|
||||
context.splitCandidate = pageBlock;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private void processOutlineObjectForTextBlockOld(ClassificationPage classificationPage, TextPageBlock pageBlock, OutlineObject outlineObject) {
|
||||
@Data
|
||||
private static class OutlineProcessionContext {
|
||||
|
||||
String blockText = pageBlock.getText();
|
||||
String outlineTitle = outlineObject.getTitle();
|
||||
private OutlineObject outlineObject;
|
||||
private List<TextPageBlock> mergeCandidates;
|
||||
private TextPageBlock splitCandidate;
|
||||
|
||||
boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle);
|
||||
boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText);
|
||||
|
||||
Rectangle2D boundingBox = pageBlock.getSequences()
|
||||
.stream()
|
||||
.map(textPositionSequence -> textPositionSequence.getTextPositions()
|
||||
.stream()
|
||||
.map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, textPositionSequence))
|
||||
.collect(RectangleTransformations.collectBBox()))
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
public OutlineProcessionContext(OutlineObject outlineObject) {
|
||||
|
||||
if (!isCloseToOutline(boundingBox, outlineObject) || !blockTextContainsOutlineTitle && !outlineTitleContainsBlockText) {
|
||||
return;
|
||||
this.outlineObject = outlineObject;
|
||||
this.mergeCandidates = new ArrayList<>();
|
||||
this.splitCandidate = null;
|
||||
}
|
||||
|
||||
if (blockText.equals(outlineTitle)) {
|
||||
|
||||
pageBlock.setClassification(PageBlockType.getHeadlineType(outlineObject.getTreeDepth()));
|
||||
return;
|
||||
}
|
||||
|
||||
if (blockTextContainsOutlineTitle) {
|
||||
splitTextBlock(pageBlock, outlineTitle, classificationPage);
|
||||
}
|
||||
|
||||
if (outlineTitleContainsBlockText) {
|
||||
// find other blocks, merge them into current, mark them for deletion after loop
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean isCloseToOutline(Rectangle2D boundingBox, OutlineObject outlineObject) {
|
||||
|
||||
float threshold = BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD;
|
||||
//if (textBlock instanceof TextPageBlock) {
|
||||
// List<TextPositionSequence> sequences = ((TextPageBlock) textBlock).getSequences();
|
||||
// if (sequences != null) {
|
||||
// float textHeightSum = 0;
|
||||
// for (TextPositionSequence word : sequences) {
|
||||
// textHeightSum += word.getTextHeight();
|
||||
// }
|
||||
// threshold = textHeightSum / sequences.size();
|
||||
// }
|
||||
//}
|
||||
|
||||
return boundingBox.getMinY() - outlineObject.getPoint().getY() < threshold && boundingBox.getMinX() - outlineObject.getPoint().getX() < threshold;
|
||||
}
|
||||
|
||||
|
||||
private void splitTextBlock(TextPageBlock pageBlock, String title, ClassificationPage classificationPage) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,25 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.OutlineExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class OutlineProcessingTest extends BuildDocumentTest {
|
||||
|
||||
@Autowired
|
||||
OutlineExtractorService outlineExtractorService;
|
||||
@Autowired
|
||||
BlockificationPostprocessingService blockificationPostprocessingService;
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void test() {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -73,5 +73,6 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user