RED-7074: Design Subsection section tree structure algorithm
* first draft: further implementations
This commit is contained in:
parent
ca35feeb63
commit
894355c7cd
@ -338,6 +338,14 @@ public class LayoutParsingPipeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// compute ToC
|
// compute ToC
|
||||||
|
List<AbstractPageBlock> headlines = classificationDocument.getPages()
|
||||||
|
.stream()
|
||||||
|
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||||
|
.stream()
|
||||||
|
.filter(tb -> tb.getClassification().isHeadline()))
|
||||||
|
.toList();
|
||||||
|
// ???
|
||||||
|
|
||||||
|
|
||||||
log.info("Building Sections for {}", identifier);
|
log.info("Building Sections for {}", identifier);
|
||||||
|
|
||||||
|
|||||||
@ -4,7 +4,6 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.bloc
|
|||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
|
||||||
@ -13,19 +12,20 @@ import org.tinspin.index.Index;
|
|||||||
import org.tinspin.index.kdtree.KDIterator;
|
import org.tinspin.index.kdtree.KDIterator;
|
||||||
import org.tinspin.index.kdtree.KDTree;
|
import org.tinspin.index.kdtree.KDTree;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
public class BlockificationPostprocessingService {
|
public class BlockificationPostprocessingService {
|
||||||
|
|
||||||
private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f;
|
private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f;
|
||||||
private static final double BLOCK_COMPARISON_PRECISION = 1.0;
|
|
||||||
|
|
||||||
private static final Function<TextPageBlock, Rectangle2D> blockToBoundingBox = (abstractPageBlock) -> abstractPageBlock.getSequences()
|
private static final Function<TextPageBlock, Rectangle2D> blockToBoundingBox = (abstractPageBlock) -> abstractPageBlock.getSequences()
|
||||||
.stream()
|
.stream()
|
||||||
@ -36,59 +36,13 @@ public class BlockificationPostprocessingService {
|
|||||||
.collect(RectangleTransformations.collectBBox());
|
.collect(RectangleTransformations.collectBBox());
|
||||||
|
|
||||||
|
|
||||||
public void sanitizeOutlineBlocks(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
|
|
||||||
|
|
||||||
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
|
|
||||||
.stream()
|
|
||||||
.filter(block -> block instanceof TextPageBlock)
|
|
||||||
.toList()
|
|
||||||
.stream()
|
|
||||||
.map(block -> (TextPageBlock) block)
|
|
||||||
.toList();
|
|
||||||
|
|
||||||
textBlocks.sort(Comparator.comparing(TextPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, BLOCK_COMPARISON_PRECISION))
|
|
||||||
.thenComparing(TextPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, BLOCK_COMPARISON_PRECISION)));
|
|
||||||
|
|
||||||
for (OutlineObject outlineObject : outlineObjects) {
|
|
||||||
|
|
||||||
boolean matchedExactly = false;
|
|
||||||
List<TextPageBlock> splitCandidates = new ArrayList<>();
|
|
||||||
List<TextPageBlock> mergeCandidates = new ArrayList<>();
|
|
||||||
|
|
||||||
for (TextPageBlock textPageBlock : textBlocks) {
|
|
||||||
matchedExactly = processOutlineObjectForTextBlock(textPageBlock, outlineObject, splitCandidates, mergeCandidates);
|
|
||||||
|
|
||||||
if (matchedExactly) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!matchedExactly) {
|
|
||||||
//selectMatch(outlineObject, kdTree, splitCandidates, mergeCandidates);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
|
public void sanitizeOutlineBlocksWithKdTree(ClassificationPage classificationPage, List<OutlineObject> outlineObjects) {
|
||||||
|
|
||||||
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
|
if (classificationPage.getTextBlocks().isEmpty() || outlineObjects.isEmpty()) {
|
||||||
.stream()
|
|
||||||
.filter(block -> block instanceof TextPageBlock)
|
|
||||||
.toList()
|
|
||||||
.stream()
|
|
||||||
.map(block -> (TextPageBlock) block)
|
|
||||||
.toList();
|
|
||||||
if (textBlocks.isEmpty() || outlineObjects.isEmpty()) {
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
KDTree<TextPageBlock> kdTree = KDTree.create(2);
|
KDTree<TextPageBlock> kdTree = createKdTree(classificationPage);
|
||||||
textBlocks.forEach(block -> {
|
|
||||||
var boundingBox = blockToBoundingBox.apply(block);
|
|
||||||
kdTree.insert(new double[]{boundingBox.getMinX(), boundingBox.getMaxY()}, block);
|
|
||||||
});
|
|
||||||
|
|
||||||
for (OutlineObject outlineObject : outlineObjects) {
|
for (OutlineObject outlineObject : outlineObjects) {
|
||||||
|
|
||||||
@ -99,36 +53,53 @@ public class BlockificationPostprocessingService {
|
|||||||
new double[]{Double.MAX_VALUE, Double.MAX_VALUE});
|
new double[]{Double.MAX_VALUE, Double.MAX_VALUE});
|
||||||
|
|
||||||
boolean matchedExactly = false;
|
boolean matchedExactly = false;
|
||||||
List<TextPageBlock> splitCandidates = new ArrayList<>();
|
|
||||||
List<TextPageBlock> mergeCandidates = new ArrayList<>();
|
|
||||||
|
|
||||||
|
OutlineProcessionContext context = new OutlineProcessionContext(outlineObject);
|
||||||
while (successorIterator.hasNext() && !matchedExactly) {
|
while (successorIterator.hasNext() && !matchedExactly) {
|
||||||
TextPageBlock pageBlock = successorIterator.next().value();
|
TextPageBlock pageBlock = successorIterator.next().value();
|
||||||
matchedExactly = processOutlineObjectForTextBlock(pageBlock, outlineObject, splitCandidates, mergeCandidates);
|
matchedExactly = processOutlineForTextBlock(pageBlock, context);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!matchedExactly) {
|
if (!matchedExactly) {
|
||||||
selectMatch(classificationPage, outlineObject, kdTree, splitCandidates, mergeCandidates);
|
selectMatch(classificationPage, kdTree, context);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void selectMatch(ClassificationPage classificationPage,
|
private static KDTree<TextPageBlock> createKdTree(ClassificationPage classificationPage) {
|
||||||
OutlineObject outlineObject,
|
|
||||||
KDTree<TextPageBlock> kdTree,
|
|
||||||
List<TextPageBlock> splitCandidates,
|
|
||||||
List<TextPageBlock> mergeCandidates) {
|
|
||||||
|
|
||||||
for (TextPageBlock splitCandidate : splitCandidates) {
|
List<TextPageBlock> textBlocks = classificationPage.getTextBlocks()
|
||||||
System.out.println(splitCandidate);
|
.stream()
|
||||||
}
|
.filter(block -> block instanceof TextPageBlock)
|
||||||
|
.toList()
|
||||||
|
.stream()
|
||||||
|
.map(block -> (TextPageBlock) block)
|
||||||
|
.toList();
|
||||||
|
|
||||||
|
KDTree<TextPageBlock> kdTree = KDTree.create(2);
|
||||||
|
textBlocks.forEach(block -> {
|
||||||
|
var boundingBox = blockToBoundingBox.apply(block);
|
||||||
|
kdTree.insert(new double[]{boundingBox.getMinX(), boundingBox.getMaxY()}, block);
|
||||||
|
});
|
||||||
|
return kdTree;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void selectMatch(ClassificationPage classificationPage, KDTree<TextPageBlock> kdTree, OutlineProcessionContext context) {
|
||||||
|
|
||||||
|
OutlineObject outlineObject = context.outlineObject;
|
||||||
|
List<TextPageBlock> mergeCandidates = context.mergeCandidates;
|
||||||
|
TextPageBlock splitCandidate = context.splitCandidate;
|
||||||
|
PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth());
|
||||||
|
|
||||||
if (!mergeCandidates.isEmpty()) {
|
if (!mergeCandidates.isEmpty()) {
|
||||||
|
|
||||||
List<TextPageBlock> allMergeCandidates = new ArrayList<>(mergeCandidates);
|
List<TextPageBlock> allMergeCandidates = new ArrayList<>(mergeCandidates);
|
||||||
addNeighborsOfCandidate(kdTree, mergeCandidates.get(0), allMergeCandidates);
|
addNeighborsOfCandidate(kdTree, mergeCandidates.get(0), allMergeCandidates);
|
||||||
addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates);
|
if (mergeCandidates.size() > 1) {
|
||||||
|
addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates);
|
||||||
|
}
|
||||||
allMergeCandidates = allMergeCandidates.stream()
|
allMergeCandidates = allMergeCandidates.stream()
|
||||||
.distinct()
|
.distinct()
|
||||||
.toList();
|
.toList();
|
||||||
@ -146,36 +117,84 @@ public class BlockificationPostprocessingService {
|
|||||||
bestCombination = combination;
|
bestCombination = combination;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
mergeBlocks(classificationPage, bestCombination);
|
var merged = mergeBlocks(classificationPage, bestCombination);
|
||||||
|
merged.setClassification(headlineType);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (splitCandidate != null) {
|
||||||
|
TextPageBlock other = splitBlock(classificationPage, splitCandidate, outlineObject.getTitle());
|
||||||
|
splitCandidate.setClassification(headlineType);
|
||||||
|
other.setClassification(headlineType);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void mergeBlocks(ClassificationPage classificationPage, List<TextPageBlock> blocksToMerge) {
|
private TextPageBlock splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, String text) {
|
||||||
|
|
||||||
if (blocksToMerge.size() <= 1) {
|
List<TextPositionSequence> wordSequence = findWordSequence(blockToSplit.getSequences(), text);
|
||||||
return;
|
List<TextPositionSequence> remaining = blockToSplit.getSequences();
|
||||||
}
|
remaining.removeAll(wordSequence);
|
||||||
|
|
||||||
TextPageBlock firstBlock = blocksToMerge.get(0);
|
blockToSplit.setSequences(wordSequence);
|
||||||
|
|
||||||
List<TextPageBlock> mergedBlocks = new ArrayList<>();
|
TextPageBlock other = buildTextBlock(remaining, 0);
|
||||||
for (TextPageBlock textPageBlock : blocksToMerge.subList(1, blocksToMerge.size())) {
|
classificationPage.getTextBlocks().add(other);
|
||||||
|
return other;
|
||||||
|
}
|
||||||
|
|
||||||
if (firstBlock != null && !firstBlock.getSequences().isEmpty()) {
|
|
||||||
|
|
||||||
if (textPageBlock.getDir() == firstBlock.getDir()) {
|
private static List<TextPositionSequence> findWordSequence(List<TextPositionSequence> textPositionSequences, String text) {
|
||||||
firstBlock.getSequences().addAll(textPageBlock.getSequences());
|
|
||||||
mergedBlocks.add(textPageBlock);
|
String target = text.replaceAll("\\s", "");
|
||||||
|
List<TextPositionSequence> inSequence = new ArrayList<>();
|
||||||
|
StringBuilder currentSequence = new StringBuilder();
|
||||||
|
|
||||||
|
for (TextPositionSequence sequence : textPositionSequences) {
|
||||||
|
|
||||||
|
if (currentSequence.toString().equals(target)) {
|
||||||
|
return inSequence;
|
||||||
|
}
|
||||||
|
currentSequence.append(sequence.toString());
|
||||||
|
inSequence.add(sequence);
|
||||||
|
|
||||||
|
if (currentSequence.length() > target.length()) {
|
||||||
|
TextPositionSequence removed = inSequence.remove(0);
|
||||||
|
currentSequence.delete(0, removed.toString().length());
|
||||||
|
|
||||||
|
while (currentSequence.length() > target.length()) {
|
||||||
|
removed = inSequence.remove(0);
|
||||||
|
currentSequence.delete(0, removed.toString().length());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return new ArrayList<>();
|
||||||
|
}
|
||||||
|
|
||||||
assert firstBlock != null;
|
|
||||||
buildTextBlock(firstBlock.getSequences(), 0);
|
|
||||||
firstBlock.setToDuplicate(false);
|
|
||||||
classificationPage.getTextBlocks().removeAll(mergedBlocks);
|
|
||||||
|
|
||||||
|
private TextPageBlock mergeBlocks(ClassificationPage classificationPage, List<TextPageBlock> blocksToMerge) {
|
||||||
|
|
||||||
|
TextPageBlock firstBlock = blocksToMerge.get(0);
|
||||||
|
|
||||||
|
if (blocksToMerge.size() > 1) {
|
||||||
|
|
||||||
|
List<TextPageBlock> mergedBlocks = new ArrayList<>();
|
||||||
|
for (TextPageBlock textPageBlock : blocksToMerge.subList(1, blocksToMerge.size())) {
|
||||||
|
|
||||||
|
if (firstBlock != null && !firstBlock.getSequences().isEmpty()) {
|
||||||
|
|
||||||
|
if (textPageBlock.getDir() == firstBlock.getDir()) {
|
||||||
|
firstBlock.getSequences().addAll(textPageBlock.getSequences());
|
||||||
|
mergedBlocks.add(textPageBlock);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assert firstBlock != null;
|
||||||
|
firstBlock.setToDuplicate(false);
|
||||||
|
classificationPage.getTextBlocks().removeAll(mergedBlocks);
|
||||||
|
}
|
||||||
|
|
||||||
|
return firstBlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -225,11 +244,9 @@ public class BlockificationPostprocessingService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean processOutlineObjectForTextBlock(TextPageBlock pageBlock,
|
private boolean processOutlineForTextBlock(TextPageBlock pageBlock, OutlineProcessionContext context) {
|
||||||
OutlineObject outlineObject,
|
|
||||||
List<TextPageBlock> splitCandidates,
|
|
||||||
List<TextPageBlock> mergeCandidates) {
|
|
||||||
|
|
||||||
|
OutlineObject outlineObject = context.getOutlineObject();
|
||||||
String blockText = pageBlock.getText();
|
String blockText = pageBlock.getText();
|
||||||
String outlineTitle = outlineObject.getTitle();
|
String outlineTitle = outlineObject.getTitle();
|
||||||
|
|
||||||
@ -245,73 +262,33 @@ public class BlockificationPostprocessingService {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (blockTextContainsOutlineTitle) {
|
if (outlineTitleContainsBlockText) {
|
||||||
splitCandidates.add(pageBlock);
|
context.mergeCandidates.add(pageBlock);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (outlineTitleContainsBlockText) {
|
if (blockTextContainsOutlineTitle && context.splitCandidate != null) {
|
||||||
mergeCandidates.add(pageBlock);
|
context.splitCandidate = pageBlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void processOutlineObjectForTextBlockOld(ClassificationPage classificationPage, TextPageBlock pageBlock, OutlineObject outlineObject) {
|
@Data
|
||||||
|
private static class OutlineProcessionContext {
|
||||||
|
|
||||||
String blockText = pageBlock.getText();
|
private OutlineObject outlineObject;
|
||||||
String outlineTitle = outlineObject.getTitle();
|
private List<TextPageBlock> mergeCandidates;
|
||||||
|
private TextPageBlock splitCandidate;
|
||||||
|
|
||||||
boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle);
|
|
||||||
boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText);
|
|
||||||
|
|
||||||
Rectangle2D boundingBox = pageBlock.getSequences()
|
public OutlineProcessionContext(OutlineObject outlineObject) {
|
||||||
.stream()
|
|
||||||
.map(textPositionSequence -> textPositionSequence.getTextPositions()
|
|
||||||
.stream()
|
|
||||||
.map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, textPositionSequence))
|
|
||||||
.collect(RectangleTransformations.collectBBox()))
|
|
||||||
.collect(RectangleTransformations.collectBBox());
|
|
||||||
|
|
||||||
if (!isCloseToOutline(boundingBox, outlineObject) || !blockTextContainsOutlineTitle && !outlineTitleContainsBlockText) {
|
this.outlineObject = outlineObject;
|
||||||
return;
|
this.mergeCandidates = new ArrayList<>();
|
||||||
|
this.splitCandidate = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (blockText.equals(outlineTitle)) {
|
|
||||||
|
|
||||||
pageBlock.setClassification(PageBlockType.getHeadlineType(outlineObject.getTreeDepth()));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (blockTextContainsOutlineTitle) {
|
|
||||||
splitTextBlock(pageBlock, outlineTitle, classificationPage);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (outlineTitleContainsBlockText) {
|
|
||||||
// find other blocks, merge them into current, mark them for deletion after loop
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean isCloseToOutline(Rectangle2D boundingBox, OutlineObject outlineObject) {
|
|
||||||
|
|
||||||
float threshold = BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD;
|
|
||||||
//if (textBlock instanceof TextPageBlock) {
|
|
||||||
// List<TextPositionSequence> sequences = ((TextPageBlock) textBlock).getSequences();
|
|
||||||
// if (sequences != null) {
|
|
||||||
// float textHeightSum = 0;
|
|
||||||
// for (TextPositionSequence word : sequences) {
|
|
||||||
// textHeightSum += word.getTextHeight();
|
|
||||||
// }
|
|
||||||
// threshold = textHeightSum / sequences.size();
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
|
|
||||||
return boundingBox.getMinY() - outlineObject.getPoint().getY() < threshold && boundingBox.getMinX() - outlineObject.getPoint().getX() < threshold;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void splitTextBlock(TextPageBlock pageBlock, String title, ClassificationPage classificationPage) {
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,25 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.OutlineExtractorService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
|
public class OutlineProcessingTest extends BuildDocumentTest {
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
OutlineExtractorService outlineExtractorService;
|
||||||
|
@Autowired
|
||||||
|
BlockificationPostprocessingService blockificationPostprocessingService;
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void test() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -73,5 +73,6 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
|
|
||||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user