Merge branch 'TAAS-104' into 'main'
TAAS-104: merge visually intersecting Paragraphs See merge request fforesight/layout-parser!73
This commit is contained in:
commit
b8ef55e6e2
@ -72,9 +72,27 @@ public abstract class AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(AbstractPageBlock atc) {
|
||||
public boolean intersectsY(AbstractPageBlock apb) {
|
||||
|
||||
return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY();
|
||||
return this.minY <= apb.getMaxY() && this.maxY >= apb.getMinY();
|
||||
}
|
||||
|
||||
|
||||
public boolean almostIntersects(AbstractPageBlock apb, float yThreshold, float xThreshold) {
|
||||
|
||||
return this.almostIntersectsX(apb, xThreshold) && this.almostIntersectsY(apb, yThreshold);
|
||||
}
|
||||
|
||||
|
||||
private boolean almostIntersectsY(AbstractPageBlock apb, float threshold) {
|
||||
|
||||
return this.minY - threshold <= apb.getMaxY() && this.maxY + threshold >= apb.getMinY();
|
||||
}
|
||||
|
||||
|
||||
private boolean almostIntersectsX(AbstractPageBlock apb, float threshold) {
|
||||
|
||||
return this.minX - threshold <= apb.getMaxX() && this.maxX + threshold >= apb.getMinX();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,11 +1,18 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
|
||||
// TODO: figure out, why this fails the build
|
||||
// import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@ -22,8 +29,12 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirA
|
||||
public class TaasBlockificationService {
|
||||
|
||||
private static final float THRESHOLD = 1f;
|
||||
private static final float Y_GAP_SPLIT_HEIGHT_MODIFIER = 1.25f;
|
||||
private static final float Y_GAP_SPLIT_HEIGHT_MODIFIER = 1.25f; // multiplied with text height
|
||||
private static final float INTERSECTS_Y_THRESHOLD = 4;// 2 * HEIGHT_PADDING // This is exactly 2 times our position height padding. This is required to find boxes that are visually intersecting.
|
||||
private static final int X_GAP_SPLIT_CONSTANT = 50;
|
||||
public static final int X_ALIGNMENT_THRESHOLD = 1;
|
||||
public static final int SMALL_Y_GAP_THRESHOLD = 5;
|
||||
public static final int NEGATIVE_X_GAP_THRESHOLD = -5;
|
||||
|
||||
|
||||
/**
|
||||
@ -39,14 +50,28 @@ public class TaasBlockificationService {
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
List<TextPageBlock> classificationTextBlocks = constructFineGranularTextPageBlocks(textPositions, horizontalRulingLines, verticalRulingLines);
|
||||
|
||||
classificationTextBlocks = mergeFineGranularTextPageBlocks(classificationTextBlocks);
|
||||
classificationTextBlocks = mergeTextPageBlocksAligningX(classificationTextBlocks);
|
||||
classificationTextBlocks = mergeIntersectingTextBlocksUntilConvergence(classificationTextBlocks);
|
||||
|
||||
return new ClassificationPage(new ArrayList<>(classificationTextBlocks.stream().map(classificationTextBlock -> (AbstractPageBlock) classificationTextBlock).toList()));
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> mergeFineGranularTextPageBlocks(List<TextPageBlock> classificationTextBlocks) {
|
||||
private List<TextPageBlock> mergeIntersectingTextBlocksUntilConvergence(List<TextPageBlock> classificationTextBlocks) {
|
||||
|
||||
int currentSize = classificationTextBlocks.size();
|
||||
while (true) {
|
||||
classificationTextBlocks = mergeTextPageBlocksAlmostIntersecting(classificationTextBlocks);
|
||||
if (classificationTextBlocks.size() == currentSize) {
|
||||
break;
|
||||
}
|
||||
currentSize = classificationTextBlocks.size();
|
||||
}
|
||||
return classificationTextBlocks;
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> mergeTextPageBlocksAligningX(List<TextPageBlock> classificationTextBlocks) {
|
||||
|
||||
if (classificationTextBlocks.isEmpty()) {
|
||||
return new ArrayList<>();
|
||||
@ -61,8 +86,8 @@ public class TaasBlockificationService {
|
||||
previousTextBlock = currentTextBlock;
|
||||
continue;
|
||||
}
|
||||
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < 1;
|
||||
boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < 5;
|
||||
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD;
|
||||
boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < SMALL_Y_GAP_THRESHOLD;
|
||||
if (alignsXRight && smallYGap) {
|
||||
currentTextBlocksToMerge.add(currentTextBlock);
|
||||
} else {
|
||||
@ -76,6 +101,23 @@ public class TaasBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> mergeTextPageBlocksAlmostIntersecting(List<TextPageBlock> textPageBlocks) {
|
||||
|
||||
Set<TextPageBlock> alreadyMerged = new HashSet<>();
|
||||
List<List<TextPageBlock>> textBlocksToMerge = new LinkedList<>();
|
||||
for (TextPageBlock textPageBlock : textPageBlocks) {
|
||||
if (alreadyMerged.contains(textPageBlock)) {
|
||||
continue;
|
||||
}
|
||||
alreadyMerged.add(textPageBlock);
|
||||
textBlocksToMerge.add(Stream.concat(Stream.of(textPageBlock),
|
||||
textPageBlocks.stream().filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2)).peek(alreadyMerged::add))
|
||||
.toList());
|
||||
}
|
||||
return textBlocksToMerge.stream().map(TextPageBlock::merge).toList();
|
||||
}
|
||||
|
||||
|
||||
private void assignOrientations(List<TextPageBlock> classificationTextBlocks) {
|
||||
|
||||
Iterator<TextPageBlock> itty = classificationTextBlocks.iterator();
|
||||
@ -149,7 +191,7 @@ public class TaasBlockificationService {
|
||||
boolean yGap = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||
boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj());
|
||||
boolean positiveXGapInline = prev != null && maxX + X_GAP_SPLIT_CONSTANT < word.getMinXDirAdj() && sameLine;
|
||||
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
|
||||
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < NEGATIVE_X_GAP_THRESHOLD;
|
||||
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean splitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
@ -164,7 +206,7 @@ public class TaasBlockificationService {
|
||||
|
||||
Orientation prevOrientation = null;
|
||||
if (!classificationTextBlocks.isEmpty()) {
|
||||
prevOrientation = classificationTextBlocks.get(classificationTextBlocks.size() - 1).getOrientation();
|
||||
prevOrientation = classificationTextBlocks.get(classificationTextBlocks.size() - X_ALIGNMENT_THRESHOLD).getOrientation();
|
||||
}
|
||||
|
||||
TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine);
|
||||
|
||||
@ -25,8 +25,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
|
||||
LayoutGridService layoutGridService = new LayoutGridService();
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
||||
String fileName = "files/marked_content/Header-Header.pdf";
|
||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||
String fileName = "files/bdr/notMergedParagraphs.pdf";
|
||||
Document document = buildGraph(fileName, LayoutParsingType.TAAS);
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
viewerDocumentService.createViewerDocument(pdDocument, document, out, true);
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user