TAAS-104: merge visually intersecting Paragraphs
This commit is contained in:
parent
8dba392904
commit
621c3f269d
@ -71,13 +71,28 @@ public abstract class AbstractPageBlock {
|
||||
return maxX - minX;
|
||||
}
|
||||
|
||||
public abstract boolean isEmpty();
|
||||
|
||||
public boolean intersectsY(AbstractPageBlock atc) {
|
||||
public boolean intersectsY(AbstractPageBlock apb) {
|
||||
|
||||
return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY();
|
||||
return this.minY <= apb.getMaxY() && this.maxY >= apb.getMinY();
|
||||
}
|
||||
|
||||
public boolean almostIntersects(AbstractPageBlock apb, float yThreshold, float xThreshold) {
|
||||
|
||||
return this.almostIntersectsX(apb, xThreshold) && this.almostIntersectsY(apb, yThreshold);
|
||||
}
|
||||
|
||||
|
||||
public abstract boolean isEmpty();
|
||||
private boolean almostIntersectsY(AbstractPageBlock apb, float threshold) {
|
||||
|
||||
return this.minY - threshold <= apb.getMaxY() && this.maxY + threshold >= apb.getMinY();
|
||||
}
|
||||
|
||||
|
||||
private boolean almostIntersectsX(AbstractPageBlock apb, float threshold) {
|
||||
|
||||
return this.minX - threshold <= apb.getMaxX() && this.maxX + threshold >= apb.getMinX();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,11 +1,16 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@ -22,8 +27,12 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirA
|
||||
public class TaasBlockificationService {
|
||||
|
||||
private static final float THRESHOLD = 1f;
|
||||
private static final float Y_GAP_SPLIT_HEIGHT_MODIFIER = 1.25f;
|
||||
private static final float Y_GAP_SPLIT_HEIGHT_MODIFIER = 1.25f; // multiplied with text height
|
||||
private static final float INTERSECTS_Y_THRESHOLD = 2 * HEIGHT_PADDING; // This is exactly 2 times our position height padding. This is required to find boxes that are visually intersecting.
|
||||
private static final int X_GAP_SPLIT_CONSTANT = 50;
|
||||
public static final int X_ALIGNMENT_THRESHOLD = 1;
|
||||
public static final int SMALL_Y_GAP_THRESHOLD = 5;
|
||||
public static final int NEGATIVE_X_GAP_THRESHOLD = -5;
|
||||
|
||||
|
||||
/**
|
||||
@ -39,14 +48,28 @@ public class TaasBlockificationService {
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
List<TextPageBlock> classificationTextBlocks = constructFineGranularTextPageBlocks(textPositions, horizontalRulingLines, verticalRulingLines);
|
||||
|
||||
classificationTextBlocks = mergeFineGranularTextPageBlocks(classificationTextBlocks);
|
||||
classificationTextBlocks = mergeTextPageBlocksAligningX(classificationTextBlocks);
|
||||
classificationTextBlocks = mergeIntersectingTextBlocksUntilConvergence(classificationTextBlocks);
|
||||
|
||||
return new ClassificationPage(new ArrayList<>(classificationTextBlocks.stream().map(classificationTextBlock -> (AbstractPageBlock) classificationTextBlock).toList()));
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> mergeFineGranularTextPageBlocks(List<TextPageBlock> classificationTextBlocks) {
|
||||
private List<TextPageBlock> mergeIntersectingTextBlocksUntilConvergence(List<TextPageBlock> classificationTextBlocks) {
|
||||
|
||||
int currentSize = classificationTextBlocks.size();
|
||||
while (true) {
|
||||
classificationTextBlocks = mergeTextPageBlocksAlmostIntersecting(classificationTextBlocks);
|
||||
if (classificationTextBlocks.size() == currentSize) {
|
||||
break;
|
||||
}
|
||||
currentSize = classificationTextBlocks.size();
|
||||
}
|
||||
return classificationTextBlocks;
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> mergeTextPageBlocksAligningX(List<TextPageBlock> classificationTextBlocks) {
|
||||
|
||||
if (classificationTextBlocks.isEmpty()) {
|
||||
return new ArrayList<>();
|
||||
@ -61,8 +84,8 @@ public class TaasBlockificationService {
|
||||
previousTextBlock = currentTextBlock;
|
||||
continue;
|
||||
}
|
||||
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < 1;
|
||||
boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < 5;
|
||||
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD;
|
||||
boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < SMALL_Y_GAP_THRESHOLD;
|
||||
if (alignsXRight && smallYGap) {
|
||||
currentTextBlocksToMerge.add(currentTextBlock);
|
||||
} else {
|
||||
@ -76,6 +99,23 @@ public class TaasBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> mergeTextPageBlocksAlmostIntersecting(List<TextPageBlock> textPageBlocks) {
|
||||
|
||||
Set<TextPageBlock> alreadyMerged = new HashSet<>();
|
||||
List<List<TextPageBlock>> textBlocksToMerge = new LinkedList<>();
|
||||
for (TextPageBlock textPageBlock : textPageBlocks) {
|
||||
if (alreadyMerged.contains(textPageBlock)) {
|
||||
continue;
|
||||
}
|
||||
alreadyMerged.add(textPageBlock);
|
||||
textBlocksToMerge.add(Stream.concat(Stream.of(textPageBlock),
|
||||
textPageBlocks.stream().filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2)).peek(alreadyMerged::add))
|
||||
.toList());
|
||||
}
|
||||
return textBlocksToMerge.stream().map(TextPageBlock::merge).toList();
|
||||
}
|
||||
|
||||
|
||||
private void assignOrientations(List<TextPageBlock> classificationTextBlocks) {
|
||||
|
||||
Iterator<TextPageBlock> itty = classificationTextBlocks.iterator();
|
||||
@ -149,7 +189,7 @@ public class TaasBlockificationService {
|
||||
boolean yGap = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||
boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj());
|
||||
boolean positiveXGapInline = prev != null && maxX + X_GAP_SPLIT_CONSTANT < word.getMinXDirAdj() && sameLine;
|
||||
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
|
||||
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < NEGATIVE_X_GAP_THRESHOLD;
|
||||
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean splitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
@ -164,7 +204,7 @@ public class TaasBlockificationService {
|
||||
|
||||
Orientation prevOrientation = null;
|
||||
if (!classificationTextBlocks.isEmpty()) {
|
||||
prevOrientation = classificationTextBlocks.get(classificationTextBlocks.size() - 1).getOrientation();
|
||||
prevOrientation = classificationTextBlocks.get(classificationTextBlocks.size() - X_ALIGNMENT_THRESHOLD).getOrientation();
|
||||
}
|
||||
|
||||
TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine);
|
||||
|
||||
@ -0,0 +1,64 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
public class TextPageBlockComparator implements Comparator<TextPageBlock>
|
||||
{
|
||||
@Override
|
||||
public int compare(TextPageBlock pos1, TextPageBlock pos2)
|
||||
{
|
||||
// only compare text that is in the same direction
|
||||
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
|
||||
if (cmp1 != 0)
|
||||
{
|
||||
return cmp1;
|
||||
}
|
||||
|
||||
// get the text direction adjusted coordinates
|
||||
float x1 = pos1.getMinX();
|
||||
float x2 = pos2.getMinX();
|
||||
|
||||
float pos1YBottom = pos1.getMaxY();
|
||||
float pos2YBottom = pos2.getMaxY();
|
||||
|
||||
// note that the coordinates have been adjusted so 0,0 is in upper left
|
||||
float pos1YTop = pos1YBottom - pos1.getHeight();
|
||||
float pos2YTop = pos2YBottom - pos2.getHeight();
|
||||
|
||||
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
||||
|
||||
// we will do a simple tolerance comparison
|
||||
if (yDifference < .1 ||
|
||||
pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom ||
|
||||
pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)
|
||||
{
|
||||
return Float.compare(x1, x2);
|
||||
}
|
||||
else if (pos1YBottom < pos2YBottom)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -25,8 +25,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
|
||||
LayoutGridService layoutGridService = new LayoutGridService();
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
||||
String fileName = "files/marked_content/Header-Header.pdf";
|
||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||
String fileName = "files/bdr/notMergedParagraphs.pdf";
|
||||
Document document = buildGraph(fileName, LayoutParsingType.TAAS);
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
viewerDocumentService.createViewerDocument(pdDocument, document, out, true);
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user