diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java index c791d27..91ce8f1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java @@ -71,13 +71,28 @@ public abstract class AbstractPageBlock { return maxX - minX; } + public abstract boolean isEmpty(); - public boolean intersectsY(AbstractPageBlock atc) { + public boolean intersectsY(AbstractPageBlock apb) { - return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY(); + return this.minY <= apb.getMaxY() && this.maxY >= apb.getMinY(); + } + + public boolean almostIntersects(AbstractPageBlock apb, float yThreshold, float xThreshold) { + + return this.almostIntersectsX(apb, xThreshold) && this.almostIntersectsY(apb, yThreshold); } - public abstract boolean isEmpty(); + private boolean almostIntersectsY(AbstractPageBlock apb, float threshold) { + + return this.minY - threshold <= apb.getMaxY() && this.maxY + threshold >= apb.getMinY(); + } + + + private boolean almostIntersectsX(AbstractPageBlock apb, float threshold) { + + return this.minX - threshold <= apb.getMaxX() && this.maxX + threshold >= apb.getMinX(); + } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java index abcbcac..428d529 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java @@ -1,11 +1,16 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockification; +import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING; + import java.util.ArrayList; +import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Stream; import org.springframework.stereotype.Service; @@ -22,8 +27,12 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirA public class TaasBlockificationService { private static final float THRESHOLD = 1f; - private static final float Y_GAP_SPLIT_HEIGHT_MODIFIER = 1.25f; + private static final float Y_GAP_SPLIT_HEIGHT_MODIFIER = 1.25f; // multiplied with text height + private static final float INTERSECTS_Y_THRESHOLD = 2 * HEIGHT_PADDING; // This is exactly 2 times our position height padding. This is required to find boxes that are visually intersecting. private static final int X_GAP_SPLIT_CONSTANT = 50; + public static final int X_ALIGNMENT_THRESHOLD = 1; + public static final int SMALL_Y_GAP_THRESHOLD = 5; + public static final int NEGATIVE_X_GAP_THRESHOLD = -5; /** @@ -39,14 +48,28 @@ public class TaasBlockificationService { public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { List classificationTextBlocks = constructFineGranularTextPageBlocks(textPositions, horizontalRulingLines, verticalRulingLines); - - classificationTextBlocks = mergeFineGranularTextPageBlocks(classificationTextBlocks); + classificationTextBlocks = mergeTextPageBlocksAligningX(classificationTextBlocks); + classificationTextBlocks = mergeIntersectingTextBlocksUntilConvergence(classificationTextBlocks); return new ClassificationPage(new ArrayList<>(classificationTextBlocks.stream().map(classificationTextBlock -> (AbstractPageBlock) classificationTextBlock).toList())); } - private List mergeFineGranularTextPageBlocks(List classificationTextBlocks) { + private List mergeIntersectingTextBlocksUntilConvergence(List classificationTextBlocks) { + + int currentSize = classificationTextBlocks.size(); + while (true) { + classificationTextBlocks = mergeTextPageBlocksAlmostIntersecting(classificationTextBlocks); + if (classificationTextBlocks.size() == currentSize) { + break; + } + currentSize = classificationTextBlocks.size(); + } + return classificationTextBlocks; + } + + + private List mergeTextPageBlocksAligningX(List classificationTextBlocks) { if (classificationTextBlocks.isEmpty()) { return new ArrayList<>(); @@ -61,8 +84,8 @@ public class TaasBlockificationService { previousTextBlock = currentTextBlock; continue; } - boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < 1; - boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < 5; + boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD; + boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < SMALL_Y_GAP_THRESHOLD; if (alignsXRight && smallYGap) { currentTextBlocksToMerge.add(currentTextBlock); } else { @@ -76,6 +99,23 @@ public class TaasBlockificationService { } + private List mergeTextPageBlocksAlmostIntersecting(List textPageBlocks) { + + Set alreadyMerged = new HashSet<>(); + List> textBlocksToMerge = new LinkedList<>(); + for (TextPageBlock textPageBlock : textPageBlocks) { + if (alreadyMerged.contains(textPageBlock)) { + continue; + } + alreadyMerged.add(textPageBlock); + textBlocksToMerge.add(Stream.concat(Stream.of(textPageBlock), + textPageBlocks.stream().filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2)).peek(alreadyMerged::add)) + .toList()); + } + return textBlocksToMerge.stream().map(TextPageBlock::merge).toList(); + } + + private void assignOrientations(List classificationTextBlocks) { Iterator itty = classificationTextBlocks.iterator(); @@ -149,7 +189,7 @@ public class TaasBlockificationService { boolean yGap = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER; boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj()); boolean positiveXGapInline = prev != null && maxX + X_GAP_SPLIT_CONSTANT < word.getMinXDirAdj() && sameLine; - boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5; + boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < NEGATIVE_X_GAP_THRESHOLD; boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight(); boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); boolean splitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); @@ -164,7 +204,7 @@ public class TaasBlockificationService { Orientation prevOrientation = null; if (!classificationTextBlocks.isEmpty()) { - prevOrientation = classificationTextBlocks.get(classificationTextBlocks.size() - 1).getOrientation(); + prevOrientation = classificationTextBlocks.get(classificationTextBlocks.size() - X_ALIGNMENT_THRESHOLD).getOrientation(); } TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPageBlockComparator.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPageBlockComparator.java new file mode 100644 index 0000000..56afc16 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPageBlockComparator.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import java.util.Comparator; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; + +public class TextPageBlockComparator implements Comparator +{ + @Override + public int compare(TextPageBlock pos1, TextPageBlock pos2) + { + // only compare text that is in the same direction + int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees()); + if (cmp1 != 0) + { + return cmp1; + } + + // get the text direction adjusted coordinates + float x1 = pos1.getMinX(); + float x2 = pos2.getMinX(); + + float pos1YBottom = pos1.getMaxY(); + float pos2YBottom = pos2.getMaxY(); + + // note that the coordinates have been adjusted so 0,0 is in upper left + float pos1YTop = pos1YBottom - pos1.getHeight(); + float pos2YTop = pos2YBottom - pos2.getHeight(); + + float yDifference = Math.abs(pos1YBottom - pos2YBottom); + + // we will do a simple tolerance comparison + if (yDifference < .1 || + pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || + pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) + { + return Float.compare(x1, x2); + } + else if (pos1YBottom < pos2YBottom) + { + return -1; + } + else + { + return 1; + } + } +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 9db7867..8875e01 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -25,8 +25,8 @@ public class ViewerDocumentTest extends BuildDocumentTest { LayoutGridService layoutGridService = new LayoutGridService(); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService); - String fileName = "files/marked_content/Header-Header.pdf"; - Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); + String fileName = "files/bdr/notMergedParagraphs.pdf"; + Document document = buildGraph(fileName, LayoutParsingType.TAAS); String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) { viewerDocumentService.createViewerDocument(pdDocument, document, out, true); diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/bdr/notMergedParagraphs.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/bdr/notMergedParagraphs.pdf new file mode 100644 index 0000000..2022a4d Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/bdr/notMergedParagraphs.pdf differ