hotfix: Fixed parsing for specific taas document

2023-10-17 15:52:19 +02:00 · 2023-10-17 15:52:19 +02:00 · 567cbc178b
commit 567cbc178b
parent 3c53772765
2 changed files with 27 additions and 19 deletions
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java
@ -4,18 +4,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica
 // TODO: figure out, why this fails the build
 // import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING;

-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.stream.Stream;
-
-import org.springframework.stereotype.Service;
-
 import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
 import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
@ -23,6 +11,12 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
 import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
+import org.springframework.stereotype.Service;
+
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Stream;

@Service
@SuppressWarnings("all")
@ -33,9 +27,10 @@ public class TaasBlockificationService {
    private static final float INTERSECTS_Y_THRESHOLD = 4;// 2 * HEIGHT_PADDING // This is exactly 2 times our position height padding. This is required to find boxes that are visually intersecting.
    private static final int X_GAP_SPLIT_CONSTANT = 50;
    public static final int X_ALIGNMENT_THRESHOLD = 1;
-    public static final int SMALL_Y_GAP_THRESHOLD = 5;
    public static final int NEGATIVE_X_GAP_THRESHOLD = -5;

+    private Pattern listIdentifier = Pattern.compile("^(?:(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)]))|\\uF0B7", Pattern.CASE_INSENSITIVE);
+

    /**
     * This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
@ -80,16 +75,29 @@ public class TaasBlockificationService {
        List<TextPageBlock> currentTextBlocksToMerge = new LinkedList<>();
        textBlocksToMerge.add(currentTextBlocksToMerge);
        TextPageBlock previousTextBlock = null;
+        Float lastLineGap = null;
        for (TextPageBlock currentTextBlock : classificationTextBlocks) {
            if (previousTextBlock == null) {
                currentTextBlocksToMerge.add(currentTextBlock);
                previousTextBlock = currentTextBlock;
                continue;
            }
+
+
+            Matcher listIdentifierPattern = listIdentifier.matcher(currentTextBlock.getText());
+            boolean isListIdentifier = listIdentifierPattern.find();
+
+            boolean yGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < previousTextBlock.getMostPopularWordHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER;
+
+            boolean sameFont = previousTextBlock.getMostPopularWordFont().equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize();
+//            boolean yGap = previousTextBlock != null && currentTextBlock.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
+
            boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD;
-            boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < SMALL_Y_GAP_THRESHOLD;
-            if (alignsXRight && smallYGap) {
+            boolean alignsXLeft = Math.abs(currentTextBlock.getPdfMinX() - previousTextBlock.getPdfMinX()) < X_ALIGNMENT_THRESHOLD;
+//            boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < yGap;
+            if (yGap && sameFont && !isListIdentifier) {
                currentTextBlocksToMerge.add(currentTextBlock);
+
            } else {
                currentTextBlocksToMerge = new LinkedList<>();
                currentTextBlocksToMerge.add(currentTextBlock);
@ -170,8 +178,8 @@ public class TaasBlockificationService {


    private List<TextPageBlock> constructFineGranularTextPageBlocks(List<TextPositionSequence> textPositions,
-                                                                                        List<Ruling> horizontalRulingLines,
-                                                                                        List<Ruling> verticalRulingLines) {
+                                                                    List<Ruling> horizontalRulingLines,
+                                                                    List<Ruling> verticalRulingLines) {

        int indexOnPage = 0;
        List<TextPositionSequence> wordClusterToCombine = new ArrayList<>();
@ -180,13 +188,13 @@ public class TaasBlockificationService {
        float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
        TextPositionSequence prev = null;
        // TODO: make static final constant
-        var listIdentitifier = Pattern.compile("\\b(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)])", Pattern.CASE_INSENSITIVE);
+

        boolean wasSplitted = false;
        Float splitX1 = null;
        for (TextPositionSequence word : textPositions) {

-            Matcher listIdentifierPattern = listIdentitifier.matcher(word.toString());
+            Matcher listIdentifierPattern = listIdentifier.matcher(word.toString());

            boolean yGap = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
            boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj());
--- a/layoutparser-service/layoutparser-service-server/src/test/resources/files/bdr/Wie
+++ b/layoutparser-service/layoutparser-service-server/src/test/resources/files/bdr/Wie