hotfix: Fixed parsing for specific taas document

This commit is contained in:
Dominique Eifländer 2023-10-17 15:52:19 +02:00
parent 3c53772765
commit 567cbc178b
2 changed files with 27 additions and 19 deletions

View File

@ -4,18 +4,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica
// TODO: figure out, why this fails the build
// import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
@ -23,6 +11,12 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
@Service
@SuppressWarnings("all")
@ -33,9 +27,10 @@ public class TaasBlockificationService {
private static final float INTERSECTS_Y_THRESHOLD = 4;// 2 * HEIGHT_PADDING // This is exactly 2 times our position height padding. This is required to find boxes that are visually intersecting.
private static final int X_GAP_SPLIT_CONSTANT = 50;
public static final int X_ALIGNMENT_THRESHOLD = 1;
public static final int SMALL_Y_GAP_THRESHOLD = 5;
public static final int NEGATIVE_X_GAP_THRESHOLD = -5;
private Pattern listIdentifier = Pattern.compile("^(?:(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)]))|\\uF0B7", Pattern.CASE_INSENSITIVE);
/**
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
@ -80,16 +75,29 @@ public class TaasBlockificationService {
List<TextPageBlock> currentTextBlocksToMerge = new LinkedList<>();
textBlocksToMerge.add(currentTextBlocksToMerge);
TextPageBlock previousTextBlock = null;
Float lastLineGap = null;
for (TextPageBlock currentTextBlock : classificationTextBlocks) {
if (previousTextBlock == null) {
currentTextBlocksToMerge.add(currentTextBlock);
previousTextBlock = currentTextBlock;
continue;
}
Matcher listIdentifierPattern = listIdentifier.matcher(currentTextBlock.getText());
boolean isListIdentifier = listIdentifierPattern.find();
boolean yGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < previousTextBlock.getMostPopularWordHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER;
boolean sameFont = previousTextBlock.getMostPopularWordFont().equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize();
// boolean yGap = previousTextBlock != null && currentTextBlock.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD;
boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < SMALL_Y_GAP_THRESHOLD;
if (alignsXRight && smallYGap) {
boolean alignsXLeft = Math.abs(currentTextBlock.getPdfMinX() - previousTextBlock.getPdfMinX()) < X_ALIGNMENT_THRESHOLD;
// boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < yGap;
if (yGap && sameFont && !isListIdentifier) {
currentTextBlocksToMerge.add(currentTextBlock);
} else {
currentTextBlocksToMerge = new LinkedList<>();
currentTextBlocksToMerge.add(currentTextBlock);
@ -170,8 +178,8 @@ public class TaasBlockificationService {
private List<TextPageBlock> constructFineGranularTextPageBlocks(List<TextPositionSequence> textPositions,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
int indexOnPage = 0;
List<TextPositionSequence> wordClusterToCombine = new ArrayList<>();
@ -180,13 +188,13 @@ public class TaasBlockificationService {
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
// TODO: make static final constant
var listIdentitifier = Pattern.compile("\\b(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)])", Pattern.CASE_INSENSITIVE);
boolean wasSplitted = false;
Float splitX1 = null;
for (TextPositionSequence word : textPositions) {
Matcher listIdentifierPattern = listIdentitifier.matcher(word.toString());
Matcher listIdentifierPattern = listIdentifier.matcher(word.toString());
boolean yGap = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj());