hotfix: Fixed parsing for specific taas document
This commit is contained in:
parent
3c53772765
commit
567cbc178b
@ -4,18 +4,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica
|
||||
// TODO: figure out, why this fails the build
|
||||
// import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
@ -23,6 +11,12 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@Service
|
||||
@SuppressWarnings("all")
|
||||
@ -33,9 +27,10 @@ public class TaasBlockificationService {
|
||||
private static final float INTERSECTS_Y_THRESHOLD = 4;// 2 * HEIGHT_PADDING // This is exactly 2 times our position height padding. This is required to find boxes that are visually intersecting.
|
||||
private static final int X_GAP_SPLIT_CONSTANT = 50;
|
||||
public static final int X_ALIGNMENT_THRESHOLD = 1;
|
||||
public static final int SMALL_Y_GAP_THRESHOLD = 5;
|
||||
public static final int NEGATIVE_X_GAP_THRESHOLD = -5;
|
||||
|
||||
private Pattern listIdentifier = Pattern.compile("^(?:(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)]))|\\uF0B7", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
|
||||
/**
|
||||
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
||||
@ -80,16 +75,29 @@ public class TaasBlockificationService {
|
||||
List<TextPageBlock> currentTextBlocksToMerge = new LinkedList<>();
|
||||
textBlocksToMerge.add(currentTextBlocksToMerge);
|
||||
TextPageBlock previousTextBlock = null;
|
||||
Float lastLineGap = null;
|
||||
for (TextPageBlock currentTextBlock : classificationTextBlocks) {
|
||||
if (previousTextBlock == null) {
|
||||
currentTextBlocksToMerge.add(currentTextBlock);
|
||||
previousTextBlock = currentTextBlock;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
Matcher listIdentifierPattern = listIdentifier.matcher(currentTextBlock.getText());
|
||||
boolean isListIdentifier = listIdentifierPattern.find();
|
||||
|
||||
boolean yGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < previousTextBlock.getMostPopularWordHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||
|
||||
boolean sameFont = previousTextBlock.getMostPopularWordFont().equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize();
|
||||
// boolean yGap = previousTextBlock != null && currentTextBlock.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||
|
||||
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD;
|
||||
boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < SMALL_Y_GAP_THRESHOLD;
|
||||
if (alignsXRight && smallYGap) {
|
||||
boolean alignsXLeft = Math.abs(currentTextBlock.getPdfMinX() - previousTextBlock.getPdfMinX()) < X_ALIGNMENT_THRESHOLD;
|
||||
// boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < yGap;
|
||||
if (yGap && sameFont && !isListIdentifier) {
|
||||
currentTextBlocksToMerge.add(currentTextBlock);
|
||||
|
||||
} else {
|
||||
currentTextBlocksToMerge = new LinkedList<>();
|
||||
currentTextBlocksToMerge.add(currentTextBlock);
|
||||
@ -170,8 +178,8 @@ public class TaasBlockificationService {
|
||||
|
||||
|
||||
private List<TextPageBlock> constructFineGranularTextPageBlocks(List<TextPositionSequence> textPositions,
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
int indexOnPage = 0;
|
||||
List<TextPositionSequence> wordClusterToCombine = new ArrayList<>();
|
||||
@ -180,13 +188,13 @@ public class TaasBlockificationService {
|
||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
// TODO: make static final constant
|
||||
var listIdentitifier = Pattern.compile("\\b(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)])", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
|
||||
boolean wasSplitted = false;
|
||||
Float splitX1 = null;
|
||||
for (TextPositionSequence word : textPositions) {
|
||||
|
||||
Matcher listIdentifierPattern = listIdentitifier.matcher(word.toString());
|
||||
Matcher listIdentifierPattern = listIdentifier.matcher(word.toString());
|
||||
|
||||
boolean yGap = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||
boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj());
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user