diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java index f16198d..287d2ba 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java @@ -4,18 +4,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica // TODO: figure out, why this fails the build // import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.stream.Stream; - -import org.springframework.stereotype.Service; - import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; @@ -23,6 +11,12 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; +import org.springframework.stereotype.Service; + +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Stream; @Service @SuppressWarnings("all") @@ -33,9 +27,10 @@ public class TaasBlockificationService { private static final float INTERSECTS_Y_THRESHOLD = 4;// 2 * HEIGHT_PADDING // This is exactly 2 times our position height padding. This is required to find boxes that are visually intersecting. private static final int X_GAP_SPLIT_CONSTANT = 50; public static final int X_ALIGNMENT_THRESHOLD = 1; - public static final int SMALL_Y_GAP_THRESHOLD = 5; public static final int NEGATIVE_X_GAP_THRESHOLD = -5; + private Pattern listIdentifier = Pattern.compile("^(?:(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)]))|\\uF0B7", Pattern.CASE_INSENSITIVE); + /** * This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions. @@ -80,16 +75,29 @@ public class TaasBlockificationService { List currentTextBlocksToMerge = new LinkedList<>(); textBlocksToMerge.add(currentTextBlocksToMerge); TextPageBlock previousTextBlock = null; + Float lastLineGap = null; for (TextPageBlock currentTextBlock : classificationTextBlocks) { if (previousTextBlock == null) { currentTextBlocksToMerge.add(currentTextBlock); previousTextBlock = currentTextBlock; continue; } + + + Matcher listIdentifierPattern = listIdentifier.matcher(currentTextBlock.getText()); + boolean isListIdentifier = listIdentifierPattern.find(); + + boolean yGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < previousTextBlock.getMostPopularWordHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER; + + boolean sameFont = previousTextBlock.getMostPopularWordFont().equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize(); +// boolean yGap = previousTextBlock != null && currentTextBlock.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER; + boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD; - boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < SMALL_Y_GAP_THRESHOLD; - if (alignsXRight && smallYGap) { + boolean alignsXLeft = Math.abs(currentTextBlock.getPdfMinX() - previousTextBlock.getPdfMinX()) < X_ALIGNMENT_THRESHOLD; +// boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < yGap; + if (yGap && sameFont && !isListIdentifier) { currentTextBlocksToMerge.add(currentTextBlock); + } else { currentTextBlocksToMerge = new LinkedList<>(); currentTextBlocksToMerge.add(currentTextBlock); @@ -170,8 +178,8 @@ public class TaasBlockificationService { private List constructFineGranularTextPageBlocks(List textPositions, - List horizontalRulingLines, - List verticalRulingLines) { + List horizontalRulingLines, + List verticalRulingLines) { int indexOnPage = 0; List wordClusterToCombine = new ArrayList<>(); @@ -180,13 +188,13 @@ public class TaasBlockificationService { float minX = 1000, maxX = 0, minY = 1000, maxY = 0; TextPositionSequence prev = null; // TODO: make static final constant - var listIdentitifier = Pattern.compile("\\b(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)])", Pattern.CASE_INSENSITIVE); + boolean wasSplitted = false; Float splitX1 = null; for (TextPositionSequence word : textPositions) { - Matcher listIdentifierPattern = listIdentitifier.matcher(word.toString()); + Matcher listIdentifierPattern = listIdentifier.matcher(word.toString()); boolean yGap = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER; boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj()); diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/bdr/Wie weiter bei Kristeneinrichtungen.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/bdr/Wie weiter bei Kristeneinrichtungen.pdf new file mode 100644 index 0000000..1ca7c46 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/bdr/Wie weiter bei Kristeneinrichtungen.pdf differ