From 385d4b399e499b852ff45d90dce91cde29ca4405 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominique=20Eifl=C3=A4nder?= Date: Fri, 23 Feb 2024 10:01:28 +0100 Subject: [PATCH] RED-7141: Improved basic block combination logic --- .../processor/model/text/TextPageBlock.java | 2 +- .../DocstrumBlockificationService.java | 16 +++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index e09026b..47c024d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -371,7 +371,7 @@ public class TextPageBlock extends AbstractPageBlock { TextPositionSequence previous = null; for (TextPositionSequence word : sequences) { if (previous != null) { - if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) { + if (word.getMaxYDirAdj() - previous.getMaxYDirAdj() > word.getTextHeight()) { numberOfLines++; } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 2b3ceda..c0f61fd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -6,7 +6,6 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.List; import java.util.ListIterator; -import java.util.regex.Pattern; import java.util.stream.Collectors; import org.springframework.stereotype.Service; @@ -33,7 +32,6 @@ public class DocstrumBlockificationService { private final DocstrumSegmentationService docstrumSegmentationService; static final float THRESHOLD = 2f; - Pattern pattern = Pattern.compile("^(\\p{Digit}{1,3}\\.){0,3}\\p{Digit}{1,3}[\\p{Lower}.]?", Pattern.CASE_INSENSITIVE); public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines, boolean xyOder) { @@ -69,13 +67,6 @@ public class DocstrumBlockificationService { if (previous != null) { - if ((Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 /* && current.getNumberOfLines() <= 10 */ || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1)) { - previous.getSequences().addAll(current.getSequences()); - previous = buildTextBlock(previous.getSequences(), 0); - itty.remove(); - continue; - } - if ((Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 /* && current.getNumberOfLines() <= 10 */ && previous.getNumberOfLines() <= current.getNumberOfLines())) { previous.getSequences().addAll(current.getSequences()); previous = buildTextBlock(previous.getSequences(), 0); @@ -87,6 +78,13 @@ public class DocstrumBlockificationService { continue; } + if ((Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 /* && current.getNumberOfLines() <= 10 */ || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1)) { + previous.getSequences().addAll(current.getSequences()); + previous = buildTextBlock(previous.getSequences(), 0); + itty.remove(); + continue; + } + if (previous.containsBlock(current, THRESHOLD)) { previous.getSequences().addAll(current.getSequences()); previous = buildTextBlock(previous.getSequences().stream().sorted(new TextPositionSequenceComparator()).collect(Collectors.toList()), 0);