diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index 0442af6..e09026b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -73,7 +73,7 @@ public class TextPageBlock extends AbstractPageBlock { return sequences.get(0).getPageWidth(); } - + public static TextPageBlock merge(List textBlocksToMerge) { @@ -82,6 +82,7 @@ public class TextPageBlock extends AbstractPageBlock { return fromTextPositionSequences(sequences); } + public static TextPageBlock fromTextPositionSequences(List wordBlockList) { TextPageBlock textBlock = null; @@ -133,7 +134,6 @@ public class TextPageBlock extends AbstractPageBlock { } - /** * Returns the minX value in pdf coordinate system. * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. @@ -362,7 +362,22 @@ public class TextPageBlock extends AbstractPageBlock { } return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()); + } + + public int getNumberOfLines() { + + int numberOfLines = 1; + TextPositionSequence previous = null; + for (TextPositionSequence word : sequences) { + if (previous != null) { + if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) { + numberOfLines++; + } + } + previous = word; + } + return numberOfLines; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 5c45105..a7ffd7e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -6,7 +6,6 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.List; import java.util.ListIterator; -import java.util.regex.Matcher; import java.util.regex.Pattern; import org.springframework.stereotype.Service; @@ -21,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService; import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; +import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator; import lombok.RequiredArgsConstructor; @@ -31,7 +31,7 @@ public class DocstrumBlockificationService { private final DocstrumSegmentationService docstrumSegmentationService; - static final float THRESHOLD = 1f; + static final float THRESHOLD = 2f; Pattern pattern = Pattern.compile("^(\\p{Digit}{1,3}\\.){0,3}\\p{Digit}{1,3}[\\p{Lower}.]?", Pattern.CASE_INSENSITIVE); @@ -67,12 +67,25 @@ public class DocstrumBlockificationService { TextPageBlock current = (TextPageBlock) block; if (previous != null) { - Matcher matcher = pattern.matcher(previous.getText().toString()); - if (matcher.matches() && Math.abs(previous.getMinY() - current.getMinY()) < 1) { + + if ((Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 && current.getNumberOfLines() <= 5 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1)) { previous.getSequences().addAll(current.getSequences()); previous = buildTextBlock(previous.getSequences(), 0); itty.remove(); + continue; } + + if ((Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 && current.getNumberOfLines() <= 5 && previous.getNumberOfLines() <= current.getNumberOfLines())) { + previous.getSequences().addAll(current.getSequences()); + previous = buildTextBlock(previous.getSequences(), 0); + itty.remove(); + + // Might be a left/right mapping add one sorted as well + var sortedDublicate = buildTextBlock(previous.getSequences().stream().sorted(new TextPositionSequenceComparator()).toList(), 0); + itty.add(sortedDublicate); + continue; + } + } previous = current; }