RED-7141: Improved basic block combination logic

This commit is contained in:
Dominique Eifländer 2024-02-23 10:01:28 +01:00
parent d0e1af3a44
commit 385d4b399e
2 changed files with 8 additions and 10 deletions

View File

@ -371,7 +371,7 @@ public class TextPageBlock extends AbstractPageBlock {
TextPositionSequence previous = null;
for (TextPositionSequence word : sequences) {
if (previous != null) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
if (word.getMaxYDirAdj() - previous.getMaxYDirAdj() > word.getTextHeight()) {
numberOfLines++;
}
}

View File

@ -6,7 +6,6 @@ import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.ListIterator;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
@ -33,7 +32,6 @@ public class DocstrumBlockificationService {
private final DocstrumSegmentationService docstrumSegmentationService;
static final float THRESHOLD = 2f;
Pattern pattern = Pattern.compile("^(\\p{Digit}{1,3}\\.){0,3}\\p{Digit}{1,3}[\\p{Lower}.]?", Pattern.CASE_INSENSITIVE);
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, boolean xyOder) {
@ -69,13 +67,6 @@ public class DocstrumBlockificationService {
if (previous != null) {
if ((Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 /* && current.getNumberOfLines() <= 10 */ || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1)) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
itty.remove();
continue;
}
if ((Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 /* && current.getNumberOfLines() <= 10 */ && previous.getNumberOfLines() <= current.getNumberOfLines())) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
@ -87,6 +78,13 @@ public class DocstrumBlockificationService {
continue;
}
if ((Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 /* && current.getNumberOfLines() <= 10 */ || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1)) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
itty.remove();
continue;
}
if (previous.containsBlock(current, THRESHOLD)) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences().stream().sorted(new TextPositionSequenceComparator()).collect(Collectors.toList()), 0);