RED-7141: Added basic block combination logic
This commit is contained in:
parent
240ef82def
commit
d06933ed17
@ -73,7 +73,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
return sequences.get(0).getPageWidth();
|
return sequences.get(0).getPageWidth();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||||
|
|
||||||
@ -82,6 +82,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
return fromTextPositionSequences(sequences);
|
return fromTextPositionSequences(sequences);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
|
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
|
||||||
|
|
||||||
TextPageBlock textBlock = null;
|
TextPageBlock textBlock = null;
|
||||||
@ -133,7 +134,6 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the minX value in pdf coordinate system.
|
* Returns the minX value in pdf coordinate system.
|
||||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||||
@ -362,7 +362,22 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
|
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public int getNumberOfLines() {
|
||||||
|
|
||||||
|
int numberOfLines = 1;
|
||||||
|
TextPositionSequence previous = null;
|
||||||
|
for (TextPositionSequence word : sequences) {
|
||||||
|
if (previous != null) {
|
||||||
|
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
|
||||||
|
numberOfLines++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
previous = word;
|
||||||
|
}
|
||||||
|
return numberOfLines;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -6,7 +6,6 @@ import java.util.ArrayList;
|
|||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.ListIterator;
|
import java.util.ListIterator;
|
||||||
import java.util.regex.Matcher;
|
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
@ -21,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
@ -31,7 +31,7 @@ public class DocstrumBlockificationService {
|
|||||||
|
|
||||||
private final DocstrumSegmentationService docstrumSegmentationService;
|
private final DocstrumSegmentationService docstrumSegmentationService;
|
||||||
|
|
||||||
static final float THRESHOLD = 1f;
|
static final float THRESHOLD = 2f;
|
||||||
Pattern pattern = Pattern.compile("^(\\p{Digit}{1,3}\\.){0,3}\\p{Digit}{1,3}[\\p{Lower}.]?", Pattern.CASE_INSENSITIVE);
|
Pattern pattern = Pattern.compile("^(\\p{Digit}{1,3}\\.){0,3}\\p{Digit}{1,3}[\\p{Lower}.]?", Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
|
|
||||||
@ -67,12 +67,25 @@ public class DocstrumBlockificationService {
|
|||||||
TextPageBlock current = (TextPageBlock) block;
|
TextPageBlock current = (TextPageBlock) block;
|
||||||
|
|
||||||
if (previous != null) {
|
if (previous != null) {
|
||||||
Matcher matcher = pattern.matcher(previous.getText().toString());
|
|
||||||
if (matcher.matches() && Math.abs(previous.getMinY() - current.getMinY()) < 1) {
|
if ((Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 && current.getNumberOfLines() <= 5 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1)) {
|
||||||
previous.getSequences().addAll(current.getSequences());
|
previous.getSequences().addAll(current.getSequences());
|
||||||
previous = buildTextBlock(previous.getSequences(), 0);
|
previous = buildTextBlock(previous.getSequences(), 0);
|
||||||
itty.remove();
|
itty.remove();
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ((Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 && current.getNumberOfLines() <= 5 && previous.getNumberOfLines() <= current.getNumberOfLines())) {
|
||||||
|
previous.getSequences().addAll(current.getSequences());
|
||||||
|
previous = buildTextBlock(previous.getSequences(), 0);
|
||||||
|
itty.remove();
|
||||||
|
|
||||||
|
// Might be a left/right mapping add one sorted as well
|
||||||
|
var sortedDublicate = buildTextBlock(previous.getSequences().stream().sorted(new TextPositionSequenceComparator()).toList(), 0);
|
||||||
|
itty.add(sortedDublicate);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
previous = current;
|
previous = current;
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user