Merge branch 'RED-7141' into 'main'
RED-7141: Fixed overlapping blocks See merge request fforesight/layout-parser!118
This commit is contained in:
commit
c55984aa67
@ -24,9 +24,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePag
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@ -75,10 +73,16 @@ public class DocstrumBlockificationService {
|
||||
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings));
|
||||
});
|
||||
|
||||
|
||||
if (xyOrder) {
|
||||
abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
abstractPageBlocks.sort(new Comparator<AbstractPageBlock>() {
|
||||
@Override
|
||||
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
|
||||
|
||||
return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return abstractPageBlocks;
|
||||
@ -118,12 +122,12 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, page)) {
|
||||
combineBlocksAndResetIterator(previous, current, itty, false);
|
||||
combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, page)) {
|
||||
combineBlocksAndResetIterator(previous, current, itty, false);
|
||||
combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -148,7 +152,7 @@ public class DocstrumBlockificationService {
|
||||
ClassificationPage page) {
|
||||
|
||||
return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
|
||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
|
||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) //
|
||||
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
|
||||
}
|
||||
|
||||
@ -223,10 +227,6 @@ public class DocstrumBlockificationService {
|
||||
|
||||
TextPageBlock current = (TextPageBlock) block;
|
||||
|
||||
if (current.isToDuplicate()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int i = 0; i < blocks.size(); i++) {
|
||||
|
||||
if (toRemove.contains(blocks.get(i))) {
|
||||
@ -241,15 +241,12 @@ public class DocstrumBlockificationService {
|
||||
|
||||
TextPageBlock inner = (TextPageBlock) blocks.get(i);
|
||||
|
||||
if (inner.isToDuplicate()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
|
||||
|
||||
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
||||
current.getSequences().addAll(inner.getSequences());
|
||||
QuickSort.sort(current.getSequences(), new TextPositionSequenceComparator());
|
||||
current = buildTextBlock(current.getSequences(), 0);
|
||||
current.setToDuplicate(toDuplicate);
|
||||
toRemove.add(inner);
|
||||
itty.set(current);
|
||||
}
|
||||
|
||||
@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String fileName = "files/new/270 rotated text on non rotated pages.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user