Merge branch 'RED-7141' into 'main'

RED-7141: Fixed overlapping blocks

See merge request fforesight/layout-parser!118
This commit is contained in:
Dominique Eifländer 2024-03-14 09:09:52 +01:00
commit c55984aa67
5 changed files with 14 additions and 17 deletions

View File

@ -24,9 +24,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePag
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
import lombok.RequiredArgsConstructor;
@ -75,10 +73,16 @@ public class DocstrumBlockificationService {
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings));
});
if (xyOrder) {
abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
abstractPageBlocks.sort(new Comparator<AbstractPageBlock>() {
@Override
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0;
}
});
}
return abstractPageBlocks;
@ -118,12 +122,12 @@ public class DocstrumBlockificationService {
}
if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, page)) {
combineBlocksAndResetIterator(previous, current, itty, false);
combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
continue;
}
if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, page)) {
combineBlocksAndResetIterator(previous, current, itty, false);
combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
continue;
}
@ -148,7 +152,7 @@ public class DocstrumBlockificationService {
ClassificationPage page) {
return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) //
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
}
@ -223,10 +227,6 @@ public class DocstrumBlockificationService {
TextPageBlock current = (TextPageBlock) block;
if (current.isToDuplicate()) {
continue;
}
for (int i = 0; i < blocks.size(); i++) {
if (toRemove.contains(blocks.get(i))) {
@ -241,15 +241,12 @@ public class DocstrumBlockificationService {
TextPageBlock inner = (TextPageBlock) blocks.get(i);
if (inner.isToDuplicate()) {
continue;
}
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
current.getSequences().addAll(inner.getSequences());
QuickSort.sort(current.getSequences(), new TextPositionSequenceComparator());
current = buildTextBlock(current.getSequences(), 0);
current.setToDuplicate(toDuplicate);
toRemove.add(inner);
itty.set(current);
}

View File

@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String fileName = "files/new/270 rotated text on non rotated pages.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();