Merge branch 'RED-7141' into 'main'
RED-7141: Fixed overlapping blocks See merge request fforesight/layout-parser!118
This commit is contained in:
commit
c55984aa67
@ -24,9 +24,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePag
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
|
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
@ -75,10 +73,16 @@ public class DocstrumBlockificationService {
|
|||||||
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings));
|
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings));
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
if (xyOrder) {
|
if (xyOrder) {
|
||||||
abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||||
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||||
|
abstractPageBlocks.sort(new Comparator<AbstractPageBlock>() {
|
||||||
|
@Override
|
||||||
|
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
|
||||||
|
|
||||||
|
return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0;
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
return abstractPageBlocks;
|
return abstractPageBlocks;
|
||||||
@ -118,12 +122,12 @@ public class DocstrumBlockificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, page)) {
|
if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, page)) {
|
||||||
combineBlocksAndResetIterator(previous, current, itty, false);
|
combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, page)) {
|
if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, page)) {
|
||||||
combineBlocksAndResetIterator(previous, current, itty, false);
|
combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -148,7 +152,7 @@ public class DocstrumBlockificationService {
|
|||||||
ClassificationPage page) {
|
ClassificationPage page) {
|
||||||
|
|
||||||
return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
|
return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
|
||||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
|
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) //
|
||||||
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
|
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -223,10 +227,6 @@ public class DocstrumBlockificationService {
|
|||||||
|
|
||||||
TextPageBlock current = (TextPageBlock) block;
|
TextPageBlock current = (TextPageBlock) block;
|
||||||
|
|
||||||
if (current.isToDuplicate()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < blocks.size(); i++) {
|
for (int i = 0; i < blocks.size(); i++) {
|
||||||
|
|
||||||
if (toRemove.contains(blocks.get(i))) {
|
if (toRemove.contains(blocks.get(i))) {
|
||||||
@ -241,15 +241,12 @@ public class DocstrumBlockificationService {
|
|||||||
|
|
||||||
TextPageBlock inner = (TextPageBlock) blocks.get(i);
|
TextPageBlock inner = (TextPageBlock) blocks.get(i);
|
||||||
|
|
||||||
if (inner.isToDuplicate()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
|
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
|
||||||
|
|
||||||
|
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
||||||
current.getSequences().addAll(inner.getSequences());
|
current.getSequences().addAll(inner.getSequences());
|
||||||
QuickSort.sort(current.getSequences(), new TextPositionSequenceComparator());
|
|
||||||
current = buildTextBlock(current.getSequences(), 0);
|
current = buildTextBlock(current.getSequences(), 0);
|
||||||
|
current.setToDuplicate(toDuplicate);
|
||||||
toRemove.add(inner);
|
toRemove.add(inner);
|
||||||
itty.set(current);
|
itty.set(current);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testViewerDocument() {
|
public void testViewerDocument() {
|
||||||
|
|
||||||
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
String fileName = "files/new/270 rotated text on non rotated pages.pdf";
|
||||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||||
|
|
||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user