diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 1676696..535baf6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -24,9 +24,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePag import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort; import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; -import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator; import lombok.RequiredArgsConstructor; @@ -75,10 +73,16 @@ public class DocstrumBlockificationService { abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings)); }); - if (xyOrder) { abstractPageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + .thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + abstractPageBlocks.sort(new Comparator() { + @Override + public int compare(AbstractPageBlock o1, AbstractPageBlock o2) { + + return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0; + } + }); } return abstractPageBlocks; @@ -118,12 +122,12 @@ public class DocstrumBlockificationService { } if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, page)) { - combineBlocksAndResetIterator(previous, current, itty, false); + combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate()); continue; } if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, page)) { - combineBlocksAndResetIterator(previous, current, itty, false); + combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate()); continue; } @@ -148,7 +152,7 @@ public class DocstrumBlockificationService { ClassificationPage page) { return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) // - && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) // + && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) // && !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4; } @@ -223,10 +227,6 @@ public class DocstrumBlockificationService { TextPageBlock current = (TextPageBlock) block; - if (current.isToDuplicate()) { - continue; - } - for (int i = 0; i < blocks.size(); i++) { if (toRemove.contains(blocks.get(i))) { @@ -241,15 +241,12 @@ public class DocstrumBlockificationService { TextPageBlock inner = (TextPageBlock) blocks.get(i); - if (inner.isToDuplicate()) { - continue; - } - if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) { + boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate(); current.getSequences().addAll(inner.getSequences()); - QuickSort.sort(current.getSequences(), new TextPositionSequenceComparator()); current = buildTextBlock(current.getSequences(), 0); + current.setToDuplicate(toDuplicate); toRemove.add(inner); itty.set(current); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index ecc9bdf..bd187ed 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; + String fileName = "files/new/270 rotated text on non rotated pages.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/100 Trinexapac-ethyl_RAR_20_Volume_3CP_B-9_ 2018-01-10.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/100 Trinexapac-ethyl_RAR_20_Volume_3CP_B-9_ 2018-01-10.pdf new file mode 100644 index 0000000..de729ce Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/100 Trinexapac-ethyl_RAR_20_Volume_3CP_B-9_ 2018-01-10.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/S101.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/S101.pdf new file mode 100644 index 0000000..52c4130 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/S101.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/S35.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/S35.pdf new file mode 100644 index 0000000..112499e Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/S35.pdf differ