diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index c04ccf7..7921d37 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -280,8 +280,9 @@ public class LayoutParsingPipeline { redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations()); case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings); case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> - docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations()); - case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations()); + docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType); + case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> + docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType); }; classificationPage.setCleanRulings(cleanRulings); @@ -311,12 +312,6 @@ public class LayoutParsingPipeline { tableExtractionService.extractTables(emptyTableCells, classificationPage); - if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) { - docstrumBlockificationService.combineBlocks(classificationPage); - } else if (layoutParsingType == LayoutParsingType.CLARIFYND) { - docstrumBlockificationService.mergeIntersectingBlocks(classificationPage, 0, 6.5f); - } - buildPageStatistics(classificationPage); increaseDocumentStatistics(classificationPage, classificationDocument); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java index 26987c7..2565413 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java @@ -37,7 +37,7 @@ public class TextRulingsClassifier { float strikethroughCenterX = (float) word.getBoundingBox().getCenterX(); float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2); - float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBoundingBox().getMinX() : word.getBoundingBox().getMaxX()); + float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBoundingBox().getMaxX() : word.getBoundingBox().getMinX()); float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2); float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 618e20e..d5e4ed1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -9,6 +9,7 @@ import java.util.ListIterator; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; @@ -36,7 +37,11 @@ public class DocstrumBlockificationService { static final float THRESHOLD = 1f; - public ClassificationPage blockify(List textPositions, CleanRulings rulings, boolean xyOrder, LayoutparsingVisualizations visualizations) { + public ClassificationPage blockify(List textPositions, + CleanRulings rulings, + boolean xyOrder, + LayoutparsingVisualizations visualizations, + LayoutParsingType layoutParsingType) { CleanRulings usedRulings = rulings.withoutTextRulings(); @@ -59,6 +64,12 @@ public class DocstrumBlockificationService { mergeIntersectingBlocks(classificationPage, 0, 0); + combineBlocks(classificationPage); + + if (layoutParsingType == LayoutParsingType.CLARIFYND) { + mergeIntersectingBlocks(classificationPage, 0, 6.5f); + } + return classificationPage; } @@ -77,8 +88,7 @@ public class DocstrumBlockificationService { } - private List toAbstractPageBlocks(List zones, boolean xyOrder, - CleanRulings usedRulings) { + private List toAbstractPageBlocks(List zones, boolean xyOrder, CleanRulings usedRulings) { List abstractPageBlocks = new ArrayList<>(); zones.forEach(zone -> { @@ -103,6 +113,7 @@ public class DocstrumBlockificationService { TextPageBlock previous = new TextPageBlock(); ListIterator itty = page.getTextBlocks().listIterator(); + CleanRulings usedRulings = page.getCleanRulings().withoutTextRulings(); while (itty.hasNext()) { AbstractPageBlock block = itty.next(); @@ -114,7 +125,7 @@ public class DocstrumBlockificationService { if (previous != null && !previous.getSequences().isEmpty()) { - if (current.getDir() != previous.getDir()) { + if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current.getBBox(), previous.getBBox())) { previous = current; continue; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java index 8a89d9a..9add9b3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java @@ -21,6 +21,9 @@ import lombok.SneakyThrows; @RequiredArgsConstructor public class GraphicExtractorService { + private static final int MIN_GRAPHICS_SIDE_LENGTH = 30; + private static final int MIN_GRAPHICS_AREA = 500; + private final GraphicsClusteringService graphicsClusteringService; private final FindGraphicsRaster findGraphicsRaster; @@ -55,7 +58,7 @@ public class GraphicExtractorService { List clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14); return clusters.stream() - .filter(box -> box.area() > 500 && box.height() > 50 && box.width() > 50) + .filter(box -> box.area() > MIN_GRAPHICS_AREA && box.height() > MIN_GRAPHICS_SIDE_LENGTH && box.width() > MIN_GRAPHICS_SIDE_LENGTH) .toList(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java index f50f753..cb90b5c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java @@ -127,8 +127,8 @@ public class LayoutparsingVisualizations { } VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings); visualizationsOnPage.getColoredLines() - .addAll(Stream.of(cleanRulings.getHorizontals(), cleanRulings.getVerticals()) - .flatMap(Collection::stream) + .addAll(cleanRulings.buildAll() + .stream() .map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 1)) .toList()); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index f69a399..d04af07 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -34,7 +34,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @Test public void testLayoutParserEndToEnd() { - String filePath = "/home/kschuettler/Dokumente/TestFiles/large number of prod files/101 S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; + String filePath = "/home/kschuettler/Dokumente/TestFiles/RotateTextWithRulingsTestFile.pdf"; runForFile(filePath); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/TextRulingsClassifierTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/TextRulingsClassifierTest.java new file mode 100644 index 0000000..dc6e8ae --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/TextRulingsClassifierTest.java @@ -0,0 +1,46 @@ +package com.knecon.fforesight.service.layoutparser.server.services; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Collections; +import java.util.List; + +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; +import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; +import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder; + +import lombok.SneakyThrows; + +public class TextRulingsClassifierTest { + + @Test + @SneakyThrows + public void textRulingExtractionTest() { + + String fileName = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf"; + List pageContents = PageContentExtractor.getSortedPageContents(fileName); + RulingCleaningService rulingCleaningService = new RulingCleaningService(); + + for (PageContents pageContent : pageContents) { + CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings()); + RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals()); + TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings); + + assertTrue(pageContent.getSortedTextPositionSequences().stream().filter(word -> word.toString().equals("Underlined")).allMatch(TextPositionSequence::isUnderline)); + assertTrue(pageContent.getSortedTextPositionSequences().stream().filter(word -> word.toString().equals("Striketrough")).allMatch(TextPositionSequence::isStrikethrough)); + + assertEquals(4, cleanRulings.buildAll().stream().filter(ruling -> ruling.getClassification().equals(Ruling.Classification.STRIKETROUGH)).count()); + assertEquals(4, cleanRulings.buildAll().stream().filter(ruling -> ruling.getClassification().equals(Ruling.Classification.UNDERLINE)).count()); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/Minimal Examples/RotateTextWithRulingsTestFile.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Minimal Examples/RotateTextWithRulingsTestFile.pdf new file mode 100644 index 0000000..da05904 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Minimal Examples/RotateTextWithRulingsTestFile.pdf differ diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java index c761c69..04233da 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocumentService.java @@ -185,8 +185,7 @@ public class ViewerDocumentService { contentStream.setFont(font, placedText.fontSize()); contentStream.beginText(); contentStream.setNonStrokingColor(placedText.color()); - if (placedText.renderingMode() - .isPresent()) { + if (placedText.renderingMode().isPresent()) { contentStream.setRenderingMode(placedText.renderingMode().get()); } else { contentStream.setRenderingMode(RenderingMode.FILL);