diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index dfa0537..6323b44 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -258,7 +258,7 @@ public class LayoutParsingPipeline { ClassificationPage classificationPage = switch (layoutParsingType) { case REDACT_MANAGER_OLD -> - redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells); case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true); case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index c3666a6..9c087a1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -2,13 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica import static java.util.stream.Collectors.toSet; -import java.awt.geom.Point2D; import java.util.ArrayList; import java.util.Comparator; -import java.util.HashSet; import java.util.List; import java.util.ListIterator; -import java.util.Set; import org.springframework.stereotype.Service; @@ -19,11 +16,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; import lombok.RequiredArgsConstructor; @@ -40,19 +39,10 @@ public class DocstrumBlockificationService { public ClassificationPage blockify(List textPositions, List cells, boolean xyOrder) { - // Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells. - List usedHorizonalRulings = new ArrayList<>(); - List usedVerticalRulings = new ArrayList<>(); - - cells.forEach(cell -> { - usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x + cell.width, cell.y))); - usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y + cell.height), new Point2D.Float(cell.x + cell.width, cell.y + cell.height))); - usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x, cell.y + cell.height))); - usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x + cell.width, cell.y), new Point2D.Float(cell.x + cell.width, cell.y + cell.height))); - }); + CleanRulings usedRulings = RectangleTransformations.extractRulings(cells); var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder); - var pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder); + var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontal(), usedRulings.getVertical(), xyOrder); var classificationPage = new ClassificationPage(pageBlocks); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java index 3062c78..9addf27 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java @@ -13,10 +13,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; @SuppressWarnings("all") @@ -31,12 +34,12 @@ public class RedactManagerBlockificationService { * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! * Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. * - * @param textPositions The words of a page. - * @param horizontalRulingLines Horizontal table lines. - * @param verticalRulingLines Vertical table lines. + * @param textPositions The words of a page. * @return Page object that contains the Textblock and text statistics. */ - public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { + public ClassificationPage blockify(List textPositions, List cells) { + + CleanRulings usedRulings = RectangleTransformations.extractRulings(cells); int indexOnPage = 0; List chunkWords = new ArrayList<>(); @@ -54,7 +57,7 @@ public class RedactManagerBlockificationService { boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX; boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); - boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); + boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontal(), usedRulings.getVertical()); boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java index 5331590..14df80a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java @@ -2,8 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; import static java.lang.String.format; +import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.awt.geom.RectangularShape; +import java.util.ArrayList; import java.util.Collections; import java.util.LinkedList; import java.util.List; @@ -19,6 +21,8 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; @@ -147,7 +151,27 @@ public class RectangleTransformations { previousRectangle = currentRectangle; } } - return rectangleListsWithGaps.stream().map(RectangleTransformations::rectangle2DBBox).toList(); + return rectangleListsWithGaps.stream() + .map(RectangleTransformations::rectangle2DBBox) + .toList(); + } + + + public static CleanRulings extractRulings(List rectangles) { + + // Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells. + List horizontalRulings = new ArrayList<>(); + List verticalRulings = new ArrayList<>(); + + rectangles.forEach(rectangle -> { + horizontalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y), new Point2D.Float(rectangle.x + rectangle.width, rectangle.y))); + horizontalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y + rectangle.height), + new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height))); + verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y), new Point2D.Float(rectangle.x, rectangle.y + rectangle.height))); + verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x + rectangle.width, rectangle.y), + new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height))); + }); + return CleanRulings.builder().vertical(verticalRulings).horizontal(horizontalRulings).build(); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 3af1376..9861a52 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -26,9 +26,22 @@ public class LayoutparserEnd2EndTest extends AbstractTest { public void testLayoutParserEndToEnd() { prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf"); + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); + LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); + Arrays.stream(finishedEvent.message().split("\n")) + .forEach(log::info); + } + + + @Test + @SneakyThrows + public void testLayoutParserEndToEnd_RED_8747() { + + prepareStorage("files/SinglePages/MergedEntities.pdf"); LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); - Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info); + Arrays.stream(finishedEvent.message().split("\n")) + .forEach(log::info); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index 739d0fa..f6b13f6 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -667,6 +667,19 @@ public class PdfSegmentationServiceTest extends AbstractTest { } + @Test + public void testMergedEntities_Page26() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/MergedEntities.pdf"); + + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); + + validateTableSize(document, 1); + + validateTable(document, 0, 6, 6, 5, 0); + + } + @SneakyThrows private void toHtml(ClassificationDocument document, String filename) { diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/MergedEntities.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/MergedEntities.pdf new file mode 100644 index 0000000..31090a9 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/MergedEntities.pdf differ