diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 9f4afcb..675a76c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -28,6 +28,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; @@ -250,8 +251,10 @@ public class LayoutParsingPipeline { PDRectangle cropbox = pdPage.getCropBox(); CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings()); + List emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); + ClassificationPage classificationPage = switch (layoutParsingType) { - case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells); case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); }; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java index 3062c78..644f60e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java @@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica import static java.util.stream.Collectors.toSet; +import java.awt.geom.Point2D; import java.util.ArrayList; import java.util.Comparator; import java.util.Iterator; @@ -13,10 +14,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; @SuppressWarnings("all") @@ -31,12 +35,12 @@ public class RedactManagerBlockificationService { * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! * Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. * - * @param textPositions The words of a page. - * @param horizontalRulingLines Horizontal table lines. - * @param verticalRulingLines Vertical table lines. + * @param textPositions The words of a page. * @return Page object that contains the Textblock and text statistics. */ - public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { + public ClassificationPage blockify(List textPositions, List emptyCells) { + + CleanRulings usedRulings = RectangleTransformations.extractRulings(emptyCells); int indexOnPage = 0; List chunkWords = new ArrayList<>(); @@ -54,7 +58,7 @@ public class RedactManagerBlockificationService { boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX; boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); - boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); + boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontal(), usedRulings.getVertical()); boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java index 70e9460..5183261 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java @@ -3,8 +3,10 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; import static java.lang.String.format; import java.awt.geom.Area; +import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.awt.geom.RectangularShape; +import java.util.ArrayList; import java.util.Collections; import java.util.LinkedList; import java.util.List; @@ -20,6 +22,8 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; @@ -149,7 +153,27 @@ public class RectangleTransformations { previousRectangle = currentRectangle; } } - return rectangleListsWithGaps.stream().map(RectangleTransformations::rectangle2DBBox).toList(); + return rectangleListsWithGaps.stream() + .map(RectangleTransformations::rectangle2DBBox) + .toList(); + } + + + public static CleanRulings extractRulings(List rectangles) { + + // Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells. + List horizontalRulings = new ArrayList<>(); + List verticalRulings = new ArrayList<>(); + + rectangles.forEach(rectangle -> { + horizontalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y), new Point2D.Float(rectangle.x + rectangle.width, rectangle.y))); + horizontalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y + rectangle.height), + new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height))); + verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y), new Point2D.Float(rectangle.x, rectangle.y + rectangle.height))); + verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x + rectangle.width, rectangle.y), + new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height))); + }); + return CleanRulings.builder().vertical(verticalRulings).horizontal(horizontalRulings).build(); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 0751be3..9db0cdb 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -28,7 +28,20 @@ public class LayoutparserEnd2EndTest extends AbstractTest { prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf"); LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); - Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info); + Arrays.stream(finishedEvent.message().split("\n")) + .forEach(log::info); + } + + + @Test + @SneakyThrows + public void testLayoutParserEndToEnd_RED_8747() { + + prepareStorage("files/SinglePages/MergedEntities.pdf"); + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); + LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); + Arrays.stream(finishedEvent.message().split("\n")) + .forEach(log::info); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index a9400a6..18e0e8c 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -751,6 +751,19 @@ public class PdfSegmentationServiceTest extends AbstractTest { } + @Test + public void testMergedEntities_Page26() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/MergedEntities.pdf"); + + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); + + validateTableSize(document, 1); + + validateTable(document, 0, 6, 6, 5, 0); + + } + @SneakyThrows private void toHtml(ClassificationDocument document, String filename) { diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/MergedEntities.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/MergedEntities.pdf new file mode 100644 index 0000000..31090a9 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/MergedEntities.pdf differ