diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index dfa0537..6323b44 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -258,7 +258,7 @@ public class LayoutParsingPipeline { ClassificationPage classificationPage = switch (layoutParsingType) { case REDACT_MANAGER_OLD -> - redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells); case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true); case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationUtils.java new file mode 100644 index 0000000..b382f0e --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationUtils.java @@ -0,0 +1,24 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.blockification; + +import java.awt.geom.Point2D; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class BlockificationUtils { + + public void extractedRullingsFromCells(List cells, List usedHorizonalRulings, List usedVerticalRulings) { + + cells.forEach(cell -> { + usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x + cell.width, cell.y))); + usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y + cell.height), new Point2D.Float(cell.x + cell.width, cell.y + cell.height))); + usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x, cell.y + cell.height))); + usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x + cell.width, cell.y), new Point2D.Float(cell.x + cell.width, cell.y + cell.height))); + }); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index c3666a6..07ca8c5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -2,13 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica import static java.util.stream.Collectors.toSet; -import java.awt.geom.Point2D; import java.util.ArrayList; import java.util.Comparator; -import java.util.HashSet; import java.util.List; import java.util.ListIterator; -import java.util.Set; import org.springframework.stereotype.Service; @@ -44,12 +41,7 @@ public class DocstrumBlockificationService { List usedHorizonalRulings = new ArrayList<>(); List usedVerticalRulings = new ArrayList<>(); - cells.forEach(cell -> { - usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x + cell.width, cell.y))); - usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y + cell.height), new Point2D.Float(cell.x + cell.width, cell.y + cell.height))); - usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x, cell.y + cell.height))); - usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x + cell.width, cell.y), new Point2D.Float(cell.x + cell.width, cell.y + cell.height))); - }); + BlockificationUtils.extractedRullingsFromCells(cells, usedHorizonalRulings, usedVerticalRulings); var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder); var pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java index 3062c78..1a6525f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java @@ -13,6 +13,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; @@ -31,12 +32,16 @@ public class RedactManagerBlockificationService { * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! * Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. * - * @param textPositions The words of a page. - * @param horizontalRulingLines Horizontal table lines. - * @param verticalRulingLines Vertical table lines. + * @param textPositions The words of a page. * @return Page object that contains the Textblock and text statistics. */ - public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { + public ClassificationPage blockify(List textPositions, List cells) { + + // Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells. + List usedHorizonalRulings = new ArrayList<>(); + List usedVerticalRulings = new ArrayList<>(); + + BlockificationUtils.extractedRullingsFromCells(cells, usedHorizonalRulings, usedVerticalRulings); int indexOnPage = 0; List chunkWords = new ArrayList<>(); @@ -54,7 +59,7 @@ public class RedactManagerBlockificationService { boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX; boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); - boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); + boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedVerticalRulings, usedVerticalRulings); boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 3af1376..a115ae9 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -26,9 +26,22 @@ public class LayoutparserEnd2EndTest extends AbstractTest { public void testLayoutParserEndToEnd() { prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf"); + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); + LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); + Arrays.stream(finishedEvent.message().split("\n")) + .forEach(log::info); + } + + + @Test + @SneakyThrows + public void testLayoutParserEndToEnd_RED_8747() { + + prepareStorage("files/localTests/MergedEntities.pdf"); LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); - Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info); + Arrays.stream(finishedEvent.message().split("\n")) + .forEach(log::info); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/localTests/MergedEntities.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/localTests/MergedEntities.pdf new file mode 100644 index 0000000..31090a9 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/localTests/MergedEntities.pdf differ