From 976f40823785412f122975f61b8353e7cfe3ce5c Mon Sep 17 00:00:00 2001 From: Corina Olariu Date: Tue, 9 Apr 2024 14:38:48 +0300 Subject: [PATCH] RED-8747 - Entities not merged properly - fp - rework the extraction of rulings from the table cells --- .../blockification/BlockificationUtils.java | 24 ----------------- .../DocstrumBlockificationService.java | 10 +++---- .../RedactManagerBlockificationService.java | 10 +++---- .../utils/RectangleTransformations.java | 26 ++++++++++++++++++- 4 files changed, 33 insertions(+), 37 deletions(-) delete mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationUtils.java diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationUtils.java deleted file mode 100644 index b382f0e..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationUtils.java +++ /dev/null @@ -1,24 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.services.blockification; - -import java.awt.geom.Point2D; -import java.util.List; - -import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; - -import lombok.experimental.UtilityClass; - -@UtilityClass -public class BlockificationUtils { - - public void extractedRullingsFromCells(List cells, List usedHorizonalRulings, List usedVerticalRulings) { - - cells.forEach(cell -> { - usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x + cell.width, cell.y))); - usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y + cell.height), new Point2D.Float(cell.x + cell.width, cell.y + cell.height))); - usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x, cell.y + cell.height))); - usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x + cell.width, cell.y), new Point2D.Float(cell.x + cell.width, cell.y + cell.height))); - }); - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 07ca8c5..9c087a1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -16,11 +16,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; import lombok.RequiredArgsConstructor; @@ -37,14 +39,10 @@ public class DocstrumBlockificationService { public ClassificationPage blockify(List textPositions, List cells, boolean xyOrder) { - // Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells. - List usedHorizonalRulings = new ArrayList<>(); - List usedVerticalRulings = new ArrayList<>(); - - BlockificationUtils.extractedRullingsFromCells(cells, usedHorizonalRulings, usedVerticalRulings); + CleanRulings usedRulings = RectangleTransformations.extractRulings(cells); var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder); - var pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder); + var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontal(), usedRulings.getVertical(), xyOrder); var classificationPage = new ClassificationPage(pageBlocks); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java index 6f50eb3..9addf27 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java @@ -14,10 +14,12 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; @SuppressWarnings("all") @@ -37,11 +39,7 @@ public class RedactManagerBlockificationService { */ public ClassificationPage blockify(List textPositions, List cells) { - // Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells. - List usedHorizonalRulings = new ArrayList<>(); - List usedVerticalRulings = new ArrayList<>(); - - BlockificationUtils.extractedRullingsFromCells(cells, usedHorizonalRulings, usedVerticalRulings); + CleanRulings usedRulings = RectangleTransformations.extractRulings(cells); int indexOnPage = 0; List chunkWords = new ArrayList<>(); @@ -59,7 +57,7 @@ public class RedactManagerBlockificationService { boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX; boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); - boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedHorizonalRulings, usedVerticalRulings); + boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontal(), usedRulings.getVertical()); boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java index 5331590..14df80a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java @@ -2,8 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; import static java.lang.String.format; +import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.awt.geom.RectangularShape; +import java.util.ArrayList; import java.util.Collections; import java.util.LinkedList; import java.util.List; @@ -19,6 +21,8 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; @@ -147,7 +151,27 @@ public class RectangleTransformations { previousRectangle = currentRectangle; } } - return rectangleListsWithGaps.stream().map(RectangleTransformations::rectangle2DBBox).toList(); + return rectangleListsWithGaps.stream() + .map(RectangleTransformations::rectangle2DBBox) + .toList(); + } + + + public static CleanRulings extractRulings(List rectangles) { + + // Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells. + List horizontalRulings = new ArrayList<>(); + List verticalRulings = new ArrayList<>(); + + rectangles.forEach(rectangle -> { + horizontalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y), new Point2D.Float(rectangle.x + rectangle.width, rectangle.y))); + horizontalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y + rectangle.height), + new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height))); + verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y), new Point2D.Float(rectangle.x, rectangle.y + rectangle.height))); + verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x + rectangle.width, rectangle.y), + new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height))); + }); + return CleanRulings.builder().vertical(verticalRulings).horizontal(horizontalRulings).build(); }