diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index a97c826..5f713ab 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -45,6 +45,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService; +import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService; @@ -263,26 +264,21 @@ public class LayoutParsingPipeline { CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings()); List emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals()); + classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber); - var graphics = graphicExtractorService.extractPathElementGraphics(originDocument, - pdPage, - pageNumber, - cleanRulings, - stripper.getTextPositionSequences(), - emptyTableCells, - false); + TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings); + + var graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), false); pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()) .addAll(graphics.stream() - .map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber())) - .toList()); - - classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber); + .map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber())) + .toList()); ClassificationPage classificationPage = switch (layoutParsingType) { case REDACT_MANAGER_OLD -> - redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, classificationDocument.getVisualizations()); - case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings.getHorizontals(), cleanRulings.getVerticals()); + redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations()); + case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings); case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations()); case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java index b78e2df..e5aaa34 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java @@ -68,6 +68,7 @@ public class ZoneBuilderService { if (rulings.lineBetween(outerLine.getBBox(), innerLine.getBBox())) { return; } + unionFind.union(outerLine, innerLine); }); @@ -151,7 +152,9 @@ public class ZoneBuilderService { outputZone.add(new Line(characters, characterSpacing)); } - return new Zone(outputZone.stream().sorted(Comparator.comparing(Line::getY0)).collect(Collectors.toList())); + return new Zone(outputZone.stream() + .sorted(Comparator.comparing(Line::getY0)) + .collect(Collectors.toList())); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java index 7895945..86056e7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java @@ -1,25 +1,37 @@ package com.knecon.fforesight.service.layoutparser.processor.model.table; +import java.awt.geom.Line2D; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; import java.util.List; +import java.util.stream.Stream; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; +import lombok.Getter; -@Data -@Builder -@NoArgsConstructor -@AllArgsConstructor +@Getter public class CleanRulings { - List horizontals; - List verticals; + List horizontals; // unmodifiable sorted by Y list + List verticals; // unmodifiable sorted by X list + + + public CleanRulings(List horizontals, List verticals) { + + this.horizontals = horizontals.stream() + .peek(Ruling::assertHorizontal) + .sorted(Comparator.comparing(Line2D.Float::getY1)) + .toList(); + this.verticals = verticals.stream() + .peek(Ruling::assertVertical) + .sorted(Comparator.comparing(Line2D.Float::getX1)) + .toList(); + } public CleanRulings getTableLines() { @@ -33,6 +45,28 @@ public class CleanRulings { } + public CleanRulings withoutTextRulings() { + + return new CleanRulings(horizontals.stream() + .filter(ruling -> !(ruling.getClassification().equals(Ruling.Classification.UNDERLINE) || ruling.getClassification() + .equals(Ruling.Classification.STRIKETROUGH))) + .toList(), + verticals.stream() + .filter(ruling -> !(ruling.getClassification().equals(Ruling.Classification.UNDERLINE) || ruling.getClassification() + .equals(Ruling.Classification.STRIKETROUGH))) + .toList()); + } + + + public List buildAll() { + + ArrayList rulings = new ArrayList<>(horizontals.size() + verticals.size()); + rulings.addAll(horizontals); + rulings.addAll(verticals); + return rulings; + } + + public boolean lineBetween(Character a, Character b) { return lineBetween(a.getTextPosition().getInitialUserSpacePosition(), b.getTextPosition().getInitialUserSpacePosition()); @@ -53,28 +87,122 @@ public class CleanRulings { Ruling ruling = new Ruling(p1, p2); if (ruling.isHorizontal()) { - return verticals.stream() + return getVerticalsInXInterval(ruling.x1, ruling.x2).stream() .anyMatch(vertical -> vertical.intersectsLine(ruling)); } if (ruling.isVertical()) { - return horizontals.stream() + return getHorizontalsInYInterval(ruling.y1, ruling.y2).stream() .anyMatch(horizontal -> horizontal.intersectsLine(ruling)); } - return buildAll().stream() + return Stream.of(getVerticalsInXInterval(ruling.x1, ruling.x2), getHorizontalsInYInterval(ruling.y1, ruling.y2)) + .flatMap(Collection::stream) .anyMatch(other -> other.intersectsLine(ruling)); } - public List buildAll() { + public List getHorizontalsInYInterval(float y1, float y2) { - ArrayList rulings = new ArrayList<>(horizontals.size() + verticals.size()); - rulings.addAll(horizontals); - rulings.addAll(verticals); - return rulings; + float startY = Math.min(y1, y2); + float endY = Math.max(y1, y2); + + if (horizontals.isEmpty() || Float.isNaN(startY) || Float.isNaN(endY)) { + return Collections.emptyList(); + } + + int firstGreaterThanIdx = findFirstHorizontalRulingIdxAbove(startY); + + if (firstGreaterThanIdx == -1) { + return Collections.emptyList(); + } + + List result = new ArrayList<>(); + for (int i = firstGreaterThanIdx; i < horizontals.size(); i++) { + Ruling horizontal = horizontals.get(i); + if (horizontal.y1 > endY) { + break; + } + result.add(horizontal); + } + return result; + } + + + private int findFirstHorizontalRulingIdxAbove(float y) { + + int low = 0; + int high = horizontals.size() - 1; + + while (low <= high) { + int mid = low + (high - low) / 2; + Line2D.Float midLine = horizontals.get(mid); + float midY = midLine.y1; + + if (midY == y) { + return mid; + } else if (midY > y) { + high = mid - 1; + } else { + low = mid + 1; + } + } + + // Return the index of the first element greater than y or -1 if not found + return horizontals.size() > low && horizontals.get(low).y1 > y ? low : -1; + } + + + public List getVerticalsInXInterval(float x1, float x2) { + + float startX = Math.min(x1, x2); + float endX = Math.max(x1, x2); + + if (verticals.isEmpty() || Float.isNaN(startX) || Float.isNaN(endX)) { + return Collections.emptyList(); + } + + int firstGreaterThanIdx = findFirstVerticalRulingIdxRightOf(startX); + + if (firstGreaterThanIdx == -1) { + return Collections.emptyList(); + } + + List result = new ArrayList<>(); + for (int i = firstGreaterThanIdx; i < verticals.size(); i++) { + Ruling horizontal = verticals.get(i); + if (horizontal.x1 > endX) { + break; + } + result.add(horizontal); + } + return result; + } + + + private int findFirstVerticalRulingIdxRightOf(float x) { + + int low = 0; + int high = verticals.size() - 1; + + while (low <= high) { + int mid = low + (high - low) / 2; + Line2D.Float midLine = verticals.get(mid); + float midX = midLine.x1; + + if (midX == x) { + return mid; + } else if (midX > x) { + high = mid - 1; + } else { + low = mid + 1; + } + } + + // Return the index of the first element greater than y or -1 if not found + return verticals.size() > low && verticals.get(low).x1 > x ? low : -1; } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java index 346509c..e910ff1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java @@ -72,6 +72,25 @@ public class Ruling extends Line2D.Float { } + public void assertHorizontal() { + + if (isHorizontal()) { + return; + } + throw new IllegalArgumentException("Ruling " + this + " is not horizontal"); + + } + + + public void assertVertical() { + + if (isVertical()) { + return; + } + throw new IllegalArgumentException("Ruling " + this + " is not vertical"); + } + + public boolean isVertical() { return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java index fccecd5..1a2e1eb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java @@ -44,6 +44,8 @@ public class TextPositionSequence implements CharSequence { private float pageHeight; private float pageWidth; private boolean isParagraphStart; + private boolean strikethrough; + private boolean underline; public TextPositionSequence(List textPositions, int pageNumber, boolean isParagraphStart) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java index ac7db1d..62fc633 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java @@ -44,9 +44,9 @@ public class GapDetectionService { if (yDifference > avgTextPositionHeight * Y_GAP_FACTOR) { yGapContext.addGap(mainBodyTextFrame.getMinX(), - previousTextPositionBBox.getMaxY(), - mainBodyTextFrame.getWidth(), - -(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY())); + previousTextPositionBBox.getMaxY(), + mainBodyTextFrame.getWidth(), + -(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY())); } if (yDifference > avgTextPositionHeight * NEW_LINE_FACTOR) { @@ -72,29 +72,34 @@ public class GapDetectionService { return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle())); } + private static Rectangle2D mirrorY(Rectangle2D rectangle2D) { return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight())); } + private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) { context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(), - previousTextPosition.getMinY(), - currentTextPosition.getMinX() - previousTextPosition.getMaxX(), - (previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2)); + previousTextPosition.getMinY(), + currentTextPosition.getMinX() - previousTextPosition.getMaxX(), + (previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2)); } private static void assertAllTextPositionsHaveSameDir(List textPositionSequences) { - assert textPositionSequences.stream().map(TextPositionSequence::getDir).allMatch(a -> a.equals(textPositionSequences.get(0).getDir())); + assert textPositionSequences.stream() + .map(TextPositionSequence::getDir) + .allMatch(a -> a.equals(textPositionSequences.get(0).getDir())); } private static double getAvgTextPositionHeight(List textPositionSequences) { - return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow(); + return textPositionSequences.stream() + .mapToDouble(TextPositionSequence::getHeight).average().orElseThrow(); } @@ -142,9 +147,9 @@ public class GapDetectionService { public void addGapToRightEdgeOfMainBody(Rectangle2D textPosition) { Rectangle2D leftGap = new Rectangle2D.Double(textPosition.getMaxX(), - textPosition.getMinY(), - mainBodyTextFrame.getMaxX() - textPosition.getMaxX(), - textPosition.getHeight()); + textPosition.getMinY(), + mainBodyTextFrame.getMaxX() - textPosition.getMaxX(), + textPosition.getHeight()); gapsInCurrentLine.add(leftGap); } @@ -152,9 +157,9 @@ public class GapDetectionService { public void addGapFromLeftEdgeOfMainBody(Rectangle2D textPosition) { Rectangle2D leftGap = new Rectangle2D.Double(mainBodyTextFrame.getMinX(), - textPosition.getMinY(), - textPosition.getMinX() - mainBodyTextFrame.getMinX(), - textPosition.getHeight()); + textPosition.getMinY(), + textPosition.getMinX() - mainBodyTextFrame.getMinX(), + textPosition.getHeight()); gapsInCurrentLine.add(leftGap); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java index ca021fe..675098b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java @@ -45,7 +45,7 @@ public class RulingCleaningService { verticalAndHorizontalRulingLines.horizontalLines.sort(X_FIRST_RULING_COMPARATOR); verticalAndHorizontalRulingLines = cleanRulings(verticalAndHorizontalRulingLines); - return CleanRulings.builder().verticals(verticalAndHorizontalRulingLines.verticalLines()).horizontals(verticalAndHorizontalRulingLines.horizontalLines()).build(); + return new CleanRulings(verticalAndHorizontalRulingLines.horizontalLines(), verticalAndHorizontalRulingLines.verticalLines()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java new file mode 100644 index 0000000..72579b3 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextRulingsClassifier.java @@ -0,0 +1,99 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class TextRulingsClassifier { + + private final static double STRIKETHROUGH_ZONE = 0.5; // multiplied with text height, determines height of intersection interval for strikethrough lines + private final static double UNDERLINE_ZONE = 0.2; // multiplied with text height, determines height of intersection interval of underline lines + private final static double TEXT_BBOX_THRESHOLD_FACTOR = 0.15; // multiplied with text width, subtracted from word width + + + public static void classifyUnderlinedAndStrikethroughText(List words, CleanRulings cleanRulings) { + + for (TextPositionSequence word : words) { + if (word.getDir().equals(TextDirection.ZERO) || word.getDir().equals(TextDirection.HALF_CIRCLE)) { + handleHorizontalText(cleanRulings, word); + } else { + handleVerticalText(cleanRulings, word); + } + } + } + + + private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) { + + float lowerY = (float) (word.getBoundingBox().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); + float upperY = (float) (word.getBoundingBox().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); + + float strikethroughCenterX = (float) word.getBoundingBox().getCenterX(); + float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2); + + float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBoundingBox().getMinX() : word.getBoundingBox().getMaxX()); + float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2); + + float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight); + float rightX = Math.max(underlineCenterX + underlineBoxHeight, strikethroughCenterX + strikethroughBoxHeight); + + List rulingsIntersectingWord = cleanRulings.getVerticalsInXInterval(leftX, rightX) + .stream() + .filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER)) + .filter(ruling -> ruling.y1 <= lowerY && upperY <= ruling.y2) + .toList(); + + for (Ruling ruling : rulingsIntersectingWord) { + if (strikethroughCenterX - strikethroughBoxHeight < ruling.x1 && ruling.x1 < strikethroughCenterX + strikethroughBoxHeight) { + ruling.setClassification(Ruling.Classification.STRIKETROUGH); + word.setStrikethrough(true); + } + + if (underlineCenterX - underlineBoxHeight < ruling.x1 && ruling.x1 < underlineCenterX + underlineBoxHeight) { + ruling.setClassification(Ruling.Classification.UNDERLINE); + word.setUnderline(true); + } + } + } + + + private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) { + + float leftX = (float) (word.getBoundingBox().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); + float rightX = (float) (word.getBoundingBox().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth()); + + float strikethroughCenterY = (float) word.getBoundingBox().getCenterY(); + float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2); + + float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBoundingBox().getMinY() : word.getBoundingBox().getMaxY()); + float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2); + + float lowerY = Math.min(underlineCenterY - underlineBoxHeight, strikethroughCenterY - strikethroughBoxHeight); + float upperY = Math.max(underlineCenterY + underlineBoxHeight, strikethroughCenterY + strikethroughBoxHeight); + + List rulingsIntersectingWord = cleanRulings.getHorizontalsInYInterval(lowerY, upperY) + .stream() + .filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER)) + .filter(ruling -> ruling.x1 <= leftX && rightX <= ruling.x2) + .toList(); + + for (Ruling ruling : rulingsIntersectingWord) { + if (strikethroughCenterY - strikethroughBoxHeight < ruling.y1 && ruling.y1 < strikethroughCenterY + strikethroughBoxHeight) { + ruling.setClassification(Ruling.Classification.STRIKETROUGH); + word.setStrikethrough(true); + } + + if (underlineCenterY - underlineBoxHeight < ruling.y1 && ruling.y1 < underlineCenterY + underlineBoxHeight) { + ruling.setClassification(Ruling.Classification.UNDERLINE); + word.setUnderline(true); + } + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 0028800..3e67cfb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -38,7 +38,7 @@ public class DocstrumBlockificationService { public ClassificationPage blockify(List textPositions, CleanRulings rulings, boolean xyOrder, LayoutparsingVisualizations visualizations) { - CleanRulings usedRulings = rulings.getTableLines(); + CleanRulings usedRulings = rulings.withoutTextRulings(); var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java index fd84be5..77b8b82 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java @@ -15,11 +15,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; @Service public class DocuMineBlockificationService { @@ -34,16 +33,17 @@ public class DocuMineBlockificationService { * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! * Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. * - * @param textPositions The textPositions of a page. - * @param horizontalRulingLines Horizontal table lines. - * @param verticalRulingLines Vertical table lines. + * @param textPositions The textPositions of a page. + * @param cleanRulings All rulings on a page * @return Page object that contains the Textblock and text statistics. */ - public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { + public ClassificationPage blockify(List textPositions, CleanRulings cleanRulings) { List chunkWords = new ArrayList<>(); List chunkBlockList1 = new ArrayList<>(); + CleanRulings usedRulings = cleanRulings.withoutTextRulings(); + float minX = 1000; float maxX = 0; float minY = 1000; @@ -59,12 +59,15 @@ public class DocuMineBlockificationService { boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5; boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); - boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); + boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev.getBoundingBox(), word.getBoundingBox()); boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle() - .contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")); + .contains("bold") + && !prev.getFontStyle() + .contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")); - Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString()); + Matcher matcher = pattern.matcher(chunkWords.stream() + .collect(Collectors.joining(" ")).toString()); boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches(); if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) { @@ -86,7 +89,11 @@ public class DocuMineBlockificationService { wasSplitted = false; cb1.setOrientation(Orientation.RIGHT); splitX1 = null; - } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) { + } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation + || !startFromTop + || !splitByX + || !newLineAfterSplit + || !isSplitByRuling)) { cb1.setOrientation(Orientation.LEFT); } @@ -149,11 +156,11 @@ public class DocuMineBlockificationService { if (textBlock == null) { textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), - wordBlock.getMaxXDirAdj(), - wordBlock.getMinYDirAdj(), - wordBlock.getMaxYDirAdj(), - wordBlockList, - wordBlock.getRotation()); + wordBlock.getMaxXDirAdj(), + wordBlock.getMinYDirAdj(), + wordBlock.getMaxYDirAdj(), + wordBlockList, + wordBlock.getRotation()); } else { TextPageBlock spatialEntity = textBlock.union(wordBlock); textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); @@ -169,68 +176,17 @@ public class DocuMineBlockificationService { textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); } - if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { + if (textBlock != null + && textBlock.getSequences() != null + && textBlock.getSequences() + .stream() + .map(t -> round(t.getMinYDirAdj(), 3)) + .collect(toSet()).size() == 1) { textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); } return textBlock; } - - private boolean isSplitByRuling(float minX, - float minY, - float maxX, - float maxY, - TextPositionSequence word, - List horizontalRulingLines, - List verticalRulingLines) { - - return isSplitByRuling(maxX, - minY, - word.getMinXDirAdj(), - word.getMinYDirAdj(), - verticalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(minX, - minY, - word.getMinXDirAdj(), - word.getMaxYDirAdj(), - horizontalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(maxX, - minY, - word.getMinXDirAdj(), - word.getMinYDirAdj(), - horizontalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(minX, - minY, - word.getMinXDirAdj(), - word.getMaxYDirAdj(), - verticalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()); // - } - - - private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines, float dir, float pageWidth, float pageHeight) { - - for (Ruling ruling : rulingLines) { - var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight); - if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { - return true; - } - } - return false; - } - - private double round(float value, int decimalPoints) { var d = Math.pow(10, decimalPoints); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java index 1c7dbb5..87db5dc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java @@ -13,14 +13,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; -import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; @SuppressWarnings("all") @@ -39,9 +35,9 @@ public class RedactManagerBlockificationService { * @param visualizations * @return Page object that contains the Textblock and text statistics. */ - public ClassificationPage blockify(List textPositions, List cells, LayoutparsingVisualizations visualizations) { + public ClassificationPage blockify(List textPositions, CleanRulings cleanRulings, LayoutparsingVisualizations visualizations) { - CleanRulings usedRulings = RectangleTransformations.extractRulings(cells); + CleanRulings usedRulings = cleanRulings.withoutTextRulings(); int indexOnPage = 0; List chunkWords = new ArrayList<>(); @@ -59,7 +55,7 @@ public class RedactManagerBlockificationService { boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX; boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); - boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontals(), usedRulings.getVerticals()); + boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev.getBoundingBox(), word.getBoundingBox()); boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) { @@ -83,7 +79,11 @@ public class RedactManagerBlockificationService { wasSplitted = false; cb1.setOrientation(Orientation.RIGHT); splitX1 = null; - } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) { + } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation + || !startFromTop + || !splitByX + || !newLineAfterSplit + || !isSplitByRuling)) { cb1.setOrientation(Orientation.LEFT); } @@ -152,8 +152,11 @@ public class RedactManagerBlockificationService { TextPageBlock block = (TextPageBlock) itty.next(); if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(), - previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation() - .equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) { + previous.getMaxY()) + || previous != null + && previous.getOrientation().equals(Orientation.LEFT) + && block.getOrientation().equals(Orientation.RIGHT) + && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) { previous.add(block); itty.remove(); continue; @@ -162,7 +165,9 @@ public class RedactManagerBlockificationService { previous = block; } - visualizations.addTextBlockVisualizations(chunkBlockList.stream().map(tb -> (TextPageBlock) tb).toList(), textPositions.get(0).getPage()); + visualizations.addTextBlockVisualizations(chunkBlockList.stream() + .map(tb -> (TextPageBlock) tb) + .toList(), textPositions.get(0).getPage()); return new ClassificationPage(chunkBlockList); } @@ -194,11 +199,11 @@ public class RedactManagerBlockificationService { if (textBlock == null) { textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), - wordBlock.getMaxXDirAdj(), - wordBlock.getMinYDirAdj(), - wordBlock.getMaxYDirAdj(), - wordBlockList, - wordBlock.getRotation()); + wordBlock.getMaxXDirAdj(), + wordBlock.getMinYDirAdj(), + wordBlock.getMaxYDirAdj(), + wordBlockList, + wordBlock.getRotation()); } else { TextPageBlock spatialEntity = textBlock.union(wordBlock); textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); @@ -214,68 +219,18 @@ public class RedactManagerBlockificationService { textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); } - if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { + if (textBlock != null + && textBlock.getSequences() != null + && textBlock.getSequences() + .stream() + .map(t -> round(t.getMinYDirAdj(), 3)) + .collect(toSet()).size() == 1) { textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); } return textBlock; } - private boolean isSplitByRuling(float minX, - float minY, - float maxX, - float maxY, - TextPositionSequence word, - List horizontalRulingLines, - List verticalRulingLines) { - - return isSplitByRuling(maxX, - minY, - word.getMinXDirAdj(), - word.getMinYDirAdj(), - verticalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(minX, - minY, - word.getMinXDirAdj(), - word.getMaxYDirAdj(), - horizontalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(maxX, - minY, - word.getMinXDirAdj(), - word.getMinYDirAdj(), - horizontalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(minX, - minY, - word.getMinXDirAdj(), - word.getMaxYDirAdj(), - verticalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()); - } - - - private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines, float dir, float pageWidth, float pageHeight) { - - for (Ruling ruling : rulingLines) { - var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight); - if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { - return true; - } - } - return false; - } - - private double round(float value, int decimalPoints) { var d = Math.pow(10, decimalPoints); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index 1b6f6ca..36ee3eb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -80,14 +80,18 @@ public class DocumentGraphFactory { } - public void addParagraphOrHeadline(GenericSemanticNode parentNode, TextPageBlock originalTextBlock, Context context, List textBlocksToMerge) { + public void addParagraphOrHeadline(GenericSemanticNode parentNode, + TextPageBlock originalTextBlock, + Context context, + List textBlocksToMerge, + LayoutParsingType layoutParsingType) { Page page = context.getPage(originalTextBlock.getPage()); GenericSemanticNode node; if (originalTextBlock.isHeadline()) { node = Headline.builder().documentTree(context.getDocumentTree()).build(); - } else if (originalTextBlock.isToDuplicate()) { + } else if (originalTextBlock.isToDuplicate() && layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER)) { node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build(); } else { node = Paragraph.builder().documentTree(context.getDocumentTree()).build(); @@ -269,8 +273,7 @@ public class DocumentGraphFactory { return pages.keySet() .stream() .filter(page -> page.getNumber() == pageIndex) - .findFirst() - .orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex))); + .findFirst().orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex))); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java index f4b26eb..cca8558 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java @@ -49,8 +49,7 @@ public class SectionNodeFactory { Map> blocksPerPage = pageBlocks.stream() .collect(groupingBy(AbstractPageBlock::getPage)); - Section section = Section.builder().documentTree(context.getDocumentTree()) - .build(); + Section section = Section.builder().documentTree(context.getDocumentTree()).build(); context.getSections().add(section); blocksPerPage.keySet() @@ -121,12 +120,12 @@ public class SectionNodeFactory { case REDACT_MANAGER, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> { alreadyMerged.add(abstractPageBlock); remainingBlocks.remove(abstractPageBlock); - DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>()); + DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType); } default -> { List textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks); alreadyMerged.addAll(textBlocks); - DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks); + DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks, layoutParsingType); } } } else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java index f71669c..b08d402 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java @@ -159,7 +159,7 @@ public class TableNodeFactory { tableCell.setLeafTextBlock(textBlock); } else { cell.getTextBlocks() - .forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList())); + .forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList(), layoutParsingType)); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java index 37a7122..8a89d9a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java @@ -1,7 +1,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.graphics; import java.awt.geom.Rectangle2D; -import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; @@ -9,8 +8,8 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; @@ -32,33 +31,32 @@ public class GraphicExtractorService { int pageNumber, CleanRulings cleanRulings, List textPositionSequences, - List emptyTableCells, boolean graphicsRaster) { - var characterBBoxes = getCharacterBBoxes(textPositionSequences); - var tableLineBBoxes = getLineBBoxesFromTableCells(emptyTableCells); - var underLineBBoxes = getUnderlineBBoxes(cleanRulings, characterBBoxes); - var strikeThroughBBoxes = getStrikeThroughBBoxes(cleanRulings, characterBBoxes); + List characterBBoxes = getCharacterBBoxes(textPositionSequences); + List classifiedRulingsBoxes = getLineBBoxesOfAllClassifiedRulings(cleanRulings); GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true); - var graphicBBoxes = graphicBBDetector.findGraphicBB(); + List graphicBBoxes = graphicBBDetector.findGraphicBB(); if (graphicsRaster) { // This should only be used if ocr was performed, it is currently in an early stage and needs to be improved. graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument, - characterBBoxes.stream().map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4)).collect(Collectors.toList()), - PageInformation.fromPDPage(pageNumber, pdPage))); + characterBBoxes.stream() + .map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4)) + .collect(Collectors.toList()), + PageInformation.fromPDPage(pageNumber, pdPage))); } - var filteredGraphicBBoxes = graphicBBoxes.stream() - .filter(box -> !box.intersectsAny(tableLineBBoxes, 4)) - .filter(box -> !box.intersectsAny(underLineBBoxes, 4)) - .filter(box -> !box.intersectsAny(strikeThroughBBoxes, 4)) + List filteredGraphicBBoxes = graphicBBoxes.stream() + .filter(box -> !box.intersectsAny(classifiedRulingsBoxes, 4)) .collect(Collectors.toList()); - var clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14); + List clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14); - return clusters.stream().filter(box -> box.area() > 500 && box.height() > 50 && box.width() > 50).toList(); + return clusters.stream() + .filter(box -> box.area() > 500 && box.height() > 50 && box.width() > 50) + .toList(); } @@ -74,34 +72,13 @@ public class GraphicExtractorService { } - private List getLineBBoxesFromTableCells(List emptyTableCells) { + private List getLineBBoxesOfAllClassifiedRulings(CleanRulings cleanRulings) { - List expandedTableLines = new ArrayList<>(); - - emptyTableCells.forEach(cell -> { - expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x, cell.y - 1, cell.width, 2))); - expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x, cell.y + cell.height - 1, cell.width, 2))); - expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x - 1, cell.y, 2, cell.height))); - expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x + cell.width - 1, cell.y, 2, cell.height))); - }); - - return expandedTableLines; - } - - - private List getUnderlineBBoxes(CleanRulings cleanRulings, List characterBBoxes) { - - return cleanRulings.getHorizontal() + return cleanRulings.buildAll() .stream() + .filter(ruling -> !ruling.getClassification().equals(Ruling.Classification.OTHER)) .map(h -> new Box(h.x1, h.y1, h.x2, h.y2)) - .filter(box -> box.intersectsAnyAndOver(characterBBoxes, 6)) .collect(Collectors.toList()); } - - private List getStrikeThroughBBoxes(CleanRulings cleanRulings, List characterBBoxes) { - - return cleanRulings.getHorizontal().stream().map(h -> new Box(h.x1, h.y1, h.x2, h.y2)).filter(box -> box.intersectsCenter(characterBBoxes, 2)).collect(Collectors.toList()); - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java index 174823b..189fd2e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java @@ -144,8 +144,7 @@ public class RectangleTransformations { return Collections.emptyList(); } double splitThreshold = rectangle2DList.stream() - .mapToDouble(RectangularShape::getWidth).average() - .orElse(5) * 5.0; + .mapToDouble(RectangularShape::getWidth).average().orElse(5) * 5.0; List> rectangleListsWithGaps = new LinkedList<>(); List rectangleListWithoutGaps = new LinkedList<>(); @@ -182,7 +181,7 @@ public class RectangleTransformations { verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x + rectangle.width, rectangle.y), new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height))); }); - return CleanRulings.builder().verticals(verticalRulings).horizontals(horizontalRulings).build(); + return new CleanRulings(verticalRulings, horizontalRulings); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index d9af8bf..53bb180 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -9,6 +9,7 @@ import java.util.List; import java.util.concurrent.atomic.AtomicInteger; import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.ClassPathResource; @@ -29,21 +30,21 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @Autowired private LayoutParsingPipeline layoutParsingPipeline; - + @Disabled @Test public void testLayoutParserEndToEnd() { - String filePath = "files/SinglePages/VV-931175_Page1.pdf"; + String filePath = "/home/kschuettler/Dokumente/TestFiles/tables with striketrough text.pdf"; runForFile(filePath); } @Test -// @Disabled + @Disabled @SneakyThrows public void testLayoutParserEndToEndWithFolder() { - String folder = "/home/kschuettler/iqser/fforesight/layout-parser/layoutparser/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages"; + String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files"; List pdfFiles = Files.walk(Path.of(folder)) .filter(path -> path.getFileName().toString().endsWith(".pdf")) .sorted(Comparator.comparing(Path::getFileName)) @@ -77,7 +78,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { Arrays.stream(finishedEvent.message().split("\n")) .forEach(log::info); - File tmpFile = new File("/tmp/layout-E2E/" + fileName + "_VIEWER.pdf"); + File tmpFile = new File("/tmp/layoutparserEND2END/" + fileName + "_VIEWER.pdf"); assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs(); storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/CleanRulingsTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/CleanRulingsTest.java index fd306fa..4081aa6 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/CleanRulingsTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/CleanRulingsTest.java @@ -1,11 +1,14 @@ package com.knecon.fforesight.service.layoutparser.server.model; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; +import java.util.Collections; import java.util.List; +import java.util.stream.IntStream; import org.junit.jupiter.api.Test; @@ -35,4 +38,81 @@ class CleanRulingsTest { assertTrue(cleanRulings.lineBetween(a, f)); } + + @Test + public void testSingleLineInRange() { + + List horizontals = List.of(new Ruling(new Point2D.Float(0, 1), new Point2D.Float(100, 1))); + List verticals = List.of(new Ruling(new Point2D.Float(1, 0), new Point2D.Float(1, 100))); + + CleanRulings cleanRulings = new CleanRulings(horizontals, verticals); + + assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -1).size()); + assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -5).size()); + assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, Float.NaN).size()); + assertEquals(1, cleanRulings.getVerticalsInXInterval(1, 10).size()); + assertEquals(0, cleanRulings.getVerticalsInXInterval(100, 101).size()); + assertEquals(verticals.size(), cleanRulings.getVerticalsInXInterval(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY).size()); + assertEquals(1, cleanRulings.getVerticalsInXInterval(1 - 1e-5f, 1 + 1e-5f).size()); + assertEquals(0, cleanRulings.getVerticalsInXInterval(1e-5f, 1 - 1e-5f).size()); + + assertEquals(0, cleanRulings.getHorizontalsInYInterval(-2, -1).size()); + assertEquals(1, cleanRulings.getHorizontalsInYInterval(1, 10).size()); + assertEquals(0, cleanRulings.getHorizontalsInYInterval(100, 1001).size()); + } + + + @Test + public void testLinesInRange() { + + List horizontals = IntStream.range(0, 101).boxed() + .map(y -> new Ruling(new Point2D.Float(0, y), new Point2D.Float(100, y))) + .toList(); + List verticals = IntStream.range(0, 101).boxed() + .map(x -> new Ruling(new Point2D.Float(x, 0), new Point2D.Float(x, 100))) + .toList(); + CleanRulings cleanRulings = new CleanRulings(horizontals, verticals); + + assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -1).size()); + assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -5).size()); + assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, Float.NaN).size()); + assertEquals(10, cleanRulings.getVerticalsInXInterval(1, 10).size()); + assertEquals(1, cleanRulings.getVerticalsInXInterval(100, 101).size()); + assertEquals(verticals.size(), cleanRulings.getVerticalsInXInterval(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY).size()); + assertEquals(1, cleanRulings.getVerticalsInXInterval(-1e-5f, 1e-5f).size()); + assertEquals(1, cleanRulings.getVerticalsInXInterval(0, 0).size()); + assertEquals(0, cleanRulings.getVerticalsInXInterval(1e-5f, 1 - 1e-5f).size()); + + assertEquals(0, cleanRulings.getHorizontalsInYInterval(-2, -1).size()); + assertEquals(10, cleanRulings.getHorizontalsInYInterval(1, 10).size()); + assertEquals(1, cleanRulings.getHorizontalsInYInterval(100, 1001).size()); + } + + + @Test + public void testLinesInRangePerformance() { + + List horizontals = IntStream.range(0, (int) 1e6).boxed() + .map(y -> new Ruling(new Point2D.Float(0, y), new Point2D.Float(100, y))) + .toList(); + CleanRulings cleanRulings = new CleanRulings(horizontals, Collections.emptyList()); + + float startY = 29; + float endY = 3000; + long start = System.currentTimeMillis(); + var result = cleanRulings.getHorizontalsInYInterval(startY, endY); + long time = System.currentTimeMillis() - start; + + start = System.currentTimeMillis(); + var result2 = cleanRulings.getHorizontals() + .stream() + .filter(ruling -> ruling.getY1() >= startY && ruling.getY1() <= endY) + .toList(); + long time2 = System.currentTimeMillis() - start; + + assertEquals(result, result2); + assertTrue(time < time2); + + } + } \ No newline at end of file