diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/AbstractPageBlock.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/AbstractPageBlock.java index a0d2caef..d4a49133 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/AbstractPageBlock.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/AbstractPageBlock.java @@ -26,6 +26,8 @@ public abstract class AbstractPageBlock { @JsonIgnore protected int page; + int columnIndex; + @JsonIgnore private Orientation orientation = Orientation.NONE; @@ -77,4 +79,10 @@ public abstract class AbstractPageBlock { return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY(); } + + public boolean intersectsX(AbstractPageBlock atc) { + + return this.minX <= atc.getMaxX() && this.maxX >= atc.getMinX(); + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/Column.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/Column.java new file mode 100644 index 00000000..ffd4a37d --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/Column.java @@ -0,0 +1,14 @@ +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model; + +import java.awt.geom.Rectangle2D; + +import lombok.AllArgsConstructor; + +@AllArgsConstructor +public class Column { + + int index; + ColumnType columnType; + Rectangle2D bBox; + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ColumnType.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ColumnType.java new file mode 100644 index 00000000..d2212c42 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ColumnType.java @@ -0,0 +1,6 @@ +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model; + +public enum ColumnType { + RULING, + DISTANCE +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/ColumnDetectionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/ColumnDetectionService.java new file mode 100644 index 00000000..27a15912 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/ColumnDetectionService.java @@ -0,0 +1,149 @@ +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service; + +import java.awt.geom.Rectangle2D; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.stream.IntStream; + +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class ColumnDetectionService { + + private static final double SPLITTABLE_LINE_PERCENTAGE_THRESHOLD = 0.6; + private static final int MAX_NUMBER_OF_COLUMNS = 4; + + + public List detectColumns(List textPositionSequences, Rectangle2D mainBodyTextFrame) { + + if (textPositionSequences.size() < 2) { + return List.of(mainBodyTextFrame); + } + + List> linesWithGaps = LineDetectionService.findLinesWithGaps(textPositionSequences); + + Map> linesWithMatchingGapIndices = new HashMap<>(); + for (int numberOfColumns = 2; numberOfColumns <= MAX_NUMBER_OF_COLUMNS; numberOfColumns++) { + linesWithMatchingGapIndices.put(numberOfColumns, findConsecutiveLinesWithMatchingGaps(linesWithGaps, mainBodyTextFrame.getWidth(), numberOfColumns)); + } + + int optimalNumberOfColumns = findOptimalNumberOfColumns(linesWithMatchingGapIndices, linesWithGaps.size()); + if (optimalNumberOfColumns == 1) { + return List.of(mainBodyTextFrame); + } + return buildColumns(mainBodyTextFrame, getLinesWithMatchingGaps(linesWithMatchingGapIndices.get(optimalNumberOfColumns), linesWithGaps), optimalNumberOfColumns); + } + + + private static List findConsecutiveLinesWithMatchingGaps(List> linesWithGaps, double width, int numberOfColumns) { + + List booleans = lineHasMatchingGap(linesWithGaps, width, numberOfColumns); + return findConsecutiveTrueIndicesWithMaxLengthRun(booleans); + } + + + private List lineHasMatchingGap(List> linesWithGaps, double width, int numberOfColumns) { + + return linesWithGaps.stream() + .map(blocksWithGaps -> IntStream.range(1, numberOfColumns) + .allMatch(columnIndex -> noBlocksIntersectX(blocksWithGaps, calculateGapLocation(width, numberOfColumns, columnIndex)))) + .toList(); + } + + + private List findConsecutiveTrueIndicesWithMaxLengthRun(List booleans) { + + List maxConsecutiveTrueIndices = new LinkedList<>(); + List currentConsecutiveTrueIndices = new LinkedList<>(); + for (int i = 0; i < booleans.size(); i++) { + if (!booleans.get(i)) { + if (currentConsecutiveTrueIndices.isEmpty()) { + continue; + } + if (currentConsecutiveTrueIndices.size() > maxConsecutiveTrueIndices.size()) { + maxConsecutiveTrueIndices = currentConsecutiveTrueIndices; + } + currentConsecutiveTrueIndices = new LinkedList<>(); + continue; + } + currentConsecutiveTrueIndices.add(i); + } + if (currentConsecutiveTrueIndices.size() > maxConsecutiveTrueIndices.size()) { + return currentConsecutiveTrueIndices; + } + return maxConsecutiveTrueIndices; + } + + + private static int findOptimalNumberOfColumns(Map> linesWithMatchingGapIndices, Integer numberOfLines) { + + return linesWithMatchingGapIndices.entrySet() + .stream() + .max(comparePercentages(numberOfLines)) + .filter(entry -> percentageIsAboveThreshold(entry, numberOfLines)) + .map(Map.Entry::getKey) + .orElse(1); + } + + + private List buildColumns(Rectangle2D mainBodyTextFrame, List rectanglesToMerge, int optimalColumnCount) { + + if (optimalColumnCount == 1 || rectanglesToMerge.isEmpty()) { + return List.of(mainBodyTextFrame); + } + + double maxY = rectanglesToMerge.get(0).getMaxY(); + double minY = rectanglesToMerge.get(rectanglesToMerge.size() - 1).getMinY(); + + List columns = new LinkedList<>(); + double width = mainBodyTextFrame.getWidth() / optimalColumnCount; + double height = maxY - minY; + for (int i = 0; i < optimalColumnCount; i++) { + columns.add(new Rectangle2D.Double(mainBodyTextFrame.getMinY() + i * width, minY, width, height)); + } + return columns; + } + + + private Comparator>> comparePercentages(Integer numberOfLines) { + + return Comparator.comparingDouble(entry -> calculatePercentage(entry.getValue().size(), numberOfLines)); + } + + + private List getLinesWithMatchingGaps(List linesWithMatchingGapIndices, List> linesWithGaps) { + + return linesWithMatchingGapIndices.stream().map(linesWithGaps::get).flatMap(Collection::stream).toList(); + } + + + private boolean percentageIsAboveThreshold(Map.Entry> entry, Integer numberOfLines) { + + return calculatePercentage(entry.getValue().size(), numberOfLines) > SPLITTABLE_LINE_PERCENTAGE_THRESHOLD; + } + + + private double calculatePercentage(Integer numberOfMatchingLines, Integer numberOfLines) { + + return ((double) numberOfMatchingLines) / ((double) numberOfLines); + } + + + private double calculateGapLocation(double pageWidth, int numberOfColumns, int columnIndex) { + + return (pageWidth / numberOfColumns) * columnIndex; + } + + + private Boolean noBlocksIntersectX(List blocksWithGaps, double x) { + + return blocksWithGaps.stream().noneMatch(rect -> rect.getMaxX() > x && rect.getMinX() < x); + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/LineDetectionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/LineDetectionService.java new file mode 100644 index 00000000..758b9890 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/LineDetectionService.java @@ -0,0 +1,115 @@ +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service; + +import java.awt.geom.Rectangle2D; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; + +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.RectangleTransformations; +import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.TextPositionSequenceComparator; + +import lombok.AllArgsConstructor; +import lombok.experimental.UtilityClass; + +@UtilityClass +public class LineDetectionService { + + private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines + + + public static List> findLinesWithGaps(List textPositionSequences) { + + if (textPositionSequences.isEmpty()) { + return Collections.emptyList(); + } + + final double avgTextPositionHeight = textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow(); + + Context context = Context.init(); + + List sortedTextPositionSequence = textPositionSequences.stream().sorted(new TextPositionSequenceComparator()).toList(); + + var previousTextPosition = sortedTextPositionSequence.get(0); + context.textPositionsToMerge.add(previousTextPosition); + for (TextPositionSequence currentTextPosition : sortedTextPositionSequence.subList(1, sortedTextPositionSequence.size())) { + if (isNewLine(currentTextPosition, previousTextPosition, avgTextPositionHeight) || isSplitByOrientation(currentTextPosition, previousTextPosition)) { + addBlockToLine(context); + startNewLine(currentTextPosition, context); + } else if (isXGap(currentTextPosition, previousTextPosition, avgTextPositionHeight)) { + addBlockToLine(context); + startNewBlock(currentTextPosition, context); + } else { + context.textPositionsToMerge.add(currentTextPosition); + } + previousTextPosition = currentTextPosition; + } + addBlockToLine(context); + return context.linesWithGaps; + } + + + private static boolean isXGap(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) { + + return Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj()) > (avgTextPositionHeight * X_GAP_FACTOR); + } + + + private static boolean isSplitByOrientation(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition) { + + return !previousTextPosition.getDir().equals(currentTextPosition.getDir()); + } + + + private static boolean isNewLine(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) { + + return Math.abs(previousTextPosition.getMinYDirAdj() - currentTextPosition.getMinYDirAdj()) > avgTextPositionHeight; + } + + + private static void startNewBlock(TextPositionSequence currentTextPosition, Context context) { + + context.textPositionsToMerge = new LinkedList<>(); + context.textPositionsToMerge.add(currentTextPosition); + } + + + private static void addBlockToLine(Context context) { + + context.blocksInLine.add(textPositionBBox(context.textPositionsToMerge)); + } + + + private static void startNewLine(TextPositionSequence current, Context context) { + + context.blocksInLine = new LinkedList<>(); + startNewBlock(current, context); + context.linesWithGaps.add(context.blocksInLine); + } + + + private Rectangle2D textPositionBBox(List textPositionSequences) { + + return RectangleTransformations.rectangleBBox(textPositionSequences.stream().map(TextPositionSequence::getRectangle).toList()); + } + + + @AllArgsConstructor + private class Context { + + List> linesWithGaps; + List blocksInLine; + List textPositionsToMerge; + + + public static Context init() { + + List> initialLinesWithGaps = new LinkedList<>(); + List initialBlocksInLine = new LinkedList<>(); + initialLinesWithGaps.add(initialBlocksInLine); + return new Context(initialLinesWithGaps, initialBlocksInLine, new LinkedList<>()); + } + + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/PdfSegmentationService.java index 913db6c9..94227fbc 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/PdfSegmentationService.java @@ -29,6 +29,7 @@ import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.mo import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence; import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing.PDFLinesTextStripper; import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.FileUtils; +import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.RectangleTransformations; import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings; import lombok.RequiredArgsConstructor; @@ -129,6 +130,7 @@ public class PdfSegmentationService { stripper.getRulings(), stripper.getMinCharWidth(), stripper.getMaxCharHeight()); + // var columns = ColumnDetectionService.detectColumns(stripper.getTextPositionSequences(), RectangleTransformations.toRectangle2D(pdPage.getCropBox())); ClassificationPage page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); page.setRotation(rotation); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/RedactManagerBlockificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/RedactManagerBlockificationService.java index d303e79e..5c4fcf1c 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/RedactManagerBlockificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/RedactManagerBlockificationService.java @@ -23,7 +23,7 @@ import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.ut @Service @SuppressWarnings("all") @ConditionalOnProperty(prefix = "application", name = "type", havingValue = "RedactManager") -public class RedactManagerBlockificationService implements BlockificationService{ +public class RedactManagerBlockificationService implements BlockificationService { static final float THRESHOLD = 1f; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/DocumentGraphFactory.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/DocumentGraphFactory.java index f517a93a..a9701164 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/DocumentGraphFactory.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/DocumentGraphFactory.java @@ -82,7 +82,6 @@ public class DocumentGraphFactory { page.getMainBody().add(node); List textBlocks = new ArrayList<>(textBlocksToMerge); - textBlocks.add(originalTextBlock); AtomicTextBlock textBlock = context.textBlockFactory.fromContext(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page); List treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node); node.setLeafTextBlock(textBlock); @@ -181,7 +180,7 @@ public class DocumentGraphFactory { Page page = context.getPage(pageIndex); Header header = Header.builder().documentTree(context.getDocumentTree()).build(); - AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page); + AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlockFromInteger(header, 0, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); header.setTreeId(tocId); header.setLeafTextBlock(textBlock); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/SectionNodeFactory.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/SectionNodeFactory.java index d162304d..4f675298 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/SectionNodeFactory.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/SectionNodeFactory.java @@ -9,6 +9,7 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.stream.Stream; import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock; import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage; @@ -80,9 +81,10 @@ public class SectionNodeFactory { remainingBlocks.removeAll(alreadyMerged); if (abstractPageBlock instanceof TextPageBlock) { - List textBlocks = findTextBlocksWithSameClassificationAndAlignsYAndSameOrientation(abstractPageBlock, remainingBlocks); - alreadyMerged.addAll(textBlocks); - DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks); +// List textBlocksToMerge = findTextBlocksWithSameClassificationAndAlignsYAndSameOrientationUntilConvergence((TextPageBlock) abstractPageBlock, remainingBlocks); + List textBlocksToMerge = findTextBlocksWithSameClassificationAndAlignsYAndSameOrientation(List.of((TextPageBlock) abstractPageBlock), remainingBlocks); + alreadyMerged.addAll(textBlocksToMerge); + DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocksToMerge); } else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) { List tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks); alreadyMerged.addAll(tablesToMerge); @@ -162,15 +164,30 @@ public class SectionNodeFactory { } - private List findTextBlocksWithSameClassificationAndAlignsYAndSameOrientation(AbstractPageBlock atc, List pageBlocks) { + private List findTextBlocksWithSameClassificationAndAlignsYAndSameOrientationUntilConvergence(TextPageBlock originalTextBlocks, + List pageBlocks) { - return pageBlocks.stream() - .filter(abstractPageBlock -> !abstractPageBlock.equals(atc)) - .filter(abstractPageBlock -> abstractPageBlock.getPage() == atc.getPage()) - .filter(abstractPageBlock -> abstractPageBlock.getOrientation().equals(atc.getOrientation())) - .filter(abstractPageBlock -> abstractPageBlock.intersectsY(atc)) - .filter(abstractPageBlock -> abstractPageBlock instanceof TextPageBlock) - .map(abstractPageBlock -> (TextPageBlock) abstractPageBlock) + int previousCount = 1; + List alignedBlocks = findTextBlocksWithSameClassificationAndAlignsYAndSameOrientation(List.of(originalTextBlocks), pageBlocks); + while (previousCount < alignedBlocks.size()) { + alignedBlocks = findTextBlocksWithSameClassificationAndAlignsYAndSameOrientation(alignedBlocks, pageBlocks); + previousCount = alignedBlocks.size(); + } + return alignedBlocks; + } + + + private static List findTextBlocksWithSameClassificationAndAlignsYAndSameOrientation(List textBlocksToMerge, List pageBlocks) { + + return Stream.concat(pageBlocks.stream() + .filter(abstractPageBlock -> !textBlocksToMerge.contains(abstractPageBlock)) + .filter(abstractPageBlock -> textBlocksToMerge.stream().allMatch(textBlockToMerge -> abstractPageBlock.getPage() == textBlockToMerge.getPage())) + .filter(abstractPageBlock -> textBlocksToMerge.stream().allMatch(textBlockToMerge -> abstractPageBlock.getOrientation().equals(textBlockToMerge.getOrientation()))) + .filter(abstractPageBlock -> textBlocksToMerge.stream().anyMatch(abstractPageBlock::intersectsY)) + //.filter(abstractPageBlock -> textBlocksToMerge.stream().anyMatch(abstractPageBlock::intersectsX)) + .filter(abstractPageBlock -> abstractPageBlock instanceof TextPageBlock) + .map(abstractPageBlock -> (TextPageBlock) abstractPageBlock), // + textBlocksToMerge.stream())// .toList(); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/TextBlockFactory.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/TextBlockFactory.java index 12c0157f..dd79c6d2 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/TextBlockFactory.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/TextBlockFactory.java @@ -43,7 +43,7 @@ public class TextBlockFactory { } - public AtomicTextBlock emptyTextBlock(SemanticNode parent, Integer numberOnPage, Page page) { + public AtomicTextBlock emptyTextBlockFromInteger(SemanticNode parent, Integer numberOnPage, Page page) { long idx = textBlockIdx; textBlockIdx++; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/RectangleTransformations.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/RectangleTransformations.java index 4c101e57..122bfaab 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/RectangleTransformations.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/RectangleTransformations.java @@ -104,6 +104,12 @@ public class RectangleTransformations { } + public static Rectangle2D toRectangle2D(PDRectangle cropBox) { + + return new Rectangle2D.Double(cropBox.getLowerLeftX(), cropBox.getLowerLeftY(), cropBox.getWidth(), cropBox.getHeight()); + } + + private static class Rectangle2DBBoxCollector implements Collector { @Override @@ -133,7 +139,7 @@ public class RectangleTransformations { @Override public Function finisher() { - return bb -> new Rectangle2D.Double(bb.lowerLeftX, bb.lowerLeftY, bb.upperRightX - bb.lowerLeftX, bb.upperRightY - bb.lowerLeftY); + return BBox::toRectangle2D; } @@ -154,6 +160,15 @@ public class RectangleTransformations { Double upperRightY; + public Rectangle2D toRectangle2D() { + + if (lowerLeftX == null || lowerLeftY == null || upperRightX == null || upperRightY == null) { + return new Rectangle2D.Double(0, 0, 0, 0); + } + return new Rectangle2D.Double(lowerLeftX, lowerLeftY, upperRightX - lowerLeftX, upperRightY - lowerLeftY); + } + + public void addRectangle(Rectangle2D rectangle2D) { double lowerLeftX = Math.min(rectangle2D.getMinX(), rectangle2D.getMaxX()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/ColumnDetectionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/ColumnDetectionServiceTest.java new file mode 100644 index 00000000..0d0712dc --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/ColumnDetectionServiceTest.java @@ -0,0 +1,58 @@ +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service; + +import java.awt.geom.Rectangle2D; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.util.List; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.junit.jupiter.api.Test; +import org.springframework.core.io.ClassPathResource; + +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing.PDFLinesTextStripper; +import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.PdfVisualisationUtility; +import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.RectangleTransformations; + +import lombok.SneakyThrows; + +class ColumnDetectionServiceTest { + + @Test + @SneakyThrows + public void testColumnDetection() { + + String filename = "files/Documine/Flora/ProblemDocs/S37Struktur.pdf"; + var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf"; + try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) { + + PDDocument pdDocument = PDDocument.load(inputStream); + System.out.println("start column detection"); + long start = System.currentTimeMillis(); + + for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { + + PDFLinesTextStripper stripper = new PDFLinesTextStripper(); + PDPage pdPage = pdDocument.getPage(pageNumber - 1); + stripper.setPageNumber(pageNumber); + stripper.setStartPage(pageNumber); + stripper.setEndPage(pageNumber); + stripper.setPdpage(pdPage); + stripper.getText(pdDocument); + + List columns = ColumnDetectionService.detectColumns(stripper.getTextPositionSequences(), RectangleTransformations.toRectangle2D(pdPage.getCropBox())); + System.out.printf("found %d columns on page %d%n", columns.size(), pageNumber); + PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, columns, PdfVisualisationUtility.Options.builder().stroke(true).build()); + } + + System.out.printf("finished col detection, took %d ms", System.currentTimeMillis() - start); + + try (var out = new FileOutputStream(tmpFileName)) { + pdDocument.save(out); + pdDocument.close(); + } + } + + } + +} \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/LineDetectionServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/LineDetectionServiceTest.java new file mode 100644 index 00000000..b8480776 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/LineDetectionServiceTest.java @@ -0,0 +1,79 @@ +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service; + +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.util.List; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.springframework.core.io.ClassPathResource; + +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing.PDFLinesTextStripper; +import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.PdfVisualisationUtility; + +import lombok.SneakyThrows; + +class LineDetectionServiceTest { + + @Test + @Disabled + @SneakyThrows + public void testLineDetection() { + + String filename = "files/BDR/Plenarprotokoll 1 (keine Druchsache!) (1).pdf"; + var tmpFileName = "/tmp/" + filename.split("/")[2] + "_LINES.pdf"; + try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) { + + PDDocument pdDocument = PDDocument.load(inputStream); + System.out.println("start column detection"); + long start = System.currentTimeMillis(); + + for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { + + PDFLinesTextStripper stripper = new PDFLinesTextStripper(); + PDPage pdPage = pdDocument.getPage(pageNumber - 1); + stripper.setPageNumber(pageNumber); + stripper.setStartPage(pageNumber); + stripper.setEndPage(pageNumber); + stripper.setPdpage(pdPage); + stripper.getText(pdDocument); + + List> linesWithGaps = LineDetectionService.findLinesWithGaps(stripper.getTextPositionSequences()); + System.out.printf("found %d lines on page %d%n", linesWithGaps.size(), pageNumber); + for (int i = 0; i < linesWithGaps.size(); i++) { + PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, linesWithGaps.get(i), PdfVisualisationUtility.Options.builder().stroke(true).build()); + PdfVisualisationUtility.drawText(String.format("%d", i), + pdDocument, + new Point2D.Double(linesWithGaps.get(i).get(0).getX() - (5 + (5 * countNumberOfDigits(i))), linesWithGaps.get(i).get(0).getY() + 2), + pageNumber, + PdfVisualisationUtility.Options.builder().stroke(true).build()); + + } + } + + System.out.printf("finished line detection, took %d ms", System.currentTimeMillis() - start); + + try (var out = new FileOutputStream(tmpFileName)) { + pdDocument.save(out); + pdDocument.close(); + } + } + } + + + private int countNumberOfDigits(int num) { + + if (num == 0) { + return 1; + } + int count = 0; + for (; num != 0; num /= 10, ++count) { + } + return count; + } + +} \ No newline at end of file diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/S37Struktur.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/S37Struktur.pdf new file mode 100644 index 00000000..8123bf3c Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Documine/Flora/ProblemDocs/S37Struktur.pdf differ