Merge branch 'experimental_features' into 'master'
DM-305: port rules to new schema See merge request redactmanager/redaction-service!44
This commit is contained in:
commit
36fcc88671
@ -26,6 +26,8 @@ public abstract class AbstractPageBlock {
|
||||
@JsonIgnore
|
||||
protected int page;
|
||||
|
||||
int columnIndex;
|
||||
|
||||
@JsonIgnore
|
||||
private Orientation orientation = Orientation.NONE;
|
||||
|
||||
@ -77,4 +79,10 @@ public abstract class AbstractPageBlock {
|
||||
return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsX(AbstractPageBlock atc) {
|
||||
|
||||
return this.minX <= atc.getMaxX() && this.maxX >= atc.getMinX();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,14 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
|
||||
@AllArgsConstructor
|
||||
public class Column {
|
||||
|
||||
int index;
|
||||
ColumnType columnType;
|
||||
Rectangle2D bBox;
|
||||
|
||||
}
|
||||
@ -0,0 +1,6 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model;
|
||||
|
||||
public enum ColumnType {
|
||||
RULING,
|
||||
DISTANCE
|
||||
}
|
||||
@ -0,0 +1,149 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ColumnDetectionService {
|
||||
|
||||
private static final double SPLITTABLE_LINE_PERCENTAGE_THRESHOLD = 0.6;
|
||||
private static final int MAX_NUMBER_OF_COLUMNS = 4;
|
||||
|
||||
|
||||
public List<Rectangle2D> detectColumns(List<TextPositionSequence> textPositionSequences, Rectangle2D mainBodyTextFrame) {
|
||||
|
||||
if (textPositionSequences.size() < 2) {
|
||||
return List.of(mainBodyTextFrame);
|
||||
}
|
||||
|
||||
List<List<Rectangle2D>> linesWithGaps = LineDetectionService.findLinesWithGaps(textPositionSequences);
|
||||
|
||||
Map<Integer, List<Integer>> linesWithMatchingGapIndices = new HashMap<>();
|
||||
for (int numberOfColumns = 2; numberOfColumns <= MAX_NUMBER_OF_COLUMNS; numberOfColumns++) {
|
||||
linesWithMatchingGapIndices.put(numberOfColumns, findConsecutiveLinesWithMatchingGaps(linesWithGaps, mainBodyTextFrame.getWidth(), numberOfColumns));
|
||||
}
|
||||
|
||||
int optimalNumberOfColumns = findOptimalNumberOfColumns(linesWithMatchingGapIndices, linesWithGaps.size());
|
||||
if (optimalNumberOfColumns == 1) {
|
||||
return List.of(mainBodyTextFrame);
|
||||
}
|
||||
return buildColumns(mainBodyTextFrame, getLinesWithMatchingGaps(linesWithMatchingGapIndices.get(optimalNumberOfColumns), linesWithGaps), optimalNumberOfColumns);
|
||||
}
|
||||
|
||||
|
||||
private static List<Integer> findConsecutiveLinesWithMatchingGaps(List<List<Rectangle2D>> linesWithGaps, double width, int numberOfColumns) {
|
||||
|
||||
List<Boolean> booleans = lineHasMatchingGap(linesWithGaps, width, numberOfColumns);
|
||||
return findConsecutiveTrueIndicesWithMaxLengthRun(booleans);
|
||||
}
|
||||
|
||||
|
||||
private List<Boolean> lineHasMatchingGap(List<List<Rectangle2D>> linesWithGaps, double width, int numberOfColumns) {
|
||||
|
||||
return linesWithGaps.stream()
|
||||
.map(blocksWithGaps -> IntStream.range(1, numberOfColumns)
|
||||
.allMatch(columnIndex -> noBlocksIntersectX(blocksWithGaps, calculateGapLocation(width, numberOfColumns, columnIndex))))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private List<Integer> findConsecutiveTrueIndicesWithMaxLengthRun(List<Boolean> booleans) {
|
||||
|
||||
List<Integer> maxConsecutiveTrueIndices = new LinkedList<>();
|
||||
List<Integer> currentConsecutiveTrueIndices = new LinkedList<>();
|
||||
for (int i = 0; i < booleans.size(); i++) {
|
||||
if (!booleans.get(i)) {
|
||||
if (currentConsecutiveTrueIndices.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
if (currentConsecutiveTrueIndices.size() > maxConsecutiveTrueIndices.size()) {
|
||||
maxConsecutiveTrueIndices = currentConsecutiveTrueIndices;
|
||||
}
|
||||
currentConsecutiveTrueIndices = new LinkedList<>();
|
||||
continue;
|
||||
}
|
||||
currentConsecutiveTrueIndices.add(i);
|
||||
}
|
||||
if (currentConsecutiveTrueIndices.size() > maxConsecutiveTrueIndices.size()) {
|
||||
return currentConsecutiveTrueIndices;
|
||||
}
|
||||
return maxConsecutiveTrueIndices;
|
||||
}
|
||||
|
||||
|
||||
private static int findOptimalNumberOfColumns(Map<Integer, List<Integer>> linesWithMatchingGapIndices, Integer numberOfLines) {
|
||||
|
||||
return linesWithMatchingGapIndices.entrySet()
|
||||
.stream()
|
||||
.max(comparePercentages(numberOfLines))
|
||||
.filter(entry -> percentageIsAboveThreshold(entry, numberOfLines))
|
||||
.map(Map.Entry::getKey)
|
||||
.orElse(1);
|
||||
}
|
||||
|
||||
|
||||
private List<Rectangle2D> buildColumns(Rectangle2D mainBodyTextFrame, List<Rectangle2D> rectanglesToMerge, int optimalColumnCount) {
|
||||
|
||||
if (optimalColumnCount == 1 || rectanglesToMerge.isEmpty()) {
|
||||
return List.of(mainBodyTextFrame);
|
||||
}
|
||||
|
||||
double maxY = rectanglesToMerge.get(0).getMaxY();
|
||||
double minY = rectanglesToMerge.get(rectanglesToMerge.size() - 1).getMinY();
|
||||
|
||||
List<Rectangle2D> columns = new LinkedList<>();
|
||||
double width = mainBodyTextFrame.getWidth() / optimalColumnCount;
|
||||
double height = maxY - minY;
|
||||
for (int i = 0; i < optimalColumnCount; i++) {
|
||||
columns.add(new Rectangle2D.Double(mainBodyTextFrame.getMinY() + i * width, minY, width, height));
|
||||
}
|
||||
return columns;
|
||||
}
|
||||
|
||||
|
||||
private Comparator<Map.Entry<Integer, List<Integer>>> comparePercentages(Integer numberOfLines) {
|
||||
|
||||
return Comparator.comparingDouble(entry -> calculatePercentage(entry.getValue().size(), numberOfLines));
|
||||
}
|
||||
|
||||
|
||||
private List<Rectangle2D> getLinesWithMatchingGaps(List<Integer> linesWithMatchingGapIndices, List<List<Rectangle2D>> linesWithGaps) {
|
||||
|
||||
return linesWithMatchingGapIndices.stream().map(linesWithGaps::get).flatMap(Collection::stream).toList();
|
||||
}
|
||||
|
||||
|
||||
private boolean percentageIsAboveThreshold(Map.Entry<Integer, List<Integer>> entry, Integer numberOfLines) {
|
||||
|
||||
return calculatePercentage(entry.getValue().size(), numberOfLines) > SPLITTABLE_LINE_PERCENTAGE_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private double calculatePercentage(Integer numberOfMatchingLines, Integer numberOfLines) {
|
||||
|
||||
return ((double) numberOfMatchingLines) / ((double) numberOfLines);
|
||||
}
|
||||
|
||||
|
||||
private double calculateGapLocation(double pageWidth, int numberOfColumns, int columnIndex) {
|
||||
|
||||
return (pageWidth / numberOfColumns) * columnIndex;
|
||||
}
|
||||
|
||||
|
||||
private Boolean noBlocksIntersectX(List<Rectangle2D> blocksWithGaps, double x) {
|
||||
|
||||
return blocksWithGaps.stream().noneMatch(rect -> rect.getMaxX() > x && rect.getMinX() < x);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,115 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.RectangleTransformations;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.TextPositionSequenceComparator;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class LineDetectionService {
|
||||
|
||||
private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines
|
||||
|
||||
|
||||
public static List<List<Rectangle2D>> findLinesWithGaps(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
if (textPositionSequences.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
final double avgTextPositionHeight = textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
|
||||
|
||||
Context context = Context.init();
|
||||
|
||||
List<TextPositionSequence> sortedTextPositionSequence = textPositionSequences.stream().sorted(new TextPositionSequenceComparator()).toList();
|
||||
|
||||
var previousTextPosition = sortedTextPositionSequence.get(0);
|
||||
context.textPositionsToMerge.add(previousTextPosition);
|
||||
for (TextPositionSequence currentTextPosition : sortedTextPositionSequence.subList(1, sortedTextPositionSequence.size())) {
|
||||
if (isNewLine(currentTextPosition, previousTextPosition, avgTextPositionHeight) || isSplitByOrientation(currentTextPosition, previousTextPosition)) {
|
||||
addBlockToLine(context);
|
||||
startNewLine(currentTextPosition, context);
|
||||
} else if (isXGap(currentTextPosition, previousTextPosition, avgTextPositionHeight)) {
|
||||
addBlockToLine(context);
|
||||
startNewBlock(currentTextPosition, context);
|
||||
} else {
|
||||
context.textPositionsToMerge.add(currentTextPosition);
|
||||
}
|
||||
previousTextPosition = currentTextPosition;
|
||||
}
|
||||
addBlockToLine(context);
|
||||
return context.linesWithGaps;
|
||||
}
|
||||
|
||||
|
||||
private static boolean isXGap(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) {
|
||||
|
||||
return Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj()) > (avgTextPositionHeight * X_GAP_FACTOR);
|
||||
}
|
||||
|
||||
|
||||
private static boolean isSplitByOrientation(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition) {
|
||||
|
||||
return !previousTextPosition.getDir().equals(currentTextPosition.getDir());
|
||||
}
|
||||
|
||||
|
||||
private static boolean isNewLine(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) {
|
||||
|
||||
return Math.abs(previousTextPosition.getMinYDirAdj() - currentTextPosition.getMinYDirAdj()) > avgTextPositionHeight;
|
||||
}
|
||||
|
||||
|
||||
private static void startNewBlock(TextPositionSequence currentTextPosition, Context context) {
|
||||
|
||||
context.textPositionsToMerge = new LinkedList<>();
|
||||
context.textPositionsToMerge.add(currentTextPosition);
|
||||
}
|
||||
|
||||
|
||||
private static void addBlockToLine(Context context) {
|
||||
|
||||
context.blocksInLine.add(textPositionBBox(context.textPositionsToMerge));
|
||||
}
|
||||
|
||||
|
||||
private static void startNewLine(TextPositionSequence current, Context context) {
|
||||
|
||||
context.blocksInLine = new LinkedList<>();
|
||||
startNewBlock(current, context);
|
||||
context.linesWithGaps.add(context.blocksInLine);
|
||||
}
|
||||
|
||||
|
||||
private Rectangle2D textPositionBBox(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
return RectangleTransformations.rectangleBBox(textPositionSequences.stream().map(TextPositionSequence::getRectangle).toList());
|
||||
}
|
||||
|
||||
|
||||
@AllArgsConstructor
|
||||
private class Context {
|
||||
|
||||
List<List<Rectangle2D>> linesWithGaps;
|
||||
List<Rectangle2D> blocksInLine;
|
||||
List<TextPositionSequence> textPositionsToMerge;
|
||||
|
||||
|
||||
public static Context init() {
|
||||
|
||||
List<List<Rectangle2D>> initialLinesWithGaps = new LinkedList<>();
|
||||
List<Rectangle2D> initialBlocksInLine = new LinkedList<>();
|
||||
initialLinesWithGaps.add(initialBlocksInLine);
|
||||
return new Context(initialLinesWithGaps, initialBlocksInLine, new LinkedList<>());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -29,6 +29,7 @@ import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.mo
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing.PDFLinesTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.FileUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.RectangleTransformations;
|
||||
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -129,6 +130,7 @@ public class PdfSegmentationService {
|
||||
stripper.getRulings(),
|
||||
stripper.getMinCharWidth(),
|
||||
stripper.getMaxCharHeight());
|
||||
// var columns = ColumnDetectionService.detectColumns(stripper.getTextPositionSequences(), RectangleTransformations.toRectangle2D(pdPage.getCropBox()));
|
||||
ClassificationPage page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
|
||||
page.setRotation(rotation);
|
||||
|
||||
@ -23,7 +23,7 @@ import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.ut
|
||||
@Service
|
||||
@SuppressWarnings("all")
|
||||
@ConditionalOnProperty(prefix = "application", name = "type", havingValue = "RedactManager")
|
||||
public class RedactManagerBlockificationService implements BlockificationService{
|
||||
public class RedactManagerBlockificationService implements BlockificationService {
|
||||
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
|
||||
@ -82,7 +82,6 @@ public class DocumentGraphFactory {
|
||||
page.getMainBody().add(node);
|
||||
|
||||
List<TextPageBlock> textBlocks = new ArrayList<>(textBlocksToMerge);
|
||||
textBlocks.add(originalTextBlock);
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.fromContext(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page);
|
||||
List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node);
|
||||
node.setLeafTextBlock(textBlock);
|
||||
@ -181,7 +180,7 @@ public class DocumentGraphFactory {
|
||||
|
||||
Page page = context.getPage(pageIndex);
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlockFromInteger(header, 0, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||
header.setTreeId(tocId);
|
||||
header.setLeafTextBlock(textBlock);
|
||||
|
||||
@ -9,6 +9,7 @@ import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage;
|
||||
@ -80,9 +81,10 @@ public class SectionNodeFactory {
|
||||
remainingBlocks.removeAll(alreadyMerged);
|
||||
|
||||
if (abstractPageBlock instanceof TextPageBlock) {
|
||||
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsYAndSameOrientation(abstractPageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(textBlocks);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
|
||||
// List<TextPageBlock> textBlocksToMerge = findTextBlocksWithSameClassificationAndAlignsYAndSameOrientationUntilConvergence((TextPageBlock) abstractPageBlock, remainingBlocks);
|
||||
List<TextPageBlock> textBlocksToMerge = findTextBlocksWithSameClassificationAndAlignsYAndSameOrientation(List.of((TextPageBlock) abstractPageBlock), remainingBlocks);
|
||||
alreadyMerged.addAll(textBlocksToMerge);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocksToMerge);
|
||||
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(tablesToMerge);
|
||||
@ -162,15 +164,30 @@ public class SectionNodeFactory {
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> findTextBlocksWithSameClassificationAndAlignsYAndSameOrientation(AbstractPageBlock atc, List<AbstractPageBlock> pageBlocks) {
|
||||
private List<TextPageBlock> findTextBlocksWithSameClassificationAndAlignsYAndSameOrientationUntilConvergence(TextPageBlock originalTextBlocks,
|
||||
List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
return pageBlocks.stream()
|
||||
.filter(abstractPageBlock -> !abstractPageBlock.equals(atc))
|
||||
.filter(abstractPageBlock -> abstractPageBlock.getPage() == atc.getPage())
|
||||
.filter(abstractPageBlock -> abstractPageBlock.getOrientation().equals(atc.getOrientation()))
|
||||
.filter(abstractPageBlock -> abstractPageBlock.intersectsY(atc))
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TextPageBlock)
|
||||
.map(abstractPageBlock -> (TextPageBlock) abstractPageBlock)
|
||||
int previousCount = 1;
|
||||
List<TextPageBlock> alignedBlocks = findTextBlocksWithSameClassificationAndAlignsYAndSameOrientation(List.of(originalTextBlocks), pageBlocks);
|
||||
while (previousCount < alignedBlocks.size()) {
|
||||
alignedBlocks = findTextBlocksWithSameClassificationAndAlignsYAndSameOrientation(alignedBlocks, pageBlocks);
|
||||
previousCount = alignedBlocks.size();
|
||||
}
|
||||
return alignedBlocks;
|
||||
}
|
||||
|
||||
|
||||
private static List<TextPageBlock> findTextBlocksWithSameClassificationAndAlignsYAndSameOrientation(List<TextPageBlock> textBlocksToMerge, List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
return Stream.concat(pageBlocks.stream()
|
||||
.filter(abstractPageBlock -> !textBlocksToMerge.contains(abstractPageBlock))
|
||||
.filter(abstractPageBlock -> textBlocksToMerge.stream().allMatch(textBlockToMerge -> abstractPageBlock.getPage() == textBlockToMerge.getPage()))
|
||||
.filter(abstractPageBlock -> textBlocksToMerge.stream().allMatch(textBlockToMerge -> abstractPageBlock.getOrientation().equals(textBlockToMerge.getOrientation())))
|
||||
.filter(abstractPageBlock -> textBlocksToMerge.stream().anyMatch(abstractPageBlock::intersectsY))
|
||||
//.filter(abstractPageBlock -> textBlocksToMerge.stream().anyMatch(abstractPageBlock::intersectsX))
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TextPageBlock)
|
||||
.map(abstractPageBlock -> (TextPageBlock) abstractPageBlock), //
|
||||
textBlocksToMerge.stream())//
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -43,7 +43,7 @@ public class TextBlockFactory {
|
||||
}
|
||||
|
||||
|
||||
public AtomicTextBlock emptyTextBlock(SemanticNode parent, Integer numberOnPage, Page page) {
|
||||
public AtomicTextBlock emptyTextBlockFromInteger(SemanticNode parent, Integer numberOnPage, Page page) {
|
||||
|
||||
long idx = textBlockIdx;
|
||||
textBlockIdx++;
|
||||
|
||||
@ -104,6 +104,12 @@ public class RectangleTransformations {
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D toRectangle2D(PDRectangle cropBox) {
|
||||
|
||||
return new Rectangle2D.Double(cropBox.getLowerLeftX(), cropBox.getLowerLeftY(), cropBox.getWidth(), cropBox.getHeight());
|
||||
}
|
||||
|
||||
|
||||
private static class Rectangle2DBBoxCollector implements Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> {
|
||||
|
||||
@Override
|
||||
@ -133,7 +139,7 @@ public class RectangleTransformations {
|
||||
@Override
|
||||
public Function<BBox, Rectangle2D> finisher() {
|
||||
|
||||
return bb -> new Rectangle2D.Double(bb.lowerLeftX, bb.lowerLeftY, bb.upperRightX - bb.lowerLeftX, bb.upperRightY - bb.lowerLeftY);
|
||||
return BBox::toRectangle2D;
|
||||
}
|
||||
|
||||
|
||||
@ -154,6 +160,15 @@ public class RectangleTransformations {
|
||||
Double upperRightY;
|
||||
|
||||
|
||||
public Rectangle2D toRectangle2D() {
|
||||
|
||||
if (lowerLeftX == null || lowerLeftY == null || upperRightX == null || upperRightY == null) {
|
||||
return new Rectangle2D.Double(0, 0, 0, 0);
|
||||
}
|
||||
return new Rectangle2D.Double(lowerLeftX, lowerLeftY, upperRightX - lowerLeftX, upperRightY - lowerLeftY);
|
||||
}
|
||||
|
||||
|
||||
public void addRectangle(Rectangle2D rectangle2D) {
|
||||
|
||||
double lowerLeftX = Math.min(rectangle2D.getMinX(), rectangle2D.getMaxX());
|
||||
|
||||
@ -0,0 +1,58 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing.PDFLinesTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.PdfVisualisationUtility;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.RectangleTransformations;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class ColumnDetectionServiceTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testColumnDetection() {
|
||||
|
||||
String filename = "files/Documine/Flora/ProblemDocs/S37Struktur.pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
|
||||
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
|
||||
|
||||
PDDocument pdDocument = PDDocument.load(inputStream);
|
||||
System.out.println("start column detection");
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(pdDocument);
|
||||
|
||||
List<Rectangle2D> columns = ColumnDetectionService.detectColumns(stripper.getTextPositionSequences(), RectangleTransformations.toRectangle2D(pdPage.getCropBox()));
|
||||
System.out.printf("found %d columns on page %d%n", columns.size(), pageNumber);
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, columns, PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
}
|
||||
|
||||
System.out.printf("finished col detection, took %d ms", System.currentTimeMillis() - start);
|
||||
|
||||
try (var out = new FileOutputStream(tmpFileName)) {
|
||||
pdDocument.save(out);
|
||||
pdDocument.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,79 @@
|
||||
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing.PDFLinesTextStripper;
|
||||
import com.iqser.red.service.redaction.v1.server.layoutparsing.document.utils.PdfVisualisationUtility;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class LineDetectionServiceTest {
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
public void testLineDetection() {
|
||||
|
||||
String filename = "files/BDR/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_LINES.pdf";
|
||||
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
|
||||
|
||||
PDDocument pdDocument = PDDocument.load(inputStream);
|
||||
System.out.println("start column detection");
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(pdDocument);
|
||||
|
||||
List<List<Rectangle2D>> linesWithGaps = LineDetectionService.findLinesWithGaps(stripper.getTextPositionSequences());
|
||||
System.out.printf("found %d lines on page %d%n", linesWithGaps.size(), pageNumber);
|
||||
for (int i = 0; i < linesWithGaps.size(); i++) {
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, linesWithGaps.get(i), PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
PdfVisualisationUtility.drawText(String.format("%d", i),
|
||||
pdDocument,
|
||||
new Point2D.Double(linesWithGaps.get(i).get(0).getX() - (5 + (5 * countNumberOfDigits(i))), linesWithGaps.get(i).get(0).getY() + 2),
|
||||
pageNumber,
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
System.out.printf("finished line detection, took %d ms", System.currentTimeMillis() - start);
|
||||
|
||||
try (var out = new FileOutputStream(tmpFileName)) {
|
||||
pdDocument.save(out);
|
||||
pdDocument.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private int countNumberOfDigits(int num) {
|
||||
|
||||
if (num == 0) {
|
||||
return 1;
|
||||
}
|
||||
int count = 0;
|
||||
for (; num != 0; num /= 10, ++count) {
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
}
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user