TAAS-41: add (inactive) experimental services

This commit is contained in:
Kilian Schuettler 2023-07-24 15:58:06 +02:00
parent 241a32cb4f
commit 526b1c5ad3
17 changed files with 503 additions and 281 deletions

View File

@ -12,12 +12,16 @@ import lombok.experimental.FieldDefaults;
@Getter @Getter
@AllArgsConstructor @AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class Gaps { public class GapInformation {
List<List<Rectangle2D>> xGaps ;
List<List<Rectangle2D>> yGaps ; List<List<Rectangle2D>> xGaps;
List<List<Rectangle2D>> yGaps;
public GapInformation() {
public Gaps() {
xGaps = new LinkedList<>(); xGaps = new LinkedList<>();
yGaps = new LinkedList<>(); yGaps = new LinkedList<>();
} }
} }

View File

@ -1,5 +1,23 @@
package com.knecon.fforesight.service.layoutparser.processor.model; package com.knecon.fforesight.service.layoutparser.processor.model;
import java.awt.geom.Rectangle2D;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Getter
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class LineInformation { public class LineInformation {
List<Rectangle2D> lineBBox;
List<List<TextPositionSequence>> sequencesByLines;
List<List<Rectangle2D>> bBoxWithGapsByLines;
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines;
} }

View File

@ -12,9 +12,10 @@ import lombok.Getter;
@Getter @Getter
@Builder @Builder
@AllArgsConstructor @AllArgsConstructor
public class PageInformation { public class PageContents {
List<TextPositionSequence> sortedTextPositionSequences; List<TextPositionSequence> sortedTextPositionSequences;
Rectangle2D cropBox; Rectangle2D cropBox;
Rectangle2D mediaBox;
} }

View File

@ -1,5 +1,17 @@
package com.knecon.fforesight.service.layoutparser.processor.model; package com.knecon.fforesight.service.layoutparser.processor.model;
import java.awt.geom.Rectangle2D;
import lombok.AllArgsConstructor;
import lombok.Getter;
@Getter
@AllArgsConstructor
public class PageInformation { public class PageInformation {
PageContents pageContents;
LineInformation lineInformation;
Rectangle2D mainBodyTextFrame;
GapInformation gapInformation;
} }

View File

@ -1,149 +1,87 @@
package com.knecon.fforesight.service.layoutparser.processor.services; package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Line2D;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.Collection; import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.stream.Stream;
import java.util.stream.IntStream;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@UtilityClass @UtilityClass
public class DividingColumnDetectionService { public class DividingColumnDetectionService {
private static final double SPLITTABLE_LINE_PERCENTAGE_THRESHOLD = 0.6; private static final int MAX_NUMBER_OF_COLUMNS = 200;
private static final int MAX_NUMBER_OF_COLUMNS = 4;
private static final int LINE_COUNT_THRESHOLD = 5;
public List<Rectangle2D> detectColumns(List<TextPositionSequence> textPositionSequences, Rectangle2D mainBodyTextFrame) { public List<Rectangle2D> detectColumns(PageContents pageContents) {
if (textPositionSequences.size() < 2) {
return List.of(mainBodyTextFrame); if (pageContents.getSortedTextPositionSequences().size() < 2) {
return List.of(pageContents.getCropBox());
} }
List<List<Rectangle2D>> linesWithGaps = LineDetectionService.findTextBlockInLines(textPositionSequences); GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), pageContents.getCropBox());
Map<Integer, List<Integer>> linesWithMatchingGapIndices = new HashMap<>(); return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getCropBox());
for (int numberOfColumns = 2; numberOfColumns <= MAX_NUMBER_OF_COLUMNS; numberOfColumns++) {
linesWithMatchingGapIndices.put(numberOfColumns, findConsecutiveLinesWithMatchingGaps(linesWithGaps, mainBodyTextFrame.getWidth(), numberOfColumns));
}
int optimalNumberOfColumns = findOptimalNumberOfColumns(linesWithMatchingGapIndices, linesWithGaps.size());
if (optimalNumberOfColumns == 1) {
return List.of(mainBodyTextFrame);
}
return buildColumns(mainBodyTextFrame, getLinesWithMatchingGaps(linesWithMatchingGapIndices.get(optimalNumberOfColumns), linesWithGaps), optimalNumberOfColumns);
} }
private static List<Integer> findConsecutiveLinesWithMatchingGaps(List<List<Rectangle2D>> linesWithGaps, double width, int numberOfColumns) { public List<Rectangle2D> detectColumnsFromLines(List<List<Rectangle2D>> gaps, Rectangle2D mainBodyTextFrame) {
List<Boolean> booleans = lineHasMatchingGap(linesWithGaps, width, numberOfColumns); List<List<Line2D>> allColumnParts = new ArrayList<>(MAX_NUMBER_OF_COLUMNS);
return findConsecutiveTrueIndicesWithMaxLengthRun(booleans); for (int columnIndex = 1; columnIndex < MAX_NUMBER_OF_COLUMNS; columnIndex++) {
double x = calculateGapLocation(columnIndex, MAX_NUMBER_OF_COLUMNS, mainBodyTextFrame.getWidth()) + mainBodyTextFrame.getMinX();
double currentMinY = mainBodyTextFrame.getMaxY();
double currentMaxY = 0;
int currentLineCount = 0;
List<Line2D> columnParts = new LinkedList<>();
allColumnParts.add(columnParts);
for (int lineNumber = 0; lineNumber < gaps.size(); lineNumber++) {
List<Rectangle2D> textBlocksInLine = gaps.get(lineNumber);
if (anyBlockIntersectX(textBlocksInLine, x)) {
if (lineNumber == gaps.size() - 1) {
currentMaxY = mainBodyTextFrame.getMinY();
} else {
currentMaxY = gaps.get(lineNumber + 1).get(0).getMinY();
}
currentLineCount++;
} else {
if (currentLineCount >= LINE_COUNT_THRESHOLD) {
columnParts.add(new Line2D.Double(x, currentMinY, x, currentMaxY));
}
currentMinY = gaps.get(lineNumber).get(0).getMaxY();
currentMaxY = currentMinY;
currentLineCount = 0;
}
}
if (currentLineCount >= LINE_COUNT_THRESHOLD) {
columnParts.add(new Line2D.Double(x, currentMinY, x, currentMaxY));
}
}
return Stream.concat(Stream.of(mainBodyTextFrame),
allColumnParts.stream()
.flatMap(columnParts -> columnParts.stream().map(line -> new Rectangle2D.Double(line.getX2(), line.getY2(), 1, Math.abs(line.getY2() - line.getY1()))))
.map(r -> (Rectangle2D) r)).toList();
} }
private List<Boolean> lineHasMatchingGap(List<List<Rectangle2D>> linesWithGaps, double width, int numberOfColumns) { private static boolean anyBlockIntersectX(List<Rectangle2D> textBlocksInLine, double x) {
return linesWithGaps.stream() return textBlocksInLine.stream().anyMatch(gap -> gap.getMinX() < x && x < gap.getMaxX());
.map(blocksWithGaps -> IntStream.range(1, numberOfColumns)
.allMatch(columnIndex -> noBlocksIntersectX(blocksWithGaps, calculateGapLocation(width, numberOfColumns, columnIndex))))
.toList();
} }
private List<Integer> findConsecutiveTrueIndicesWithMaxLengthRun(List<Boolean> booleans) { private double calculateGapLocation(int columnIndex, int numberOfColumns, double pageWidth) {
List<Integer> maxConsecutiveTrueIndices = new LinkedList<>();
List<Integer> currentConsecutiveTrueIndices = new LinkedList<>();
for (int i = 0; i < booleans.size(); i++) {
if (!booleans.get(i)) {
if (currentConsecutiveTrueIndices.isEmpty()) {
continue;
}
if (currentConsecutiveTrueIndices.size() > maxConsecutiveTrueIndices.size()) {
maxConsecutiveTrueIndices = currentConsecutiveTrueIndices;
}
currentConsecutiveTrueIndices = new LinkedList<>();
continue;
}
currentConsecutiveTrueIndices.add(i);
}
if (currentConsecutiveTrueIndices.size() > maxConsecutiveTrueIndices.size()) {
return currentConsecutiveTrueIndices;
}
return maxConsecutiveTrueIndices;
}
private static int findOptimalNumberOfColumns(Map<Integer, List<Integer>> linesWithMatchingGapIndices, Integer numberOfLines) {
return linesWithMatchingGapIndices.entrySet()
.stream()
.max(comparePercentages(numberOfLines))
.filter(entry -> percentageIsAboveThreshold(entry, numberOfLines))
.map(Map.Entry::getKey)
.orElse(1);
}
private List<Rectangle2D> buildColumns(Rectangle2D mainBodyTextFrame, List<Rectangle2D> rectanglesToMerge, int optimalColumnCount) {
if (optimalColumnCount == 1 || rectanglesToMerge.isEmpty()) {
return List.of(mainBodyTextFrame);
}
double maxY = rectanglesToMerge.get(0).getMaxY();
double minY = rectanglesToMerge.get(rectanglesToMerge.size() - 1).getMinY();
List<Rectangle2D> columns = new LinkedList<>();
double width = mainBodyTextFrame.getWidth() / optimalColumnCount;
double height = maxY - minY;
for (int i = 0; i < optimalColumnCount; i++) {
columns.add(new Rectangle2D.Double(mainBodyTextFrame.getMinY() + i * width, minY, width, height));
}
return columns;
}
private Comparator<Map.Entry<Integer, List<Integer>>> comparePercentages(Integer numberOfLines) {
return Comparator.comparingDouble(entry -> calculatePercentage(entry.getValue().size(), numberOfLines));
}
private List<Rectangle2D> getLinesWithMatchingGaps(List<Integer> linesWithMatchingGapIndices, List<List<Rectangle2D>> linesWithGaps) {
return linesWithMatchingGapIndices.stream().map(linesWithGaps::get).flatMap(Collection::stream).toList();
}
private boolean percentageIsAboveThreshold(Map.Entry<Integer, List<Integer>> entry, Integer numberOfLines) {
return calculatePercentage(entry.getValue().size(), numberOfLines) > SPLITTABLE_LINE_PERCENTAGE_THRESHOLD;
}
private double calculatePercentage(Integer numberOfMatchingLines, Integer numberOfLines) {
return ((double) numberOfMatchingLines) / ((double) numberOfLines);
}
private double calculateGapLocation(double pageWidth, int numberOfColumns, int columnIndex) {
return (pageWidth / numberOfColumns) * columnIndex; return (pageWidth / numberOfColumns) * columnIndex;
} }
private Boolean noBlocksIntersectX(List<Rectangle2D> blocksWithGaps, double x) {
return blocksWithGaps.stream().noneMatch(rect -> rect.getMaxX() > x && rect.getMinX() < x);
}
} }

View File

@ -4,7 +4,9 @@ import java.awt.geom.Rectangle2D;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@ -12,17 +14,16 @@ import lombok.experimental.UtilityClass;
@UtilityClass @UtilityClass
public class GapDetectionService { public class GapDetectionService {
private static final double X_GAP_FACTOR = 0.3; // multiplied with average text height, determines the minimum distance of gaps in lines private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines
private static final double Y_GAP_FACTOR = 1; private static final double Y_GAP_FACTOR = 1;
private static final double NEW_LINE_FACTOR = 0.2; private static final double NEW_LINE_FACTOR = 0.2;
public static Gaps findGapsInLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) { public static GapInformation findGapsInLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
if (sortedTextPositionSequences.isEmpty()) { if (sortedTextPositionSequences.isEmpty()) {
return new Gaps(); return new GapInformation();
} }
//assertAllTextPositionsHaveSameDir(textPositionSequences);
final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences); final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences);
@ -32,30 +33,29 @@ public class GapDetectionService {
var previousTextPosition = sortedTextPositionSequences.get(0); var previousTextPosition = sortedTextPositionSequences.get(0);
Rectangle2D rectangle = toRectangle2D(previousTextPosition); Rectangle2D rectangle = toRectangle2D(previousTextPosition);
yGapContext.addGapFromTopOfMainBody(rectangle);
xGapContext.addGapFromLeftEdgeOfMainBody(rectangle); xGapContext.addGapFromLeftEdgeOfMainBody(rectangle);
for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) { for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) {
double yGap = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj()); double yDifference = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj());
double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj()); double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj());
Rectangle2D previousTextPositionBBox = toRectangle2D(previousTextPosition); Rectangle2D previousTextPositionBBox = toRectangle2D(previousTextPosition);
Rectangle2D currentTextPositionBBox = toRectangle2D(currentTextPosition); Rectangle2D currentTextPositionBBox = toRectangle2D(currentTextPosition);
if (yGap > avgTextPositionHeight * Y_GAP_FACTOR) { if (yDifference > avgTextPositionHeight * Y_GAP_FACTOR) {
yGapContext.addGap(mainBodyTextFrame.getMinX(),
yGapContext.addGap(mainBodyTextFrame.getMinX(), currentTextPositionBBox.getMaxY(), mainBodyTextFrame.getWidth(), yGap); previousTextPositionBBox.getMaxY(),
mainBodyTextFrame.getWidth(),
-(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY()));
} }
if (yGap > avgTextPositionHeight * NEW_LINE_FACTOR) { if (yDifference > avgTextPositionHeight * NEW_LINE_FACTOR) {
xGapContext.addGapToRightEdgeOfMainBody(previousTextPositionBBox); xGapContext.addGapToRightEdgeOfMainBody(previousTextPositionBBox);
xGapContext.gapsInCurrentLine = new LinkedList<>(); xGapContext.gapsInCurrentLine = new LinkedList<>();
xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine); xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine);
xGapContext.addGapFromLeftEdgeOfMainBody(currentTextPositionBBox); xGapContext.addGapFromLeftEdgeOfMainBody(currentTextPositionBBox);
} else if (xGap > avgTextPositionHeight * X_GAP_FACTOR) {
} else if (xGap <= avgTextPositionHeight * X_GAP_FACTOR) {
addGapToLine(currentTextPositionBBox, previousTextPositionBBox, xGapContext); addGapToLine(currentTextPositionBBox, previousTextPositionBBox, xGapContext);
} }
previousTextPosition = currentTextPosition; previousTextPosition = currentTextPosition;
@ -63,15 +63,19 @@ public class GapDetectionService {
xGapContext.addGapToRightEdgeOfMainBody(toRectangle2D(sortedTextPositionSequences.get(sortedTextPositionSequences.size() - 1))); xGapContext.addGapToRightEdgeOfMainBody(toRectangle2D(sortedTextPositionSequences.get(sortedTextPositionSequences.size() - 1)));
xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine); xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine);
return new Gaps(xGapContext.gapsPerLine, yGapContext.gapsPerLine); return new GapInformation(xGapContext.gapsPerLine, yGapContext.gapsPerLine);
} }
private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) { private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) {
return RectangleTransformations.toRectangle2D(textPosition.getRectangle()); return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle()));
} }
private static Rectangle2D mirrorY(Rectangle2D rectangle2D) {
return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight()));
}
private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) { private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) {
@ -111,19 +115,9 @@ public class GapDetectionService {
} }
public void addGapFromTopOfMainBody(Rectangle2D rectangle) { public void addGap(double x1, double y1, double w, double h) {
gapsInCurrentLine.add(new Rectangle2D.Double(mainBodyTextFrame.getMinX(), gapsInCurrentLine.add(new Rectangle2D.Double(x1, y1, w, h));
rectangle.getMaxY(),
mainBodyTextFrame.getWidth(),
mainBodyTextFrame.getMaxY() - rectangle.getMaxY()));
}
public void addGap(double x, double y, double w, double h) {
gapsInCurrentLine.add(new Rectangle2D.Double(x, y, w, h));
} }
} }

View File

@ -15,13 +15,14 @@ import lombok.RequiredArgsConstructor;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@UtilityClass @UtilityClass
public class GapFindingColumnDetectionService implements ColumnDetectionService { public class GapsAcrossLinesService {
private static final double GAP_WIDTH_THRESHOLD_FACTOR = 0.01; // multiplied with avg text height private static final double GAP_WIDTH_THRESHOLD_FACTOR = 0.01; // multiplied with avg text height
private static final double LINE_COUNT_THRESHOLD_FACTOR = 0.3; // multiplied with average line count per page private static final double LINE_COUNT_THRESHOLD_FACTOR = 0.3; // multiplied with average line count per page
private static final double DISTANCE_TO_BORDER_THRESHOLD = 1;
public List<Rectangle2D> detectColumns(GapInformation gapInformation, Rectangle2D mainBodyTextFrame) { public List<Rectangle2D> detectXGapsAcrossLines(GapInformation gapInformation, Rectangle2D mainBodyTextFrame) {
if (gapInformation.getXGaps().size() < 2) { if (gapInformation.getXGaps().size() < 2) {
return List.of(mainBodyTextFrame); return List.of(mainBodyTextFrame);
@ -35,40 +36,41 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
.orElseThrow(); .orElseThrow();
ColumnFactory columnFactory = ColumnFactory.init(avgHeight, gapInformation.getXGaps().size()); ColumnFactory columnFactory = ColumnFactory.init(avgHeight, gapInformation.getXGaps().size());
gapInformation.getXGaps().get(0).stream().map(Column::new).forEach(columnFactory::addToQueue); gapInformation.getXGaps().get(0).stream().map(GapAcrossLines::new).forEach(columnFactory::addToQueue);
List<List<Rectangle2D>> xGaps = gapInformation.getXGaps(); List<List<Rectangle2D>> xGaps = gapInformation.getXGaps();
for (var gaps : xGaps.subList(1, xGaps.size())) { for (var gaps : xGaps.subList(1, xGaps.size())) {
while (columnFactory.hasColumnsToProcess()) { while (columnFactory.hasGapsToProcess()) {
Column column = columnFactory.getNext(); GapAcrossLines gapAcrossLines = columnFactory.getNext();
rememberColumnIfValid(columnFactory, column); rememberColumnIfValid(columnFactory, gapAcrossLines);
elongateColumnsAndFilterForWidth(column, gaps, columnFactory).forEach(columnFactory::setToStillInProgress); elongateColumnsAndFilterForWidth(gapAcrossLines, gaps, columnFactory).forEach(columnFactory::setToStillInProgress);
} }
columnFactory.addStillInProgressToQueue(); columnFactory.addStillInProgressToQueue();
columnFactory.addGapsToQueue(gaps); columnFactory.addGapsToQueue(gaps);
} }
return columnFactory.outputColumns.stream() return columnFactory.outputGaps.stream()
.filter(column -> columnFactory.outputColumns.stream().filter(column::intersectsX).noneMatch(column1 -> column1.lineCount > column.lineCount)) .filter(gapAcrossLines -> columnFactory.outputGaps.stream().filter(gapAcrossLines::intersectsX).noneMatch(gapAcrossLines1 -> gapAcrossLines1.lineCount > gapAcrossLines.lineCount))
.filter(column -> ) .filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMinX() - mainBodyTextFrame.getMinX()) > DISTANCE_TO_BORDER_THRESHOLD)
.map(Column::getRectangle2D) .filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMaxX() - mainBodyTextFrame.getMaxX()) > DISTANCE_TO_BORDER_THRESHOLD)
.map(GapAcrossLines::getRectangle2D)
.toList(); .toList();
} }
private static void rememberColumnIfValid(ColumnFactory columnFactory, Column column) { private static void rememberColumnIfValid(ColumnFactory columnFactory, GapAcrossLines gapAcrossLines) {
if (column.lineCount >= LINE_COUNT_THRESHOLD_FACTOR * (double) columnFactory.lineCount) { if (gapAcrossLines.lineCount >= LINE_COUNT_THRESHOLD_FACTOR * (double) columnFactory.lineCount) {
columnFactory.outputColumns.add(column); columnFactory.outputGaps.add(gapAcrossLines);
} }
} }
private static Stream<Column> elongateColumnsAndFilterForWidth(Column column, List<Rectangle2D> gaps, ColumnFactory columnFactory) { private static Stream<GapAcrossLines> elongateColumnsAndFilterForWidth(GapAcrossLines gapAcrossLines, List<Rectangle2D> gaps, ColumnFactory columnFactory) {
return gaps.stream()// return gaps.stream()//
.filter(gap -> column.getIntersectionWidth(gap) > GAP_WIDTH_THRESHOLD_FACTOR * columnFactory.avgHeight)// .filter(gap -> gapAcrossLines.getIntersectionWidth(gap) > GAP_WIDTH_THRESHOLD_FACTOR * columnFactory.avgHeight)//
.map(column::addNewLineAndShrink); .map(gapAcrossLines::addNewLineAndShrink);
} }
@ -85,13 +87,13 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
@Getter @Getter
@AllArgsConstructor @AllArgsConstructor
private class Column { private class GapAcrossLines {
Rectangle2D rectangle2D; Rectangle2D rectangle2D;
int lineCount = 1; int lineCount = 1;
public Column(Rectangle2D rectangle2D) { public GapAcrossLines(Rectangle2D rectangle2D) {
this.rectangle2D = correctRectangle(rectangle2D); this.rectangle2D = correctRectangle(rectangle2D);
} }
@ -103,9 +105,9 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
} }
public boolean intersectsX(Column column) { public boolean intersectsX(GapAcrossLines gapAcrossLines) {
return this.intersectsX(column.getRectangle2D()); return this.intersectsX(gapAcrossLines.getRectangle2D());
} }
@ -120,7 +122,7 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
} }
public Column addNewLineAndShrink(Rectangle2D rectangle2D) { public GapAcrossLines addNewLineAndShrink(Rectangle2D rectangle2D) {
var correctedRectangle = correctRectangle(rectangle2D); var correctedRectangle = correctRectangle(rectangle2D);
double min_x = Math.max(correctedRectangle.getMinX(), this.rectangle2D.getMinX()); double min_x = Math.max(correctedRectangle.getMinX(), this.rectangle2D.getMinX());
@ -129,7 +131,7 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
double max_y = this.rectangle2D.getMaxY(); double max_y = this.rectangle2D.getMaxY();
double width = max_x - min_x; double width = max_x - min_x;
double height = max_y - min_y; double height = max_y - min_y;
return new Column(new Rectangle2D.Double(min_x, min_y, width, height), lineCount + 1); return new GapAcrossLines(new Rectangle2D.Double(min_x, min_y, width, height), lineCount + 1);
} }
} }
@ -140,9 +142,9 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
final double avgHeight; final double avgHeight;
final int lineCount; final int lineCount;
List<Column> outputColumns = new LinkedList<>(); List<GapAcrossLines> outputGaps = new LinkedList<>();
Queue<Column> columnQueue = new LinkedList<>(); Queue<GapAcrossLines> gapsQueue = new LinkedList<>();
List<Column> columnsToQueue = new LinkedList<>(); List<GapAcrossLines> gapsToQueue = new LinkedList<>();
public static ColumnFactory init(double avgHeight, int lineCount) { public static ColumnFactory init(double avgHeight, int lineCount) {
@ -151,40 +153,40 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
} }
public Column getNext() { public GapAcrossLines getNext() {
return columnQueue.remove(); return gapsQueue.remove();
} }
public void addToQueue(Column column) { public void addToQueue(GapAcrossLines gapAcrossLines) {
columnQueue.add(column); gapsQueue.add(gapAcrossLines);
} }
public void addToQueue(Rectangle2D gap) { public void addToQueue(Rectangle2D gap) {
columnQueue.add(new Column(gap)); gapsQueue.add(new GapAcrossLines(gap));
} }
private boolean hasColumnsToProcess() { private boolean hasGapsToProcess() {
return columnQueue.peek() != null; return gapsQueue.peek() != null;
} }
public void setToStillInProgress(Column column) { public void setToStillInProgress(GapAcrossLines gapAcrossLines) {
columnsToQueue.add(column); gapsToQueue.add(gapAcrossLines);
} }
private void addStillInProgressToQueue() { private void addStillInProgressToQueue() {
for (int i = columnsToQueue.size() - 1; i >= 0; i--) { for (int i = gapsToQueue.size() - 1; i >= 0; i--) {
columnQueue.add(columnsToQueue.remove(i)); gapsQueue.add(gapsToQueue.remove(i));
} }
} }

View File

@ -1,5 +1,63 @@
package com.knecon.fforesight.service.layoutparser.processor.services; package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.util.LinkedList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
@UtilityClass
public class InvisibleTableDetectionService { public class InvisibleTableDetectionService {
public List<List<Rectangle2D>> detectTable(List<TextPositionSequence> textPositionSequences, Rectangle2D tableBBox) {
LineInformation lineInformation = LineDetectionService.calculateLineInformation(textPositionSequences);
GapInformation gaps = GapDetectionService.findGapsInLines(textPositionSequences, tableBBox);
List<Rectangle2D> gapsAcrossLines = GapsAcrossLinesService.detectXGapsAcrossLines(gaps, tableBBox);
List<Double> columnXCoords = gapsAcrossLines.stream().map(RectangularShape::getCenterX).toList();
int colCount = gapsAcrossLines.size();
int rowCount = lineInformation.getLineBBox().size();
List<List<Rectangle2D>> cells = new LinkedList<>();
List<Rectangle2D> cellsInLine = new LinkedList<>();
cells.add(cellsInLine);
double x1;
double y1;
double x2;
double y2;
for (int col = 0; col < colCount + 1; col++) {
for (int row = 0; row < rowCount + 1; row++) {
if (col == 0) {
x1 = tableBBox.getX();
} else {
x1 = columnXCoords.get(col - 1);
}
if (row == 0) {
y2 = tableBBox.getMaxY();
} else {
y2 = lineInformation.getLineBBox().get(row - 1).getY();
}
if (col == colCount) {
x2 = tableBBox.getMaxX();
} else {
x2 = columnXCoords.get(col);
}
if (row == rowCount) {
y1 = tableBBox.getY();
} else {
y1 = lineInformation.getLineBBox().get(row).getY();
}
cellsInLine.add(new Rectangle2D.Double(x1, y1, x2 - x1, y2 - y1));
}
cellsInLine = new LinkedList<>();
cells.add(cellsInLine);
}
return cells;
}
} }

View File

@ -1,14 +1,16 @@
package com.knecon.fforesight.service.layoutparser.processor.services; package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@UtilityClass @UtilityClass
@ -17,38 +19,50 @@ public class LineDetectionService {
private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines
public static List<List<Rectangle2D>> findTextBlockInLines(List<TextPositionSequence> textPositionSequences) { public LineInformation calculateLineInformation(List<TextPositionSequence> sortedTextPositionSequences) {
if (textPositionSequences.isEmpty()) { if (sortedTextPositionSequences.isEmpty()) {
return Collections.emptyList(); return LineFactory.init().build();
} }
final double avgTextPositionHeight = getAvgTextPositionHeight(textPositionSequences); return buildLineInformation(sortedTextPositionSequences);
}
TextBlockContext context = TextBlockContext.init();
List<TextPositionSequence> sortedTextPositionSequence = textPositionSequences.stream().sorted(new TextPositionSequenceComparator()).toList(); public List<List<Rectangle2D>> findLinesWithGaps(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
var previousTextPosition = sortedTextPositionSequence.get(0); return calculateLineInformation(sortedTextPositionSequences).getBBoxWithGapsByLines();
context.textPositionsToMerge.add(previousTextPosition); }
for (TextPositionSequence currentTextPosition : sortedTextPositionSequence.subList(1, sortedTextPositionSequence.size())) {
public List<List<TextPositionSequence>> orderByLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
return calculateLineInformation(sortedTextPositionSequences).getSequencesByLines();
}
private static LineInformation buildLineInformation(List<TextPositionSequence> sortedTextPositionSequences) {
final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences);
LineFactory lineFactory = LineFactory.init();
var previousTextPosition = sortedTextPositionSequences.get(0);
lineFactory.addToCurrentLine(previousTextPosition);
for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) {
if (isNewLine(currentTextPosition, previousTextPosition, avgTextPositionHeight) || isSplitByOrientation(currentTextPosition, previousTextPosition)) { if (isNewLine(currentTextPosition, previousTextPosition, avgTextPositionHeight) || isSplitByOrientation(currentTextPosition, previousTextPosition)) {
addBlockToLine(context); lineFactory.startNewLine();
startNewLine(currentTextPosition, context);
} else if (isXGap(currentTextPosition, previousTextPosition, avgTextPositionHeight)) { } else if (isXGap(currentTextPosition, previousTextPosition, avgTextPositionHeight)) {
addBlockToLine(context); lineFactory.startNewBlock();
startNewBlock(currentTextPosition, context);
} else {
context.textPositionsToMerge.add(currentTextPosition);
} }
lineFactory.addToCurrentLine(currentTextPosition);
previousTextPosition = currentTextPosition; previousTextPosition = currentTextPosition;
} }
addBlockToLine(context); lineFactory.addFinalLine();
return context.textBlocksInLines; return lineFactory.build();
} }
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) { private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) {
return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow(); return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
@ -73,24 +87,94 @@ public class LineDetectionService {
} }
private static void startNewBlock(TextPositionSequence currentTextPosition, TextBlockContext context) { @Getter
@AllArgsConstructor
private class LineFactory {
context.textPositionsToMerge = new LinkedList<>(); List<Rectangle2D> lineBBox;
context.textPositionsToMerge.add(currentTextPosition);
List<List<Rectangle2D>> bBoxWithGapsByLines;
List<Rectangle2D> bBoxWithGapsInCurrentLine;
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines;
List<List<TextPositionSequence>> sequencesWithGapsInCurrentLine;
List<TextPositionSequence> currentSequencesWithoutGaps;
List<List<TextPositionSequence>> sequencesByLines;
List<TextPositionSequence> sequencesInCurrentLine;
List<List<Rectangle2D>> xGaps;
List<List<Rectangle2D>> yGaps;
public static LineFactory init() {
List<Rectangle2D> lineBBox = new LinkedList<>();
List<List<Rectangle2D>> bBoxWithGapsByLines = new LinkedList<>();
List<Rectangle2D> bBoxWithGapsInCurrentLine = new LinkedList<>();
bBoxWithGapsByLines.add(bBoxWithGapsInCurrentLine);
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines = new LinkedList<>();
List<List<TextPositionSequence>> sequencesWithGapsInCurrentLine = new LinkedList<>();
sequencesWithGapsByLines.add(sequencesWithGapsInCurrentLine);
List<TextPositionSequence> currentSequencesWithoutGaps = new LinkedList<>();
sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps);
List<List<TextPositionSequence>> sequencesByLines = new LinkedList<>();
List<TextPositionSequence> sequencesInCurrentLine = new LinkedList<>();
sequencesByLines.add(sequencesInCurrentLine);
return new LineFactory(lineBBox,
bBoxWithGapsByLines,
bBoxWithGapsInCurrentLine,
sequencesWithGapsByLines,
sequencesWithGapsInCurrentLine,
currentSequencesWithoutGaps,
sequencesByLines,
sequencesInCurrentLine,
null,
null);
} }
private static void addBlockToLine(TextBlockContext context) { public void addGaps(GapInformation gapInformation) {
context.blocksInCurrentLine.add(textPositionBBox(context.textPositionsToMerge)); this.xGaps = gapInformation.getXGaps();
this.yGaps = gapInformation.getYGaps();
} }
private static void startNewLine(TextPositionSequence current, TextBlockContext context) { public LineInformation build() {
context.blocksInCurrentLine = new LinkedList<>(); return new LineInformation(lineBBox, sequencesByLines, bBoxWithGapsByLines, sequencesWithGapsByLines);
startNewBlock(current, context); }
context.textBlocksInLines.add(context.blocksInCurrentLine);
public void startNewBlock() {
bBoxWithGapsInCurrentLine.add(textPositionBBox(currentSequencesWithoutGaps));
currentSequencesWithoutGaps = new LinkedList<>();
sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps);
}
public void startNewLine() {
lineBBox.add(textPositionBBox(sequencesInCurrentLine));
bBoxWithGapsInCurrentLine.add(textPositionBBox(currentSequencesWithoutGaps));
bBoxWithGapsInCurrentLine = new LinkedList<>();
bBoxWithGapsByLines.add(bBoxWithGapsInCurrentLine);
sequencesWithGapsInCurrentLine = new LinkedList<>();
sequencesWithGapsByLines.add(sequencesWithGapsInCurrentLine);
currentSequencesWithoutGaps = new LinkedList<>();
sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps);
sequencesInCurrentLine = new LinkedList<>();
sequencesByLines.add(sequencesInCurrentLine);
} }
@ -100,23 +184,19 @@ public class LineDetectionService {
} }
@AllArgsConstructor public void addToCurrentLine(TextPositionSequence current) {
private class TextBlockContext {
List<List<Rectangle2D>> textBlocksInLines; sequencesInCurrentLine.add(current);
List<Rectangle2D> blocksInCurrentLine; currentSequencesWithoutGaps.add(current);
List<TextPositionSequence> textPositionsToMerge; }
public static TextBlockContext init() { public void addFinalLine() {
List<List<Rectangle2D>> initialLinesWithGaps = new LinkedList<>(); lineBBox.add(textPositionBBox(sequencesInCurrentLine));
List<Rectangle2D> initialBlocksInLine = new LinkedList<>(); bBoxWithGapsInCurrentLine.add(textPositionBBox(currentSequencesWithoutGaps));
initialLinesWithGaps.add(initialBlocksInLine);
return new TextBlockContext(initialLinesWithGaps, initialBlocksInLine, new LinkedList<>());
} }
} }
} }

View File

@ -1,5 +1,25 @@
package com.knecon.fforesight.service.layoutparser.processor.services; package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D;
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.experimental.UtilityClass;
@UtilityClass
public class MainBodyTextFrameExtractionService { public class MainBodyTextFrameExtractionService {
private static final double TEXT_FRAME_PAD_WIDTH = 0.0;
private static final double TEXT_FRAME_PAD_HEIGHT = 0.02;
public Rectangle2D calculateMainBodyTextFrame(LineInformation lineInformation) {
Rectangle2D mainBodyTextFrame = lineInformation.getLineBBox().stream()
.collect(RectangleTransformations.collectBBox());
return RectangleTransformations.pad(mainBodyTextFrame, mainBodyTextFrame.getWidth() * TEXT_FRAME_PAD_WIDTH, mainBodyTextFrame.getHeight() * TEXT_FRAME_PAD_HEIGHT);
}
} }

View File

@ -1,2 +1,24 @@
package com.knecon.fforesight.service.layoutparser.processor.services;public class PageInformationService { package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PageInformationService {
public PageInformation build(PageContents pageContents) {
LineInformation lineInformation = LineDetectionService.calculateLineInformation(pageContents.getSortedTextPositionSequences());
Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(lineInformation);
GapInformation gapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), mainBodyTextFrame);
return new PageInformation(pageContents, lineInformation, mainBodyTextFrame, gapInformation);
}
} }

View File

@ -13,20 +13,19 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.springframework.core.io.ClassPathResource; import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.processor.model.TextPositionsWithPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.parsing.PDFLinesTextStripper; import com.knecon.fforesight.service.layoutparser.processor.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@UtilityClass @UtilityClass
public class TextPositionSequenceExtractionService { public class TextPositionSequenceSorter {
public List<TextPositionsWithPage> getSortedTextPositionsWithPages(String filename) throws IOException { public List<PageContents> getSortedTextPositionsWithPages(String filename) throws IOException {
List<TextPositionsWithPage> textPositionSequencesPerPage = new LinkedList<>(); List<PageContents> textPositionSequencesPerPage = new LinkedList<>();
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) { try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
PDDocument pdDocument = Loader.loadPDF(inputStream); PDDocument pdDocument = Loader.loadPDF(inputStream);
@ -36,20 +35,21 @@ public class TextPositionSequenceExtractionService {
PDFLinesTextStripper stripper = new PDFLinesTextStripper(); PDFLinesTextStripper stripper = new PDFLinesTextStripper();
PDPage pdPage = pdDocument.getPage(pageNumber - 1); PDPage pdPage = pdDocument.getPage(pageNumber - 1);
stripper.setPageNumber(pageNumber); stripper.setPageNumber(pageNumber);
stripper.setSortByPosition(true);
stripper.setStartPage(pageNumber); stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber); stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage); stripper.setPdpage(pdPage);
stripper.getText(pdDocument); stripper.getText(pdDocument);
// var sortedTextPositionSequences = stripper.getTextPositionSequences();
Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences() Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences()
.stream() .stream()
.sorted(new TextPositionSequenceComparator())
.collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees())); .collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees()));
var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation()); var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation());
textPositionSequencesPerPage.add(new TextPositionsWithPage(sortedTextPositionSequences, RectangleTransformations.toRectangle2D(pdPage.getCropBox()))); textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences,
RectangleTransformations.toRectangle2D(pdPage.getCropBox()),
RectangleTransformations.toRectangle2D(pdPage.getMediaBox())));
} }
pdDocument.close(); pdDocument.close();

View File

@ -18,7 +18,7 @@ import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.Pd
import lombok.SneakyThrows; import lombok.SneakyThrows;
class ColumnDetectionServiceTest { class GapAcrossLinesDetectionServiceTest {
@Test @Test
@SneakyThrows @SneakyThrows

View File

@ -1,23 +1,64 @@
package com.knecon.fforesight.service.layoutparser.processor.services; package com.knecon.fforesight.service.layoutparser.server.services;
import java.awt.geom.Rectangle2D;
import java.nio.file.Path;
import java.util.Collection;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.stream.Collectors;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.InvisibleTableDetectionService;
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows; import lombok.SneakyThrows;
class InvisibleTableDetectionServiceTest { class InvisibleTableDetectionServiceTest {
@Test @Test
@SneakyThrows @SneakyThrows
public void detectInvisibleTableTest() { public void detectInvisibleTableTest() {
String fileName = "files/test-two-pages_ocred-2.pdf"; String fileName = "files/new/test-two-pages_ocred-2.pdf";
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
List<PageInformation> pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());
List<PageContents> pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName); int pageNumber = 1;
Rectangle2D tableBBox = pageContents.get(0)
.getPageContents()
.getSortedTextPositionSequences()
.subList(45, 152)
.stream()
.map(TextPositionSequence::getRectangle)
.map(RectangleTransformations::toRectangle2D)
.map(this::mirrorY)
.collect(RectangleTransformations.collectBBox());
List<TextPositionSequence> textPositionSequences = pageContents.get(0)
.getPageContents()
.getSortedTextPositionSequences()
.stream()
.filter(textPositionSequence -> tableBBox.contains(mirrorY(RectangleTransformations.toRectangle2D(textPositionSequence.getRectangle()))))
.toList();
var table = InvisibleTableDetectionService.detectTable(textPositionSequences, tableBBox);
PdfDraw.drawRectanglesPerPage(fileName, List.of(table.stream().flatMap(Collection::stream).toList(), Collections.emptyList()), tmpFileName);
} }
private Rectangle2D mirrorY(Rectangle2D rectangle2D) {
if (rectangle2D.getHeight() >= 0) {
return rectangle2D;
}
return new Rectangle2D.Double(rectangle2D.getX(), rectangle2D.getY() + rectangle2D.getHeight(), rectangle2D.getWidth(), -rectangle2D.getHeight());
}
} }

View File

@ -1,7 +1,26 @@
package com.knecon.fforesight.service.layoutparser.processor.services; package com.knecon.fforesight.service.layoutparser.server.services;
import static org.junit.jupiter.api.Assertions.*; import java.nio.file.Path;
import java.util.List;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
import lombok.SneakyThrows;
class MainBodyTextFrameExtractionServiceTest { class MainBodyTextFrameExtractionServiceTest {
@Test
@SneakyThrows
public void testMainBodyDetection() {
String fileName = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString();
List<PageContents> sortedTextPositionSequence = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
}
} }

View File

@ -1,49 +1,62 @@
package com.knecon.fforesight.service.layoutparser.server.services; package com.knecon.fforesight.service.layoutparser.server.services;
import java.awt.geom.Rectangle2D;
import java.util.Collection; import java.util.Collection;
import java.util.LinkedList;
import java.util.List; import java.util.List;
import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
import com.knecon.fforesight.service.layoutparser.processor.services.MainBodyTextFrameExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter; import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows; import lombok.SneakyThrows;
class GapDetectionServiceTest { class PageInformationServiceTest {
@Test @Test
@Disabled @Disabled
@SneakyThrows @SneakyThrows
public void testGapDetection() { public void testGapDetection() {
String filename = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; String filename = "files/new/test-two-pages_ocred-2.pdf";
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf"; var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
System.out.println("start TextPosition extraction"); System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
List<PageContents> sortedTextPositionSequencesPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename); List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
List<GapInformation> gapInformationInLinesPerPage = new LinkedList<>();
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start); System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start gap detection"); System.out.println("start gap detection");
start = System.currentTimeMillis(); start = System.currentTimeMillis();
for (PageContents pageContents : sortedTextPositionSequencesPerPage) {
// List<List<TextPositionSequence>> lines = LineDetectionService.orderByLine(textPositionsWithPage.getSortedTextPositionSequences());
Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(pageContents);
gapInformationInLinesPerPage.add(GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), mainBodyTextFrame));
}
System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start); System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start draw rectangles"); System.out.println("start draw rectangles");
start = System.currentTimeMillis(); start = System.currentTimeMillis();
PdfDraw.drawRectanglesAndLinesPerPage(filename, PdfDraw.drawRectanglesAndLinesPerPage(filename,
gapInformationInLinesPerPage.stream().map(gaps -> gaps.getYGaps().stream().flatMap(Collection::stream).toList()).toList(), pageInformations.stream().map(PageInformation::getGapInformation).map(gaps -> gaps.getYGaps().stream().flatMap(Collection::stream).toList()).toList(),
gapInformationInLinesPerPage.stream().map(gaps -> gaps.getXGaps().stream().flatMap(Collection::stream).toList()).toList(), tmpFileName); pageInformations.stream().map(PageInformation::getGapInformation).map(gaps -> gaps.getXGaps().stream().flatMap(Collection::stream).toList()).toList(),
tmpFileName);
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
}
@Test
@Disabled
@SneakyThrows
public void testLineDetection() {
String filename = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis();
List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start gap detection");
start = System.currentTimeMillis();
System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start draw rectangles");
start = System.currentTimeMillis();
PdfDraw.drawRectanglesPerPageNumberedByLine(filename,
pageInformations.stream().map(PageInformation::getLineInformation).map(gaps -> gaps.getBBoxWithGapsByLines().stream().toList()).toList(),
tmpFileName);
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start); System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
} }

View File

@ -7,7 +7,7 @@ import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter; import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;