TAAS-41: add (inactive) experimental services

This commit is contained in:
Kilian Schuettler 2023-07-24 15:58:06 +02:00
parent 241a32cb4f
commit 526b1c5ad3
17 changed files with 503 additions and 281 deletions

View File

@ -12,12 +12,16 @@ import lombok.experimental.FieldDefaults;
@Getter
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class Gaps {
List<List<Rectangle2D>> xGaps ;
List<List<Rectangle2D>> yGaps ;
public class GapInformation {
List<List<Rectangle2D>> xGaps;
List<List<Rectangle2D>> yGaps;
public GapInformation() {
public Gaps() {
xGaps = new LinkedList<>();
yGaps = new LinkedList<>();
}
}

View File

@ -1,5 +1,23 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.awt.geom.Rectangle2D;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Getter
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class LineInformation {
List<Rectangle2D> lineBBox;
List<List<TextPositionSequence>> sequencesByLines;
List<List<Rectangle2D>> bBoxWithGapsByLines;
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines;
}

View File

@ -12,9 +12,10 @@ import lombok.Getter;
@Getter
@Builder
@AllArgsConstructor
public class PageInformation {
public class PageContents {
List<TextPositionSequence> sortedTextPositionSequences;
Rectangle2D cropBox;
Rectangle2D mediaBox;
}

View File

@ -1,5 +1,17 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.awt.geom.Rectangle2D;
import lombok.AllArgsConstructor;
import lombok.Getter;
@Getter
@AllArgsConstructor
public class PageInformation {
PageContents pageContents;
LineInformation lineInformation;
Rectangle2D mainBodyTextFrame;
GapInformation gapInformation;
}

View File

@ -1,149 +1,87 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Line2D;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import lombok.experimental.UtilityClass;
@UtilityClass
public class DividingColumnDetectionService {
private static final double SPLITTABLE_LINE_PERCENTAGE_THRESHOLD = 0.6;
private static final int MAX_NUMBER_OF_COLUMNS = 4;
private static final int MAX_NUMBER_OF_COLUMNS = 200;
private static final int LINE_COUNT_THRESHOLD = 5;
public List<Rectangle2D> detectColumns(List<TextPositionSequence> textPositionSequences, Rectangle2D mainBodyTextFrame) {
public List<Rectangle2D> detectColumns(PageContents pageContents) {
if (textPositionSequences.size() < 2) {
return List.of(mainBodyTextFrame);
if (pageContents.getSortedTextPositionSequences().size() < 2) {
return List.of(pageContents.getCropBox());
}
List<List<Rectangle2D>> linesWithGaps = LineDetectionService.findTextBlockInLines(textPositionSequences);
GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), pageContents.getCropBox());
Map<Integer, List<Integer>> linesWithMatchingGapIndices = new HashMap<>();
for (int numberOfColumns = 2; numberOfColumns <= MAX_NUMBER_OF_COLUMNS; numberOfColumns++) {
linesWithMatchingGapIndices.put(numberOfColumns, findConsecutiveLinesWithMatchingGaps(linesWithGaps, mainBodyTextFrame.getWidth(), numberOfColumns));
}
int optimalNumberOfColumns = findOptimalNumberOfColumns(linesWithMatchingGapIndices, linesWithGaps.size());
if (optimalNumberOfColumns == 1) {
return List.of(mainBodyTextFrame);
}
return buildColumns(mainBodyTextFrame, getLinesWithMatchingGaps(linesWithMatchingGapIndices.get(optimalNumberOfColumns), linesWithGaps), optimalNumberOfColumns);
return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getCropBox());
}
private static List<Integer> findConsecutiveLinesWithMatchingGaps(List<List<Rectangle2D>> linesWithGaps, double width, int numberOfColumns) {
public List<Rectangle2D> detectColumnsFromLines(List<List<Rectangle2D>> gaps, Rectangle2D mainBodyTextFrame) {
List<Boolean> booleans = lineHasMatchingGap(linesWithGaps, width, numberOfColumns);
return findConsecutiveTrueIndicesWithMaxLengthRun(booleans);
}
private List<Boolean> lineHasMatchingGap(List<List<Rectangle2D>> linesWithGaps, double width, int numberOfColumns) {
return linesWithGaps.stream()
.map(blocksWithGaps -> IntStream.range(1, numberOfColumns)
.allMatch(columnIndex -> noBlocksIntersectX(blocksWithGaps, calculateGapLocation(width, numberOfColumns, columnIndex))))
.toList();
}
private List<Integer> findConsecutiveTrueIndicesWithMaxLengthRun(List<Boolean> booleans) {
List<Integer> maxConsecutiveTrueIndices = new LinkedList<>();
List<Integer> currentConsecutiveTrueIndices = new LinkedList<>();
for (int i = 0; i < booleans.size(); i++) {
if (!booleans.get(i)) {
if (currentConsecutiveTrueIndices.isEmpty()) {
continue;
List<List<Line2D>> allColumnParts = new ArrayList<>(MAX_NUMBER_OF_COLUMNS);
for (int columnIndex = 1; columnIndex < MAX_NUMBER_OF_COLUMNS; columnIndex++) {
double x = calculateGapLocation(columnIndex, MAX_NUMBER_OF_COLUMNS, mainBodyTextFrame.getWidth()) + mainBodyTextFrame.getMinX();
double currentMinY = mainBodyTextFrame.getMaxY();
double currentMaxY = 0;
int currentLineCount = 0;
List<Line2D> columnParts = new LinkedList<>();
allColumnParts.add(columnParts);
for (int lineNumber = 0; lineNumber < gaps.size(); lineNumber++) {
List<Rectangle2D> textBlocksInLine = gaps.get(lineNumber);
if (anyBlockIntersectX(textBlocksInLine, x)) {
if (lineNumber == gaps.size() - 1) {
currentMaxY = mainBodyTextFrame.getMinY();
} else {
currentMaxY = gaps.get(lineNumber + 1).get(0).getMinY();
}
currentLineCount++;
} else {
if (currentLineCount >= LINE_COUNT_THRESHOLD) {
columnParts.add(new Line2D.Double(x, currentMinY, x, currentMaxY));
}
currentMinY = gaps.get(lineNumber).get(0).getMaxY();
currentMaxY = currentMinY;
currentLineCount = 0;
}
if (currentConsecutiveTrueIndices.size() > maxConsecutiveTrueIndices.size()) {
maxConsecutiveTrueIndices = currentConsecutiveTrueIndices;
}
currentConsecutiveTrueIndices = new LinkedList<>();
continue;
}
if (currentLineCount >= LINE_COUNT_THRESHOLD) {
columnParts.add(new Line2D.Double(x, currentMinY, x, currentMaxY));
}
currentConsecutiveTrueIndices.add(i);
}
if (currentConsecutiveTrueIndices.size() > maxConsecutiveTrueIndices.size()) {
return currentConsecutiveTrueIndices;
}
return maxConsecutiveTrueIndices;
return Stream.concat(Stream.of(mainBodyTextFrame),
allColumnParts.stream()
.flatMap(columnParts -> columnParts.stream().map(line -> new Rectangle2D.Double(line.getX2(), line.getY2(), 1, Math.abs(line.getY2() - line.getY1()))))
.map(r -> (Rectangle2D) r)).toList();
}
private static int findOptimalNumberOfColumns(Map<Integer, List<Integer>> linesWithMatchingGapIndices, Integer numberOfLines) {
private static boolean anyBlockIntersectX(List<Rectangle2D> textBlocksInLine, double x) {
return linesWithMatchingGapIndices.entrySet()
.stream()
.max(comparePercentages(numberOfLines))
.filter(entry -> percentageIsAboveThreshold(entry, numberOfLines))
.map(Map.Entry::getKey)
.orElse(1);
return textBlocksInLine.stream().anyMatch(gap -> gap.getMinX() < x && x < gap.getMaxX());
}
private List<Rectangle2D> buildColumns(Rectangle2D mainBodyTextFrame, List<Rectangle2D> rectanglesToMerge, int optimalColumnCount) {
if (optimalColumnCount == 1 || rectanglesToMerge.isEmpty()) {
return List.of(mainBodyTextFrame);
}
double maxY = rectanglesToMerge.get(0).getMaxY();
double minY = rectanglesToMerge.get(rectanglesToMerge.size() - 1).getMinY();
List<Rectangle2D> columns = new LinkedList<>();
double width = mainBodyTextFrame.getWidth() / optimalColumnCount;
double height = maxY - minY;
for (int i = 0; i < optimalColumnCount; i++) {
columns.add(new Rectangle2D.Double(mainBodyTextFrame.getMinY() + i * width, minY, width, height));
}
return columns;
}
private Comparator<Map.Entry<Integer, List<Integer>>> comparePercentages(Integer numberOfLines) {
return Comparator.comparingDouble(entry -> calculatePercentage(entry.getValue().size(), numberOfLines));
}
private List<Rectangle2D> getLinesWithMatchingGaps(List<Integer> linesWithMatchingGapIndices, List<List<Rectangle2D>> linesWithGaps) {
return linesWithMatchingGapIndices.stream().map(linesWithGaps::get).flatMap(Collection::stream).toList();
}
private boolean percentageIsAboveThreshold(Map.Entry<Integer, List<Integer>> entry, Integer numberOfLines) {
return calculatePercentage(entry.getValue().size(), numberOfLines) > SPLITTABLE_LINE_PERCENTAGE_THRESHOLD;
}
private double calculatePercentage(Integer numberOfMatchingLines, Integer numberOfLines) {
return ((double) numberOfMatchingLines) / ((double) numberOfLines);
}
private double calculateGapLocation(double pageWidth, int numberOfColumns, int columnIndex) {
private double calculateGapLocation(int columnIndex, int numberOfColumns, double pageWidth) {
return (pageWidth / numberOfColumns) * columnIndex;
}
private Boolean noBlocksIntersectX(List<Rectangle2D> blocksWithGaps, double x) {
return blocksWithGaps.stream().noneMatch(rect -> rect.getMaxX() > x && rect.getMinX() < x);
}
}

View File

@ -4,7 +4,9 @@ import java.awt.geom.Rectangle2D;
import java.util.LinkedList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AllArgsConstructor;
import lombok.experimental.UtilityClass;
@ -12,17 +14,16 @@ import lombok.experimental.UtilityClass;
@UtilityClass
public class GapDetectionService {
private static final double X_GAP_FACTOR = 0.3; // multiplied with average text height, determines the minimum distance of gaps in lines
private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines
private static final double Y_GAP_FACTOR = 1;
private static final double NEW_LINE_FACTOR = 0.2;
public static Gaps findGapsInLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
public static GapInformation findGapsInLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
if (sortedTextPositionSequences.isEmpty()) {
return new Gaps();
return new GapInformation();
}
//assertAllTextPositionsHaveSameDir(textPositionSequences);
final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences);
@ -32,30 +33,29 @@ public class GapDetectionService {
var previousTextPosition = sortedTextPositionSequences.get(0);
Rectangle2D rectangle = toRectangle2D(previousTextPosition);
yGapContext.addGapFromTopOfMainBody(rectangle);
xGapContext.addGapFromLeftEdgeOfMainBody(rectangle);
for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) {
double yGap = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj());
double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj());
double yDifference = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj());
double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj());
Rectangle2D previousTextPositionBBox = toRectangle2D(previousTextPosition);
Rectangle2D currentTextPositionBBox = toRectangle2D(currentTextPosition);
if (yGap > avgTextPositionHeight * Y_GAP_FACTOR) {
yGapContext.addGap(mainBodyTextFrame.getMinX(), currentTextPositionBBox.getMaxY(), mainBodyTextFrame.getWidth(), yGap);
if (yDifference > avgTextPositionHeight * Y_GAP_FACTOR) {
yGapContext.addGap(mainBodyTextFrame.getMinX(),
previousTextPositionBBox.getMaxY(),
mainBodyTextFrame.getWidth(),
-(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY()));
}
if (yGap > avgTextPositionHeight * NEW_LINE_FACTOR) {
if (yDifference > avgTextPositionHeight * NEW_LINE_FACTOR) {
xGapContext.addGapToRightEdgeOfMainBody(previousTextPositionBBox);
xGapContext.gapsInCurrentLine = new LinkedList<>();
xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine);
xGapContext.addGapFromLeftEdgeOfMainBody(currentTextPositionBBox);
} else if (xGap <= avgTextPositionHeight * X_GAP_FACTOR) {
} else if (xGap > avgTextPositionHeight * X_GAP_FACTOR) {
addGapToLine(currentTextPositionBBox, previousTextPositionBBox, xGapContext);
}
previousTextPosition = currentTextPosition;
@ -63,15 +63,19 @@ public class GapDetectionService {
xGapContext.addGapToRightEdgeOfMainBody(toRectangle2D(sortedTextPositionSequences.get(sortedTextPositionSequences.size() - 1)));
xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine);
return new Gaps(xGapContext.gapsPerLine, yGapContext.gapsPerLine);
return new GapInformation(xGapContext.gapsPerLine, yGapContext.gapsPerLine);
}
private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) {
return RectangleTransformations.toRectangle2D(textPosition.getRectangle());
return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle()));
}
private static Rectangle2D mirrorY(Rectangle2D rectangle2D) {
return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight()));
}
private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) {
@ -111,19 +115,9 @@ public class GapDetectionService {
}
public void addGapFromTopOfMainBody(Rectangle2D rectangle) {
public void addGap(double x1, double y1, double w, double h) {
gapsInCurrentLine.add(new Rectangle2D.Double(mainBodyTextFrame.getMinX(),
rectangle.getMaxY(),
mainBodyTextFrame.getWidth(),
mainBodyTextFrame.getMaxY() - rectangle.getMaxY()));
}
public void addGap(double x, double y, double w, double h) {
gapsInCurrentLine.add(new Rectangle2D.Double(x, y, w, h));
gapsInCurrentLine.add(new Rectangle2D.Double(x1, y1, w, h));
}
}

View File

@ -15,13 +15,14 @@ import lombok.RequiredArgsConstructor;
import lombok.experimental.UtilityClass;
@UtilityClass
public class GapFindingColumnDetectionService implements ColumnDetectionService {
public class GapsAcrossLinesService {
private static final double GAP_WIDTH_THRESHOLD_FACTOR = 0.01; // multiplied with avg text height
private static final double LINE_COUNT_THRESHOLD_FACTOR = 0.3; // multiplied with average line count per page
private static final double DISTANCE_TO_BORDER_THRESHOLD = 1;
public List<Rectangle2D> detectColumns(GapInformation gapInformation, Rectangle2D mainBodyTextFrame) {
public List<Rectangle2D> detectXGapsAcrossLines(GapInformation gapInformation, Rectangle2D mainBodyTextFrame) {
if (gapInformation.getXGaps().size() < 2) {
return List.of(mainBodyTextFrame);
@ -35,40 +36,41 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
.orElseThrow();
ColumnFactory columnFactory = ColumnFactory.init(avgHeight, gapInformation.getXGaps().size());
gapInformation.getXGaps().get(0).stream().map(Column::new).forEach(columnFactory::addToQueue);
gapInformation.getXGaps().get(0).stream().map(GapAcrossLines::new).forEach(columnFactory::addToQueue);
List<List<Rectangle2D>> xGaps = gapInformation.getXGaps();
for (var gaps : xGaps.subList(1, xGaps.size())) {
while (columnFactory.hasColumnsToProcess()) {
Column column = columnFactory.getNext();
rememberColumnIfValid(columnFactory, column);
elongateColumnsAndFilterForWidth(column, gaps, columnFactory).forEach(columnFactory::setToStillInProgress);
while (columnFactory.hasGapsToProcess()) {
GapAcrossLines gapAcrossLines = columnFactory.getNext();
rememberColumnIfValid(columnFactory, gapAcrossLines);
elongateColumnsAndFilterForWidth(gapAcrossLines, gaps, columnFactory).forEach(columnFactory::setToStillInProgress);
}
columnFactory.addStillInProgressToQueue();
columnFactory.addGapsToQueue(gaps);
}
return columnFactory.outputColumns.stream()
.filter(column -> columnFactory.outputColumns.stream().filter(column::intersectsX).noneMatch(column1 -> column1.lineCount > column.lineCount))
.filter(column -> )
.map(Column::getRectangle2D)
return columnFactory.outputGaps.stream()
.filter(gapAcrossLines -> columnFactory.outputGaps.stream().filter(gapAcrossLines::intersectsX).noneMatch(gapAcrossLines1 -> gapAcrossLines1.lineCount > gapAcrossLines.lineCount))
.filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMinX() - mainBodyTextFrame.getMinX()) > DISTANCE_TO_BORDER_THRESHOLD)
.filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMaxX() - mainBodyTextFrame.getMaxX()) > DISTANCE_TO_BORDER_THRESHOLD)
.map(GapAcrossLines::getRectangle2D)
.toList();
}
private static void rememberColumnIfValid(ColumnFactory columnFactory, Column column) {
private static void rememberColumnIfValid(ColumnFactory columnFactory, GapAcrossLines gapAcrossLines) {
if (column.lineCount >= LINE_COUNT_THRESHOLD_FACTOR * (double) columnFactory.lineCount) {
columnFactory.outputColumns.add(column);
if (gapAcrossLines.lineCount >= LINE_COUNT_THRESHOLD_FACTOR * (double) columnFactory.lineCount) {
columnFactory.outputGaps.add(gapAcrossLines);
}
}
private static Stream<Column> elongateColumnsAndFilterForWidth(Column column, List<Rectangle2D> gaps, ColumnFactory columnFactory) {
private static Stream<GapAcrossLines> elongateColumnsAndFilterForWidth(GapAcrossLines gapAcrossLines, List<Rectangle2D> gaps, ColumnFactory columnFactory) {
return gaps.stream()//
.filter(gap -> column.getIntersectionWidth(gap) > GAP_WIDTH_THRESHOLD_FACTOR * columnFactory.avgHeight)//
.map(column::addNewLineAndShrink);
.filter(gap -> gapAcrossLines.getIntersectionWidth(gap) > GAP_WIDTH_THRESHOLD_FACTOR * columnFactory.avgHeight)//
.map(gapAcrossLines::addNewLineAndShrink);
}
@ -85,13 +87,13 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
@Getter
@AllArgsConstructor
private class Column {
private class GapAcrossLines {
Rectangle2D rectangle2D;
int lineCount = 1;
public Column(Rectangle2D rectangle2D) {
public GapAcrossLines(Rectangle2D rectangle2D) {
this.rectangle2D = correctRectangle(rectangle2D);
}
@ -103,9 +105,9 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
}
public boolean intersectsX(Column column) {
public boolean intersectsX(GapAcrossLines gapAcrossLines) {
return this.intersectsX(column.getRectangle2D());
return this.intersectsX(gapAcrossLines.getRectangle2D());
}
@ -120,7 +122,7 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
}
public Column addNewLineAndShrink(Rectangle2D rectangle2D) {
public GapAcrossLines addNewLineAndShrink(Rectangle2D rectangle2D) {
var correctedRectangle = correctRectangle(rectangle2D);
double min_x = Math.max(correctedRectangle.getMinX(), this.rectangle2D.getMinX());
@ -129,7 +131,7 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
double max_y = this.rectangle2D.getMaxY();
double width = max_x - min_x;
double height = max_y - min_y;
return new Column(new Rectangle2D.Double(min_x, min_y, width, height), lineCount + 1);
return new GapAcrossLines(new Rectangle2D.Double(min_x, min_y, width, height), lineCount + 1);
}
}
@ -140,9 +142,9 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
final double avgHeight;
final int lineCount;
List<Column> outputColumns = new LinkedList<>();
Queue<Column> columnQueue = new LinkedList<>();
List<Column> columnsToQueue = new LinkedList<>();
List<GapAcrossLines> outputGaps = new LinkedList<>();
Queue<GapAcrossLines> gapsQueue = new LinkedList<>();
List<GapAcrossLines> gapsToQueue = new LinkedList<>();
public static ColumnFactory init(double avgHeight, int lineCount) {
@ -151,40 +153,40 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
}
public Column getNext() {
public GapAcrossLines getNext() {
return columnQueue.remove();
return gapsQueue.remove();
}
public void addToQueue(Column column) {
public void addToQueue(GapAcrossLines gapAcrossLines) {
columnQueue.add(column);
gapsQueue.add(gapAcrossLines);
}
public void addToQueue(Rectangle2D gap) {
columnQueue.add(new Column(gap));
gapsQueue.add(new GapAcrossLines(gap));
}
private boolean hasColumnsToProcess() {
private boolean hasGapsToProcess() {
return columnQueue.peek() != null;
return gapsQueue.peek() != null;
}
public void setToStillInProgress(Column column) {
public void setToStillInProgress(GapAcrossLines gapAcrossLines) {
columnsToQueue.add(column);
gapsToQueue.add(gapAcrossLines);
}
private void addStillInProgressToQueue() {
for (int i = columnsToQueue.size() - 1; i >= 0; i--) {
columnQueue.add(columnsToQueue.remove(i));
for (int i = gapsToQueue.size() - 1; i >= 0; i--) {
gapsQueue.add(gapsToQueue.remove(i));
}
}

View File

@ -1,5 +1,63 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.util.LinkedList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
@UtilityClass
public class InvisibleTableDetectionService {
public List<List<Rectangle2D>> detectTable(List<TextPositionSequence> textPositionSequences, Rectangle2D tableBBox) {
LineInformation lineInformation = LineDetectionService.calculateLineInformation(textPositionSequences);
GapInformation gaps = GapDetectionService.findGapsInLines(textPositionSequences, tableBBox);
List<Rectangle2D> gapsAcrossLines = GapsAcrossLinesService.detectXGapsAcrossLines(gaps, tableBBox);
List<Double> columnXCoords = gapsAcrossLines.stream().map(RectangularShape::getCenterX).toList();
int colCount = gapsAcrossLines.size();
int rowCount = lineInformation.getLineBBox().size();
List<List<Rectangle2D>> cells = new LinkedList<>();
List<Rectangle2D> cellsInLine = new LinkedList<>();
cells.add(cellsInLine);
double x1;
double y1;
double x2;
double y2;
for (int col = 0; col < colCount + 1; col++) {
for (int row = 0; row < rowCount + 1; row++) {
if (col == 0) {
x1 = tableBBox.getX();
} else {
x1 = columnXCoords.get(col - 1);
}
if (row == 0) {
y2 = tableBBox.getMaxY();
} else {
y2 = lineInformation.getLineBBox().get(row - 1).getY();
}
if (col == colCount) {
x2 = tableBBox.getMaxX();
} else {
x2 = columnXCoords.get(col);
}
if (row == rowCount) {
y1 = tableBBox.getY();
} else {
y1 = lineInformation.getLineBBox().get(row).getY();
}
cellsInLine.add(new Rectangle2D.Double(x1, y1, x2 - x1, y2 - y1));
}
cellsInLine = new LinkedList<>();
cells.add(cellsInLine);
}
return cells;
}
}

View File

@ -1,14 +1,16 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.experimental.UtilityClass;
@UtilityClass
@ -17,37 +19,49 @@ public class LineDetectionService {
private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines
public static List<List<Rectangle2D>> findTextBlockInLines(List<TextPositionSequence> textPositionSequences) {
public LineInformation calculateLineInformation(List<TextPositionSequence> sortedTextPositionSequences) {
if (textPositionSequences.isEmpty()) {
return Collections.emptyList();
if (sortedTextPositionSequences.isEmpty()) {
return LineFactory.init().build();
}
final double avgTextPositionHeight = getAvgTextPositionHeight(textPositionSequences);
TextBlockContext context = TextBlockContext.init();
List<TextPositionSequence> sortedTextPositionSequence = textPositionSequences.stream().sorted(new TextPositionSequenceComparator()).toList();
var previousTextPosition = sortedTextPositionSequence.get(0);
context.textPositionsToMerge.add(previousTextPosition);
for (TextPositionSequence currentTextPosition : sortedTextPositionSequence.subList(1, sortedTextPositionSequence.size())) {
if (isNewLine(currentTextPosition, previousTextPosition, avgTextPositionHeight) || isSplitByOrientation(currentTextPosition, previousTextPosition)) {
addBlockToLine(context);
startNewLine(currentTextPosition, context);
} else if (isXGap(currentTextPosition, previousTextPosition, avgTextPositionHeight)) {
addBlockToLine(context);
startNewBlock(currentTextPosition, context);
} else {
context.textPositionsToMerge.add(currentTextPosition);
}
previousTextPosition = currentTextPosition;
}
addBlockToLine(context);
return context.textBlocksInLines;
return buildLineInformation(sortedTextPositionSequences);
}
public List<List<Rectangle2D>> findLinesWithGaps(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
return calculateLineInformation(sortedTextPositionSequences).getBBoxWithGapsByLines();
}
public List<List<TextPositionSequence>> orderByLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
return calculateLineInformation(sortedTextPositionSequences).getSequencesByLines();
}
private static LineInformation buildLineInformation(List<TextPositionSequence> sortedTextPositionSequences) {
final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences);
LineFactory lineFactory = LineFactory.init();
var previousTextPosition = sortedTextPositionSequences.get(0);
lineFactory.addToCurrentLine(previousTextPosition);
for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) {
if (isNewLine(currentTextPosition, previousTextPosition, avgTextPositionHeight) || isSplitByOrientation(currentTextPosition, previousTextPosition)) {
lineFactory.startNewLine();
} else if (isXGap(currentTextPosition, previousTextPosition, avgTextPositionHeight)) {
lineFactory.startNewBlock();
}
lineFactory.addToCurrentLine(currentTextPosition);
previousTextPosition = currentTextPosition;
}
lineFactory.addFinalLine();
return lineFactory.build();
}
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) {
@ -73,50 +87,116 @@ public class LineDetectionService {
}
private static void startNewBlock(TextPositionSequence currentTextPosition, TextBlockContext context) {
context.textPositionsToMerge = new LinkedList<>();
context.textPositionsToMerge.add(currentTextPosition);
}
private static void addBlockToLine(TextBlockContext context) {
context.blocksInCurrentLine.add(textPositionBBox(context.textPositionsToMerge));
}
private static void startNewLine(TextPositionSequence current, TextBlockContext context) {
context.blocksInCurrentLine = new LinkedList<>();
startNewBlock(current, context);
context.textBlocksInLines.add(context.blocksInCurrentLine);
}
private Rectangle2D textPositionBBox(List<TextPositionSequence> textPositionSequences) {
return RectangleTransformations.rectangleBBox(textPositionSequences.stream().map(TextPositionSequence::getRectangle).toList());
}
@Getter
@AllArgsConstructor
private class TextBlockContext {
private class LineFactory {
List<List<Rectangle2D>> textBlocksInLines;
List<Rectangle2D> blocksInCurrentLine;
List<TextPositionSequence> textPositionsToMerge;
List<Rectangle2D> lineBBox;
List<List<Rectangle2D>> bBoxWithGapsByLines;
List<Rectangle2D> bBoxWithGapsInCurrentLine;
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines;
List<List<TextPositionSequence>> sequencesWithGapsInCurrentLine;
List<TextPositionSequence> currentSequencesWithoutGaps;
List<List<TextPositionSequence>> sequencesByLines;
List<TextPositionSequence> sequencesInCurrentLine;
List<List<Rectangle2D>> xGaps;
List<List<Rectangle2D>> yGaps;
public static TextBlockContext init() {
public static LineFactory init() {
List<List<Rectangle2D>> initialLinesWithGaps = new LinkedList<>();
List<Rectangle2D> initialBlocksInLine = new LinkedList<>();
initialLinesWithGaps.add(initialBlocksInLine);
return new TextBlockContext(initialLinesWithGaps, initialBlocksInLine, new LinkedList<>());
List<Rectangle2D> lineBBox = new LinkedList<>();
List<List<Rectangle2D>> bBoxWithGapsByLines = new LinkedList<>();
List<Rectangle2D> bBoxWithGapsInCurrentLine = new LinkedList<>();
bBoxWithGapsByLines.add(bBoxWithGapsInCurrentLine);
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines = new LinkedList<>();
List<List<TextPositionSequence>> sequencesWithGapsInCurrentLine = new LinkedList<>();
sequencesWithGapsByLines.add(sequencesWithGapsInCurrentLine);
List<TextPositionSequence> currentSequencesWithoutGaps = new LinkedList<>();
sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps);
List<List<TextPositionSequence>> sequencesByLines = new LinkedList<>();
List<TextPositionSequence> sequencesInCurrentLine = new LinkedList<>();
sequencesByLines.add(sequencesInCurrentLine);
return new LineFactory(lineBBox,
bBoxWithGapsByLines,
bBoxWithGapsInCurrentLine,
sequencesWithGapsByLines,
sequencesWithGapsInCurrentLine,
currentSequencesWithoutGaps,
sequencesByLines,
sequencesInCurrentLine,
null,
null);
}
public void addGaps(GapInformation gapInformation) {
this.xGaps = gapInformation.getXGaps();
this.yGaps = gapInformation.getYGaps();
}
public LineInformation build() {
return new LineInformation(lineBBox, sequencesByLines, bBoxWithGapsByLines, sequencesWithGapsByLines);
}
public void startNewBlock() {
bBoxWithGapsInCurrentLine.add(textPositionBBox(currentSequencesWithoutGaps));
currentSequencesWithoutGaps = new LinkedList<>();
sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps);
}
public void startNewLine() {
lineBBox.add(textPositionBBox(sequencesInCurrentLine));
bBoxWithGapsInCurrentLine.add(textPositionBBox(currentSequencesWithoutGaps));
bBoxWithGapsInCurrentLine = new LinkedList<>();
bBoxWithGapsByLines.add(bBoxWithGapsInCurrentLine);
sequencesWithGapsInCurrentLine = new LinkedList<>();
sequencesWithGapsByLines.add(sequencesWithGapsInCurrentLine);
currentSequencesWithoutGaps = new LinkedList<>();
sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps);
sequencesInCurrentLine = new LinkedList<>();
sequencesByLines.add(sequencesInCurrentLine);
}
private Rectangle2D textPositionBBox(List<TextPositionSequence> textPositionSequences) {
return RectangleTransformations.rectangleBBox(textPositionSequences.stream().map(TextPositionSequence::getRectangle).toList());
}
public void addToCurrentLine(TextPositionSequence current) {
sequencesInCurrentLine.add(current);
currentSequencesWithoutGaps.add(current);
}
public void addFinalLine() {
lineBBox.add(textPositionBBox(sequencesInCurrentLine));
bBoxWithGapsInCurrentLine.add(textPositionBBox(currentSequencesWithoutGaps));
}
}
}

View File

@ -1,5 +1,25 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D;
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.experimental.UtilityClass;
@UtilityClass
public class MainBodyTextFrameExtractionService {
private static final double TEXT_FRAME_PAD_WIDTH = 0.0;
private static final double TEXT_FRAME_PAD_HEIGHT = 0.02;
public Rectangle2D calculateMainBodyTextFrame(LineInformation lineInformation) {
Rectangle2D mainBodyTextFrame = lineInformation.getLineBBox().stream()
.collect(RectangleTransformations.collectBBox());
return RectangleTransformations.pad(mainBodyTextFrame, mainBodyTextFrame.getWidth() * TEXT_FRAME_PAD_WIDTH, mainBodyTextFrame.getHeight() * TEXT_FRAME_PAD_HEIGHT);
}
}

View File

@ -1,2 +1,24 @@
package com.knecon.fforesight.service.layoutparser.processor.services;public class PageInformationService {
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PageInformationService {
public PageInformation build(PageContents pageContents) {
LineInformation lineInformation = LineDetectionService.calculateLineInformation(pageContents.getSortedTextPositionSequences());
Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(lineInformation);
GapInformation gapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), mainBodyTextFrame);
return new PageInformation(pageContents, lineInformation, mainBodyTextFrame, gapInformation);
}
}

View File

@ -13,20 +13,19 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.processor.model.TextPositionsWithPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
import lombok.experimental.UtilityClass;
@UtilityClass
public class TextPositionSequenceExtractionService {
public class TextPositionSequenceSorter {
public List<TextPositionsWithPage> getSortedTextPositionsWithPages(String filename) throws IOException {
public List<PageContents> getSortedTextPositionsWithPages(String filename) throws IOException {
List<TextPositionsWithPage> textPositionSequencesPerPage = new LinkedList<>();
List<PageContents> textPositionSequencesPerPage = new LinkedList<>();
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
PDDocument pdDocument = Loader.loadPDF(inputStream);
@ -36,20 +35,21 @@ public class TextPositionSequenceExtractionService {
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
stripper.setPageNumber(pageNumber);
stripper.setSortByPosition(true);
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
stripper.getText(pdDocument);
// var sortedTextPositionSequences = stripper.getTextPositionSequences();
Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences()
.stream()
.sorted(new TextPositionSequenceComparator())
.collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees()));
var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation());
textPositionSequencesPerPage.add(new TextPositionsWithPage(sortedTextPositionSequences, RectangleTransformations.toRectangle2D(pdPage.getCropBox())));
textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences,
RectangleTransformations.toRectangle2D(pdPage.getCropBox()),
RectangleTransformations.toRectangle2D(pdPage.getMediaBox())));
}
pdDocument.close();

View File

@ -18,7 +18,7 @@ import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.Pd
import lombok.SneakyThrows;
class ColumnDetectionServiceTest {
class GapAcrossLinesDetectionServiceTest {
@Test
@SneakyThrows

View File

@ -1,23 +1,64 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
package com.knecon.fforesight.service.layoutparser.server.services;
import java.awt.geom.Rectangle2D;
import java.nio.file.Path;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.InvisibleTableDetectionService;
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows;
class InvisibleTableDetectionServiceTest {
@Test
@SneakyThrows
public void detectInvisibleTableTest() {
String fileName = "files/test-two-pages_ocred-2.pdf";
String fileName = "files/new/test-two-pages_ocred-2.pdf";
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
List<PageInformation> pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());
List<PageContents> pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
int pageNumber = 1;
Rectangle2D tableBBox = pageContents.get(0)
.getPageContents()
.getSortedTextPositionSequences()
.subList(45, 152)
.stream()
.map(TextPositionSequence::getRectangle)
.map(RectangleTransformations::toRectangle2D)
.map(this::mirrorY)
.collect(RectangleTransformations.collectBBox());
List<TextPositionSequence> textPositionSequences = pageContents.get(0)
.getPageContents()
.getSortedTextPositionSequences()
.stream()
.filter(textPositionSequence -> tableBBox.contains(mirrorY(RectangleTransformations.toRectangle2D(textPositionSequence.getRectangle()))))
.toList();
var table = InvisibleTableDetectionService.detectTable(textPositionSequences, tableBBox);
PdfDraw.drawRectanglesPerPage(fileName, List.of(table.stream().flatMap(Collection::stream).toList(), Collections.emptyList()), tmpFileName);
}
private Rectangle2D mirrorY(Rectangle2D rectangle2D) {
if (rectangle2D.getHeight() >= 0) {
return rectangle2D;
}
return new Rectangle2D.Double(rectangle2D.getX(), rectangle2D.getY() + rectangle2D.getHeight(), rectangle2D.getWidth(), -rectangle2D.getHeight());
}
}

View File

@ -1,7 +1,26 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
package com.knecon.fforesight.service.layoutparser.server.services;
import static org.junit.jupiter.api.Assertions.*;
import java.nio.file.Path;
import java.util.List;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
import lombok.SneakyThrows;
class MainBodyTextFrameExtractionServiceTest {
@Test
@SneakyThrows
public void testMainBodyDetection() {
String fileName = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString();
List<PageContents> sortedTextPositionSequence = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
}
}

View File

@ -1,49 +1,62 @@
package com.knecon.fforesight.service.layoutparser.server.services;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
import com.knecon.fforesight.service.layoutparser.processor.services.MainBodyTextFrameExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows;
class GapDetectionServiceTest {
class PageInformationServiceTest {
@Test
@Disabled
@SneakyThrows
public void testGapDetection() {
String filename = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String filename = "files/new/test-two-pages_ocred-2.pdf";
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis();
List<PageContents> sortedTextPositionSequencesPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename);
List<GapInformation> gapInformationInLinesPerPage = new LinkedList<>();
List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start gap detection");
start = System.currentTimeMillis();
for (PageContents pageContents : sortedTextPositionSequencesPerPage) {
// List<List<TextPositionSequence>> lines = LineDetectionService.orderByLine(textPositionsWithPage.getSortedTextPositionSequences());
Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(pageContents);
gapInformationInLinesPerPage.add(GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), mainBodyTextFrame));
}
System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start draw rectangles");
start = System.currentTimeMillis();
PdfDraw.drawRectanglesAndLinesPerPage(filename,
gapInformationInLinesPerPage.stream().map(gaps -> gaps.getYGaps().stream().flatMap(Collection::stream).toList()).toList(),
gapInformationInLinesPerPage.stream().map(gaps -> gaps.getXGaps().stream().flatMap(Collection::stream).toList()).toList(), tmpFileName);
pageInformations.stream().map(PageInformation::getGapInformation).map(gaps -> gaps.getYGaps().stream().flatMap(Collection::stream).toList()).toList(),
pageInformations.stream().map(PageInformation::getGapInformation).map(gaps -> gaps.getXGaps().stream().flatMap(Collection::stream).toList()).toList(),
tmpFileName);
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
}
@Test
@Disabled
@SneakyThrows
public void testLineDetection() {
String filename = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis();
List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start gap detection");
start = System.currentTimeMillis();
System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start draw rectangles");
start = System.currentTimeMillis();
PdfDraw.drawRectanglesPerPageNumberedByLine(filename,
pageInformations.stream().map(PageInformation::getLineInformation).map(gaps -> gaps.getBBoxWithGapsByLines().stream().toList()).toList(),
tmpFileName);
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
}

View File

@ -7,7 +7,7 @@ import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;