TAAS-41: add (inactive) experimental services
This commit is contained in:
parent
241a32cb4f
commit
526b1c5ad3
@ -12,12 +12,16 @@ import lombok.experimental.FieldDefaults;
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class Gaps {
|
||||
List<List<Rectangle2D>> xGaps ;
|
||||
List<List<Rectangle2D>> yGaps ;
|
||||
public class GapInformation {
|
||||
|
||||
List<List<Rectangle2D>> xGaps;
|
||||
List<List<Rectangle2D>> yGaps;
|
||||
|
||||
|
||||
public GapInformation() {
|
||||
|
||||
public Gaps() {
|
||||
xGaps = new LinkedList<>();
|
||||
yGaps = new LinkedList<>();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,23 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class LineInformation {
|
||||
|
||||
List<Rectangle2D> lineBBox;
|
||||
List<List<TextPositionSequence>> sequencesByLines;
|
||||
List<List<Rectangle2D>> bBoxWithGapsByLines;
|
||||
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines;
|
||||
|
||||
}
|
||||
|
||||
@ -12,9 +12,10 @@ import lombok.Getter;
|
||||
@Getter
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
public class PageInformation {
|
||||
public class PageContents {
|
||||
|
||||
List<TextPositionSequence> sortedTextPositionSequences;
|
||||
Rectangle2D cropBox;
|
||||
Rectangle2D mediaBox;
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,17 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
public class PageInformation {
|
||||
|
||||
PageContents pageContents;
|
||||
LineInformation lineInformation;
|
||||
Rectangle2D mainBodyTextFrame;
|
||||
GapInformation gapInformation;
|
||||
|
||||
}
|
||||
|
||||
@ -1,149 +1,87 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class DividingColumnDetectionService {
|
||||
|
||||
private static final double SPLITTABLE_LINE_PERCENTAGE_THRESHOLD = 0.6;
|
||||
private static final int MAX_NUMBER_OF_COLUMNS = 4;
|
||||
private static final int MAX_NUMBER_OF_COLUMNS = 200;
|
||||
|
||||
private static final int LINE_COUNT_THRESHOLD = 5;
|
||||
|
||||
|
||||
public List<Rectangle2D> detectColumns(List<TextPositionSequence> textPositionSequences, Rectangle2D mainBodyTextFrame) {
|
||||
public List<Rectangle2D> detectColumns(PageContents pageContents) {
|
||||
|
||||
if (textPositionSequences.size() < 2) {
|
||||
return List.of(mainBodyTextFrame);
|
||||
|
||||
if (pageContents.getSortedTextPositionSequences().size() < 2) {
|
||||
return List.of(pageContents.getCropBox());
|
||||
}
|
||||
|
||||
List<List<Rectangle2D>> linesWithGaps = LineDetectionService.findTextBlockInLines(textPositionSequences);
|
||||
GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), pageContents.getCropBox());
|
||||
|
||||
Map<Integer, List<Integer>> linesWithMatchingGapIndices = new HashMap<>();
|
||||
for (int numberOfColumns = 2; numberOfColumns <= MAX_NUMBER_OF_COLUMNS; numberOfColumns++) {
|
||||
linesWithMatchingGapIndices.put(numberOfColumns, findConsecutiveLinesWithMatchingGaps(linesWithGaps, mainBodyTextFrame.getWidth(), numberOfColumns));
|
||||
}
|
||||
|
||||
int optimalNumberOfColumns = findOptimalNumberOfColumns(linesWithMatchingGapIndices, linesWithGaps.size());
|
||||
if (optimalNumberOfColumns == 1) {
|
||||
return List.of(mainBodyTextFrame);
|
||||
}
|
||||
return buildColumns(mainBodyTextFrame, getLinesWithMatchingGaps(linesWithMatchingGapIndices.get(optimalNumberOfColumns), linesWithGaps), optimalNumberOfColumns);
|
||||
return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getCropBox());
|
||||
}
|
||||
|
||||
|
||||
private static List<Integer> findConsecutiveLinesWithMatchingGaps(List<List<Rectangle2D>> linesWithGaps, double width, int numberOfColumns) {
|
||||
public List<Rectangle2D> detectColumnsFromLines(List<List<Rectangle2D>> gaps, Rectangle2D mainBodyTextFrame) {
|
||||
|
||||
List<Boolean> booleans = lineHasMatchingGap(linesWithGaps, width, numberOfColumns);
|
||||
return findConsecutiveTrueIndicesWithMaxLengthRun(booleans);
|
||||
}
|
||||
|
||||
|
||||
private List<Boolean> lineHasMatchingGap(List<List<Rectangle2D>> linesWithGaps, double width, int numberOfColumns) {
|
||||
|
||||
return linesWithGaps.stream()
|
||||
.map(blocksWithGaps -> IntStream.range(1, numberOfColumns)
|
||||
.allMatch(columnIndex -> noBlocksIntersectX(blocksWithGaps, calculateGapLocation(width, numberOfColumns, columnIndex))))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private List<Integer> findConsecutiveTrueIndicesWithMaxLengthRun(List<Boolean> booleans) {
|
||||
|
||||
List<Integer> maxConsecutiveTrueIndices = new LinkedList<>();
|
||||
List<Integer> currentConsecutiveTrueIndices = new LinkedList<>();
|
||||
for (int i = 0; i < booleans.size(); i++) {
|
||||
if (!booleans.get(i)) {
|
||||
if (currentConsecutiveTrueIndices.isEmpty()) {
|
||||
continue;
|
||||
List<List<Line2D>> allColumnParts = new ArrayList<>(MAX_NUMBER_OF_COLUMNS);
|
||||
for (int columnIndex = 1; columnIndex < MAX_NUMBER_OF_COLUMNS; columnIndex++) {
|
||||
double x = calculateGapLocation(columnIndex, MAX_NUMBER_OF_COLUMNS, mainBodyTextFrame.getWidth()) + mainBodyTextFrame.getMinX();
|
||||
double currentMinY = mainBodyTextFrame.getMaxY();
|
||||
double currentMaxY = 0;
|
||||
int currentLineCount = 0;
|
||||
List<Line2D> columnParts = new LinkedList<>();
|
||||
allColumnParts.add(columnParts);
|
||||
for (int lineNumber = 0; lineNumber < gaps.size(); lineNumber++) {
|
||||
List<Rectangle2D> textBlocksInLine = gaps.get(lineNumber);
|
||||
if (anyBlockIntersectX(textBlocksInLine, x)) {
|
||||
if (lineNumber == gaps.size() - 1) {
|
||||
currentMaxY = mainBodyTextFrame.getMinY();
|
||||
} else {
|
||||
currentMaxY = gaps.get(lineNumber + 1).get(0).getMinY();
|
||||
}
|
||||
currentLineCount++;
|
||||
} else {
|
||||
if (currentLineCount >= LINE_COUNT_THRESHOLD) {
|
||||
columnParts.add(new Line2D.Double(x, currentMinY, x, currentMaxY));
|
||||
}
|
||||
currentMinY = gaps.get(lineNumber).get(0).getMaxY();
|
||||
currentMaxY = currentMinY;
|
||||
currentLineCount = 0;
|
||||
}
|
||||
if (currentConsecutiveTrueIndices.size() > maxConsecutiveTrueIndices.size()) {
|
||||
maxConsecutiveTrueIndices = currentConsecutiveTrueIndices;
|
||||
}
|
||||
currentConsecutiveTrueIndices = new LinkedList<>();
|
||||
continue;
|
||||
|
||||
}
|
||||
if (currentLineCount >= LINE_COUNT_THRESHOLD) {
|
||||
columnParts.add(new Line2D.Double(x, currentMinY, x, currentMaxY));
|
||||
}
|
||||
currentConsecutiveTrueIndices.add(i);
|
||||
}
|
||||
if (currentConsecutiveTrueIndices.size() > maxConsecutiveTrueIndices.size()) {
|
||||
return currentConsecutiveTrueIndices;
|
||||
}
|
||||
return maxConsecutiveTrueIndices;
|
||||
return Stream.concat(Stream.of(mainBodyTextFrame),
|
||||
allColumnParts.stream()
|
||||
.flatMap(columnParts -> columnParts.stream().map(line -> new Rectangle2D.Double(line.getX2(), line.getY2(), 1, Math.abs(line.getY2() - line.getY1()))))
|
||||
.map(r -> (Rectangle2D) r)).toList();
|
||||
}
|
||||
|
||||
|
||||
private static int findOptimalNumberOfColumns(Map<Integer, List<Integer>> linesWithMatchingGapIndices, Integer numberOfLines) {
|
||||
private static boolean anyBlockIntersectX(List<Rectangle2D> textBlocksInLine, double x) {
|
||||
|
||||
return linesWithMatchingGapIndices.entrySet()
|
||||
.stream()
|
||||
.max(comparePercentages(numberOfLines))
|
||||
.filter(entry -> percentageIsAboveThreshold(entry, numberOfLines))
|
||||
.map(Map.Entry::getKey)
|
||||
.orElse(1);
|
||||
return textBlocksInLine.stream().anyMatch(gap -> gap.getMinX() < x && x < gap.getMaxX());
|
||||
}
|
||||
|
||||
|
||||
private List<Rectangle2D> buildColumns(Rectangle2D mainBodyTextFrame, List<Rectangle2D> rectanglesToMerge, int optimalColumnCount) {
|
||||
|
||||
if (optimalColumnCount == 1 || rectanglesToMerge.isEmpty()) {
|
||||
return List.of(mainBodyTextFrame);
|
||||
}
|
||||
|
||||
double maxY = rectanglesToMerge.get(0).getMaxY();
|
||||
double minY = rectanglesToMerge.get(rectanglesToMerge.size() - 1).getMinY();
|
||||
|
||||
List<Rectangle2D> columns = new LinkedList<>();
|
||||
double width = mainBodyTextFrame.getWidth() / optimalColumnCount;
|
||||
double height = maxY - minY;
|
||||
for (int i = 0; i < optimalColumnCount; i++) {
|
||||
columns.add(new Rectangle2D.Double(mainBodyTextFrame.getMinY() + i * width, minY, width, height));
|
||||
}
|
||||
return columns;
|
||||
}
|
||||
|
||||
|
||||
private Comparator<Map.Entry<Integer, List<Integer>>> comparePercentages(Integer numberOfLines) {
|
||||
|
||||
return Comparator.comparingDouble(entry -> calculatePercentage(entry.getValue().size(), numberOfLines));
|
||||
}
|
||||
|
||||
|
||||
private List<Rectangle2D> getLinesWithMatchingGaps(List<Integer> linesWithMatchingGapIndices, List<List<Rectangle2D>> linesWithGaps) {
|
||||
|
||||
return linesWithMatchingGapIndices.stream().map(linesWithGaps::get).flatMap(Collection::stream).toList();
|
||||
}
|
||||
|
||||
|
||||
private boolean percentageIsAboveThreshold(Map.Entry<Integer, List<Integer>> entry, Integer numberOfLines) {
|
||||
|
||||
return calculatePercentage(entry.getValue().size(), numberOfLines) > SPLITTABLE_LINE_PERCENTAGE_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private double calculatePercentage(Integer numberOfMatchingLines, Integer numberOfLines) {
|
||||
|
||||
return ((double) numberOfMatchingLines) / ((double) numberOfLines);
|
||||
}
|
||||
|
||||
|
||||
private double calculateGapLocation(double pageWidth, int numberOfColumns, int columnIndex) {
|
||||
private double calculateGapLocation(int columnIndex, int numberOfColumns, double pageWidth) {
|
||||
|
||||
return (pageWidth / numberOfColumns) * columnIndex;
|
||||
}
|
||||
|
||||
|
||||
private Boolean noBlocksIntersectX(List<Rectangle2D> blocksWithGaps, double x) {
|
||||
|
||||
return blocksWithGaps.stream().noneMatch(rect -> rect.getMaxX() > x && rect.getMinX() < x);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -4,7 +4,9 @@ import java.awt.geom.Rectangle2D;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.experimental.UtilityClass;
|
||||
@ -12,17 +14,16 @@ import lombok.experimental.UtilityClass;
|
||||
@UtilityClass
|
||||
public class GapDetectionService {
|
||||
|
||||
private static final double X_GAP_FACTOR = 0.3; // multiplied with average text height, determines the minimum distance of gaps in lines
|
||||
private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines
|
||||
private static final double Y_GAP_FACTOR = 1;
|
||||
private static final double NEW_LINE_FACTOR = 0.2;
|
||||
|
||||
|
||||
public static Gaps findGapsInLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
|
||||
public static GapInformation findGapsInLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
|
||||
|
||||
if (sortedTextPositionSequences.isEmpty()) {
|
||||
return new Gaps();
|
||||
return new GapInformation();
|
||||
}
|
||||
//assertAllTextPositionsHaveSameDir(textPositionSequences);
|
||||
|
||||
final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences);
|
||||
|
||||
@ -32,30 +33,29 @@ public class GapDetectionService {
|
||||
var previousTextPosition = sortedTextPositionSequences.get(0);
|
||||
Rectangle2D rectangle = toRectangle2D(previousTextPosition);
|
||||
|
||||
yGapContext.addGapFromTopOfMainBody(rectangle);
|
||||
xGapContext.addGapFromLeftEdgeOfMainBody(rectangle);
|
||||
|
||||
for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) {
|
||||
|
||||
double yGap = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj());
|
||||
double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj());
|
||||
double yDifference = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj());
|
||||
double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj());
|
||||
Rectangle2D previousTextPositionBBox = toRectangle2D(previousTextPosition);
|
||||
Rectangle2D currentTextPositionBBox = toRectangle2D(currentTextPosition);
|
||||
|
||||
if (yGap > avgTextPositionHeight * Y_GAP_FACTOR) {
|
||||
|
||||
yGapContext.addGap(mainBodyTextFrame.getMinX(), currentTextPositionBBox.getMaxY(), mainBodyTextFrame.getWidth(), yGap);
|
||||
|
||||
if (yDifference > avgTextPositionHeight * Y_GAP_FACTOR) {
|
||||
yGapContext.addGap(mainBodyTextFrame.getMinX(),
|
||||
previousTextPositionBBox.getMaxY(),
|
||||
mainBodyTextFrame.getWidth(),
|
||||
-(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY()));
|
||||
}
|
||||
if (yGap > avgTextPositionHeight * NEW_LINE_FACTOR) {
|
||||
if (yDifference > avgTextPositionHeight * NEW_LINE_FACTOR) {
|
||||
|
||||
xGapContext.addGapToRightEdgeOfMainBody(previousTextPositionBBox);
|
||||
xGapContext.gapsInCurrentLine = new LinkedList<>();
|
||||
xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine);
|
||||
xGapContext.addGapFromLeftEdgeOfMainBody(currentTextPositionBBox);
|
||||
|
||||
|
||||
} else if (xGap <= avgTextPositionHeight * X_GAP_FACTOR) {
|
||||
} else if (xGap > avgTextPositionHeight * X_GAP_FACTOR) {
|
||||
addGapToLine(currentTextPositionBBox, previousTextPositionBBox, xGapContext);
|
||||
}
|
||||
previousTextPosition = currentTextPosition;
|
||||
@ -63,15 +63,19 @@ public class GapDetectionService {
|
||||
xGapContext.addGapToRightEdgeOfMainBody(toRectangle2D(sortedTextPositionSequences.get(sortedTextPositionSequences.size() - 1)));
|
||||
xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine);
|
||||
|
||||
return new Gaps(xGapContext.gapsPerLine, yGapContext.gapsPerLine);
|
||||
return new GapInformation(xGapContext.gapsPerLine, yGapContext.gapsPerLine);
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) {
|
||||
|
||||
return RectangleTransformations.toRectangle2D(textPosition.getRectangle());
|
||||
return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle()));
|
||||
}
|
||||
|
||||
private static Rectangle2D mirrorY(Rectangle2D rectangle2D) {
|
||||
|
||||
return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight()));
|
||||
}
|
||||
|
||||
private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) {
|
||||
|
||||
@ -111,19 +115,9 @@ public class GapDetectionService {
|
||||
}
|
||||
|
||||
|
||||
public void addGapFromTopOfMainBody(Rectangle2D rectangle) {
|
||||
public void addGap(double x1, double y1, double w, double h) {
|
||||
|
||||
gapsInCurrentLine.add(new Rectangle2D.Double(mainBodyTextFrame.getMinX(),
|
||||
rectangle.getMaxY(),
|
||||
mainBodyTextFrame.getWidth(),
|
||||
mainBodyTextFrame.getMaxY() - rectangle.getMaxY()));
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void addGap(double x, double y, double w, double h) {
|
||||
|
||||
gapsInCurrentLine.add(new Rectangle2D.Double(x, y, w, h));
|
||||
gapsInCurrentLine.add(new Rectangle2D.Double(x1, y1, w, h));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -15,13 +15,14 @@ import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class GapFindingColumnDetectionService implements ColumnDetectionService {
|
||||
public class GapsAcrossLinesService {
|
||||
|
||||
private static final double GAP_WIDTH_THRESHOLD_FACTOR = 0.01; // multiplied with avg text height
|
||||
private static final double LINE_COUNT_THRESHOLD_FACTOR = 0.3; // multiplied with average line count per page
|
||||
private static final double DISTANCE_TO_BORDER_THRESHOLD = 1;
|
||||
|
||||
|
||||
public List<Rectangle2D> detectColumns(GapInformation gapInformation, Rectangle2D mainBodyTextFrame) {
|
||||
public List<Rectangle2D> detectXGapsAcrossLines(GapInformation gapInformation, Rectangle2D mainBodyTextFrame) {
|
||||
|
||||
if (gapInformation.getXGaps().size() < 2) {
|
||||
return List.of(mainBodyTextFrame);
|
||||
@ -35,40 +36,41 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
|
||||
.orElseThrow();
|
||||
|
||||
ColumnFactory columnFactory = ColumnFactory.init(avgHeight, gapInformation.getXGaps().size());
|
||||
gapInformation.getXGaps().get(0).stream().map(Column::new).forEach(columnFactory::addToQueue);
|
||||
gapInformation.getXGaps().get(0).stream().map(GapAcrossLines::new).forEach(columnFactory::addToQueue);
|
||||
List<List<Rectangle2D>> xGaps = gapInformation.getXGaps();
|
||||
for (var gaps : xGaps.subList(1, xGaps.size())) {
|
||||
|
||||
while (columnFactory.hasColumnsToProcess()) {
|
||||
Column column = columnFactory.getNext();
|
||||
rememberColumnIfValid(columnFactory, column);
|
||||
elongateColumnsAndFilterForWidth(column, gaps, columnFactory).forEach(columnFactory::setToStillInProgress);
|
||||
while (columnFactory.hasGapsToProcess()) {
|
||||
GapAcrossLines gapAcrossLines = columnFactory.getNext();
|
||||
rememberColumnIfValid(columnFactory, gapAcrossLines);
|
||||
elongateColumnsAndFilterForWidth(gapAcrossLines, gaps, columnFactory).forEach(columnFactory::setToStillInProgress);
|
||||
}
|
||||
columnFactory.addStillInProgressToQueue();
|
||||
columnFactory.addGapsToQueue(gaps);
|
||||
}
|
||||
|
||||
return columnFactory.outputColumns.stream()
|
||||
.filter(column -> columnFactory.outputColumns.stream().filter(column::intersectsX).noneMatch(column1 -> column1.lineCount > column.lineCount))
|
||||
.filter(column -> )
|
||||
.map(Column::getRectangle2D)
|
||||
return columnFactory.outputGaps.stream()
|
||||
.filter(gapAcrossLines -> columnFactory.outputGaps.stream().filter(gapAcrossLines::intersectsX).noneMatch(gapAcrossLines1 -> gapAcrossLines1.lineCount > gapAcrossLines.lineCount))
|
||||
.filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMinX() - mainBodyTextFrame.getMinX()) > DISTANCE_TO_BORDER_THRESHOLD)
|
||||
.filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMaxX() - mainBodyTextFrame.getMaxX()) > DISTANCE_TO_BORDER_THRESHOLD)
|
||||
.map(GapAcrossLines::getRectangle2D)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private static void rememberColumnIfValid(ColumnFactory columnFactory, Column column) {
|
||||
private static void rememberColumnIfValid(ColumnFactory columnFactory, GapAcrossLines gapAcrossLines) {
|
||||
|
||||
if (column.lineCount >= LINE_COUNT_THRESHOLD_FACTOR * (double) columnFactory.lineCount) {
|
||||
columnFactory.outputColumns.add(column);
|
||||
if (gapAcrossLines.lineCount >= LINE_COUNT_THRESHOLD_FACTOR * (double) columnFactory.lineCount) {
|
||||
columnFactory.outputGaps.add(gapAcrossLines);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static Stream<Column> elongateColumnsAndFilterForWidth(Column column, List<Rectangle2D> gaps, ColumnFactory columnFactory) {
|
||||
private static Stream<GapAcrossLines> elongateColumnsAndFilterForWidth(GapAcrossLines gapAcrossLines, List<Rectangle2D> gaps, ColumnFactory columnFactory) {
|
||||
|
||||
return gaps.stream()//
|
||||
.filter(gap -> column.getIntersectionWidth(gap) > GAP_WIDTH_THRESHOLD_FACTOR * columnFactory.avgHeight)//
|
||||
.map(column::addNewLineAndShrink);
|
||||
.filter(gap -> gapAcrossLines.getIntersectionWidth(gap) > GAP_WIDTH_THRESHOLD_FACTOR * columnFactory.avgHeight)//
|
||||
.map(gapAcrossLines::addNewLineAndShrink);
|
||||
|
||||
}
|
||||
|
||||
@ -85,13 +87,13 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
private class Column {
|
||||
private class GapAcrossLines {
|
||||
|
||||
Rectangle2D rectangle2D;
|
||||
int lineCount = 1;
|
||||
|
||||
|
||||
public Column(Rectangle2D rectangle2D) {
|
||||
public GapAcrossLines(Rectangle2D rectangle2D) {
|
||||
|
||||
this.rectangle2D = correctRectangle(rectangle2D);
|
||||
}
|
||||
@ -103,9 +105,9 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsX(Column column) {
|
||||
public boolean intersectsX(GapAcrossLines gapAcrossLines) {
|
||||
|
||||
return this.intersectsX(column.getRectangle2D());
|
||||
return this.intersectsX(gapAcrossLines.getRectangle2D());
|
||||
}
|
||||
|
||||
|
||||
@ -120,7 +122,7 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
|
||||
}
|
||||
|
||||
|
||||
public Column addNewLineAndShrink(Rectangle2D rectangle2D) {
|
||||
public GapAcrossLines addNewLineAndShrink(Rectangle2D rectangle2D) {
|
||||
|
||||
var correctedRectangle = correctRectangle(rectangle2D);
|
||||
double min_x = Math.max(correctedRectangle.getMinX(), this.rectangle2D.getMinX());
|
||||
@ -129,7 +131,7 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
|
||||
double max_y = this.rectangle2D.getMaxY();
|
||||
double width = max_x - min_x;
|
||||
double height = max_y - min_y;
|
||||
return new Column(new Rectangle2D.Double(min_x, min_y, width, height), lineCount + 1);
|
||||
return new GapAcrossLines(new Rectangle2D.Double(min_x, min_y, width, height), lineCount + 1);
|
||||
}
|
||||
|
||||
}
|
||||
@ -140,9 +142,9 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
|
||||
final double avgHeight;
|
||||
final int lineCount;
|
||||
|
||||
List<Column> outputColumns = new LinkedList<>();
|
||||
Queue<Column> columnQueue = new LinkedList<>();
|
||||
List<Column> columnsToQueue = new LinkedList<>();
|
||||
List<GapAcrossLines> outputGaps = new LinkedList<>();
|
||||
Queue<GapAcrossLines> gapsQueue = new LinkedList<>();
|
||||
List<GapAcrossLines> gapsToQueue = new LinkedList<>();
|
||||
|
||||
|
||||
public static ColumnFactory init(double avgHeight, int lineCount) {
|
||||
@ -151,40 +153,40 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
|
||||
}
|
||||
|
||||
|
||||
public Column getNext() {
|
||||
public GapAcrossLines getNext() {
|
||||
|
||||
return columnQueue.remove();
|
||||
return gapsQueue.remove();
|
||||
}
|
||||
|
||||
|
||||
public void addToQueue(Column column) {
|
||||
public void addToQueue(GapAcrossLines gapAcrossLines) {
|
||||
|
||||
columnQueue.add(column);
|
||||
gapsQueue.add(gapAcrossLines);
|
||||
}
|
||||
|
||||
|
||||
public void addToQueue(Rectangle2D gap) {
|
||||
|
||||
columnQueue.add(new Column(gap));
|
||||
gapsQueue.add(new GapAcrossLines(gap));
|
||||
}
|
||||
|
||||
|
||||
private boolean hasColumnsToProcess() {
|
||||
private boolean hasGapsToProcess() {
|
||||
|
||||
return columnQueue.peek() != null;
|
||||
return gapsQueue.peek() != null;
|
||||
}
|
||||
|
||||
|
||||
public void setToStillInProgress(Column column) {
|
||||
public void setToStillInProgress(GapAcrossLines gapAcrossLines) {
|
||||
|
||||
columnsToQueue.add(column);
|
||||
gapsToQueue.add(gapAcrossLines);
|
||||
}
|
||||
|
||||
|
||||
private void addStillInProgressToQueue() {
|
||||
|
||||
for (int i = columnsToQueue.size() - 1; i >= 0; i--) {
|
||||
columnQueue.add(columnsToQueue.remove(i));
|
||||
for (int i = gapsToQueue.size() - 1; i >= 0; i--) {
|
||||
gapsQueue.add(gapsToQueue.remove(i));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1,5 +1,63 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class InvisibleTableDetectionService {
|
||||
|
||||
public List<List<Rectangle2D>> detectTable(List<TextPositionSequence> textPositionSequences, Rectangle2D tableBBox) {
|
||||
|
||||
LineInformation lineInformation = LineDetectionService.calculateLineInformation(textPositionSequences);
|
||||
GapInformation gaps = GapDetectionService.findGapsInLines(textPositionSequences, tableBBox);
|
||||
List<Rectangle2D> gapsAcrossLines = GapsAcrossLinesService.detectXGapsAcrossLines(gaps, tableBBox);
|
||||
List<Double> columnXCoords = gapsAcrossLines.stream().map(RectangularShape::getCenterX).toList();
|
||||
int colCount = gapsAcrossLines.size();
|
||||
int rowCount = lineInformation.getLineBBox().size();
|
||||
List<List<Rectangle2D>> cells = new LinkedList<>();
|
||||
List<Rectangle2D> cellsInLine = new LinkedList<>();
|
||||
cells.add(cellsInLine);
|
||||
double x1;
|
||||
double y1;
|
||||
double x2;
|
||||
double y2;
|
||||
for (int col = 0; col < colCount + 1; col++) {
|
||||
for (int row = 0; row < rowCount + 1; row++) {
|
||||
if (col == 0) {
|
||||
x1 = tableBBox.getX();
|
||||
} else {
|
||||
x1 = columnXCoords.get(col - 1);
|
||||
}
|
||||
if (row == 0) {
|
||||
y2 = tableBBox.getMaxY();
|
||||
} else {
|
||||
y2 = lineInformation.getLineBBox().get(row - 1).getY();
|
||||
}
|
||||
if (col == colCount) {
|
||||
x2 = tableBBox.getMaxX();
|
||||
} else {
|
||||
x2 = columnXCoords.get(col);
|
||||
}
|
||||
if (row == rowCount) {
|
||||
y1 = tableBBox.getY();
|
||||
} else {
|
||||
y1 = lineInformation.getLineBBox().get(row).getY();
|
||||
}
|
||||
cellsInLine.add(new Rectangle2D.Double(x1, y1, x2 - x1, y2 - y1));
|
||||
}
|
||||
cellsInLine = new LinkedList<>();
|
||||
cells.add(cellsInLine);
|
||||
}
|
||||
|
||||
return cells;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,14 +1,16 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
@ -17,37 +19,49 @@ public class LineDetectionService {
|
||||
private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines
|
||||
|
||||
|
||||
public static List<List<Rectangle2D>> findTextBlockInLines(List<TextPositionSequence> textPositionSequences) {
|
||||
public LineInformation calculateLineInformation(List<TextPositionSequence> sortedTextPositionSequences) {
|
||||
|
||||
if (textPositionSequences.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
if (sortedTextPositionSequences.isEmpty()) {
|
||||
return LineFactory.init().build();
|
||||
}
|
||||
|
||||
final double avgTextPositionHeight = getAvgTextPositionHeight(textPositionSequences);
|
||||
|
||||
TextBlockContext context = TextBlockContext.init();
|
||||
|
||||
List<TextPositionSequence> sortedTextPositionSequence = textPositionSequences.stream().sorted(new TextPositionSequenceComparator()).toList();
|
||||
|
||||
var previousTextPosition = sortedTextPositionSequence.get(0);
|
||||
context.textPositionsToMerge.add(previousTextPosition);
|
||||
for (TextPositionSequence currentTextPosition : sortedTextPositionSequence.subList(1, sortedTextPositionSequence.size())) {
|
||||
if (isNewLine(currentTextPosition, previousTextPosition, avgTextPositionHeight) || isSplitByOrientation(currentTextPosition, previousTextPosition)) {
|
||||
addBlockToLine(context);
|
||||
startNewLine(currentTextPosition, context);
|
||||
} else if (isXGap(currentTextPosition, previousTextPosition, avgTextPositionHeight)) {
|
||||
addBlockToLine(context);
|
||||
startNewBlock(currentTextPosition, context);
|
||||
} else {
|
||||
context.textPositionsToMerge.add(currentTextPosition);
|
||||
}
|
||||
previousTextPosition = currentTextPosition;
|
||||
}
|
||||
addBlockToLine(context);
|
||||
return context.textBlocksInLines;
|
||||
return buildLineInformation(sortedTextPositionSequences);
|
||||
}
|
||||
|
||||
|
||||
public List<List<Rectangle2D>> findLinesWithGaps(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
|
||||
|
||||
return calculateLineInformation(sortedTextPositionSequences).getBBoxWithGapsByLines();
|
||||
}
|
||||
|
||||
|
||||
public List<List<TextPositionSequence>> orderByLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
|
||||
|
||||
return calculateLineInformation(sortedTextPositionSequences).getSequencesByLines();
|
||||
}
|
||||
|
||||
|
||||
private static LineInformation buildLineInformation(List<TextPositionSequence> sortedTextPositionSequences) {
|
||||
|
||||
final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences);
|
||||
|
||||
LineFactory lineFactory = LineFactory.init();
|
||||
|
||||
var previousTextPosition = sortedTextPositionSequences.get(0);
|
||||
lineFactory.addToCurrentLine(previousTextPosition);
|
||||
for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) {
|
||||
if (isNewLine(currentTextPosition, previousTextPosition, avgTextPositionHeight) || isSplitByOrientation(currentTextPosition, previousTextPosition)) {
|
||||
lineFactory.startNewLine();
|
||||
} else if (isXGap(currentTextPosition, previousTextPosition, avgTextPositionHeight)) {
|
||||
lineFactory.startNewBlock();
|
||||
}
|
||||
lineFactory.addToCurrentLine(currentTextPosition);
|
||||
previousTextPosition = currentTextPosition;
|
||||
}
|
||||
lineFactory.addFinalLine();
|
||||
return lineFactory.build();
|
||||
}
|
||||
|
||||
|
||||
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
@ -73,50 +87,116 @@ public class LineDetectionService {
|
||||
}
|
||||
|
||||
|
||||
private static void startNewBlock(TextPositionSequence currentTextPosition, TextBlockContext context) {
|
||||
|
||||
context.textPositionsToMerge = new LinkedList<>();
|
||||
context.textPositionsToMerge.add(currentTextPosition);
|
||||
}
|
||||
|
||||
|
||||
private static void addBlockToLine(TextBlockContext context) {
|
||||
|
||||
context.blocksInCurrentLine.add(textPositionBBox(context.textPositionsToMerge));
|
||||
}
|
||||
|
||||
|
||||
private static void startNewLine(TextPositionSequence current, TextBlockContext context) {
|
||||
|
||||
context.blocksInCurrentLine = new LinkedList<>();
|
||||
startNewBlock(current, context);
|
||||
context.textBlocksInLines.add(context.blocksInCurrentLine);
|
||||
}
|
||||
|
||||
|
||||
private Rectangle2D textPositionBBox(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
return RectangleTransformations.rectangleBBox(textPositionSequences.stream().map(TextPositionSequence::getRectangle).toList());
|
||||
}
|
||||
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
private class TextBlockContext {
|
||||
private class LineFactory {
|
||||
|
||||
List<List<Rectangle2D>> textBlocksInLines;
|
||||
List<Rectangle2D> blocksInCurrentLine;
|
||||
List<TextPositionSequence> textPositionsToMerge;
|
||||
List<Rectangle2D> lineBBox;
|
||||
|
||||
List<List<Rectangle2D>> bBoxWithGapsByLines;
|
||||
List<Rectangle2D> bBoxWithGapsInCurrentLine;
|
||||
|
||||
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines;
|
||||
List<List<TextPositionSequence>> sequencesWithGapsInCurrentLine;
|
||||
|
||||
List<TextPositionSequence> currentSequencesWithoutGaps;
|
||||
|
||||
List<List<TextPositionSequence>> sequencesByLines;
|
||||
List<TextPositionSequence> sequencesInCurrentLine;
|
||||
|
||||
List<List<Rectangle2D>> xGaps;
|
||||
List<List<Rectangle2D>> yGaps;
|
||||
|
||||
|
||||
public static TextBlockContext init() {
|
||||
public static LineFactory init() {
|
||||
|
||||
List<List<Rectangle2D>> initialLinesWithGaps = new LinkedList<>();
|
||||
List<Rectangle2D> initialBlocksInLine = new LinkedList<>();
|
||||
initialLinesWithGaps.add(initialBlocksInLine);
|
||||
return new TextBlockContext(initialLinesWithGaps, initialBlocksInLine, new LinkedList<>());
|
||||
List<Rectangle2D> lineBBox = new LinkedList<>();
|
||||
|
||||
List<List<Rectangle2D>> bBoxWithGapsByLines = new LinkedList<>();
|
||||
List<Rectangle2D> bBoxWithGapsInCurrentLine = new LinkedList<>();
|
||||
bBoxWithGapsByLines.add(bBoxWithGapsInCurrentLine);
|
||||
|
||||
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines = new LinkedList<>();
|
||||
List<List<TextPositionSequence>> sequencesWithGapsInCurrentLine = new LinkedList<>();
|
||||
sequencesWithGapsByLines.add(sequencesWithGapsInCurrentLine);
|
||||
List<TextPositionSequence> currentSequencesWithoutGaps = new LinkedList<>();
|
||||
sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps);
|
||||
|
||||
List<List<TextPositionSequence>> sequencesByLines = new LinkedList<>();
|
||||
List<TextPositionSequence> sequencesInCurrentLine = new LinkedList<>();
|
||||
sequencesByLines.add(sequencesInCurrentLine);
|
||||
|
||||
return new LineFactory(lineBBox,
|
||||
bBoxWithGapsByLines,
|
||||
bBoxWithGapsInCurrentLine,
|
||||
sequencesWithGapsByLines,
|
||||
sequencesWithGapsInCurrentLine,
|
||||
currentSequencesWithoutGaps,
|
||||
sequencesByLines,
|
||||
sequencesInCurrentLine,
|
||||
null,
|
||||
null);
|
||||
}
|
||||
|
||||
|
||||
public void addGaps(GapInformation gapInformation) {
|
||||
|
||||
this.xGaps = gapInformation.getXGaps();
|
||||
this.yGaps = gapInformation.getYGaps();
|
||||
}
|
||||
|
||||
|
||||
public LineInformation build() {
|
||||
|
||||
return new LineInformation(lineBBox, sequencesByLines, bBoxWithGapsByLines, sequencesWithGapsByLines);
|
||||
}
|
||||
|
||||
|
||||
public void startNewBlock() {
|
||||
|
||||
bBoxWithGapsInCurrentLine.add(textPositionBBox(currentSequencesWithoutGaps));
|
||||
currentSequencesWithoutGaps = new LinkedList<>();
|
||||
sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps);
|
||||
}
|
||||
|
||||
|
||||
public void startNewLine() {
|
||||
|
||||
lineBBox.add(textPositionBBox(sequencesInCurrentLine));
|
||||
|
||||
bBoxWithGapsInCurrentLine.add(textPositionBBox(currentSequencesWithoutGaps));
|
||||
bBoxWithGapsInCurrentLine = new LinkedList<>();
|
||||
bBoxWithGapsByLines.add(bBoxWithGapsInCurrentLine);
|
||||
|
||||
sequencesWithGapsInCurrentLine = new LinkedList<>();
|
||||
sequencesWithGapsByLines.add(sequencesWithGapsInCurrentLine);
|
||||
currentSequencesWithoutGaps = new LinkedList<>();
|
||||
sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps);
|
||||
|
||||
sequencesInCurrentLine = new LinkedList<>();
|
||||
sequencesByLines.add(sequencesInCurrentLine);
|
||||
}
|
||||
|
||||
|
||||
private Rectangle2D textPositionBBox(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
return RectangleTransformations.rectangleBBox(textPositionSequences.stream().map(TextPositionSequence::getRectangle).toList());
|
||||
}
|
||||
|
||||
|
||||
public void addToCurrentLine(TextPositionSequence current) {
|
||||
|
||||
sequencesInCurrentLine.add(current);
|
||||
currentSequencesWithoutGaps.add(current);
|
||||
}
|
||||
|
||||
|
||||
public void addFinalLine() {
|
||||
|
||||
lineBBox.add(textPositionBBox(sequencesInCurrentLine));
|
||||
bBoxWithGapsInCurrentLine.add(textPositionBBox(currentSequencesWithoutGaps));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,25 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class MainBodyTextFrameExtractionService {
|
||||
|
||||
private static final double TEXT_FRAME_PAD_WIDTH = 0.0;
|
||||
private static final double TEXT_FRAME_PAD_HEIGHT = 0.02;
|
||||
|
||||
|
||||
public Rectangle2D calculateMainBodyTextFrame(LineInformation lineInformation) {
|
||||
|
||||
Rectangle2D mainBodyTextFrame = lineInformation.getLineBBox().stream()
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
return RectangleTransformations.pad(mainBodyTextFrame, mainBodyTextFrame.getWidth() * TEXT_FRAME_PAD_WIDTH, mainBodyTextFrame.getHeight() * TEXT_FRAME_PAD_HEIGHT);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,2 +1,24 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;public class PageInformationService {
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class PageInformationService {
|
||||
|
||||
public PageInformation build(PageContents pageContents) {
|
||||
|
||||
LineInformation lineInformation = LineDetectionService.calculateLineInformation(pageContents.getSortedTextPositionSequences());
|
||||
Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(lineInformation);
|
||||
GapInformation gapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), mainBodyTextFrame);
|
||||
|
||||
return new PageInformation(pageContents, lineInformation, mainBodyTextFrame, gapInformation);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -13,20 +13,19 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.TextPositionsWithPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class TextPositionSequenceExtractionService {
|
||||
public class TextPositionSequenceSorter {
|
||||
|
||||
public List<TextPositionsWithPage> getSortedTextPositionsWithPages(String filename) throws IOException {
|
||||
public List<PageContents> getSortedTextPositionsWithPages(String filename) throws IOException {
|
||||
|
||||
List<TextPositionsWithPage> textPositionSequencesPerPage = new LinkedList<>();
|
||||
List<PageContents> textPositionSequencesPerPage = new LinkedList<>();
|
||||
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
|
||||
|
||||
PDDocument pdDocument = Loader.loadPDF(inputStream);
|
||||
@ -36,20 +35,21 @@ public class TextPositionSequenceExtractionService {
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setSortByPosition(true);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(pdDocument);
|
||||
|
||||
// var sortedTextPositionSequences = stripper.getTextPositionSequences();
|
||||
Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences()
|
||||
.stream()
|
||||
.sorted(new TextPositionSequenceComparator())
|
||||
.collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees()));
|
||||
|
||||
var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation());
|
||||
|
||||
textPositionSequencesPerPage.add(new TextPositionsWithPage(sortedTextPositionSequences, RectangleTransformations.toRectangle2D(pdPage.getCropBox())));
|
||||
textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences,
|
||||
RectangleTransformations.toRectangle2D(pdPage.getCropBox()),
|
||||
RectangleTransformations.toRectangle2D(pdPage.getMediaBox())));
|
||||
}
|
||||
|
||||
pdDocument.close();
|
||||
|
||||
@ -18,7 +18,7 @@ import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.Pd
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class ColumnDetectionServiceTest {
|
||||
class GapAcrossLinesDetectionServiceTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
|
||||
@ -1,23 +1,64 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.InvisibleTableDetectionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class InvisibleTableDetectionServiceTest {
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void detectInvisibleTableTest() {
|
||||
|
||||
String fileName = "files/test-two-pages_ocred-2.pdf";
|
||||
String fileName = "files/new/test-two-pages_ocred-2.pdf";
|
||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
|
||||
List<PageInformation> pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());
|
||||
|
||||
List<PageContents> pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
|
||||
int pageNumber = 1;
|
||||
Rectangle2D tableBBox = pageContents.get(0)
|
||||
.getPageContents()
|
||||
.getSortedTextPositionSequences()
|
||||
.subList(45, 152)
|
||||
.stream()
|
||||
.map(TextPositionSequence::getRectangle)
|
||||
.map(RectangleTransformations::toRectangle2D)
|
||||
.map(this::mirrorY)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
List<TextPositionSequence> textPositionSequences = pageContents.get(0)
|
||||
.getPageContents()
|
||||
.getSortedTextPositionSequences()
|
||||
.stream()
|
||||
.filter(textPositionSequence -> tableBBox.contains(mirrorY(RectangleTransformations.toRectangle2D(textPositionSequence.getRectangle()))))
|
||||
.toList();
|
||||
|
||||
var table = InvisibleTableDetectionService.detectTable(textPositionSequences, tableBBox);
|
||||
|
||||
PdfDraw.drawRectanglesPerPage(fileName, List.of(table.stream().flatMap(Collection::stream).toList(), Collections.emptyList()), tmpFileName);
|
||||
}
|
||||
|
||||
|
||||
private Rectangle2D mirrorY(Rectangle2D rectangle2D) {
|
||||
|
||||
if (rectangle2D.getHeight() >= 0) {
|
||||
return rectangle2D;
|
||||
}
|
||||
return new Rectangle2D.Double(rectangle2D.getX(), rectangle2D.getY() + rectangle2D.getHeight(), rectangle2D.getWidth(), -rectangle2D.getHeight());
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,7 +1,26 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class MainBodyTextFrameExtractionServiceTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testMainBodyDetection() {
|
||||
|
||||
String fileName = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString();
|
||||
List<PageContents> sortedTextPositionSequence = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,49 +1,62 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.MainBodyTextFrameExtractionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class GapDetectionServiceTest {
|
||||
class PageInformationServiceTest {
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
public void testGapDetection() {
|
||||
|
||||
String filename = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String filename = "files/new/test-two-pages_ocred-2.pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
List<PageContents> sortedTextPositionSequencesPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename);
|
||||
List<GapInformation> gapInformationInLinesPerPage = new LinkedList<>();
|
||||
List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
|
||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start gap detection");
|
||||
start = System.currentTimeMillis();
|
||||
for (PageContents pageContents : sortedTextPositionSequencesPerPage) {
|
||||
// List<List<TextPositionSequence>> lines = LineDetectionService.orderByLine(textPositionsWithPage.getSortedTextPositionSequences());
|
||||
Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(pageContents);
|
||||
gapInformationInLinesPerPage.add(GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), mainBodyTextFrame));
|
||||
}
|
||||
System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start draw rectangles");
|
||||
start = System.currentTimeMillis();
|
||||
PdfDraw.drawRectanglesAndLinesPerPage(filename,
|
||||
gapInformationInLinesPerPage.stream().map(gaps -> gaps.getYGaps().stream().flatMap(Collection::stream).toList()).toList(),
|
||||
gapInformationInLinesPerPage.stream().map(gaps -> gaps.getXGaps().stream().flatMap(Collection::stream).toList()).toList(), tmpFileName);
|
||||
pageInformations.stream().map(PageInformation::getGapInformation).map(gaps -> gaps.getYGaps().stream().flatMap(Collection::stream).toList()).toList(),
|
||||
pageInformations.stream().map(PageInformation::getGapInformation).map(gaps -> gaps.getXGaps().stream().flatMap(Collection::stream).toList()).toList(),
|
||||
tmpFileName);
|
||||
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
public void testLineDetection() {
|
||||
|
||||
String filename = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
|
||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start gap detection");
|
||||
start = System.currentTimeMillis();
|
||||
System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start draw rectangles");
|
||||
start = System.currentTimeMillis();
|
||||
PdfDraw.drawRectanglesPerPageNumberedByLine(filename,
|
||||
pageInformations.stream().map(PageInformation::getLineInformation).map(gaps -> gaps.getBBoxWithGapsByLines().stream().toList()).toList(),
|
||||
tmpFileName);
|
||||
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
|
||||
}
|
||||
|
||||
|
||||
@ -7,7 +7,7 @@ import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user