TAAS-41: add (inactive) experimental services
This commit is contained in:
parent
241a32cb4f
commit
526b1c5ad3
@ -12,12 +12,16 @@ import lombok.experimental.FieldDefaults;
|
|||||||
@Getter
|
@Getter
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
public class Gaps {
|
public class GapInformation {
|
||||||
List<List<Rectangle2D>> xGaps ;
|
|
||||||
List<List<Rectangle2D>> yGaps ;
|
List<List<Rectangle2D>> xGaps;
|
||||||
|
List<List<Rectangle2D>> yGaps;
|
||||||
|
|
||||||
|
|
||||||
|
public GapInformation() {
|
||||||
|
|
||||||
public Gaps() {
|
|
||||||
xGaps = new LinkedList<>();
|
xGaps = new LinkedList<>();
|
||||||
yGaps = new LinkedList<>();
|
yGaps = new LinkedList<>();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,23 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
public class LineInformation {
|
public class LineInformation {
|
||||||
|
|
||||||
|
List<Rectangle2D> lineBBox;
|
||||||
|
List<List<TextPositionSequence>> sequencesByLines;
|
||||||
|
List<List<Rectangle2D>> bBoxWithGapsByLines;
|
||||||
|
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -12,9 +12,10 @@ import lombok.Getter;
|
|||||||
@Getter
|
@Getter
|
||||||
@Builder
|
@Builder
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
public class PageInformation {
|
public class PageContents {
|
||||||
|
|
||||||
List<TextPositionSequence> sortedTextPositionSequences;
|
List<TextPositionSequence> sortedTextPositionSequences;
|
||||||
Rectangle2D cropBox;
|
Rectangle2D cropBox;
|
||||||
|
Rectangle2D mediaBox;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,17 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Getter;
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
@AllArgsConstructor
|
||||||
public class PageInformation {
|
public class PageInformation {
|
||||||
|
|
||||||
|
PageContents pageContents;
|
||||||
|
LineInformation lineInformation;
|
||||||
|
Rectangle2D mainBodyTextFrame;
|
||||||
|
GapInformation gapInformation;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,149 +1,87 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||||
|
|
||||||
|
import java.awt.geom.Line2D;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.Collection;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.stream.Stream;
|
||||||
import java.util.stream.IntStream;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
@UtilityClass
|
@UtilityClass
|
||||||
public class DividingColumnDetectionService {
|
public class DividingColumnDetectionService {
|
||||||
|
|
||||||
private static final double SPLITTABLE_LINE_PERCENTAGE_THRESHOLD = 0.6;
|
private static final int MAX_NUMBER_OF_COLUMNS = 200;
|
||||||
private static final int MAX_NUMBER_OF_COLUMNS = 4;
|
|
||||||
|
private static final int LINE_COUNT_THRESHOLD = 5;
|
||||||
|
|
||||||
|
|
||||||
public List<Rectangle2D> detectColumns(List<TextPositionSequence> textPositionSequences, Rectangle2D mainBodyTextFrame) {
|
public List<Rectangle2D> detectColumns(PageContents pageContents) {
|
||||||
|
|
||||||
if (textPositionSequences.size() < 2) {
|
|
||||||
return List.of(mainBodyTextFrame);
|
if (pageContents.getSortedTextPositionSequences().size() < 2) {
|
||||||
|
return List.of(pageContents.getCropBox());
|
||||||
}
|
}
|
||||||
|
|
||||||
List<List<Rectangle2D>> linesWithGaps = LineDetectionService.findTextBlockInLines(textPositionSequences);
|
GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), pageContents.getCropBox());
|
||||||
|
|
||||||
Map<Integer, List<Integer>> linesWithMatchingGapIndices = new HashMap<>();
|
return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getCropBox());
|
||||||
for (int numberOfColumns = 2; numberOfColumns <= MAX_NUMBER_OF_COLUMNS; numberOfColumns++) {
|
|
||||||
linesWithMatchingGapIndices.put(numberOfColumns, findConsecutiveLinesWithMatchingGaps(linesWithGaps, mainBodyTextFrame.getWidth(), numberOfColumns));
|
|
||||||
}
|
|
||||||
|
|
||||||
int optimalNumberOfColumns = findOptimalNumberOfColumns(linesWithMatchingGapIndices, linesWithGaps.size());
|
|
||||||
if (optimalNumberOfColumns == 1) {
|
|
||||||
return List.of(mainBodyTextFrame);
|
|
||||||
}
|
|
||||||
return buildColumns(mainBodyTextFrame, getLinesWithMatchingGaps(linesWithMatchingGapIndices.get(optimalNumberOfColumns), linesWithGaps), optimalNumberOfColumns);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static List<Integer> findConsecutiveLinesWithMatchingGaps(List<List<Rectangle2D>> linesWithGaps, double width, int numberOfColumns) {
|
public List<Rectangle2D> detectColumnsFromLines(List<List<Rectangle2D>> gaps, Rectangle2D mainBodyTextFrame) {
|
||||||
|
|
||||||
List<Boolean> booleans = lineHasMatchingGap(linesWithGaps, width, numberOfColumns);
|
List<List<Line2D>> allColumnParts = new ArrayList<>(MAX_NUMBER_OF_COLUMNS);
|
||||||
return findConsecutiveTrueIndicesWithMaxLengthRun(booleans);
|
for (int columnIndex = 1; columnIndex < MAX_NUMBER_OF_COLUMNS; columnIndex++) {
|
||||||
|
double x = calculateGapLocation(columnIndex, MAX_NUMBER_OF_COLUMNS, mainBodyTextFrame.getWidth()) + mainBodyTextFrame.getMinX();
|
||||||
|
double currentMinY = mainBodyTextFrame.getMaxY();
|
||||||
|
double currentMaxY = 0;
|
||||||
|
int currentLineCount = 0;
|
||||||
|
List<Line2D> columnParts = new LinkedList<>();
|
||||||
|
allColumnParts.add(columnParts);
|
||||||
|
for (int lineNumber = 0; lineNumber < gaps.size(); lineNumber++) {
|
||||||
|
List<Rectangle2D> textBlocksInLine = gaps.get(lineNumber);
|
||||||
|
if (anyBlockIntersectX(textBlocksInLine, x)) {
|
||||||
|
if (lineNumber == gaps.size() - 1) {
|
||||||
|
currentMaxY = mainBodyTextFrame.getMinY();
|
||||||
|
} else {
|
||||||
|
currentMaxY = gaps.get(lineNumber + 1).get(0).getMinY();
|
||||||
|
}
|
||||||
|
currentLineCount++;
|
||||||
|
} else {
|
||||||
|
if (currentLineCount >= LINE_COUNT_THRESHOLD) {
|
||||||
|
columnParts.add(new Line2D.Double(x, currentMinY, x, currentMaxY));
|
||||||
|
}
|
||||||
|
currentMinY = gaps.get(lineNumber).get(0).getMaxY();
|
||||||
|
currentMaxY = currentMinY;
|
||||||
|
currentLineCount = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
if (currentLineCount >= LINE_COUNT_THRESHOLD) {
|
||||||
|
columnParts.add(new Line2D.Double(x, currentMinY, x, currentMaxY));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Stream.concat(Stream.of(mainBodyTextFrame),
|
||||||
|
allColumnParts.stream()
|
||||||
|
.flatMap(columnParts -> columnParts.stream().map(line -> new Rectangle2D.Double(line.getX2(), line.getY2(), 1, Math.abs(line.getY2() - line.getY1()))))
|
||||||
|
.map(r -> (Rectangle2D) r)).toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<Boolean> lineHasMatchingGap(List<List<Rectangle2D>> linesWithGaps, double width, int numberOfColumns) {
|
private static boolean anyBlockIntersectX(List<Rectangle2D> textBlocksInLine, double x) {
|
||||||
|
|
||||||
return linesWithGaps.stream()
|
return textBlocksInLine.stream().anyMatch(gap -> gap.getMinX() < x && x < gap.getMaxX());
|
||||||
.map(blocksWithGaps -> IntStream.range(1, numberOfColumns)
|
|
||||||
.allMatch(columnIndex -> noBlocksIntersectX(blocksWithGaps, calculateGapLocation(width, numberOfColumns, columnIndex))))
|
|
||||||
.toList();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<Integer> findConsecutiveTrueIndicesWithMaxLengthRun(List<Boolean> booleans) {
|
private double calculateGapLocation(int columnIndex, int numberOfColumns, double pageWidth) {
|
||||||
|
|
||||||
List<Integer> maxConsecutiveTrueIndices = new LinkedList<>();
|
|
||||||
List<Integer> currentConsecutiveTrueIndices = new LinkedList<>();
|
|
||||||
for (int i = 0; i < booleans.size(); i++) {
|
|
||||||
if (!booleans.get(i)) {
|
|
||||||
if (currentConsecutiveTrueIndices.isEmpty()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (currentConsecutiveTrueIndices.size() > maxConsecutiveTrueIndices.size()) {
|
|
||||||
maxConsecutiveTrueIndices = currentConsecutiveTrueIndices;
|
|
||||||
}
|
|
||||||
currentConsecutiveTrueIndices = new LinkedList<>();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
currentConsecutiveTrueIndices.add(i);
|
|
||||||
}
|
|
||||||
if (currentConsecutiveTrueIndices.size() > maxConsecutiveTrueIndices.size()) {
|
|
||||||
return currentConsecutiveTrueIndices;
|
|
||||||
}
|
|
||||||
return maxConsecutiveTrueIndices;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static int findOptimalNumberOfColumns(Map<Integer, List<Integer>> linesWithMatchingGapIndices, Integer numberOfLines) {
|
|
||||||
|
|
||||||
return linesWithMatchingGapIndices.entrySet()
|
|
||||||
.stream()
|
|
||||||
.max(comparePercentages(numberOfLines))
|
|
||||||
.filter(entry -> percentageIsAboveThreshold(entry, numberOfLines))
|
|
||||||
.map(Map.Entry::getKey)
|
|
||||||
.orElse(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private List<Rectangle2D> buildColumns(Rectangle2D mainBodyTextFrame, List<Rectangle2D> rectanglesToMerge, int optimalColumnCount) {
|
|
||||||
|
|
||||||
if (optimalColumnCount == 1 || rectanglesToMerge.isEmpty()) {
|
|
||||||
return List.of(mainBodyTextFrame);
|
|
||||||
}
|
|
||||||
|
|
||||||
double maxY = rectanglesToMerge.get(0).getMaxY();
|
|
||||||
double minY = rectanglesToMerge.get(rectanglesToMerge.size() - 1).getMinY();
|
|
||||||
|
|
||||||
List<Rectangle2D> columns = new LinkedList<>();
|
|
||||||
double width = mainBodyTextFrame.getWidth() / optimalColumnCount;
|
|
||||||
double height = maxY - minY;
|
|
||||||
for (int i = 0; i < optimalColumnCount; i++) {
|
|
||||||
columns.add(new Rectangle2D.Double(mainBodyTextFrame.getMinY() + i * width, minY, width, height));
|
|
||||||
}
|
|
||||||
return columns;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private Comparator<Map.Entry<Integer, List<Integer>>> comparePercentages(Integer numberOfLines) {
|
|
||||||
|
|
||||||
return Comparator.comparingDouble(entry -> calculatePercentage(entry.getValue().size(), numberOfLines));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private List<Rectangle2D> getLinesWithMatchingGaps(List<Integer> linesWithMatchingGapIndices, List<List<Rectangle2D>> linesWithGaps) {
|
|
||||||
|
|
||||||
return linesWithMatchingGapIndices.stream().map(linesWithGaps::get).flatMap(Collection::stream).toList();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean percentageIsAboveThreshold(Map.Entry<Integer, List<Integer>> entry, Integer numberOfLines) {
|
|
||||||
|
|
||||||
return calculatePercentage(entry.getValue().size(), numberOfLines) > SPLITTABLE_LINE_PERCENTAGE_THRESHOLD;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private double calculatePercentage(Integer numberOfMatchingLines, Integer numberOfLines) {
|
|
||||||
|
|
||||||
return ((double) numberOfMatchingLines) / ((double) numberOfLines);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private double calculateGapLocation(double pageWidth, int numberOfColumns, int columnIndex) {
|
|
||||||
|
|
||||||
return (pageWidth / numberOfColumns) * columnIndex;
|
return (pageWidth / numberOfColumns) * columnIndex;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Boolean noBlocksIntersectX(List<Rectangle2D> blocksWithGaps, double x) {
|
|
||||||
|
|
||||||
return blocksWithGaps.stream().noneMatch(rect -> rect.getMaxX() > x && rect.getMinX() < x);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -4,7 +4,9 @@ import java.awt.geom.Rectangle2D;
|
|||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
@ -12,17 +14,16 @@ import lombok.experimental.UtilityClass;
|
|||||||
@UtilityClass
|
@UtilityClass
|
||||||
public class GapDetectionService {
|
public class GapDetectionService {
|
||||||
|
|
||||||
private static final double X_GAP_FACTOR = 0.3; // multiplied with average text height, determines the minimum distance of gaps in lines
|
private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines
|
||||||
private static final double Y_GAP_FACTOR = 1;
|
private static final double Y_GAP_FACTOR = 1;
|
||||||
private static final double NEW_LINE_FACTOR = 0.2;
|
private static final double NEW_LINE_FACTOR = 0.2;
|
||||||
|
|
||||||
|
|
||||||
public static Gaps findGapsInLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
|
public static GapInformation findGapsInLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
|
||||||
|
|
||||||
if (sortedTextPositionSequences.isEmpty()) {
|
if (sortedTextPositionSequences.isEmpty()) {
|
||||||
return new Gaps();
|
return new GapInformation();
|
||||||
}
|
}
|
||||||
//assertAllTextPositionsHaveSameDir(textPositionSequences);
|
|
||||||
|
|
||||||
final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences);
|
final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences);
|
||||||
|
|
||||||
@ -32,30 +33,29 @@ public class GapDetectionService {
|
|||||||
var previousTextPosition = sortedTextPositionSequences.get(0);
|
var previousTextPosition = sortedTextPositionSequences.get(0);
|
||||||
Rectangle2D rectangle = toRectangle2D(previousTextPosition);
|
Rectangle2D rectangle = toRectangle2D(previousTextPosition);
|
||||||
|
|
||||||
yGapContext.addGapFromTopOfMainBody(rectangle);
|
|
||||||
xGapContext.addGapFromLeftEdgeOfMainBody(rectangle);
|
xGapContext.addGapFromLeftEdgeOfMainBody(rectangle);
|
||||||
|
|
||||||
for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) {
|
for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) {
|
||||||
|
|
||||||
double yGap = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj());
|
double yDifference = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj());
|
||||||
double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj());
|
double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj());
|
||||||
Rectangle2D previousTextPositionBBox = toRectangle2D(previousTextPosition);
|
Rectangle2D previousTextPositionBBox = toRectangle2D(previousTextPosition);
|
||||||
Rectangle2D currentTextPositionBBox = toRectangle2D(currentTextPosition);
|
Rectangle2D currentTextPositionBBox = toRectangle2D(currentTextPosition);
|
||||||
|
|
||||||
if (yGap > avgTextPositionHeight * Y_GAP_FACTOR) {
|
if (yDifference > avgTextPositionHeight * Y_GAP_FACTOR) {
|
||||||
|
yGapContext.addGap(mainBodyTextFrame.getMinX(),
|
||||||
yGapContext.addGap(mainBodyTextFrame.getMinX(), currentTextPositionBBox.getMaxY(), mainBodyTextFrame.getWidth(), yGap);
|
previousTextPositionBBox.getMaxY(),
|
||||||
|
mainBodyTextFrame.getWidth(),
|
||||||
|
-(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY()));
|
||||||
}
|
}
|
||||||
if (yGap > avgTextPositionHeight * NEW_LINE_FACTOR) {
|
if (yDifference > avgTextPositionHeight * NEW_LINE_FACTOR) {
|
||||||
|
|
||||||
xGapContext.addGapToRightEdgeOfMainBody(previousTextPositionBBox);
|
xGapContext.addGapToRightEdgeOfMainBody(previousTextPositionBBox);
|
||||||
xGapContext.gapsInCurrentLine = new LinkedList<>();
|
xGapContext.gapsInCurrentLine = new LinkedList<>();
|
||||||
xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine);
|
xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine);
|
||||||
xGapContext.addGapFromLeftEdgeOfMainBody(currentTextPositionBBox);
|
xGapContext.addGapFromLeftEdgeOfMainBody(currentTextPositionBBox);
|
||||||
|
|
||||||
|
} else if (xGap > avgTextPositionHeight * X_GAP_FACTOR) {
|
||||||
} else if (xGap <= avgTextPositionHeight * X_GAP_FACTOR) {
|
|
||||||
addGapToLine(currentTextPositionBBox, previousTextPositionBBox, xGapContext);
|
addGapToLine(currentTextPositionBBox, previousTextPositionBBox, xGapContext);
|
||||||
}
|
}
|
||||||
previousTextPosition = currentTextPosition;
|
previousTextPosition = currentTextPosition;
|
||||||
@ -63,15 +63,19 @@ public class GapDetectionService {
|
|||||||
xGapContext.addGapToRightEdgeOfMainBody(toRectangle2D(sortedTextPositionSequences.get(sortedTextPositionSequences.size() - 1)));
|
xGapContext.addGapToRightEdgeOfMainBody(toRectangle2D(sortedTextPositionSequences.get(sortedTextPositionSequences.size() - 1)));
|
||||||
xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine);
|
xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine);
|
||||||
|
|
||||||
return new Gaps(xGapContext.gapsPerLine, yGapContext.gapsPerLine);
|
return new GapInformation(xGapContext.gapsPerLine, yGapContext.gapsPerLine);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) {
|
private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) {
|
||||||
|
|
||||||
return RectangleTransformations.toRectangle2D(textPosition.getRectangle());
|
return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static Rectangle2D mirrorY(Rectangle2D rectangle2D) {
|
||||||
|
|
||||||
|
return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight()));
|
||||||
|
}
|
||||||
|
|
||||||
private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) {
|
private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) {
|
||||||
|
|
||||||
@ -111,19 +115,9 @@ public class GapDetectionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void addGapFromTopOfMainBody(Rectangle2D rectangle) {
|
public void addGap(double x1, double y1, double w, double h) {
|
||||||
|
|
||||||
gapsInCurrentLine.add(new Rectangle2D.Double(mainBodyTextFrame.getMinX(),
|
gapsInCurrentLine.add(new Rectangle2D.Double(x1, y1, w, h));
|
||||||
rectangle.getMaxY(),
|
|
||||||
mainBodyTextFrame.getWidth(),
|
|
||||||
mainBodyTextFrame.getMaxY() - rectangle.getMaxY()));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public void addGap(double x, double y, double w, double h) {
|
|
||||||
|
|
||||||
gapsInCurrentLine.add(new Rectangle2D.Double(x, y, w, h));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -15,13 +15,14 @@ import lombok.RequiredArgsConstructor;
|
|||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
@UtilityClass
|
@UtilityClass
|
||||||
public class GapFindingColumnDetectionService implements ColumnDetectionService {
|
public class GapsAcrossLinesService {
|
||||||
|
|
||||||
private static final double GAP_WIDTH_THRESHOLD_FACTOR = 0.01; // multiplied with avg text height
|
private static final double GAP_WIDTH_THRESHOLD_FACTOR = 0.01; // multiplied with avg text height
|
||||||
private static final double LINE_COUNT_THRESHOLD_FACTOR = 0.3; // multiplied with average line count per page
|
private static final double LINE_COUNT_THRESHOLD_FACTOR = 0.3; // multiplied with average line count per page
|
||||||
|
private static final double DISTANCE_TO_BORDER_THRESHOLD = 1;
|
||||||
|
|
||||||
|
|
||||||
public List<Rectangle2D> detectColumns(GapInformation gapInformation, Rectangle2D mainBodyTextFrame) {
|
public List<Rectangle2D> detectXGapsAcrossLines(GapInformation gapInformation, Rectangle2D mainBodyTextFrame) {
|
||||||
|
|
||||||
if (gapInformation.getXGaps().size() < 2) {
|
if (gapInformation.getXGaps().size() < 2) {
|
||||||
return List.of(mainBodyTextFrame);
|
return List.of(mainBodyTextFrame);
|
||||||
@ -35,40 +36,41 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
|
|||||||
.orElseThrow();
|
.orElseThrow();
|
||||||
|
|
||||||
ColumnFactory columnFactory = ColumnFactory.init(avgHeight, gapInformation.getXGaps().size());
|
ColumnFactory columnFactory = ColumnFactory.init(avgHeight, gapInformation.getXGaps().size());
|
||||||
gapInformation.getXGaps().get(0).stream().map(Column::new).forEach(columnFactory::addToQueue);
|
gapInformation.getXGaps().get(0).stream().map(GapAcrossLines::new).forEach(columnFactory::addToQueue);
|
||||||
List<List<Rectangle2D>> xGaps = gapInformation.getXGaps();
|
List<List<Rectangle2D>> xGaps = gapInformation.getXGaps();
|
||||||
for (var gaps : xGaps.subList(1, xGaps.size())) {
|
for (var gaps : xGaps.subList(1, xGaps.size())) {
|
||||||
|
|
||||||
while (columnFactory.hasColumnsToProcess()) {
|
while (columnFactory.hasGapsToProcess()) {
|
||||||
Column column = columnFactory.getNext();
|
GapAcrossLines gapAcrossLines = columnFactory.getNext();
|
||||||
rememberColumnIfValid(columnFactory, column);
|
rememberColumnIfValid(columnFactory, gapAcrossLines);
|
||||||
elongateColumnsAndFilterForWidth(column, gaps, columnFactory).forEach(columnFactory::setToStillInProgress);
|
elongateColumnsAndFilterForWidth(gapAcrossLines, gaps, columnFactory).forEach(columnFactory::setToStillInProgress);
|
||||||
}
|
}
|
||||||
columnFactory.addStillInProgressToQueue();
|
columnFactory.addStillInProgressToQueue();
|
||||||
columnFactory.addGapsToQueue(gaps);
|
columnFactory.addGapsToQueue(gaps);
|
||||||
}
|
}
|
||||||
|
|
||||||
return columnFactory.outputColumns.stream()
|
return columnFactory.outputGaps.stream()
|
||||||
.filter(column -> columnFactory.outputColumns.stream().filter(column::intersectsX).noneMatch(column1 -> column1.lineCount > column.lineCount))
|
.filter(gapAcrossLines -> columnFactory.outputGaps.stream().filter(gapAcrossLines::intersectsX).noneMatch(gapAcrossLines1 -> gapAcrossLines1.lineCount > gapAcrossLines.lineCount))
|
||||||
.filter(column -> )
|
.filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMinX() - mainBodyTextFrame.getMinX()) > DISTANCE_TO_BORDER_THRESHOLD)
|
||||||
.map(Column::getRectangle2D)
|
.filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMaxX() - mainBodyTextFrame.getMaxX()) > DISTANCE_TO_BORDER_THRESHOLD)
|
||||||
|
.map(GapAcrossLines::getRectangle2D)
|
||||||
.toList();
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static void rememberColumnIfValid(ColumnFactory columnFactory, Column column) {
|
private static void rememberColumnIfValid(ColumnFactory columnFactory, GapAcrossLines gapAcrossLines) {
|
||||||
|
|
||||||
if (column.lineCount >= LINE_COUNT_THRESHOLD_FACTOR * (double) columnFactory.lineCount) {
|
if (gapAcrossLines.lineCount >= LINE_COUNT_THRESHOLD_FACTOR * (double) columnFactory.lineCount) {
|
||||||
columnFactory.outputColumns.add(column);
|
columnFactory.outputGaps.add(gapAcrossLines);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static Stream<Column> elongateColumnsAndFilterForWidth(Column column, List<Rectangle2D> gaps, ColumnFactory columnFactory) {
|
private static Stream<GapAcrossLines> elongateColumnsAndFilterForWidth(GapAcrossLines gapAcrossLines, List<Rectangle2D> gaps, ColumnFactory columnFactory) {
|
||||||
|
|
||||||
return gaps.stream()//
|
return gaps.stream()//
|
||||||
.filter(gap -> column.getIntersectionWidth(gap) > GAP_WIDTH_THRESHOLD_FACTOR * columnFactory.avgHeight)//
|
.filter(gap -> gapAcrossLines.getIntersectionWidth(gap) > GAP_WIDTH_THRESHOLD_FACTOR * columnFactory.avgHeight)//
|
||||||
.map(column::addNewLineAndShrink);
|
.map(gapAcrossLines::addNewLineAndShrink);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -85,13 +87,13 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
|
|||||||
|
|
||||||
@Getter
|
@Getter
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
private class Column {
|
private class GapAcrossLines {
|
||||||
|
|
||||||
Rectangle2D rectangle2D;
|
Rectangle2D rectangle2D;
|
||||||
int lineCount = 1;
|
int lineCount = 1;
|
||||||
|
|
||||||
|
|
||||||
public Column(Rectangle2D rectangle2D) {
|
public GapAcrossLines(Rectangle2D rectangle2D) {
|
||||||
|
|
||||||
this.rectangle2D = correctRectangle(rectangle2D);
|
this.rectangle2D = correctRectangle(rectangle2D);
|
||||||
}
|
}
|
||||||
@ -103,9 +105,9 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean intersectsX(Column column) {
|
public boolean intersectsX(GapAcrossLines gapAcrossLines) {
|
||||||
|
|
||||||
return this.intersectsX(column.getRectangle2D());
|
return this.intersectsX(gapAcrossLines.getRectangle2D());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -120,7 +122,7 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Column addNewLineAndShrink(Rectangle2D rectangle2D) {
|
public GapAcrossLines addNewLineAndShrink(Rectangle2D rectangle2D) {
|
||||||
|
|
||||||
var correctedRectangle = correctRectangle(rectangle2D);
|
var correctedRectangle = correctRectangle(rectangle2D);
|
||||||
double min_x = Math.max(correctedRectangle.getMinX(), this.rectangle2D.getMinX());
|
double min_x = Math.max(correctedRectangle.getMinX(), this.rectangle2D.getMinX());
|
||||||
@ -129,7 +131,7 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
|
|||||||
double max_y = this.rectangle2D.getMaxY();
|
double max_y = this.rectangle2D.getMaxY();
|
||||||
double width = max_x - min_x;
|
double width = max_x - min_x;
|
||||||
double height = max_y - min_y;
|
double height = max_y - min_y;
|
||||||
return new Column(new Rectangle2D.Double(min_x, min_y, width, height), lineCount + 1);
|
return new GapAcrossLines(new Rectangle2D.Double(min_x, min_y, width, height), lineCount + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -140,9 +142,9 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
|
|||||||
final double avgHeight;
|
final double avgHeight;
|
||||||
final int lineCount;
|
final int lineCount;
|
||||||
|
|
||||||
List<Column> outputColumns = new LinkedList<>();
|
List<GapAcrossLines> outputGaps = new LinkedList<>();
|
||||||
Queue<Column> columnQueue = new LinkedList<>();
|
Queue<GapAcrossLines> gapsQueue = new LinkedList<>();
|
||||||
List<Column> columnsToQueue = new LinkedList<>();
|
List<GapAcrossLines> gapsToQueue = new LinkedList<>();
|
||||||
|
|
||||||
|
|
||||||
public static ColumnFactory init(double avgHeight, int lineCount) {
|
public static ColumnFactory init(double avgHeight, int lineCount) {
|
||||||
@ -151,40 +153,40 @@ public class GapFindingColumnDetectionService implements ColumnDetectionService
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Column getNext() {
|
public GapAcrossLines getNext() {
|
||||||
|
|
||||||
return columnQueue.remove();
|
return gapsQueue.remove();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void addToQueue(Column column) {
|
public void addToQueue(GapAcrossLines gapAcrossLines) {
|
||||||
|
|
||||||
columnQueue.add(column);
|
gapsQueue.add(gapAcrossLines);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void addToQueue(Rectangle2D gap) {
|
public void addToQueue(Rectangle2D gap) {
|
||||||
|
|
||||||
columnQueue.add(new Column(gap));
|
gapsQueue.add(new GapAcrossLines(gap));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean hasColumnsToProcess() {
|
private boolean hasGapsToProcess() {
|
||||||
|
|
||||||
return columnQueue.peek() != null;
|
return gapsQueue.peek() != null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void setToStillInProgress(Column column) {
|
public void setToStillInProgress(GapAcrossLines gapAcrossLines) {
|
||||||
|
|
||||||
columnsToQueue.add(column);
|
gapsToQueue.add(gapAcrossLines);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void addStillInProgressToQueue() {
|
private void addStillInProgressToQueue() {
|
||||||
|
|
||||||
for (int i = columnsToQueue.size() - 1; i >= 0; i--) {
|
for (int i = gapsToQueue.size() - 1; i >= 0; i--) {
|
||||||
columnQueue.add(columnsToQueue.remove(i));
|
gapsQueue.add(gapsToQueue.remove(i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,63 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.awt.geom.RectangularShape;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
public class InvisibleTableDetectionService {
|
public class InvisibleTableDetectionService {
|
||||||
|
|
||||||
|
public List<List<Rectangle2D>> detectTable(List<TextPositionSequence> textPositionSequences, Rectangle2D tableBBox) {
|
||||||
|
|
||||||
|
LineInformation lineInformation = LineDetectionService.calculateLineInformation(textPositionSequences);
|
||||||
|
GapInformation gaps = GapDetectionService.findGapsInLines(textPositionSequences, tableBBox);
|
||||||
|
List<Rectangle2D> gapsAcrossLines = GapsAcrossLinesService.detectXGapsAcrossLines(gaps, tableBBox);
|
||||||
|
List<Double> columnXCoords = gapsAcrossLines.stream().map(RectangularShape::getCenterX).toList();
|
||||||
|
int colCount = gapsAcrossLines.size();
|
||||||
|
int rowCount = lineInformation.getLineBBox().size();
|
||||||
|
List<List<Rectangle2D>> cells = new LinkedList<>();
|
||||||
|
List<Rectangle2D> cellsInLine = new LinkedList<>();
|
||||||
|
cells.add(cellsInLine);
|
||||||
|
double x1;
|
||||||
|
double y1;
|
||||||
|
double x2;
|
||||||
|
double y2;
|
||||||
|
for (int col = 0; col < colCount + 1; col++) {
|
||||||
|
for (int row = 0; row < rowCount + 1; row++) {
|
||||||
|
if (col == 0) {
|
||||||
|
x1 = tableBBox.getX();
|
||||||
|
} else {
|
||||||
|
x1 = columnXCoords.get(col - 1);
|
||||||
|
}
|
||||||
|
if (row == 0) {
|
||||||
|
y2 = tableBBox.getMaxY();
|
||||||
|
} else {
|
||||||
|
y2 = lineInformation.getLineBBox().get(row - 1).getY();
|
||||||
|
}
|
||||||
|
if (col == colCount) {
|
||||||
|
x2 = tableBBox.getMaxX();
|
||||||
|
} else {
|
||||||
|
x2 = columnXCoords.get(col);
|
||||||
|
}
|
||||||
|
if (row == rowCount) {
|
||||||
|
y1 = tableBBox.getY();
|
||||||
|
} else {
|
||||||
|
y1 = lineInformation.getLineBBox().get(row).getY();
|
||||||
|
}
|
||||||
|
cellsInLine.add(new Rectangle2D.Double(x1, y1, x2 - x1, y2 - y1));
|
||||||
|
}
|
||||||
|
cellsInLine = new LinkedList<>();
|
||||||
|
cells.add(cellsInLine);
|
||||||
|
}
|
||||||
|
|
||||||
|
return cells;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,14 +1,16 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Getter;
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
@UtilityClass
|
@UtilityClass
|
||||||
@ -17,38 +19,50 @@ public class LineDetectionService {
|
|||||||
private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines
|
private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines
|
||||||
|
|
||||||
|
|
||||||
public static List<List<Rectangle2D>> findTextBlockInLines(List<TextPositionSequence> textPositionSequences) {
|
public LineInformation calculateLineInformation(List<TextPositionSequence> sortedTextPositionSequences) {
|
||||||
|
|
||||||
if (textPositionSequences.isEmpty()) {
|
if (sortedTextPositionSequences.isEmpty()) {
|
||||||
return Collections.emptyList();
|
return LineFactory.init().build();
|
||||||
}
|
}
|
||||||
|
|
||||||
final double avgTextPositionHeight = getAvgTextPositionHeight(textPositionSequences);
|
return buildLineInformation(sortedTextPositionSequences);
|
||||||
|
}
|
||||||
|
|
||||||
TextBlockContext context = TextBlockContext.init();
|
|
||||||
|
|
||||||
List<TextPositionSequence> sortedTextPositionSequence = textPositionSequences.stream().sorted(new TextPositionSequenceComparator()).toList();
|
public List<List<Rectangle2D>> findLinesWithGaps(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
|
||||||
|
|
||||||
var previousTextPosition = sortedTextPositionSequence.get(0);
|
return calculateLineInformation(sortedTextPositionSequences).getBBoxWithGapsByLines();
|
||||||
context.textPositionsToMerge.add(previousTextPosition);
|
}
|
||||||
for (TextPositionSequence currentTextPosition : sortedTextPositionSequence.subList(1, sortedTextPositionSequence.size())) {
|
|
||||||
|
|
||||||
|
public List<List<TextPositionSequence>> orderByLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
|
||||||
|
|
||||||
|
return calculateLineInformation(sortedTextPositionSequences).getSequencesByLines();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static LineInformation buildLineInformation(List<TextPositionSequence> sortedTextPositionSequences) {
|
||||||
|
|
||||||
|
final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences);
|
||||||
|
|
||||||
|
LineFactory lineFactory = LineFactory.init();
|
||||||
|
|
||||||
|
var previousTextPosition = sortedTextPositionSequences.get(0);
|
||||||
|
lineFactory.addToCurrentLine(previousTextPosition);
|
||||||
|
for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) {
|
||||||
if (isNewLine(currentTextPosition, previousTextPosition, avgTextPositionHeight) || isSplitByOrientation(currentTextPosition, previousTextPosition)) {
|
if (isNewLine(currentTextPosition, previousTextPosition, avgTextPositionHeight) || isSplitByOrientation(currentTextPosition, previousTextPosition)) {
|
||||||
addBlockToLine(context);
|
lineFactory.startNewLine();
|
||||||
startNewLine(currentTextPosition, context);
|
|
||||||
} else if (isXGap(currentTextPosition, previousTextPosition, avgTextPositionHeight)) {
|
} else if (isXGap(currentTextPosition, previousTextPosition, avgTextPositionHeight)) {
|
||||||
addBlockToLine(context);
|
lineFactory.startNewBlock();
|
||||||
startNewBlock(currentTextPosition, context);
|
|
||||||
} else {
|
|
||||||
context.textPositionsToMerge.add(currentTextPosition);
|
|
||||||
}
|
}
|
||||||
|
lineFactory.addToCurrentLine(currentTextPosition);
|
||||||
previousTextPosition = currentTextPosition;
|
previousTextPosition = currentTextPosition;
|
||||||
}
|
}
|
||||||
addBlockToLine(context);
|
lineFactory.addFinalLine();
|
||||||
return context.textBlocksInLines;
|
return lineFactory.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) {
|
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) {
|
||||||
|
|
||||||
return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
|
return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
|
||||||
@ -73,24 +87,94 @@ public class LineDetectionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static void startNewBlock(TextPositionSequence currentTextPosition, TextBlockContext context) {
|
@Getter
|
||||||
|
@AllArgsConstructor
|
||||||
|
private class LineFactory {
|
||||||
|
|
||||||
context.textPositionsToMerge = new LinkedList<>();
|
List<Rectangle2D> lineBBox;
|
||||||
context.textPositionsToMerge.add(currentTextPosition);
|
|
||||||
|
List<List<Rectangle2D>> bBoxWithGapsByLines;
|
||||||
|
List<Rectangle2D> bBoxWithGapsInCurrentLine;
|
||||||
|
|
||||||
|
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines;
|
||||||
|
List<List<TextPositionSequence>> sequencesWithGapsInCurrentLine;
|
||||||
|
|
||||||
|
List<TextPositionSequence> currentSequencesWithoutGaps;
|
||||||
|
|
||||||
|
List<List<TextPositionSequence>> sequencesByLines;
|
||||||
|
List<TextPositionSequence> sequencesInCurrentLine;
|
||||||
|
|
||||||
|
List<List<Rectangle2D>> xGaps;
|
||||||
|
List<List<Rectangle2D>> yGaps;
|
||||||
|
|
||||||
|
|
||||||
|
public static LineFactory init() {
|
||||||
|
|
||||||
|
List<Rectangle2D> lineBBox = new LinkedList<>();
|
||||||
|
|
||||||
|
List<List<Rectangle2D>> bBoxWithGapsByLines = new LinkedList<>();
|
||||||
|
List<Rectangle2D> bBoxWithGapsInCurrentLine = new LinkedList<>();
|
||||||
|
bBoxWithGapsByLines.add(bBoxWithGapsInCurrentLine);
|
||||||
|
|
||||||
|
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines = new LinkedList<>();
|
||||||
|
List<List<TextPositionSequence>> sequencesWithGapsInCurrentLine = new LinkedList<>();
|
||||||
|
sequencesWithGapsByLines.add(sequencesWithGapsInCurrentLine);
|
||||||
|
List<TextPositionSequence> currentSequencesWithoutGaps = new LinkedList<>();
|
||||||
|
sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps);
|
||||||
|
|
||||||
|
List<List<TextPositionSequence>> sequencesByLines = new LinkedList<>();
|
||||||
|
List<TextPositionSequence> sequencesInCurrentLine = new LinkedList<>();
|
||||||
|
sequencesByLines.add(sequencesInCurrentLine);
|
||||||
|
|
||||||
|
return new LineFactory(lineBBox,
|
||||||
|
bBoxWithGapsByLines,
|
||||||
|
bBoxWithGapsInCurrentLine,
|
||||||
|
sequencesWithGapsByLines,
|
||||||
|
sequencesWithGapsInCurrentLine,
|
||||||
|
currentSequencesWithoutGaps,
|
||||||
|
sequencesByLines,
|
||||||
|
sequencesInCurrentLine,
|
||||||
|
null,
|
||||||
|
null);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static void addBlockToLine(TextBlockContext context) {
|
public void addGaps(GapInformation gapInformation) {
|
||||||
|
|
||||||
context.blocksInCurrentLine.add(textPositionBBox(context.textPositionsToMerge));
|
this.xGaps = gapInformation.getXGaps();
|
||||||
|
this.yGaps = gapInformation.getYGaps();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static void startNewLine(TextPositionSequence current, TextBlockContext context) {
|
public LineInformation build() {
|
||||||
|
|
||||||
context.blocksInCurrentLine = new LinkedList<>();
|
return new LineInformation(lineBBox, sequencesByLines, bBoxWithGapsByLines, sequencesWithGapsByLines);
|
||||||
startNewBlock(current, context);
|
}
|
||||||
context.textBlocksInLines.add(context.blocksInCurrentLine);
|
|
||||||
|
|
||||||
|
public void startNewBlock() {
|
||||||
|
|
||||||
|
bBoxWithGapsInCurrentLine.add(textPositionBBox(currentSequencesWithoutGaps));
|
||||||
|
currentSequencesWithoutGaps = new LinkedList<>();
|
||||||
|
sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void startNewLine() {
|
||||||
|
|
||||||
|
lineBBox.add(textPositionBBox(sequencesInCurrentLine));
|
||||||
|
|
||||||
|
bBoxWithGapsInCurrentLine.add(textPositionBBox(currentSequencesWithoutGaps));
|
||||||
|
bBoxWithGapsInCurrentLine = new LinkedList<>();
|
||||||
|
bBoxWithGapsByLines.add(bBoxWithGapsInCurrentLine);
|
||||||
|
|
||||||
|
sequencesWithGapsInCurrentLine = new LinkedList<>();
|
||||||
|
sequencesWithGapsByLines.add(sequencesWithGapsInCurrentLine);
|
||||||
|
currentSequencesWithoutGaps = new LinkedList<>();
|
||||||
|
sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps);
|
||||||
|
|
||||||
|
sequencesInCurrentLine = new LinkedList<>();
|
||||||
|
sequencesByLines.add(sequencesInCurrentLine);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -100,23 +184,19 @@ public class LineDetectionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@AllArgsConstructor
|
public void addToCurrentLine(TextPositionSequence current) {
|
||||||
private class TextBlockContext {
|
|
||||||
|
|
||||||
List<List<Rectangle2D>> textBlocksInLines;
|
sequencesInCurrentLine.add(current);
|
||||||
List<Rectangle2D> blocksInCurrentLine;
|
currentSequencesWithoutGaps.add(current);
|
||||||
List<TextPositionSequence> textPositionsToMerge;
|
}
|
||||||
|
|
||||||
|
|
||||||
public static TextBlockContext init() {
|
public void addFinalLine() {
|
||||||
|
|
||||||
List<List<Rectangle2D>> initialLinesWithGaps = new LinkedList<>();
|
lineBBox.add(textPositionBBox(sequencesInCurrentLine));
|
||||||
List<Rectangle2D> initialBlocksInLine = new LinkedList<>();
|
bBoxWithGapsInCurrentLine.add(textPositionBBox(currentSequencesWithoutGaps));
|
||||||
initialLinesWithGaps.add(initialBlocksInLine);
|
|
||||||
return new TextBlockContext(initialLinesWithGaps, initialBlocksInLine, new LinkedList<>());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,25 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
public class MainBodyTextFrameExtractionService {
|
public class MainBodyTextFrameExtractionService {
|
||||||
|
|
||||||
|
private static final double TEXT_FRAME_PAD_WIDTH = 0.0;
|
||||||
|
private static final double TEXT_FRAME_PAD_HEIGHT = 0.02;
|
||||||
|
|
||||||
|
|
||||||
|
public Rectangle2D calculateMainBodyTextFrame(LineInformation lineInformation) {
|
||||||
|
|
||||||
|
Rectangle2D mainBodyTextFrame = lineInformation.getLineBBox().stream()
|
||||||
|
.collect(RectangleTransformations.collectBBox());
|
||||||
|
|
||||||
|
return RectangleTransformations.pad(mainBodyTextFrame, mainBodyTextFrame.getWidth() * TEXT_FRAME_PAD_WIDTH, mainBodyTextFrame.getHeight() * TEXT_FRAME_PAD_HEIGHT);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,2 +1,24 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services;public class PageInformationService {
|
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
|
public class PageInformationService {
|
||||||
|
|
||||||
|
public PageInformation build(PageContents pageContents) {
|
||||||
|
|
||||||
|
LineInformation lineInformation = LineDetectionService.calculateLineInformation(pageContents.getSortedTextPositionSequences());
|
||||||
|
Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(lineInformation);
|
||||||
|
GapInformation gapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), mainBodyTextFrame);
|
||||||
|
|
||||||
|
return new PageInformation(pageContents, lineInformation, mainBodyTextFrame, gapInformation);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -13,20 +13,19 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
|||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.springframework.core.io.ClassPathResource;
|
import org.springframework.core.io.ClassPathResource;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.TextPositionsWithPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.parsing.PDFLinesTextStripper;
|
import com.knecon.fforesight.service.layoutparser.processor.parsing.PDFLinesTextStripper;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
|
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
@UtilityClass
|
@UtilityClass
|
||||||
public class TextPositionSequenceExtractionService {
|
public class TextPositionSequenceSorter {
|
||||||
|
|
||||||
public List<TextPositionsWithPage> getSortedTextPositionsWithPages(String filename) throws IOException {
|
public List<PageContents> getSortedTextPositionsWithPages(String filename) throws IOException {
|
||||||
|
|
||||||
List<TextPositionsWithPage> textPositionSequencesPerPage = new LinkedList<>();
|
List<PageContents> textPositionSequencesPerPage = new LinkedList<>();
|
||||||
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
|
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
|
||||||
|
|
||||||
PDDocument pdDocument = Loader.loadPDF(inputStream);
|
PDDocument pdDocument = Loader.loadPDF(inputStream);
|
||||||
@ -36,20 +35,21 @@ public class TextPositionSequenceExtractionService {
|
|||||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||||
stripper.setPageNumber(pageNumber);
|
stripper.setPageNumber(pageNumber);
|
||||||
|
stripper.setSortByPosition(true);
|
||||||
stripper.setStartPage(pageNumber);
|
stripper.setStartPage(pageNumber);
|
||||||
stripper.setEndPage(pageNumber);
|
stripper.setEndPage(pageNumber);
|
||||||
stripper.setPdpage(pdPage);
|
stripper.setPdpage(pdPage);
|
||||||
stripper.getText(pdDocument);
|
stripper.getText(pdDocument);
|
||||||
|
|
||||||
// var sortedTextPositionSequences = stripper.getTextPositionSequences();
|
|
||||||
Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences()
|
Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences()
|
||||||
.stream()
|
.stream()
|
||||||
.sorted(new TextPositionSequenceComparator())
|
|
||||||
.collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees()));
|
.collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees()));
|
||||||
|
|
||||||
var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation());
|
var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation());
|
||||||
|
|
||||||
textPositionSequencesPerPage.add(new TextPositionsWithPage(sortedTextPositionSequences, RectangleTransformations.toRectangle2D(pdPage.getCropBox())));
|
textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences,
|
||||||
|
RectangleTransformations.toRectangle2D(pdPage.getCropBox()),
|
||||||
|
RectangleTransformations.toRectangle2D(pdPage.getMediaBox())));
|
||||||
}
|
}
|
||||||
|
|
||||||
pdDocument.close();
|
pdDocument.close();
|
||||||
|
|||||||
@ -18,7 +18,7 @@ import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.Pd
|
|||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
class ColumnDetectionServiceTest {
|
class GapAcrossLinesDetectionServiceTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
|
|||||||
@ -1,23 +1,64 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.InvisibleTableDetectionService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
class InvisibleTableDetectionServiceTest {
|
class InvisibleTableDetectionServiceTest {
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void detectInvisibleTableTest() {
|
public void detectInvisibleTableTest() {
|
||||||
|
|
||||||
String fileName = "files/test-two-pages_ocred-2.pdf";
|
String fileName = "files/new/test-two-pages_ocred-2.pdf";
|
||||||
|
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
|
||||||
|
List<PageInformation> pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());
|
||||||
|
|
||||||
List<PageContents> pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
|
int pageNumber = 1;
|
||||||
|
Rectangle2D tableBBox = pageContents.get(0)
|
||||||
|
.getPageContents()
|
||||||
|
.getSortedTextPositionSequences()
|
||||||
|
.subList(45, 152)
|
||||||
|
.stream()
|
||||||
|
.map(TextPositionSequence::getRectangle)
|
||||||
|
.map(RectangleTransformations::toRectangle2D)
|
||||||
|
.map(this::mirrorY)
|
||||||
|
.collect(RectangleTransformations.collectBBox());
|
||||||
|
|
||||||
|
List<TextPositionSequence> textPositionSequences = pageContents.get(0)
|
||||||
|
.getPageContents()
|
||||||
|
.getSortedTextPositionSequences()
|
||||||
|
.stream()
|
||||||
|
.filter(textPositionSequence -> tableBBox.contains(mirrorY(RectangleTransformations.toRectangle2D(textPositionSequence.getRectangle()))))
|
||||||
|
.toList();
|
||||||
|
|
||||||
|
var table = InvisibleTableDetectionService.detectTable(textPositionSequences, tableBBox);
|
||||||
|
|
||||||
|
PdfDraw.drawRectanglesPerPage(fileName, List.of(table.stream().flatMap(Collection::stream).toList(), Collections.emptyList()), tmpFileName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Rectangle2D mirrorY(Rectangle2D rectangle2D) {
|
||||||
|
|
||||||
|
if (rectangle2D.getHeight() >= 0) {
|
||||||
|
return rectangle2D;
|
||||||
|
}
|
||||||
|
return new Rectangle2D.Double(rectangle2D.getX(), rectangle2D.getY() + rectangle2D.getHeight(), rectangle2D.getWidth(), -rectangle2D.getHeight());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -1,7 +1,26 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import java.nio.file.Path;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
class MainBodyTextFrameExtractionServiceTest {
|
class MainBodyTextFrameExtractionServiceTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void testMainBodyDetection() {
|
||||||
|
|
||||||
|
String fileName = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||||
|
String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString();
|
||||||
|
List<PageContents> sortedTextPositionSequence = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -1,49 +1,62 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.junit.jupiter.api.Disabled;
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.MainBodyTextFrameExtractionService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
class GapDetectionServiceTest {
|
class PageInformationServiceTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Disabled
|
@Disabled
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testGapDetection() {
|
public void testGapDetection() {
|
||||||
|
|
||||||
String filename = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
String filename = "files/new/test-two-pages_ocred-2.pdf";
|
||||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
|
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
|
||||||
System.out.println("start TextPosition extraction");
|
System.out.println("start TextPosition extraction");
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
List<PageContents> sortedTextPositionSequencesPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename);
|
List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
|
||||||
List<GapInformation> gapInformationInLinesPerPage = new LinkedList<>();
|
|
||||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||||
System.out.println("start gap detection");
|
System.out.println("start gap detection");
|
||||||
start = System.currentTimeMillis();
|
start = System.currentTimeMillis();
|
||||||
for (PageContents pageContents : sortedTextPositionSequencesPerPage) {
|
|
||||||
// List<List<TextPositionSequence>> lines = LineDetectionService.orderByLine(textPositionsWithPage.getSortedTextPositionSequences());
|
|
||||||
Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(pageContents);
|
|
||||||
gapInformationInLinesPerPage.add(GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), mainBodyTextFrame));
|
|
||||||
}
|
|
||||||
System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start);
|
System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start);
|
||||||
System.out.println("start draw rectangles");
|
System.out.println("start draw rectangles");
|
||||||
start = System.currentTimeMillis();
|
start = System.currentTimeMillis();
|
||||||
PdfDraw.drawRectanglesAndLinesPerPage(filename,
|
PdfDraw.drawRectanglesAndLinesPerPage(filename,
|
||||||
gapInformationInLinesPerPage.stream().map(gaps -> gaps.getYGaps().stream().flatMap(Collection::stream).toList()).toList(),
|
pageInformations.stream().map(PageInformation::getGapInformation).map(gaps -> gaps.getYGaps().stream().flatMap(Collection::stream).toList()).toList(),
|
||||||
gapInformationInLinesPerPage.stream().map(gaps -> gaps.getXGaps().stream().flatMap(Collection::stream).toList()).toList(), tmpFileName);
|
pageInformations.stream().map(PageInformation::getGapInformation).map(gaps -> gaps.getXGaps().stream().flatMap(Collection::stream).toList()).toList(),
|
||||||
|
tmpFileName);
|
||||||
|
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Disabled
|
||||||
|
@SneakyThrows
|
||||||
|
public void testLineDetection() {
|
||||||
|
|
||||||
|
String filename = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||||
|
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
|
||||||
|
System.out.println("start TextPosition extraction");
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
|
||||||
|
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||||
|
System.out.println("start gap detection");
|
||||||
|
start = System.currentTimeMillis();
|
||||||
|
System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start);
|
||||||
|
System.out.println("start draw rectangles");
|
||||||
|
start = System.currentTimeMillis();
|
||||||
|
PdfDraw.drawRectanglesPerPageNumberedByLine(filename,
|
||||||
|
pageInformations.stream().map(PageInformation::getLineInformation).map(gaps -> gaps.getBBoxWithGapsByLines().stream().toList()).toList(),
|
||||||
|
tmpFileName);
|
||||||
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
|
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -7,7 +7,7 @@ import org.junit.jupiter.api.Test;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user