RED-8670: add table detection from idp result

* some 'slight' refactoring
This commit is contained in:
Kilian Schuettler 2025-01-09 11:28:08 +01:00
parent 3a700aecd4
commit 36c7bdd317
15 changed files with 979 additions and 422 deletions

View File

@ -23,7 +23,7 @@ dependencies {
}
implementation("com.iqser.red.commons:storage-commons:2.50.0")
api("com.knecon.fforesight:azure-ocr-service-api:0.23.0")
api("com.knecon.fforesight:azure-ocr-service-api:0.25.0")
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")

View File

@ -296,7 +296,7 @@ public class LayoutParsingPipeline {
classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber, null);
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
List<TablePageBlock> tables = tableExtractionService.extractTables(emptyTableCells, words, pageInformation, idpTablesPerPage.get(pageNumber), layoutParsingType, classificationDocument.getLayoutDebugLayer());
List<TablePageBlock> tables = tableExtractionService.extractTables(emptyTableCells, words, pageInformation, idpTablesPerPage.get(pageNumber), layoutParsingType);
List<ClassifiedImage> graphics = graphicExtractorService.extractPathElementGraphics(pageContents.getGraphicBBoxes(), pageNumber, cleanRulings);
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()).addAll(graphics);

View File

@ -0,0 +1,334 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.Setter;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Getter
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LinkedQuadPointCell {
public static final int MAX_NEIGHBOUR_DISTANCE = 2;
public static final int MAX_ANGLE_DIFFERENCE = 5;
public static final double LINE_INTERSECT_THRESHOLD = 2;
final QuadPoint quadPoint;
final List<AbstractPageBlock> pageBlocks;
final List<LinkedQuadPointCell> rights = new ArrayList<>();
final List<LinkedQuadPointCell> lefts = new ArrayList<>();
final List<LinkedQuadPointCell> aboves = new ArrayList<>();
final List<LinkedQuadPointCell> belows = new ArrayList<>();
@Setter
boolean headerCell;
public LinkedQuadPointCell(QuadPoint quadPoint, List<AbstractPageBlock> pageBlocks) {
this.quadPoint = quadPoint;
this.pageBlocks = pageBlocks;
headerCell = false;
}
public boolean contains(Word word) {
return quadPoint.contains(word.getBBox().getCenterX(), word.getBBox().getCenterY());
}
public String toString() {
return getPageBlocks().stream()
.map(AbstractPageBlock::toString)
.collect(Collectors.joining("\n"));
}
public void addToNeighbours(LinkedQuadPointCell otherCell, double minWidth, double minHeight) {
if (rightNeighbour(otherCell, minHeight)) {
rights.add(otherCell);
}
if (leftNeighbour(otherCell, minHeight)) {
lefts.add(otherCell);
}
if (aboveNeighbour(otherCell, minWidth)) {
aboves.add(otherCell);
}
if (belowNeighbour(otherCell, minWidth)) {
belows.add(otherCell);
}
}
public boolean leftNeighbour(LinkedQuadPointCell other, double minHeight) {
Line2D right = this.quadPoint.getLeftLine();
Line2D left = other.quadPoint.getRightLine();
return isYIntersectionSignificant(right, left, minHeight) && areLinesSimilar(right, left);
}
public boolean rightNeighbour(LinkedQuadPointCell other, double minHeight) {
Line2D right = other.quadPoint.getLeftLine();
Line2D left = this.quadPoint.getRightLine();
return isYIntersectionSignificant(right, left, minHeight) && areLinesSimilar(right, left);
}
public boolean aboveNeighbour(LinkedQuadPointCell other, double minWidth) {
Line2D top = other.quadPoint.getTopLine();
Line2D bottom = this.quadPoint.getBottomLine();
return isXIntersectionSignificant(top, bottom, minWidth) && areLinesSimilar(top, bottom);
}
public boolean belowNeighbour(LinkedQuadPointCell other, double minWidth) {
Line2D top = this.quadPoint.getTopLine();
Line2D bottom = other.quadPoint.getBottomLine();
return isXIntersectionSignificant(top, bottom, minWidth) && areLinesSimilar(top, bottom);
}
public static boolean areLinesSimilar(Line2D line1, Line2D line2) {
double angle1 = Math.atan2(line1.getY2() - line1.getY1(), line1.getX2() - line1.getX1());
double angle2 = Math.atan2(line2.getY2() - line2.getY1(), line2.getX2() - line2.getX1());
double angleDifference = Math.toDegrees(Math.abs(angle1 - angle2));
angleDifference = Math.min(angleDifference, 360 - angleDifference);
if (angleDifference >= MAX_ANGLE_DIFFERENCE) {
return false;
}
double distance1 = line1.ptSegDist(line2.getP1());
double distance2 = line1.ptSegDist(line2.getP2());
double distance3 = line2.ptSegDist(line1.getP1());
double distance4 = line2.ptSegDist(line1.getP2());
double minDistance = Math.min(Math.min(distance1, distance2), Math.min(distance3, distance4));
return minDistance < MAX_NEIGHBOUR_DISTANCE;
}
public static boolean isXIntersectionSignificant(Line2D line1, Line2D line2, double minWidth) {
double start1 = Math.min(line1.getX1(), line1.getX2());
double end1 = Math.max(line1.getX1(), line1.getX2());
double start2 = Math.min(line2.getX1(), line2.getX2());
double end2 = Math.max(line2.getX1(), line2.getX2());
double intersectionStart = Math.max(start1, start2);
double intersectionEnd = Math.min(end1, end2);
return intersectionEnd - intersectionStart >= minWidth;
}
public static boolean isYIntersectionSignificant(Line2D line1, Line2D line2, double minHeight) {
double start1 = Math.min(line1.getY1(), line1.getY2());
double end1 = Math.max(line1.getY1(), line1.getY2());
double start2 = Math.min(line2.getY1(), line2.getY2());
double end2 = Math.max(line2.getY1(), line2.getY2());
double intersectionStart = Math.max(start1, start2);
double intersectionEnd = Math.min(end1, end2);
return intersectionEnd - intersectionStart >= minHeight;
}
public void resetNeighbours() {
rights.clear();
lefts.clear();
aboves.clear();
belows.clear();
}
public boolean needsSplit() {
return rights.size() > 1 || lefts.size() > 1 || aboves.size() > 1 || belows.size() > 1;
}
private LinkedQuadPointCell copyCell(Point2D a, Point2D b, Point2D c, Point2D d) {
var cell = new LinkedQuadPointCell(new QuadPoint(a, b, c, d), pageBlocks);
cell.setHeaderCell(headerCell);
return cell;
}
public boolean isTopLeft() {
return lefts.isEmpty() && aboves.isEmpty();
}
public Collection<LinkedQuadPointCell> split(double minWidth, double minHeight) {
List<LinkedQuadPointCell> newCells;
if (rights.size() > 1 && rights.size() >= lefts.size()) {
newCells = splitY(rights, minHeight);
return newCells;
}
if (lefts.size() > 1) {
newCells = splitY(lefts, minHeight);
return newCells;
}
if (aboves.size() > 1 && aboves.size() >= belows.size()) {
newCells = splitX(aboves, minWidth);
return newCells;
}
if (belows.size() > 1) {
newCells = splitX(belows, minWidth);
return newCells;
}
return List.of(this);
}
private List<LinkedQuadPointCell> splitY(List<LinkedQuadPointCell> neighbours, double minHeight) {
List<LinkedQuadPointCell> splitCells = new LinkedList<>();
List<Line2D> ySplitLines = neighbours.stream()
.map(LinkedQuadPointCell::getQuadPoint)
.map(QuadPoint::getTopLine)
.sorted(Comparator.comparing(line -> (line.getY1() + line.getY2()) / 2))
.toList();
Line2D rightLine = quadPoint.getRightLine();
Line2D leftLine = quadPoint.getLeftLine();
Line2D topLine = quadPoint.getTopLine();
Point2D lowerLeft = quadPoint.getLowerLeft();
Point2D lowerRight = quadPoint.getLowerRight();
Point2D topLeft;
Point2D topRight;
for (Line2D neighborLine : ySplitLines) {
if (Math.abs(neighborLine.getY1() - topLine.getY1()) < minHeight || Math.abs(neighborLine.getY2() - topLine.getY2()) < minHeight) {
continue;
}
var topLeftOptional = findIntersectionPoint(leftLine, neighborLine);
var lowerRightOptional = findIntersectionPoint(rightLine, neighborLine);
if (topLeftOptional.isEmpty() || lowerRightOptional.isEmpty()) {
continue;
}
topLeft = topLeftOptional.get();
topRight = lowerRightOptional.get();
if (Math.abs(topLeft.getY() - lowerLeft.getY()) < minHeight || Math.abs(topRight.getY() - lowerRight.getY()) < minHeight) {
continue;
}
LinkedQuadPointCell cell = copyCell(topLeft, lowerLeft, lowerRight, topRight);
splitCells.add(cell);
lowerLeft = topLeft;
lowerRight = topRight;
}
LinkedQuadPointCell cell = copyCell(topLine.getP1(), lowerLeft, lowerRight, topLine.getP2());
splitCells.add(cell);
return splitCells;
}
/*
Finds the intersection point of the line and the extended line. Where the intersectionPoint must lie within the range of the line, but the extendedLine may be extended as far as needed.
*/
private Optional<Point2D> findIntersectionPoint(Line2D line, Line2D lineToExtend) {
double x1 = line.getX1();
double y1 = line.getY1();
double x2 = line.getX2();
double y2 = line.getY2();
double x3 = lineToExtend.getX1();
double y3 = lineToExtend.getY1();
double x4 = lineToExtend.getX2();
double y4 = lineToExtend.getY2();
double denom = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4);
// If denominator is 0, lines are parallel or coincident
if (denom == 0) {
return Optional.empty();
}
double intersectX = ((x1 * y2 - y1 * x2) * (x3 - x4) - (x1 - x2) * (x3 * y4 - y3 * x4)) / denom;
double intersectY = ((x1 * y2 - y1 * x2) * (y3 - y4) - (y1 - y2) * (x3 * y4 - y3 * x4)) / denom;
Point2D intersection = new Point2D.Double(intersectX, intersectY);
// Check if the intersection point lies within the bounds of the line segment
if (intersection.getX() >= Math.min(x1 - LINE_INTERSECT_THRESHOLD, x2 - LINE_INTERSECT_THRESHOLD) && intersection.getX() <= Math.max(x1 + LINE_INTERSECT_THRESHOLD,
x2 + LINE_INTERSECT_THRESHOLD)//
&& intersection.getY() >= Math.min(y1 - LINE_INTERSECT_THRESHOLD, y2 - LINE_INTERSECT_THRESHOLD) && intersection.getY() <= Math.max(y1 + LINE_INTERSECT_THRESHOLD,
y2 + LINE_INTERSECT_THRESHOLD)) {
return Optional.of(intersection);
}
return Optional.empty();
}
private List<LinkedQuadPointCell> splitX(List<LinkedQuadPointCell> neighbours, double minWidth) {
List<Line2D> xSplitLines = neighbours.stream()
.map(LinkedQuadPointCell::getQuadPoint)
.map(QuadPoint::getRightLine)
.sorted(Comparator.comparing(line -> (line.getX1() + line.getX2()) / 2))
.toList();
if (xSplitLines.isEmpty()) {
return List.of(this);
}
List<LinkedQuadPointCell> splitCells = new LinkedList<>();
Line2D topLine = quadPoint.getTopLine();
Line2D bottomLine = quadPoint.getBottomLine();
Line2D rightLine = quadPoint.getRightLine();
Point2D topLeft = quadPoint.getTopLeft();
Point2D lowerLeft = quadPoint.getLowerLeft();
Point2D topRight;
Point2D lowerRight;
for (Line2D neighborLine : xSplitLines) {
if (Math.abs(rightLine.getX1() - neighborLine.getX1()) < minWidth || Math.abs(rightLine.getX2() - neighborLine.getX2()) < minWidth) {
continue;
}
var topRightOptional = findIntersectionPoint(topLine, neighborLine);
var lowerRightOptional = findIntersectionPoint(bottomLine, neighborLine);
if (topRightOptional.isEmpty() || lowerRightOptional.isEmpty()) {
continue;
}
topRight = topRightOptional.get();
lowerRight = lowerRightOptional.get();
if (Math.abs(topRight.getX() - topLeft.getX()) < minWidth || Math.abs(lowerRight.getX() - lowerLeft.getX()) < minWidth) {
continue;
}
LinkedQuadPointCell cell = copyCell(lowerLeft, topLeft, topRight, lowerRight);
topLeft = topRight;
lowerLeft = lowerRight;
splitCells.add(cell);
}
LinkedQuadPointCell cell = copyCell(lowerLeft, topLeft, rightLine.getP1(), rightLine.getP2());
splitCells.add(cell);
return splitCells;
}
}

View File

@ -0,0 +1,298 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.AffineTransform;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Objects;
import java.util.stream.Stream;
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPointData;
public final class QuadPoint {
/*
B _____ C
| |
A|_____|D
*/
private final Point2D a;
private final Point2D b;
private final Point2D c;
private final Point2D d;
private Line2D left;
private Line2D right;
private Line2D top;
private Line2D bottom;
// This constructor assumes, the points form a convex polygon, I will omit the assertion for performance reasons.
public QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) {
List<Point2D> points = new ArrayList<>(4);
points.add(a);
points.add(b);
points.add(c);
points.add(d);
points.sort(Comparator.comparingDouble(Point2D::getX).thenComparing(Point2D::getY));
if (points.get(0).getY() >= points.get(1).getY()) {
this.a = points.get(0);
this.b = points.get(1);
} else {
this.a = points.get(1);
this.b = points.get(0);
}
if (points.get(2).getY() < points.get(3).getY()) {
this.c = points.get(2);
this.d = points.get(3);
} else {
this.c = points.get(3);
this.d = points.get(2);
}
}
public static QuadPoint fromRectangle2D(Rectangle2D rectangle2D) {
var lowerLeft = new Point2D.Double(rectangle2D.getX(), rectangle2D.getY());
var upperLeft = new Point2D.Double(rectangle2D.getX(), rectangle2D.getMaxY());
var upperRight = new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getMaxY());
var lowerRight = new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getY());
return new QuadPoint(lowerLeft, upperLeft, upperRight, lowerRight);
}
public Rectangle2D getBounds2D() {
double minX = Math.min(Math.min(Math.min(a.getX(), b.getX()), c.getX()), d.getX());
double minY = Math.min(Math.min(Math.min(a.getY(), b.getY()), c.getY()), d.getY());
double maxX = Math.max(Math.max(Math.max(a.getX(), b.getX()), c.getX()), d.getX());
double maxY = Math.max(Math.max(Math.max(a.getY(), b.getY()), c.getY()), d.getY());
return new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY);
}
public static QuadPoint fromData(QuadPointData data) {
return new QuadPoint(new Point2D.Double(data.values()[0], data.values()[1]),
new Point2D.Double(data.values()[2], data.values()[3]),
new Point2D.Double(data.values()[4], data.values()[5]),
new Point2D.Double(data.values()[6], data.values()[7]));
}
public Stream<Line2D> asLines() {
return Stream.of(new Line2D.Double(a(), b()), new Line2D.Double(b(), c()), new Line2D.Double(c(), d()), new Line2D.Double(d(), a()));
}
public QuadPoint getTransformed(AffineTransform at) {
return new QuadPoint(at.transform(a, null), at.transform(b, null), at.transform(c, null), at.transform(d, null));
}
public boolean contains(double x, double y) {
// split into two triangles, test if either contains the point, assumes the QuadPoint is convex and created correctly. More specifically, the points must be in the correct order.
return triangleContains(a, b, c, x, y) || triangleContains(a, c, d, x, y);
}
/*
checks if a triangle contains a point by converting the point to barycentric coordinates using cramer's rule and then checking if the linear combination is within the bounds of the triangle.
https://en.wikipedia.org/wiki/Barycentric_coordinate_system#Barycentric_coordinates_on_triangles
*/
private boolean triangleContains(Point2D a, Point2D b, Point2D c, double x, double y) {
// area of the triangle
double denominator = ((b.getY() - c.getY()) * (a.getX() - c.getX()) + (c.getX() - b.getX()) * (a.getY() - c.getY()));
double invertedDenominator = 1.0 / denominator;
double alpha = ((b.getY() - c.getY()) * (x - c.getX()) + (c.getX() - b.getX()) * (y - c.getY())) * invertedDenominator;
double beta = ((c.getY() - a.getY()) * (x - c.getX()) + (a.getX() - c.getX()) * (y - c.getY())) * invertedDenominator;
return alpha >= 0 && beta >= 0 && alpha + beta <= 1;
}
public boolean contains(Point2D p) {
return contains(p.getX(), p.getY());
}
public boolean contains(Rectangle2D r) {
double x = r.getX();
double y = r.getY();
double maxY = r.getMaxY();
double maxX = r.getMaxX();
Point2D p1 = new Point2D.Double(x, y);
Point2D p2 = new Point2D.Double(x, maxY);
Point2D p3 = new Point2D.Double(maxX, maxY);
Point2D p4 = new Point2D.Double(maxX, y);
return contains(p1) && contains(p2) && contains(p3) && contains(p4);
}
public double getCenterX() {
return (a.getX() + b.getX() + c.getX() + d.getX()) / 4;
}
public double getCenterY() {
return (a.getY() + b.getY() + c.getY() + d.getY()) / 4;
}
public Point2D getCenter() {
return new Point2D.Double(getCenterX(), getCenterY());
}
public boolean intersects(Line2D line) {
return contains(line.getP1()) || contains(line.getP2()) || asLines().anyMatch(qLine -> qLine.intersectsLine(line));
}
public Line2D getRightLine() {
if (right == null) {
right = new Line2D.Double(getLowerRight(), getTopRight());
}
return right;
}
public Line2D getLeftLine() {
if (left == null) {
left = new Line2D.Double(getLowerLeft(), getTopLeft());
}
return left;
}
public Line2D getBottomLine() {
if (bottom == null) {
bottom = new Line2D.Double(getLowerLeft(), getLowerRight());
}
return bottom;
}
public Line2D getTopLine() {
if (top == null) {
top = new Line2D.Double(getTopLeft(), getTopRight());
}
return top;
}
public Point2D getTopLeft() {
return a;
}
public Point2D getTopRight() {
return d;
}
public Point2D getLowerRight() {
return c;
}
public Point2D getLowerLeft() {
return b;
}
@Override
public String toString() {
return String.format("A:(%.2f, %.2f) | B:(%.2f, %.2f) | C:(%.2f, %.2f) | D:(%.2f, %.2f)",
a().getX(),
a().getY(),
b().getX(),
b().getY(),
c().getX(),
c().getY(),
d().getX(),
d().getY());
}
public double getAngle() {
return calculateAngle(a, d);
}
private static double calculateAngle(Point2D a, Point2D d) {
double deltaY = d.getY() - a.getY();
double deltaX = d.getX() - a.getX();
return Math.atan2(deltaY, deltaX);
}
public Point2D a() {return a;}
public Point2D b() {return b;}
public Point2D c() {return c;}
public Point2D d() {return d;}
@Override
public boolean equals(Object obj) {
if (obj == this) {
return true;
}
if (obj == null || obj.getClass() != this.getClass()) {
return false;
}
var that = (QuadPoint) obj;
return Objects.equals(this.a, that.a) && Objects.equals(this.b, that.b) && Objects.equals(this.c, that.c) && Objects.equals(this.d, that.d);
}
@Override
public int hashCode() {
return Objects.hash(a, b, c, d);
}
}

View File

@ -8,6 +8,8 @@ import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.stream.Collectors;
import org.apache.pdfbox.Loader;
@ -29,7 +31,9 @@ import lombok.AccessLevel;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class PageContentExtractor {
@ -76,10 +80,26 @@ public class PageContentExtractor {
@SneakyThrows
public void startAsync() {
List<Thread> extractionThreads = new ArrayList<>(pageNumberBatches.size());
for (List<Integer> pageNumberBatch : pageNumberBatches) {
Thread thread = new Thread(() -> extractPages(pageNumberBatch));
thread.start();
extractionThreads.add(thread);
}
Thread finisher = new Thread(() -> {
awaitFinished(extractionThreads);
});
finisher.start();
}
@SneakyThrows
private static void awaitFinished(List<Thread> extractionThreads) {
for (Thread extractionThread : extractionThreads) {
extractionThread.join();
}
log.info("Page content extraction threads finished!");
}
@ -150,10 +170,12 @@ public class PageContentExtractor {
}
public PageContents awaitPageContents(Integer pageNumber) throws InterruptedException {
public PageContents awaitPageContents(Integer pageNumber) throws InterruptedException, TimeoutException {
finishedLookup[pageNumber - 1].await();
return pageContents[pageNumber - 1];
if (finishedLookup[pageNumber - 1].await(1, TimeUnit.MINUTES)) {
return pageContents[pageNumber - 1];
}
throw new TimeoutException("A timeout has occurred during page content extraction!");
}

View File

@ -29,7 +29,7 @@ public class AreaSweepGridifier {
*
* @return TablePageBlock Structure as a rows of cells matrix
*/
public List<List<Cell>> gridify(Set<Cell> cells, AffineTransform pageToPdfTransform, double minCellWidth, double minCellHeight) {
public List<List<Cell>> gridify(Collection<Cell> cells, AffineTransform pageToPdfTransform, double minCellWidth, double minCellHeight) {
if (cells.isEmpty()) {
return new ArrayList<>();

View File

@ -0,0 +1,257 @@
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
import java.awt.geom.AffineTransform;
import java.awt.geom.Line2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.LinkedQuadPointCell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.QuadPoint;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class QuadPointGridifier {
public static final int MAX_SPLITTING_ITERATIONS = 10;
Set<LinkedQuadPointCell> cells;
AffineTransform pageToPdfTransform;
AffineTransform pdfToPageTransform;
double minCellHeight;
double minCellWidth;
@SneakyThrows
QuadPointGridifier(Collection<LinkedQuadPointCell> cells, AffineTransform pdfToPageTransform) {
this.cells = new HashSet<>(cells);
this.pageToPdfTransform = pdfToPageTransform.createInverse();
this.pdfToPageTransform = pdfToPageTransform;
this.minCellHeight = cells.stream()
.map(LinkedQuadPointCell::getQuadPoint)
.flatMap(this::verticalLines)
.mapToDouble(QuadPointGridifier::length)
.min().orElse(0) * 0.75;
this.minCellWidth = cells.stream()
.map(LinkedQuadPointCell::getQuadPoint)
.flatMap(this::horizontalLines)
.mapToDouble(QuadPointGridifier::length)
.min().orElse(0) * 0.75;
}
public Stream<Line2D> horizontalLines(QuadPoint quadPoint) {
return Stream.of(quadPoint.getBottomLine(), quadPoint.getTopLine());
}
public Stream<Line2D> verticalLines(QuadPoint quadPoint) {
return Stream.of(quadPoint.getRightLine(), quadPoint.getLeftLine());
}
public static double length(Line2D line) {
double xAbs = Math.abs(line.getX1() - line.getX2());
double yAbs = Math.abs(line.getY1() - line.getY2());
return Math.sqrt(xAbs * xAbs + yAbs * yAbs);
}
/**
* Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
* Checks if any cell has more than one neighbor in any direction, if it does, it splits the cell according to its neighbors.
* This is repeated until no more splits are necessary. Then the rows are computed using that very same linked neighbor structure starting with the top left cell.
*
* @return TablePageBlock Structure as a rows of cells matrix
*/
public List<List<Cell>> gridify() {
var linkedCells = cells.stream()
.toList();
computeNeighbours(linkedCells);
int numberOfSplits = 0;
while (linkedCells.stream()
.anyMatch(LinkedQuadPointCell::needsSplit) && numberOfSplits < MAX_SPLITTING_ITERATIONS) {
List<LinkedQuadPointCell> newCells = new LinkedList<>();
for (LinkedQuadPointCell linkedCell : linkedCells) {
if (linkedCell.needsSplit()) {
newCells.addAll(linkedCell.split(minCellWidth, minCellHeight));
} else {
newCells.add(linkedCell);
}
}
computeNeighbours(newCells);
linkedCells = newCells;
numberOfSplits++;
}
return buildStructure(linkedCells);
}
private List<List<Cell>> buildStructure(List<LinkedQuadPointCell> cells) {
if (cells.isEmpty()) {
return Collections.emptyList();
}
List<List<LinkedQuadPointCell>> rows = buildRows(cells);
List<List<Cell>> cellRows = mapToCells(rows);
if (isNotRectangular(rows)) {
log.error("Non rectangular table on page {}",
cells.stream()
.map(LinkedQuadPointCell::getPageBlocks)
.flatMap(List::stream)
.map(AbstractPageBlock::getWords)
.flatMap(Collection::stream)
.map(Word::getPage)
.findAny().orElse(0));
// sometimes this algorithm fails to produce a rectangular table, this happens when the lines are so tilted it eventually produces a cell which is skipped due to being too small, leading to non-rectangular rows.
// Then we use the area sweep algorithm as a fallback.
return AreaSweepGridifier.gridify(this.cells.stream()
.map(this::toCell)
.toList(), pageToPdfTransform, minCellWidth, minCellHeight);
}
cellRows = removeEmptyRows(cellRows);
cellRows = removeEmptyCols(cellRows);
return cellRows;
}
private List<List<Cell>> mapToCells(List<List<LinkedQuadPointCell>> rows) {
return rows.stream()
.map(row -> row.stream()
.map(this::toCell)
.toList())
.toList();
}
private Cell toCell(LinkedQuadPointCell qpCell) {
Cell cell = Cell.fromPageCoordinates(qpCell.getQuadPoint().getBounds2D(), pageToPdfTransform);
cell.setTextBlocks(qpCell.getPageBlocks());
cell.setHeaderCell(qpCell.isHeaderCell());
return cell;
}
private boolean isNotRectangular(List<List<LinkedQuadPointCell>> rows) {
if (rows.isEmpty()) {
return true;
}
int n = rows.get(0).size();
return rows.stream()
.anyMatch(row -> row.size() != n);
}
private List<List<LinkedQuadPointCell>> buildRows(List<LinkedQuadPointCell> cells) {
List<LinkedQuadPointCell> topLeftCandidates = cells.stream()
.filter(LinkedQuadPointCell::isTopLeft)
.toList();
if (topLeftCandidates.size() != 1) {
log.error("More than one top-left cell found!");
}
var cell = topLeftCandidates.get(0);
List<List<LinkedQuadPointCell>> rows = new ArrayList<>();
rows.add(buildRow(cell));
while (!cell.getBelows().isEmpty()) {
cell = cell.getBelows().get(0);
rows.add(buildRow(cell));
}
return rows;
}
private static List<LinkedQuadPointCell> buildRow(LinkedQuadPointCell cell) {
List<LinkedQuadPointCell> currentRow = new ArrayList<>();
LinkedQuadPointCell nextCell = cell;
currentRow.add(cell);
while (!nextCell.getRights().isEmpty()) {
nextCell = nextCell.getRights().get(0);
currentRow.add(nextCell);
}
return currentRow;
}
private void computeNeighbours(List<LinkedQuadPointCell> cells) {
for (LinkedQuadPointCell cell : cells) {
cell.resetNeighbours();
computeNeighbours(cell, cells);
}
}
private void computeNeighbours(LinkedQuadPointCell cell, List<LinkedQuadPointCell> otherCells) {
for (LinkedQuadPointCell otherCell : otherCells) {
if (cell.equals(otherCell)) {
continue;
}
cell.addToNeighbours(otherCell, minCellWidth, minCellHeight);
}
}
static <T> List<List<T>> transpose(List<List<T>> table) {
List<List<T>> ret = new ArrayList<List<T>>();
final int N = table.get(0).size();
for (int i = 0; i < N; i++) {
List<T> col = new ArrayList<T>();
for (List<T> row : table) {
col.add(row.get(i));
}
ret.add(col);
}
return ret;
}
private List<List<Cell>> removeEmptyCols(List<List<Cell>> rowsOfCells) {
if (rowsOfCells.isEmpty()) {
return rowsOfCells;
}
var colsOfCells = transpose(rowsOfCells);
colsOfCells = removeEmptyRows(colsOfCells);
return transpose(colsOfCells);
}
private List<List<Cell>> removeEmptyRows(List<List<Cell>> rowsOfCells) {
return rowsOfCells.stream()
.filter(row -> row.stream()
.anyMatch(cell -> !cell.getTextBlocks().isEmpty()))
.collect(Collectors.toList());
}
}

View File

@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.tables;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;
import java.awt.Color;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
@ -28,6 +27,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.LinkedQuadPointCell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.QuadPoint;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -37,7 +38,6 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTran
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators;
import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
import com.knecon.fforesight.service.ocr.v1.api.model.Table;
import com.knecon.fforesight.service.ocr.v1.api.model.TableCell;
import com.knecon.fforesight.service.ocr.v1.api.model.TableCellType;
@ -66,13 +66,12 @@ public class TableExtractionService {
List<Word> words,
PageInformation pageInformation,
List<Table> idpTables,
LayoutParsingType layoutParsingType,
LayoutDebugLayer layoutDebugLayer) {
LayoutParsingType layoutParsingType) {
AffineTransform pdfToPageTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(pageInformation);
List<TablePageBlock> tablePageBlocks;
if (idpTables == null || idpTables.isEmpty()) {
tablePageBlocks = extractTables(emptyCells, words, pdfToPageTransform, layoutParsingType, layoutDebugLayer, pageInformation);
tablePageBlocks = extractTables(emptyCells, words, pdfToPageTransform, layoutParsingType);
} else {
tablePageBlocks = buildTableFromIdpResult(idpTables, words, pdfToPageTransform, layoutParsingType);
}
@ -83,9 +82,7 @@ public class TableExtractionService {
private List<TablePageBlock> extractTables(List<Cell> emptyCells,
List<Word> words,
AffineTransform pdfToPageTransform,
LayoutParsingType layoutParsingType,
LayoutDebugLayer layoutDebugLayer,
PageInformation pageInformation) {
LayoutParsingType layoutParsingType) {
// sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them
emptyCells.sort(CELL_SIZE_COMPARATOR);
@ -111,15 +108,6 @@ public class TableExtractionService {
if (containedCells.isEmpty()) {
continue;
}
// if cells are missing, for example a corner hasn't been recognized (See files/syngenta/CustomerFiles/SinglePages/T4_Page16_138 IDD0000261736.pdf),
// the LinkedCell based gridification can deal with this, but the transpose logic will then drop the entire column.
// That's why we compute the missing Cells from the spreadsheet area and fill them in.
Set<Cell> missingCells = TableAreaFiller.findMissingCells(containedCells, area, pdfToPageTransform);
layoutDebugLayer.addCellVisualizations(missingCells, pageInformation.number(), Color.RED);
layoutDebugLayer.addCellVisualizations(List.of(new Cell(area, pdfToPageTransform)), pageInformation.number(), Color.BLUE);
containedCells.addAll(missingCells);
Set<Word> wordsInTable = new HashSet<>(); // As docstrum blockfication recomputes the words, we need to remember the origin words to remove them from the overall list of words
for (Cell cell : containedCells) {
@ -151,14 +139,10 @@ public class TableExtractionService {
}
private static void removeWordsFromCells(List<Word> words, TablePageBlock tablePageBlock) {
Set<Word> wordsFromCells = new HashSet<>(tablePageBlock.getWords());
words.removeAll(wordsFromCells);
}
private List<TablePageBlock> buildTableFromIdpResult(List<Table> idpTables, List<Word> words, AffineTransform pdfToPageTransform, LayoutParsingType layoutParsingType) {
private List<TablePageBlock> buildTableFromIdpResult(List<Table> idpTables,
List<Word> words,
AffineTransform pdfToPageTransform,
LayoutParsingType layoutParsingType) {
if (idpTables == null || idpTables.isEmpty()) {
return Collections.emptyList();
@ -171,30 +155,38 @@ public class TableExtractionService {
continue;
}
List<Cell> cells = new ArrayList<>(idpTable.cells().size());
List<LinkedQuadPointCell> qpCells = new ArrayList<>(idpTable.cells().size());
Set<Word> wordsInTable = new HashSet<>(); // As docstrum blockfication recomputes the words, we need to remember the origin words to remove them from the overall list of words
for (TableCell idpCell : idpTable.cells()) {
Cell cell = new Cell(idpCell, pdfToPageTransform);
if (idpCell.kind().equals(TableCellType.ROW_HEADER) || idpCell.kind().equals(TableCellType.COLUMN_HEADER)) {
cell.setHeaderCell(true);
}
cells.add(cell);
Function<Point2D, Boolean> contains = p -> idpCell.textRegion().region().bbox().get().contains(p);
Function<Rectangle2D, Boolean> containsRect = r -> idpCell.textRegion().region().bbox().get().contains(r);
BlocksWithTheirWords blocksWithTheirWords = sortBlocksIntoCell(layoutParsingType, words, tables, contains, containsRect);
cell.setTextBlocks(blocksWithTheirWords.blocks);
BlocksWithTheirWords blocksWithTheirWords = sortWordsIntoQuadPoint(words, layoutParsingType, idpCell, tables);
wordsInTable.addAll(blocksWithTheirWords.words());
LinkedQuadPointCell qpCell = new LinkedQuadPointCell(QuadPoint.fromData(idpCell.textRegion().region().bbox()).getTransformed(pdfToPageTransform),
blocksWithTheirWords.blocks);
if (idpCell.kind().equals(TableCellType.ROW_HEADER) || idpCell.kind().equals(TableCellType.COLUMN_HEADER)) {
qpCell.setHeaderCell(true);
}
qpCells.add(qpCell);
}
TableGridStructureCalculator calculator = new TableGridStructureCalculator(cells, pdfToPageTransform);
List<List<Cell>> gridCells = calculator.gridify();
TablePageBlock tablePageBlock = new TablePageBlock(null, gridCells);
QuadPointGridifier calculator = new QuadPointGridifier(qpCells, pdfToPageTransform);
List<List<Cell>> rows = calculator.gridify();
TablePageBlock tablePageBlock = new TablePageBlock(null, rows);
addTableIfValid(words, tablePageBlock, tables, wordsInTable);
}
return tables;
}
private BlocksWithTheirWords sortWordsIntoQuadPoint(List<Word> words, LayoutParsingType layoutParsingType, TableCell idpCell, List<TablePageBlock> tables) {
Function<Point2D, Boolean> contains = p -> idpCell.textRegion().region().bbox().get().contains(p);
Function<Rectangle2D, Boolean> containsRect = r -> idpCell.textRegion().region().bbox().get().contains(r);
return sortBlocksIntoCell(layoutParsingType, words, tables, contains, containsRect);
}
private static void addTableIfValid(List<Word> words, TablePageBlock tablePageBlock, List<TablePageBlock> tables, Set<Word> wordsInTable) {
if (tablePageBlock.getRowCount() > MAX_ROWS_OR_COLS || tablePageBlock.getColCount() == 0 || tablePageBlock.getColCount() > MAX_ROWS_OR_COLS) {

View File

@ -7,6 +7,7 @@ import java.util.LinkedList;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
@ -14,6 +15,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
import lombok.Getter;
import lombok.Setter;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@ -26,6 +28,8 @@ public class TableFromCellsExtractor {
@Setter
private final List<Cell> originCells;
private final AffineTransform pdfToPageTransform;
private final double minCellWidth;
private final double minCellHeight;
public TableFromCellsExtractor(List<Cell> originCells, AffineTransform pdfToPageTransform) {
@ -33,15 +37,18 @@ public class TableFromCellsExtractor {
classification = PageBlockType.TABLE;
this.originCells = originCells;
this.pdfToPageTransform = pdfToPageTransform;
this.minCellHeight = originCells.stream()
.mapToDouble(BoundingBox::getHeight).average().orElse(0);
this.minCellWidth = originCells.stream()
.mapToDouble(BoundingBox::getWidth).average().orElse(0);
}
@SneakyThrows
public TablePageBlock extract() {
computeRows(originCells);
rows = AreaSweepGridifier.gridify(originCells, pdfToPageTransform.createInverse(), minCellWidth, minCellHeight);
computeHeaders();
return new TablePageBlock(null, rows);
}
@ -119,15 +126,4 @@ public class TableFromCellsExtractor {
}
private void computeRows(List<Cell> cells) {
if (cells.isEmpty()) {
return;
}
TableGridStructureCalculator calculator = new TableGridStructureCalculator(cells, pdfToPageTransform);
rows = calculator.gridify();
}
}

View File

@ -1,353 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class TableGridStructureCalculator {
// multiplied with minimum cell height/width, Cells may be at most this apart in one dimension, and must overlap at least that much in the other dimension to be considered neighbours
private static final double DISTANCE_FACTOR = 0.5;
Set<Cell> cells;
AffineTransform pageToPdfTransform;
double minCellHeight;
double minCellWidth;
@SneakyThrows
TableGridStructureCalculator(Collection<Cell> cells, AffineTransform pdfToPageTransform) {
this.cells = new HashSet<>(cells);
this.pageToPdfTransform = pdfToPageTransform.createInverse();
this.minCellHeight = cells.stream()
.mapToDouble(cell -> cell.getBBox().getHeight())
.min().orElse(0);
this.minCellWidth = cells.stream()
.mapToDouble(cell -> cell.getBBox().getWidth())
.min().orElse(0);
}
/**
* Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
* Checks if any cell has more than one neighbor in any direction, if it does, it splits the cell according to its neighbors.
* This is repeated until no more splits are necessary. Then the rows are computed using that very same linked neighbor structure starting with the top left cell.
*
* @return TablePageBlock Structure as a rows of cells matrix
*/
public List<List<Cell>> gridify() {
if (cellsHaveLargeOverlaps()) {
// If cells overlap significantly, the logic below will keep splitting them infinitely, so we revert to the simpler area sweep implementation.
List<List<Cell>> rows = AreaSweepGridifier.gridify(cells, pageToPdfTransform, minCellWidth, minCellHeight);
rows = removeEmptyRows(rows);
rows = removeEmptyCols(rows);
return rows;
}
var linkedCells = cells.stream()
.map(LinkedCell::new)
.collect(Collectors.toList());
computeNeighbours(linkedCells);
while (linkedCells.stream()
.anyMatch(LinkedCell::needsSplit)) {
List<LinkedCell> newCells = new LinkedList<>();
for (LinkedCell linkedCell : linkedCells) {
if (linkedCell.needsSplit()) {
newCells.addAll(linkedCell.split());
} else {
newCells.add(linkedCell);
}
}
computeNeighbours(newCells);
linkedCells = newCells;
}
return buildStructure(linkedCells);
}
private boolean cellsHaveLargeOverlaps() {
for (Cell cell1 : cells) {
for (Cell cell2 : cells) {
if (cell1.equals(cell2)) {
continue;
}
if (cell1.horizontalOverlap(cell2) > minCellWidth * DISTANCE_FACTOR //
&& cell1.verticalOverlap(cell2) > minCellHeight * DISTANCE_FACTOR) {
return true;
}
}
}
return false;
}
private List<List<Cell>> buildStructure(List<LinkedCell> cells) {
if (cells.isEmpty()) {
return Collections.emptyList();
}
List<List<Cell>> rows = buildRows(cells);
if (isNotRectangular(rows)) {
throw new AssertionError();
}
rows = removeEmptyRows(rows);
rows = removeEmptyCols(rows);
return rows;
}
private boolean isNotRectangular(List<List<Cell>> rows) {
if (rows.isEmpty()) {
return true;
}
int n = rows.get(0).size();
return rows.stream()
.anyMatch(row -> row.size() != n);
}
private List<List<Cell>> buildRows(List<LinkedCell> cells) {
List<LinkedCell> topLeftCandidates = cells.stream()
.filter(LinkedCell::isTopLeft)
.toList();
assert topLeftCandidates.size() == 1;
var cell = topLeftCandidates.get(0);
List<List<Cell>> rows = new ArrayList<>();
rows.add(buildRow(cell));
while (!cell.belows.isEmpty()) {
cell = cell.belows.get(0);
rows.add(buildRow(cell));
}
if (isNotRectangular(rows)) {
throw new AssertionError();
}
return rows;
}
private static List<Cell> buildRow(LinkedCell cell) {
List<Cell> currentRow = new ArrayList<>();
LinkedCell nextCell = cell;
currentRow.add(cell.originalCell);
while (!nextCell.rights.isEmpty()) {
nextCell = nextCell.rights.get(0);
currentRow.add(nextCell.originalCell);
}
return currentRow;
}
private void computeNeighbours(List<LinkedCell> cells) {
for (LinkedCell cell : cells) {
cell.resetNeighbours();
computeNeighbours(cell, cells);
}
}
private void computeNeighbours(LinkedCell cell, List<LinkedCell> otherCells) {
for (LinkedCell otherCell : otherCells) {
if (cell.equals(otherCell)) {
continue;
}
if (cell.originalCell.horizontalDistance(otherCell.originalCell) <= minCellWidth * DISTANCE_FACTOR
&& cell.originalCell.verticalOverlap(otherCell.originalCell) >= minCellHeight * DISTANCE_FACTOR) {
if (cell.originalCell.getBBox().getCenterX() <= otherCell.originalCell.getBBox().getCenterX()) {
cell.rights.add(otherCell);
} else {
cell.lefts.add(otherCell);
}
} else if (cell.originalCell.verticalDistance(otherCell.originalCell) <= minCellHeight * DISTANCE_FACTOR
&& cell.originalCell.horizontalOverlap(otherCell.originalCell) >= minCellWidth * DISTANCE_FACTOR) {
if (cell.originalCell.getBBox().getCenterY() <= otherCell.originalCell.getBBox().getCenterY()) {
cell.belows.add(otherCell);
} else {
cell.aboves.add(otherCell);
}
}
}
}
static <T> List<List<T>> transpose(List<List<T>> table) {
List<List<T>> ret = new ArrayList<List<T>>();
final int N = table.get(0).size();
for (int i = 0; i < N; i++) {
List<T> col = new ArrayList<T>();
for (List<T> row : table) {
col.add(row.get(i));
}
ret.add(col);
}
return ret;
}
private List<List<Cell>> removeEmptyCols(List<List<Cell>> rowsOfCells) {
if (rowsOfCells.isEmpty()) {
return rowsOfCells;
}
var colsOfCells = transpose(rowsOfCells);
colsOfCells = removeEmptyRows(colsOfCells);
return transpose(colsOfCells);
}
private List<List<Cell>> removeEmptyRows(List<List<Cell>> rowsOfCells) {
return rowsOfCells.stream()
.filter(row -> row.stream()
.anyMatch(cell -> !cell.getTextBlocks().isEmpty()))
.collect(Collectors.toList());
}
class LinkedCell {
private final Cell originalCell;
private final List<LinkedCell> rights;
private final List<LinkedCell> lefts;
private final List<LinkedCell> aboves;
private final List<LinkedCell> belows;
LinkedCell(Cell cell) {
this.originalCell = cell;
this.rights = new LinkedList<>();
this.lefts = new LinkedList<>();
this.aboves = new LinkedList<>();
this.belows = new LinkedList<>();
}
public boolean needsSplit() {
return rights.size() > 1 || lefts.size() > 1 || aboves.size() > 1 || belows.size() > 1;
}
public boolean isTopLeft() {
return lefts.isEmpty() && aboves.isEmpty();
}
public String toString() {
return originalCell.toString();
}
public Collection<LinkedCell> split() {
if (rights.size() > 1 && rights.size() >= lefts.size()) {
return splitY(rights);
}
if (lefts.size() > 1) {
return splitY(lefts);
}
if (aboves.size() > 1 && aboves.size() >= belows.size()) {
return splitX(aboves);
}
if (belows.size() > 1) {
return splitX(belows);
}
return List.of(this);
}
private List<LinkedCell> splitY(List<LinkedCell> neighbours) {
List<LinkedCell> splitCells = new LinkedList<>();
List<Double> ySplit = neighbours.stream()
.map(right -> right.originalCell.getMaxY())
.sorted()
.toList();
Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY());
double maxX = originalCell.getBBox().getMaxX();
double x = originalCell.getBBox().getX();
double maxY = originalCell.getBBox().getMaxY();
for (Double neighborY : ySplit) {
double y = Math.min(neighborY, maxY);
Point2D bottomRight = new Point2D.Double(maxX, y);
Cell cell = copyCell(topLeft, bottomRight);
splitCells.add(new LinkedCell(cell));
topLeft = new Point2D.Double(x, y);
}
return splitCells;
}
private List<LinkedCell> splitX(List<LinkedCell> neighbours) {
List<LinkedCell> splitCells = new LinkedList<>();
List<Double> xSplit = neighbours.stream()
.map(right -> right.originalCell.getMaxX())
.sorted()
.toList();
Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY());
double maxY = originalCell.getBBox().getMaxY();
double y = originalCell.getBBox().getY();
double maxX = originalCell.getBBox().getMaxX();
for (Double neighborX : xSplit) {
double x = Math.min(neighborX, maxX);
Point2D bottomRight = new Point2D.Double(x, maxY);
Cell cell = copyCell(topLeft, bottomRight);
splitCells.add(new LinkedCell(cell));
topLeft = new Point2D.Double(x, y);
}
return splitCells;
}
private Cell copyCell(Point2D topLeft, Point2D bottomRight) {
Cell cell = Cell.fromPageCoordinates(topLeft, bottomRight, pageToPdfTransform);
cell.setHeaderCell(originalCell.isHeaderCell());
cell.setTextBlocks(originalCell.getTextBlocks());
return cell;
}
public void resetNeighbours() {
rights.clear();
lefts.clear();
aboves.clear();
belows.clear();
}
}
}

View File

@ -54,11 +54,8 @@ public class LayoutGridService {
layers.add(layoutGrid);
if (document.layoutDebugLayer().isActive()) {
layers.add(document.layoutDebugLayer());
}
viewerDocumentService.addLayerGroups(originFile, destinationFile, layers, layoutParserVersion, layoutParsingTypeName, outline);
}

View File

@ -28,6 +28,7 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.QuadPoint;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;

View File

@ -37,7 +37,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@Test
public void testLayoutParserEndToEnd() {
String filePath = "/home/kschuettler/Downloads/2021-2048323.pdf";
String filePath = "/home/kschuettler/Dokumente/TestFiles/OCR/TestSet/VV-331340-first100.pdf";
runForFile(filePath);
}
@ -46,8 +46,8 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@Test
public void testLayoutParserEndToEndWithIdpResult() {
String filePath = "/tmp/OCR_TEST/2009-1048395_50pages_tables.pdf/document.pdf";
String idpResultPath = "/tmp/OCR_TEST/2009-1048395_50pages_tables.pdf/idpResult.json";
String filePath = "/home/kschuettler/Dokumente/Ticket Related/RED-8670/VV-331340-first100.pdf/viewerDocument.pdf";
String idpResultPath = "/home/kschuettler/Dokumente/Ticket Related/RED-8670/VV-331340-first100.pdf/idpResult.json";
runForFile(filePath, idpResultPath);
}

View File

@ -134,6 +134,14 @@ public class PDFTronViewerDocumentService {
}
@SneakyThrows
@Observed(name = "PDFTronViewerDocumentService", contextualName = "add-visualizations")
public void addLayerGroups(File originFile, File destinationFile, List<LayerGroup> layerGroups) {
addLayerGroups(originFile, destinationFile, layerGroups, null, null, new Outline());
}
private static Set<String> mapMarkedContentNames(List<LayerGroup> layerGroups) {
return layerGroups.stream()

View File

@ -26,11 +26,16 @@ public class ViewerDocVersioningUtility {
pdfDoc.getDocInfo().setAuthor(AUTHOR);
pdfDoc.getDocInfo().setKeywords(CUSTOM_DICT + ":" + currentVersion);
Obj versionInfo = pdfDoc.getSDFDoc().createIndirectDict();
versionInfo.putString("LayoutParserVersion", layoutParserVersion);
versionInfo.putString("LayoutParsingType", layoutParsingType);
pdfDoc.getRoot().put("KneconVersionInfo", versionInfo);
if (layoutParserVersion != null || layoutParsingType != null) {
Obj versionInfo = pdfDoc.getSDFDoc().createIndirectDict();
if (layoutParserVersion != null) {
versionInfo.putString("LayoutParserVersion", layoutParserVersion);
}
if (layoutParsingType != null) {
versionInfo.putString("LayoutParsingType", layoutParsingType);
}
pdfDoc.getRoot().put("KneconVersionInfo", versionInfo);
}
}