diff --git a/layoutparser-service/layoutparser-service-processor/build.gradle.kts b/layoutparser-service/layoutparser-service-processor/build.gradle.kts index 4b50384..999de33 100644 --- a/layoutparser-service/layoutparser-service-processor/build.gradle.kts +++ b/layoutparser-service/layoutparser-service-processor/build.gradle.kts @@ -23,7 +23,7 @@ dependencies { } implementation("com.iqser.red.commons:storage-commons:2.50.0") - api("com.knecon.fforesight:azure-ocr-service-api:0.23.0") + api("com.knecon.fforesight:azure-ocr-service-api:0.25.0") implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}") implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}") diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 57440d7..1e83430 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -296,7 +296,7 @@ public class LayoutParsingPipeline { classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber, null); TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings); - List tables = tableExtractionService.extractTables(emptyTableCells, words, pageInformation, idpTablesPerPage.get(pageNumber), layoutParsingType, classificationDocument.getLayoutDebugLayer()); + List tables = tableExtractionService.extractTables(emptyTableCells, words, pageInformation, idpTablesPerPage.get(pageNumber), layoutParsingType); List graphics = graphicExtractorService.extractPathElementGraphics(pageContents.getGraphicBBoxes(), pageNumber, cleanRulings); pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()).addAll(graphics); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/LinkedQuadPointCell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/LinkedQuadPointCell.java new file mode 100644 index 0000000..aba632c --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/LinkedQuadPointCell.java @@ -0,0 +1,334 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.table; + +import java.awt.geom.Line2D; +import java.awt.geom.Point2D; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Comparator; +import java.util.LinkedList; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; + +import lombok.AccessLevel; +import lombok.Getter; +import lombok.Setter; +import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Getter +@FieldDefaults(level = AccessLevel.PRIVATE) +public class LinkedQuadPointCell { + + public static final int MAX_NEIGHBOUR_DISTANCE = 2; + public static final int MAX_ANGLE_DIFFERENCE = 5; + public static final double LINE_INTERSECT_THRESHOLD = 2; + + final QuadPoint quadPoint; + final List pageBlocks; + final List rights = new ArrayList<>(); + final List lefts = new ArrayList<>(); + final List aboves = new ArrayList<>(); + final List belows = new ArrayList<>(); + + @Setter + boolean headerCell; + + + public LinkedQuadPointCell(QuadPoint quadPoint, List pageBlocks) { + + this.quadPoint = quadPoint; + this.pageBlocks = pageBlocks; + headerCell = false; + } + + + public boolean contains(Word word) { + + return quadPoint.contains(word.getBBox().getCenterX(), word.getBBox().getCenterY()); + } + + + public String toString() { + + return getPageBlocks().stream() + .map(AbstractPageBlock::toString) + .collect(Collectors.joining("\n")); + } + + + public void addToNeighbours(LinkedQuadPointCell otherCell, double minWidth, double minHeight) { + + if (rightNeighbour(otherCell, minHeight)) { + rights.add(otherCell); + } + if (leftNeighbour(otherCell, minHeight)) { + lefts.add(otherCell); + } + if (aboveNeighbour(otherCell, minWidth)) { + aboves.add(otherCell); + } + if (belowNeighbour(otherCell, minWidth)) { + belows.add(otherCell); + } + } + + + public boolean leftNeighbour(LinkedQuadPointCell other, double minHeight) { + + Line2D right = this.quadPoint.getLeftLine(); + Line2D left = other.quadPoint.getRightLine(); + return isYIntersectionSignificant(right, left, minHeight) && areLinesSimilar(right, left); + } + + + public boolean rightNeighbour(LinkedQuadPointCell other, double minHeight) { + + Line2D right = other.quadPoint.getLeftLine(); + Line2D left = this.quadPoint.getRightLine(); + return isYIntersectionSignificant(right, left, minHeight) && areLinesSimilar(right, left); + } + + + public boolean aboveNeighbour(LinkedQuadPointCell other, double minWidth) { + + Line2D top = other.quadPoint.getTopLine(); + Line2D bottom = this.quadPoint.getBottomLine(); + return isXIntersectionSignificant(top, bottom, minWidth) && areLinesSimilar(top, bottom); + } + + + public boolean belowNeighbour(LinkedQuadPointCell other, double minWidth) { + + Line2D top = this.quadPoint.getTopLine(); + Line2D bottom = other.quadPoint.getBottomLine(); + return isXIntersectionSignificant(top, bottom, minWidth) && areLinesSimilar(top, bottom); + } + + + public static boolean areLinesSimilar(Line2D line1, Line2D line2) { + + double angle1 = Math.atan2(line1.getY2() - line1.getY1(), line1.getX2() - line1.getX1()); + double angle2 = Math.atan2(line2.getY2() - line2.getY1(), line2.getX2() - line2.getX1()); + + double angleDifference = Math.toDegrees(Math.abs(angle1 - angle2)); + + angleDifference = Math.min(angleDifference, 360 - angleDifference); + + if (angleDifference >= MAX_ANGLE_DIFFERENCE) { + return false; + } + + double distance1 = line1.ptSegDist(line2.getP1()); + double distance2 = line1.ptSegDist(line2.getP2()); + double distance3 = line2.ptSegDist(line1.getP1()); + double distance4 = line2.ptSegDist(line1.getP2()); + + double minDistance = Math.min(Math.min(distance1, distance2), Math.min(distance3, distance4)); + + return minDistance < MAX_NEIGHBOUR_DISTANCE; + } + + + public static boolean isXIntersectionSignificant(Line2D line1, Line2D line2, double minWidth) { + + double start1 = Math.min(line1.getX1(), line1.getX2()); + double end1 = Math.max(line1.getX1(), line1.getX2()); + double start2 = Math.min(line2.getX1(), line2.getX2()); + double end2 = Math.max(line2.getX1(), line2.getX2()); + double intersectionStart = Math.max(start1, start2); + double intersectionEnd = Math.min(end1, end2); + return intersectionEnd - intersectionStart >= minWidth; + } + + + public static boolean isYIntersectionSignificant(Line2D line1, Line2D line2, double minHeight) { + + double start1 = Math.min(line1.getY1(), line1.getY2()); + double end1 = Math.max(line1.getY1(), line1.getY2()); + double start2 = Math.min(line2.getY1(), line2.getY2()); + double end2 = Math.max(line2.getY1(), line2.getY2()); + double intersectionStart = Math.max(start1, start2); + double intersectionEnd = Math.min(end1, end2); + return intersectionEnd - intersectionStart >= minHeight; + } + + + public void resetNeighbours() { + + rights.clear(); + lefts.clear(); + aboves.clear(); + belows.clear(); + } + + + public boolean needsSplit() { + + return rights.size() > 1 || lefts.size() > 1 || aboves.size() > 1 || belows.size() > 1; + } + + + private LinkedQuadPointCell copyCell(Point2D a, Point2D b, Point2D c, Point2D d) { + + var cell = new LinkedQuadPointCell(new QuadPoint(a, b, c, d), pageBlocks); + cell.setHeaderCell(headerCell); + return cell; + } + + + public boolean isTopLeft() { + + return lefts.isEmpty() && aboves.isEmpty(); + } + + + public Collection split(double minWidth, double minHeight) { + + List newCells; + if (rights.size() > 1 && rights.size() >= lefts.size()) { + newCells = splitY(rights, minHeight); + return newCells; + } + if (lefts.size() > 1) { + newCells = splitY(lefts, minHeight); + return newCells; + } + if (aboves.size() > 1 && aboves.size() >= belows.size()) { + newCells = splitX(aboves, minWidth); + return newCells; + } + if (belows.size() > 1) { + newCells = splitX(belows, minWidth); + return newCells; + } + return List.of(this); + } + + + private List splitY(List neighbours, double minHeight) { + + List splitCells = new LinkedList<>(); + List ySplitLines = neighbours.stream() + .map(LinkedQuadPointCell::getQuadPoint) + .map(QuadPoint::getTopLine) + .sorted(Comparator.comparing(line -> (line.getY1() + line.getY2()) / 2)) + .toList(); + Line2D rightLine = quadPoint.getRightLine(); + Line2D leftLine = quadPoint.getLeftLine(); + Line2D topLine = quadPoint.getTopLine(); + Point2D lowerLeft = quadPoint.getLowerLeft(); + Point2D lowerRight = quadPoint.getLowerRight(); + Point2D topLeft; + Point2D topRight; + for (Line2D neighborLine : ySplitLines) { + if (Math.abs(neighborLine.getY1() - topLine.getY1()) < minHeight || Math.abs(neighborLine.getY2() - topLine.getY2()) < minHeight) { + continue; + } + var topLeftOptional = findIntersectionPoint(leftLine, neighborLine); + var lowerRightOptional = findIntersectionPoint(rightLine, neighborLine); + if (topLeftOptional.isEmpty() || lowerRightOptional.isEmpty()) { + continue; + } + topLeft = topLeftOptional.get(); + topRight = lowerRightOptional.get(); + if (Math.abs(topLeft.getY() - lowerLeft.getY()) < minHeight || Math.abs(topRight.getY() - lowerRight.getY()) < minHeight) { + continue; + } + LinkedQuadPointCell cell = copyCell(topLeft, lowerLeft, lowerRight, topRight); + splitCells.add(cell); + lowerLeft = topLeft; + lowerRight = topRight; + } + LinkedQuadPointCell cell = copyCell(topLine.getP1(), lowerLeft, lowerRight, topLine.getP2()); + splitCells.add(cell); + return splitCells; + } + + + /* + Finds the intersection point of the line and the extended line. Where the intersectionPoint must lie within the range of the line, but the extendedLine may be extended as far as needed. + */ + private Optional findIntersectionPoint(Line2D line, Line2D lineToExtend) { + + double x1 = line.getX1(); + double y1 = line.getY1(); + double x2 = line.getX2(); + double y2 = line.getY2(); + + double x3 = lineToExtend.getX1(); + double y3 = lineToExtend.getY1(); + double x4 = lineToExtend.getX2(); + double y4 = lineToExtend.getY2(); + + double denom = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4); + + // If denominator is 0, lines are parallel or coincident + if (denom == 0) { + return Optional.empty(); + } + + double intersectX = ((x1 * y2 - y1 * x2) * (x3 - x4) - (x1 - x2) * (x3 * y4 - y3 * x4)) / denom; + double intersectY = ((x1 * y2 - y1 * x2) * (y3 - y4) - (y1 - y2) * (x3 * y4 - y3 * x4)) / denom; + + Point2D intersection = new Point2D.Double(intersectX, intersectY); + + // Check if the intersection point lies within the bounds of the line segment + if (intersection.getX() >= Math.min(x1 - LINE_INTERSECT_THRESHOLD, x2 - LINE_INTERSECT_THRESHOLD) && intersection.getX() <= Math.max(x1 + LINE_INTERSECT_THRESHOLD, + x2 + LINE_INTERSECT_THRESHOLD)// + && intersection.getY() >= Math.min(y1 - LINE_INTERSECT_THRESHOLD, y2 - LINE_INTERSECT_THRESHOLD) && intersection.getY() <= Math.max(y1 + LINE_INTERSECT_THRESHOLD, + y2 + LINE_INTERSECT_THRESHOLD)) { + return Optional.of(intersection); + } + + return Optional.empty(); + } + + + private List splitX(List neighbours, double minWidth) { + + List xSplitLines = neighbours.stream() + .map(LinkedQuadPointCell::getQuadPoint) + .map(QuadPoint::getRightLine) + .sorted(Comparator.comparing(line -> (line.getX1() + line.getX2()) / 2)) + .toList(); + if (xSplitLines.isEmpty()) { + return List.of(this); + } + List splitCells = new LinkedList<>(); + Line2D topLine = quadPoint.getTopLine(); + Line2D bottomLine = quadPoint.getBottomLine(); + Line2D rightLine = quadPoint.getRightLine(); + Point2D topLeft = quadPoint.getTopLeft(); + Point2D lowerLeft = quadPoint.getLowerLeft(); + Point2D topRight; + Point2D lowerRight; + for (Line2D neighborLine : xSplitLines) { + if (Math.abs(rightLine.getX1() - neighborLine.getX1()) < minWidth || Math.abs(rightLine.getX2() - neighborLine.getX2()) < minWidth) { + continue; + } + var topRightOptional = findIntersectionPoint(topLine, neighborLine); + var lowerRightOptional = findIntersectionPoint(bottomLine, neighborLine); + if (topRightOptional.isEmpty() || lowerRightOptional.isEmpty()) { + continue; + } + topRight = topRightOptional.get(); + lowerRight = lowerRightOptional.get(); + if (Math.abs(topRight.getX() - topLeft.getX()) < minWidth || Math.abs(lowerRight.getX() - lowerLeft.getX()) < minWidth) { + continue; + } + LinkedQuadPointCell cell = copyCell(lowerLeft, topLeft, topRight, lowerRight); + topLeft = topRight; + lowerLeft = lowerRight; + splitCells.add(cell); + } + LinkedQuadPointCell cell = copyCell(lowerLeft, topLeft, rightLine.getP1(), rightLine.getP2()); + splitCells.add(cell); + return splitCells; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/QuadPoint.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/QuadPoint.java new file mode 100644 index 0000000..6191a85 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/QuadPoint.java @@ -0,0 +1,298 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.table; + +import java.awt.geom.AffineTransform; +import java.awt.geom.Line2D; +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Objects; +import java.util.stream.Stream; + +import com.knecon.fforesight.service.ocr.v1.api.model.QuadPointData; + +public final class QuadPoint { + + /* + B _____ C + | | + A|_____|D + */ + + private final Point2D a; + private final Point2D b; + private final Point2D c; + private final Point2D d; + + private Line2D left; + private Line2D right; + private Line2D top; + private Line2D bottom; + + + // This constructor assumes, the points form a convex polygon, I will omit the assertion for performance reasons. + public QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) { + + List points = new ArrayList<>(4); + points.add(a); + points.add(b); + points.add(c); + points.add(d); + points.sort(Comparator.comparingDouble(Point2D::getX).thenComparing(Point2D::getY)); + if (points.get(0).getY() >= points.get(1).getY()) { + this.a = points.get(0); + this.b = points.get(1); + } else { + this.a = points.get(1); + this.b = points.get(0); + } + + if (points.get(2).getY() < points.get(3).getY()) { + this.c = points.get(2); + this.d = points.get(3); + } else { + this.c = points.get(3); + this.d = points.get(2); + } + } + + + public static QuadPoint fromRectangle2D(Rectangle2D rectangle2D) { + + var lowerLeft = new Point2D.Double(rectangle2D.getX(), rectangle2D.getY()); + var upperLeft = new Point2D.Double(rectangle2D.getX(), rectangle2D.getMaxY()); + var upperRight = new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getMaxY()); + var lowerRight = new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getY()); + + return new QuadPoint(lowerLeft, upperLeft, upperRight, lowerRight); + } + + + public Rectangle2D getBounds2D() { + + double minX = Math.min(Math.min(Math.min(a.getX(), b.getX()), c.getX()), d.getX()); + double minY = Math.min(Math.min(Math.min(a.getY(), b.getY()), c.getY()), d.getY()); + double maxX = Math.max(Math.max(Math.max(a.getX(), b.getX()), c.getX()), d.getX()); + double maxY = Math.max(Math.max(Math.max(a.getY(), b.getY()), c.getY()), d.getY()); + + return new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY); + } + + + public static QuadPoint fromData(QuadPointData data) { + + return new QuadPoint(new Point2D.Double(data.values()[0], data.values()[1]), + new Point2D.Double(data.values()[2], data.values()[3]), + new Point2D.Double(data.values()[4], data.values()[5]), + new Point2D.Double(data.values()[6], data.values()[7])); + + } + + + public Stream asLines() { + + return Stream.of(new Line2D.Double(a(), b()), new Line2D.Double(b(), c()), new Line2D.Double(c(), d()), new Line2D.Double(d(), a())); + + } + + + public QuadPoint getTransformed(AffineTransform at) { + + return new QuadPoint(at.transform(a, null), at.transform(b, null), at.transform(c, null), at.transform(d, null)); + } + + + public boolean contains(double x, double y) { + // split into two triangles, test if either contains the point, assumes the QuadPoint is convex and created correctly. More specifically, the points must be in the correct order. + return triangleContains(a, b, c, x, y) || triangleContains(a, c, d, x, y); + } + + + /* + checks if a triangle contains a point by converting the point to barycentric coordinates using cramer's rule and then checking if the linear combination is within the bounds of the triangle. + https://en.wikipedia.org/wiki/Barycentric_coordinate_system#Barycentric_coordinates_on_triangles + */ + private boolean triangleContains(Point2D a, Point2D b, Point2D c, double x, double y) { + + // area of the triangle + double denominator = ((b.getY() - c.getY()) * (a.getX() - c.getX()) + (c.getX() - b.getX()) * (a.getY() - c.getY())); + double invertedDenominator = 1.0 / denominator; + double alpha = ((b.getY() - c.getY()) * (x - c.getX()) + (c.getX() - b.getX()) * (y - c.getY())) * invertedDenominator; + double beta = ((c.getY() - a.getY()) * (x - c.getX()) + (a.getX() - c.getX()) * (y - c.getY())) * invertedDenominator; + + return alpha >= 0 && beta >= 0 && alpha + beta <= 1; + } + + + public boolean contains(Point2D p) { + + return contains(p.getX(), p.getY()); + } + + + public boolean contains(Rectangle2D r) { + + double x = r.getX(); + double y = r.getY(); + double maxY = r.getMaxY(); + double maxX = r.getMaxX(); + + Point2D p1 = new Point2D.Double(x, y); + Point2D p2 = new Point2D.Double(x, maxY); + Point2D p3 = new Point2D.Double(maxX, maxY); + Point2D p4 = new Point2D.Double(maxX, y); + + return contains(p1) && contains(p2) && contains(p3) && contains(p4); + } + + + public double getCenterX() { + + return (a.getX() + b.getX() + c.getX() + d.getX()) / 4; + } + + + public double getCenterY() { + + return (a.getY() + b.getY() + c.getY() + d.getY()) / 4; + } + + + public Point2D getCenter() { + + return new Point2D.Double(getCenterX(), getCenterY()); + } + + + public boolean intersects(Line2D line) { + + return contains(line.getP1()) || contains(line.getP2()) || asLines().anyMatch(qLine -> qLine.intersectsLine(line)); + } + + + public Line2D getRightLine() { + + if (right == null) { + right = new Line2D.Double(getLowerRight(), getTopRight()); + } + + return right; + } + + + public Line2D getLeftLine() { + + if (left == null) { + left = new Line2D.Double(getLowerLeft(), getTopLeft()); + } + return left; + } + + + public Line2D getBottomLine() { + + if (bottom == null) { + bottom = new Line2D.Double(getLowerLeft(), getLowerRight()); + } + return bottom; + } + + + public Line2D getTopLine() { + + if (top == null) { + top = new Line2D.Double(getTopLeft(), getTopRight()); + } + return top; + } + + + public Point2D getTopLeft() { + + return a; + + } + + + public Point2D getTopRight() { + + return d; + } + + + public Point2D getLowerRight() { + + return c; + } + + + public Point2D getLowerLeft() { + + return b; + } + + + @Override + public String toString() { + + return String.format("A:(%.2f, %.2f) | B:(%.2f, %.2f) | C:(%.2f, %.2f) | D:(%.2f, %.2f)", + a().getX(), + a().getY(), + b().getX(), + b().getY(), + c().getX(), + c().getY(), + d().getX(), + d().getY()); + } + + + public double getAngle() { + + return calculateAngle(a, d); + } + + + private static double calculateAngle(Point2D a, Point2D d) { + + double deltaY = d.getY() - a.getY(); + double deltaX = d.getX() - a.getX(); + return Math.atan2(deltaY, deltaX); + } + + + public Point2D a() {return a;} + + + public Point2D b() {return b;} + + + public Point2D c() {return c;} + + + public Point2D d() {return d;} + + + @Override + public boolean equals(Object obj) { + + if (obj == this) { + return true; + } + if (obj == null || obj.getClass() != this.getClass()) { + return false; + } + var that = (QuadPoint) obj; + return Objects.equals(this.a, that.a) && Objects.equals(this.b, that.b) && Objects.equals(this.c, that.c) && Objects.equals(this.d, that.d); + } + + + @Override + public int hashCode() { + + return Objects.hash(a, b, c, d); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java index 3b18dd4..79dbacd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java @@ -8,6 +8,8 @@ import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.stream.Collectors; import org.apache.pdfbox.Loader; @@ -29,7 +31,9 @@ import lombok.AccessLevel; import lombok.Getter; import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; +@Slf4j @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class PageContentExtractor { @@ -76,10 +80,26 @@ public class PageContentExtractor { @SneakyThrows public void startAsync() { + List extractionThreads = new ArrayList<>(pageNumberBatches.size()); for (List pageNumberBatch : pageNumberBatches) { Thread thread = new Thread(() -> extractPages(pageNumberBatch)); thread.start(); + extractionThreads.add(thread); } + Thread finisher = new Thread(() -> { + awaitFinished(extractionThreads); + }); + finisher.start(); + } + + + @SneakyThrows + private static void awaitFinished(List extractionThreads) { + + for (Thread extractionThread : extractionThreads) { + extractionThread.join(); + } + log.info("Page content extraction threads finished!"); } @@ -150,10 +170,12 @@ public class PageContentExtractor { } - public PageContents awaitPageContents(Integer pageNumber) throws InterruptedException { + public PageContents awaitPageContents(Integer pageNumber) throws InterruptedException, TimeoutException { - finishedLookup[pageNumber - 1].await(); - return pageContents[pageNumber - 1]; + if (finishedLookup[pageNumber - 1].await(1, TimeUnit.MINUTES)) { + return pageContents[pageNumber - 1]; + } + throw new TimeoutException("A timeout has occurred during page content extraction!"); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/AreaSweepGridifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/AreaSweepGridifier.java index 5029e6d..28a65ea 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/AreaSweepGridifier.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/AreaSweepGridifier.java @@ -29,7 +29,7 @@ public class AreaSweepGridifier { * * @return TablePageBlock Structure as a rows of cells matrix */ - public List> gridify(Set cells, AffineTransform pageToPdfTransform, double minCellWidth, double minCellHeight) { + public List> gridify(Collection cells, AffineTransform pageToPdfTransform, double minCellWidth, double minCellHeight) { if (cells.isEmpty()) { return new ArrayList<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/QuadPointGridifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/QuadPointGridifier.java new file mode 100644 index 0000000..ca8711d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/QuadPointGridifier.java @@ -0,0 +1,257 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.tables; + +import java.awt.geom.AffineTransform; +import java.awt.geom.Line2D; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.LinkedQuadPointCell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.QuadPoint; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class QuadPointGridifier { + + public static final int MAX_SPLITTING_ITERATIONS = 10; + Set cells; + AffineTransform pageToPdfTransform; + AffineTransform pdfToPageTransform; + double minCellHeight; + double minCellWidth; + + + @SneakyThrows + QuadPointGridifier(Collection cells, AffineTransform pdfToPageTransform) { + + this.cells = new HashSet<>(cells); + this.pageToPdfTransform = pdfToPageTransform.createInverse(); + this.pdfToPageTransform = pdfToPageTransform; + this.minCellHeight = cells.stream() + .map(LinkedQuadPointCell::getQuadPoint) + .flatMap(this::verticalLines) + .mapToDouble(QuadPointGridifier::length) + .min().orElse(0) * 0.75; + this.minCellWidth = cells.stream() + .map(LinkedQuadPointCell::getQuadPoint) + .flatMap(this::horizontalLines) + .mapToDouble(QuadPointGridifier::length) + .min().orElse(0) * 0.75; + } + + + public Stream horizontalLines(QuadPoint quadPoint) { + + return Stream.of(quadPoint.getBottomLine(), quadPoint.getTopLine()); + } + + + public Stream verticalLines(QuadPoint quadPoint) { + + return Stream.of(quadPoint.getRightLine(), quadPoint.getLeftLine()); + } + + + public static double length(Line2D line) { + + double xAbs = Math.abs(line.getX1() - line.getX2()); + double yAbs = Math.abs(line.getY1() - line.getY2()); + return Math.sqrt(xAbs * xAbs + yAbs * yAbs); + } + + + /** + * Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted. + * Checks if any cell has more than one neighbor in any direction, if it does, it splits the cell according to its neighbors. + * This is repeated until no more splits are necessary. Then the rows are computed using that very same linked neighbor structure starting with the top left cell. + * + * @return TablePageBlock Structure as a rows of cells matrix + */ + public List> gridify() { + + var linkedCells = cells.stream() + .toList(); + + computeNeighbours(linkedCells); + int numberOfSplits = 0; + while (linkedCells.stream() + .anyMatch(LinkedQuadPointCell::needsSplit) && numberOfSplits < MAX_SPLITTING_ITERATIONS) { + + List newCells = new LinkedList<>(); + for (LinkedQuadPointCell linkedCell : linkedCells) { + if (linkedCell.needsSplit()) { + newCells.addAll(linkedCell.split(minCellWidth, minCellHeight)); + } else { + newCells.add(linkedCell); + } + } + computeNeighbours(newCells); + linkedCells = newCells; + numberOfSplits++; + } + + return buildStructure(linkedCells); + } + + + private List> buildStructure(List cells) { + + if (cells.isEmpty()) { + return Collections.emptyList(); + } + List> rows = buildRows(cells); + List> cellRows = mapToCells(rows); + if (isNotRectangular(rows)) { + log.error("Non rectangular table on page {}", + cells.stream() + .map(LinkedQuadPointCell::getPageBlocks) + .flatMap(List::stream) + .map(AbstractPageBlock::getWords) + .flatMap(Collection::stream) + .map(Word::getPage) + .findAny().orElse(0)); + // sometimes this algorithm fails to produce a rectangular table, this happens when the lines are so tilted it eventually produces a cell which is skipped due to being too small, leading to non-rectangular rows. + // Then we use the area sweep algorithm as a fallback. + return AreaSweepGridifier.gridify(this.cells.stream() + .map(this::toCell) + .toList(), pageToPdfTransform, minCellWidth, minCellHeight); + } + cellRows = removeEmptyRows(cellRows); + cellRows = removeEmptyCols(cellRows); + return cellRows; + } + + + private List> mapToCells(List> rows) { + + return rows.stream() + .map(row -> row.stream() + .map(this::toCell) + .toList()) + .toList(); + } + + + private Cell toCell(LinkedQuadPointCell qpCell) { + + Cell cell = Cell.fromPageCoordinates(qpCell.getQuadPoint().getBounds2D(), pageToPdfTransform); + cell.setTextBlocks(qpCell.getPageBlocks()); + cell.setHeaderCell(qpCell.isHeaderCell()); + return cell; + } + + + private boolean isNotRectangular(List> rows) { + + if (rows.isEmpty()) { + return true; + } + int n = rows.get(0).size(); + return rows.stream() + .anyMatch(row -> row.size() != n); + } + + + private List> buildRows(List cells) { + + List topLeftCandidates = cells.stream() + .filter(LinkedQuadPointCell::isTopLeft) + .toList(); + + if (topLeftCandidates.size() != 1) { + log.error("More than one top-left cell found!"); + } + var cell = topLeftCandidates.get(0); + + List> rows = new ArrayList<>(); + rows.add(buildRow(cell)); + while (!cell.getBelows().isEmpty()) { + cell = cell.getBelows().get(0); + rows.add(buildRow(cell)); + } + return rows; + } + + + private static List buildRow(LinkedQuadPointCell cell) { + + List currentRow = new ArrayList<>(); + LinkedQuadPointCell nextCell = cell; + currentRow.add(cell); + while (!nextCell.getRights().isEmpty()) { + nextCell = nextCell.getRights().get(0); + currentRow.add(nextCell); + } + return currentRow; + } + + + private void computeNeighbours(List cells) { + + for (LinkedQuadPointCell cell : cells) { + cell.resetNeighbours(); + computeNeighbours(cell, cells); + } + + } + + + private void computeNeighbours(LinkedQuadPointCell cell, List otherCells) { + + for (LinkedQuadPointCell otherCell : otherCells) { + if (cell.equals(otherCell)) { + continue; + } + cell.addToNeighbours(otherCell, minCellWidth, minCellHeight); + } + + } + + + static List> transpose(List> table) { + + List> ret = new ArrayList>(); + final int N = table.get(0).size(); + for (int i = 0; i < N; i++) { + List col = new ArrayList(); + for (List row : table) { + col.add(row.get(i)); + } + ret.add(col); + } + return ret; + } + + + private List> removeEmptyCols(List> rowsOfCells) { + + if (rowsOfCells.isEmpty()) { + return rowsOfCells; + } + + var colsOfCells = transpose(rowsOfCells); + colsOfCells = removeEmptyRows(colsOfCells); + return transpose(colsOfCells); + } + + + private List> removeEmptyRows(List> rowsOfCells) { + + return rowsOfCells.stream() + .filter(row -> row.stream() + .anyMatch(cell -> !cell.getTextBlocks().isEmpty())) + .collect(Collectors.toList()); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableExtractionService.java index 4e94e4a..33fcb0f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableExtractionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableExtractionService.java @@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.tables; import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR; import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR; -import java.awt.Color; import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; @@ -28,6 +27,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.LinkedQuadPointCell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.QuadPoint; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; @@ -37,7 +38,6 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTran import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; import com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators; import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder; -import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; import com.knecon.fforesight.service.ocr.v1.api.model.Table; import com.knecon.fforesight.service.ocr.v1.api.model.TableCell; import com.knecon.fforesight.service.ocr.v1.api.model.TableCellType; @@ -66,13 +66,12 @@ public class TableExtractionService { List words, PageInformation pageInformation, List idpTables, - LayoutParsingType layoutParsingType, - LayoutDebugLayer layoutDebugLayer) { + LayoutParsingType layoutParsingType) { AffineTransform pdfToPageTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(pageInformation); List tablePageBlocks; if (idpTables == null || idpTables.isEmpty()) { - tablePageBlocks = extractTables(emptyCells, words, pdfToPageTransform, layoutParsingType, layoutDebugLayer, pageInformation); + tablePageBlocks = extractTables(emptyCells, words, pdfToPageTransform, layoutParsingType); } else { tablePageBlocks = buildTableFromIdpResult(idpTables, words, pdfToPageTransform, layoutParsingType); } @@ -83,9 +82,7 @@ public class TableExtractionService { private List extractTables(List emptyCells, List words, AffineTransform pdfToPageTransform, - LayoutParsingType layoutParsingType, - LayoutDebugLayer layoutDebugLayer, - PageInformation pageInformation) { + LayoutParsingType layoutParsingType) { // sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them emptyCells.sort(CELL_SIZE_COMPARATOR); @@ -111,15 +108,6 @@ public class TableExtractionService { if (containedCells.isEmpty()) { continue; } - // if cells are missing, for example a corner hasn't been recognized (See files/syngenta/CustomerFiles/SinglePages/T4_Page16_138 IDD0000261736.pdf), - // the LinkedCell based gridification can deal with this, but the transpose logic will then drop the entire column. - // That's why we compute the missing Cells from the spreadsheet area and fill them in. - Set missingCells = TableAreaFiller.findMissingCells(containedCells, area, pdfToPageTransform); - - layoutDebugLayer.addCellVisualizations(missingCells, pageInformation.number(), Color.RED); - layoutDebugLayer.addCellVisualizations(List.of(new Cell(area, pdfToPageTransform)), pageInformation.number(), Color.BLUE); - - containedCells.addAll(missingCells); Set wordsInTable = new HashSet<>(); // As docstrum blockfication recomputes the words, we need to remember the origin words to remove them from the overall list of words for (Cell cell : containedCells) { @@ -151,14 +139,10 @@ public class TableExtractionService { } - private static void removeWordsFromCells(List words, TablePageBlock tablePageBlock) { - - Set wordsFromCells = new HashSet<>(tablePageBlock.getWords()); - words.removeAll(wordsFromCells); - } - - - private List buildTableFromIdpResult(List
idpTables, List words, AffineTransform pdfToPageTransform, LayoutParsingType layoutParsingType) { + private List buildTableFromIdpResult(List
idpTables, + List words, + AffineTransform pdfToPageTransform, + LayoutParsingType layoutParsingType) { if (idpTables == null || idpTables.isEmpty()) { return Collections.emptyList(); @@ -171,30 +155,38 @@ public class TableExtractionService { continue; } - List cells = new ArrayList<>(idpTable.cells().size()); + List qpCells = new ArrayList<>(idpTable.cells().size()); Set wordsInTable = new HashSet<>(); // As docstrum blockfication recomputes the words, we need to remember the origin words to remove them from the overall list of words for (TableCell idpCell : idpTable.cells()) { - Cell cell = new Cell(idpCell, pdfToPageTransform); - if (idpCell.kind().equals(TableCellType.ROW_HEADER) || idpCell.kind().equals(TableCellType.COLUMN_HEADER)) { - cell.setHeaderCell(true); - } - cells.add(cell); - Function contains = p -> idpCell.textRegion().region().bbox().get().contains(p); - Function containsRect = r -> idpCell.textRegion().region().bbox().get().contains(r); - BlocksWithTheirWords blocksWithTheirWords = sortBlocksIntoCell(layoutParsingType, words, tables, contains, containsRect); - cell.setTextBlocks(blocksWithTheirWords.blocks); + BlocksWithTheirWords blocksWithTheirWords = sortWordsIntoQuadPoint(words, layoutParsingType, idpCell, tables); wordsInTable.addAll(blocksWithTheirWords.words()); + + LinkedQuadPointCell qpCell = new LinkedQuadPointCell(QuadPoint.fromData(idpCell.textRegion().region().bbox()).getTransformed(pdfToPageTransform), + blocksWithTheirWords.blocks); + if (idpCell.kind().equals(TableCellType.ROW_HEADER) || idpCell.kind().equals(TableCellType.COLUMN_HEADER)) { + qpCell.setHeaderCell(true); + } + + qpCells.add(qpCell); } - TableGridStructureCalculator calculator = new TableGridStructureCalculator(cells, pdfToPageTransform); - List> gridCells = calculator.gridify(); - TablePageBlock tablePageBlock = new TablePageBlock(null, gridCells); + QuadPointGridifier calculator = new QuadPointGridifier(qpCells, pdfToPageTransform); + List> rows = calculator.gridify(); + TablePageBlock tablePageBlock = new TablePageBlock(null, rows); addTableIfValid(words, tablePageBlock, tables, wordsInTable); } return tables; } + private BlocksWithTheirWords sortWordsIntoQuadPoint(List words, LayoutParsingType layoutParsingType, TableCell idpCell, List tables) { + + Function contains = p -> idpCell.textRegion().region().bbox().get().contains(p); + Function containsRect = r -> idpCell.textRegion().region().bbox().get().contains(r); + return sortBlocksIntoCell(layoutParsingType, words, tables, contains, containsRect); + } + + private static void addTableIfValid(List words, TablePageBlock tablePageBlock, List tables, Set wordsInTable) { if (tablePageBlock.getRowCount() > MAX_ROWS_OR_COLS || tablePageBlock.getColCount() == 0 || tablePageBlock.getColCount() > MAX_ROWS_OR_COLS) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableFromCellsExtractor.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableFromCellsExtractor.java index fa2ad62..bf8d4ad 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableFromCellsExtractor.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableFromCellsExtractor.java @@ -7,6 +7,7 @@ import java.util.LinkedList; import java.util.List; import com.fasterxml.jackson.annotation.JsonIgnore; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; @@ -14,6 +15,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB import lombok.Getter; import lombok.Setter; +import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @Slf4j @@ -26,6 +28,8 @@ public class TableFromCellsExtractor { @Setter private final List originCells; private final AffineTransform pdfToPageTransform; + private final double minCellWidth; + private final double minCellHeight; public TableFromCellsExtractor(List originCells, AffineTransform pdfToPageTransform) { @@ -33,15 +37,18 @@ public class TableFromCellsExtractor { classification = PageBlockType.TABLE; this.originCells = originCells; this.pdfToPageTransform = pdfToPageTransform; + this.minCellHeight = originCells.stream() + .mapToDouble(BoundingBox::getHeight).average().orElse(0); + this.minCellWidth = originCells.stream() + .mapToDouble(BoundingBox::getWidth).average().orElse(0); } + @SneakyThrows public TablePageBlock extract() { - computeRows(originCells); - + rows = AreaSweepGridifier.gridify(originCells, pdfToPageTransform.createInverse(), minCellWidth, minCellHeight); computeHeaders(); - return new TablePageBlock(null, rows); } @@ -119,15 +126,4 @@ public class TableFromCellsExtractor { } - - private void computeRows(List cells) { - - if (cells.isEmpty()) { - return; - } - - TableGridStructureCalculator calculator = new TableGridStructureCalculator(cells, pdfToPageTransform); - rows = calculator.gridify(); - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableGridStructureCalculator.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableGridStructureCalculator.java deleted file mode 100644 index b1c21d2..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableGridStructureCalculator.java +++ /dev/null @@ -1,353 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.services.tables; - -import java.awt.geom.AffineTransform; -import java.awt.geom.Point2D; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; - -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; - -@Slf4j -public class TableGridStructureCalculator { - - // multiplied with minimum cell height/width, Cells may be at most this apart in one dimension, and must overlap at least that much in the other dimension to be considered neighbours - private static final double DISTANCE_FACTOR = 0.5; - Set cells; - AffineTransform pageToPdfTransform; - double minCellHeight; - double minCellWidth; - - - @SneakyThrows - TableGridStructureCalculator(Collection cells, AffineTransform pdfToPageTransform) { - - this.cells = new HashSet<>(cells); - this.pageToPdfTransform = pdfToPageTransform.createInverse(); - this.minCellHeight = cells.stream() - .mapToDouble(cell -> cell.getBBox().getHeight()) - .min().orElse(0); - this.minCellWidth = cells.stream() - .mapToDouble(cell -> cell.getBBox().getWidth()) - .min().orElse(0); - } - - - /** - * Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted. - * Checks if any cell has more than one neighbor in any direction, if it does, it splits the cell according to its neighbors. - * This is repeated until no more splits are necessary. Then the rows are computed using that very same linked neighbor structure starting with the top left cell. - * - * @return TablePageBlock Structure as a rows of cells matrix - */ - public List> gridify() { - - if (cellsHaveLargeOverlaps()) { - // If cells overlap significantly, the logic below will keep splitting them infinitely, so we revert to the simpler area sweep implementation. - List> rows = AreaSweepGridifier.gridify(cells, pageToPdfTransform, minCellWidth, minCellHeight); - rows = removeEmptyRows(rows); - rows = removeEmptyCols(rows); - return rows; - } - - var linkedCells = cells.stream() - .map(LinkedCell::new) - .collect(Collectors.toList()); - - computeNeighbours(linkedCells); - - while (linkedCells.stream() - .anyMatch(LinkedCell::needsSplit)) { - - List newCells = new LinkedList<>(); - for (LinkedCell linkedCell : linkedCells) { - if (linkedCell.needsSplit()) { - newCells.addAll(linkedCell.split()); - } else { - newCells.add(linkedCell); - } - } - computeNeighbours(newCells); - linkedCells = newCells; - } - return buildStructure(linkedCells); - } - - - private boolean cellsHaveLargeOverlaps() { - - for (Cell cell1 : cells) { - for (Cell cell2 : cells) { - if (cell1.equals(cell2)) { - continue; - } - if (cell1.horizontalOverlap(cell2) > minCellWidth * DISTANCE_FACTOR // - && cell1.verticalOverlap(cell2) > minCellHeight * DISTANCE_FACTOR) { - return true; - } - } - } - return false; - } - - - private List> buildStructure(List cells) { - - if (cells.isEmpty()) { - return Collections.emptyList(); - } - List> rows = buildRows(cells); - if (isNotRectangular(rows)) { - throw new AssertionError(); - } - rows = removeEmptyRows(rows); - rows = removeEmptyCols(rows); - return rows; - } - - - private boolean isNotRectangular(List> rows) { - - if (rows.isEmpty()) { - return true; - } - int n = rows.get(0).size(); - return rows.stream() - .anyMatch(row -> row.size() != n); - } - - - private List> buildRows(List cells) { - - List topLeftCandidates = cells.stream() - .filter(LinkedCell::isTopLeft) - .toList(); - - assert topLeftCandidates.size() == 1; - var cell = topLeftCandidates.get(0); - - List> rows = new ArrayList<>(); - rows.add(buildRow(cell)); - while (!cell.belows.isEmpty()) { - cell = cell.belows.get(0); - rows.add(buildRow(cell)); - } - if (isNotRectangular(rows)) { - throw new AssertionError(); - } - return rows; - } - - - private static List buildRow(LinkedCell cell) { - - List currentRow = new ArrayList<>(); - LinkedCell nextCell = cell; - currentRow.add(cell.originalCell); - while (!nextCell.rights.isEmpty()) { - nextCell = nextCell.rights.get(0); - currentRow.add(nextCell.originalCell); - } - return currentRow; - } - - - private void computeNeighbours(List cells) { - - for (LinkedCell cell : cells) { - cell.resetNeighbours(); - computeNeighbours(cell, cells); - } - - } - - - private void computeNeighbours(LinkedCell cell, List otherCells) { - - for (LinkedCell otherCell : otherCells) { - if (cell.equals(otherCell)) { - continue; - } - if (cell.originalCell.horizontalDistance(otherCell.originalCell) <= minCellWidth * DISTANCE_FACTOR - && cell.originalCell.verticalOverlap(otherCell.originalCell) >= minCellHeight * DISTANCE_FACTOR) { - if (cell.originalCell.getBBox().getCenterX() <= otherCell.originalCell.getBBox().getCenterX()) { - cell.rights.add(otherCell); - } else { - cell.lefts.add(otherCell); - } - } else if (cell.originalCell.verticalDistance(otherCell.originalCell) <= minCellHeight * DISTANCE_FACTOR - && cell.originalCell.horizontalOverlap(otherCell.originalCell) >= minCellWidth * DISTANCE_FACTOR) { - if (cell.originalCell.getBBox().getCenterY() <= otherCell.originalCell.getBBox().getCenterY()) { - cell.belows.add(otherCell); - } else { - cell.aboves.add(otherCell); - } - } - } - - } - - - static List> transpose(List> table) { - - List> ret = new ArrayList>(); - final int N = table.get(0).size(); - for (int i = 0; i < N; i++) { - List col = new ArrayList(); - for (List row : table) { - col.add(row.get(i)); - } - ret.add(col); - } - return ret; - } - - - private List> removeEmptyCols(List> rowsOfCells) { - - if (rowsOfCells.isEmpty()) { - return rowsOfCells; - } - - var colsOfCells = transpose(rowsOfCells); - colsOfCells = removeEmptyRows(colsOfCells); - return transpose(colsOfCells); - } - - - private List> removeEmptyRows(List> rowsOfCells) { - - return rowsOfCells.stream() - .filter(row -> row.stream() - .anyMatch(cell -> !cell.getTextBlocks().isEmpty())) - .collect(Collectors.toList()); - } - - - class LinkedCell { - - private final Cell originalCell; - private final List rights; - private final List lefts; - private final List aboves; - private final List belows; - - - LinkedCell(Cell cell) { - - this.originalCell = cell; - this.rights = new LinkedList<>(); - this.lefts = new LinkedList<>(); - this.aboves = new LinkedList<>(); - this.belows = new LinkedList<>(); - } - - - public boolean needsSplit() { - - return rights.size() > 1 || lefts.size() > 1 || aboves.size() > 1 || belows.size() > 1; - } - - - public boolean isTopLeft() { - - return lefts.isEmpty() && aboves.isEmpty(); - } - - - public String toString() { - - return originalCell.toString(); - } - - - public Collection split() { - - if (rights.size() > 1 && rights.size() >= lefts.size()) { - return splitY(rights); - } - if (lefts.size() > 1) { - return splitY(lefts); - } - if (aboves.size() > 1 && aboves.size() >= belows.size()) { - return splitX(aboves); - } - if (belows.size() > 1) { - return splitX(belows); - } - return List.of(this); - } - - - private List splitY(List neighbours) { - - List splitCells = new LinkedList<>(); - List ySplit = neighbours.stream() - .map(right -> right.originalCell.getMaxY()) - .sorted() - .toList(); - Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY()); - double maxX = originalCell.getBBox().getMaxX(); - double x = originalCell.getBBox().getX(); - double maxY = originalCell.getBBox().getMaxY(); - for (Double neighborY : ySplit) { - double y = Math.min(neighborY, maxY); - Point2D bottomRight = new Point2D.Double(maxX, y); - Cell cell = copyCell(topLeft, bottomRight); - splitCells.add(new LinkedCell(cell)); - topLeft = new Point2D.Double(x, y); - } - return splitCells; - } - - - private List splitX(List neighbours) { - - List splitCells = new LinkedList<>(); - List xSplit = neighbours.stream() - .map(right -> right.originalCell.getMaxX()) - .sorted() - .toList(); - Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY()); - double maxY = originalCell.getBBox().getMaxY(); - double y = originalCell.getBBox().getY(); - double maxX = originalCell.getBBox().getMaxX(); - for (Double neighborX : xSplit) { - double x = Math.min(neighborX, maxX); - Point2D bottomRight = new Point2D.Double(x, maxY); - Cell cell = copyCell(topLeft, bottomRight); - splitCells.add(new LinkedCell(cell)); - topLeft = new Point2D.Double(x, y); - } - return splitCells; - } - - - private Cell copyCell(Point2D topLeft, Point2D bottomRight) { - - Cell cell = Cell.fromPageCoordinates(topLeft, bottomRight, pageToPdfTransform); - cell.setHeaderCell(originalCell.isHeaderCell()); - cell.setTextBlocks(originalCell.getTextBlocks()); - return cell; - } - - - public void resetNeighbours() { - - rights.clear(); - lefts.clear(); - aboves.clear(); - belows.clear(); - } - - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java index 4617101..e1208a4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java @@ -54,11 +54,8 @@ public class LayoutGridService { layers.add(layoutGrid); if (document.layoutDebugLayer().isActive()) { layers.add(document.layoutDebugLayer()); - } - viewerDocumentService.addLayerGroups(originFile, destinationFile, layers, layoutParserVersion, layoutParsingTypeName, outline); - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java index 5866717..43e893b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java @@ -28,6 +28,7 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.QuadPoint; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage; diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 8152a60..3432ff0 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -37,7 +37,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @Test public void testLayoutParserEndToEnd() { - String filePath = "/home/kschuettler/Downloads/2021-2048323.pdf"; + String filePath = "/home/kschuettler/Dokumente/TestFiles/OCR/TestSet/VV-331340-first100.pdf"; runForFile(filePath); } @@ -46,8 +46,8 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @Test public void testLayoutParserEndToEndWithIdpResult() { - String filePath = "/tmp/OCR_TEST/2009-1048395_50pages_tables.pdf/document.pdf"; - String idpResultPath = "/tmp/OCR_TEST/2009-1048395_50pages_tables.pdf/idpResult.json"; + String filePath = "/home/kschuettler/Dokumente/Ticket Related/RED-8670/VV-331340-first100.pdf/viewerDocument.pdf"; + String idpResultPath = "/home/kschuettler/Dokumente/Ticket Related/RED-8670/VV-331340-first100.pdf/idpResult.json"; runForFile(filePath, idpResultPath); } diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/PDFTronViewerDocumentService.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/PDFTronViewerDocumentService.java index 0a9f6b9..97660e9 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/PDFTronViewerDocumentService.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/PDFTronViewerDocumentService.java @@ -134,6 +134,14 @@ public class PDFTronViewerDocumentService { } + @SneakyThrows + @Observed(name = "PDFTronViewerDocumentService", contextualName = "add-visualizations") + public void addLayerGroups(File originFile, File destinationFile, List layerGroups) { + + addLayerGroups(originFile, destinationFile, layerGroups, null, null, new Outline()); + } + + private static Set mapMarkedContentNames(List layerGroups) { return layerGroups.stream() diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocVersioningUtility.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocVersioningUtility.java index 230aa7a..e2f871e 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocVersioningUtility.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/service/ViewerDocVersioningUtility.java @@ -26,11 +26,16 @@ public class ViewerDocVersioningUtility { pdfDoc.getDocInfo().setAuthor(AUTHOR); pdfDoc.getDocInfo().setKeywords(CUSTOM_DICT + ":" + currentVersion); - - Obj versionInfo = pdfDoc.getSDFDoc().createIndirectDict(); - versionInfo.putString("LayoutParserVersion", layoutParserVersion); - versionInfo.putString("LayoutParsingType", layoutParsingType); - pdfDoc.getRoot().put("KneconVersionInfo", versionInfo); + if (layoutParserVersion != null || layoutParsingType != null) { + Obj versionInfo = pdfDoc.getSDFDoc().createIndirectDict(); + if (layoutParserVersion != null) { + versionInfo.putString("LayoutParserVersion", layoutParserVersion); + } + if (layoutParsingType != null) { + versionInfo.putString("LayoutParsingType", layoutParsingType); + } + pdfDoc.getRoot().put("KneconVersionInfo", versionInfo); + } }