diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java index 16a468e..d113dfa 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java @@ -6,12 +6,14 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB import lombok.AllArgsConstructor; import lombok.Data; +import lombok.EqualsAndHashCode; import lombok.NoArgsConstructor; @Data @AllArgsConstructor @NoArgsConstructor -public abstract class AbstractPageBlock { +@EqualsAndHashCode(callSuper = true) +public abstract class AbstractPageBlock extends Rectangle { @JsonIgnore protected float minX; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/TableCell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/TableCell.java index 3036d94..3d726d8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/TableCell.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/TableCell.java @@ -84,14 +84,16 @@ public class TableCell implements GenericSemanticNode { private TextBlock buildTextBlock() { - return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector()); + return streamAllSubNodes().filter(SemanticNode::isLeaf) + .map(SemanticNode::getLeafTextBlock) + .collect(new TextBlockCollector()); } @Override public String toString() { - return treeId + ": " + NodeType.TABLE_CELL + ": " + this.buildTextBlock().buildSummary(); + return treeId + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary(); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java index 7dfce70..2f0de29 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.table; import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -36,6 +37,12 @@ public class Cell extends Rectangle { } + public Cell(Rectangle2D r) { + + super((float) r.getY(), (float) r.getX(), (float) r.getWidth(), (float) r.getHeight()); + } + + public void addTextBlock(TextPageBlock textBlock) { textBlocks.add(textBlock); @@ -76,14 +83,4 @@ public class Cell extends Rectangle { return this.getHeight() >= MIN_SIZE && this.getWidth() >= MIN_SIZE; } - public boolean nearlyIntersects(Cell other) { - - if (this.getHeight() <= 0 || other.getHeight() <= 0) { - return false; - } - double x0 = this.getX() + 2; - double y0 = this.getY() + 2; - return (other.x + other.width > x0 && other.y + other.height > y0 && other.x < x0 + this.getWidth() - 2 && other.y < y0 + this.getHeight() - 2); - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java index 6acf5e1..7586258 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java @@ -20,7 +20,8 @@ import lombok.extern.slf4j.Slf4j; @SuppressWarnings("all") public class Ruling extends Line2D.Float { - private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2; + public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2; + public static final int COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT = 2; public Ruling(Point2D p1, Point2D p2) { @@ -110,8 +111,8 @@ public class Ruling extends Line2D.Float { }); for (Ruling h : horizontals) { - sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h)); - sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h)); + sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, h)); + sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, h)); } for (Ruling v : verticals) { @@ -151,7 +152,7 @@ public class Ruling extends Line2D.Float { if (i == null) { continue; } - rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)}); + rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT)}); } catch (UnsupportedOperationException e) { log.info("Some line are oblique, ignoring..."); continue; @@ -267,7 +268,7 @@ public class Ruling extends Line2D.Float { } - public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) { + public boolean nearlyIntersects(Ruling another) { if (this.intersectsLine(another)) { return true; @@ -276,9 +277,9 @@ public class Ruling extends Line2D.Float { boolean rv = false; if (this.perpendicularTo(another)) { - rv = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).intersectsLine(another); + rv = this.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT).intersectsLine(another); } else { - rv = this.expand(colinearOrParallelExpandAmount).intersectsLine(another.expand(colinearOrParallelExpandAmount)); + rv = this.expand(COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT).intersectsLine(another.expand(COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT)); } return rv; @@ -319,8 +320,8 @@ public class Ruling extends Line2D.Float { public Point2D intersectionPoint(Ruling other) { - Ruling this_l = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT); - Ruling other_l = other.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT); + Ruling this_l = this.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT); + Ruling other_l = other.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT); Ruling horizontal, vertical; if (!this_l.intersectsLine(other_l)) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java index 8e91dae..020dca6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.table; import java.awt.geom.Point2D; import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -11,6 +12,7 @@ import java.util.TreeMap; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import lombok.Getter; import lombok.Setter; @@ -19,6 +21,7 @@ import lombok.extern.slf4j.Slf4j; @Slf4j public class TablePageBlock extends AbstractPageBlock { + public static final double CELL_AREA_CONTAINED_THRESHOLD = 0.98; private final TreeMap cellTreeMap = new TreeMap<>(); private final int rotation; @@ -93,7 +96,7 @@ public class TablePageBlock extends AbstractPageBlock { /** * Detect header cells (either first row or first column): - * Column is marked as header if cell text is bold and row cell text is not bold. + * Column is marked as header if originalCell text is bold and row originalCell text is not bold. * Defaults to row. */ private void computeHeaders() { @@ -101,7 +104,7 @@ public class TablePageBlock extends AbstractPageBlock { if (rows == null) { rows = computeRows(); } - // A bold cell is a header cell as long as every cell to the left/top is bold, too + // A bold originalCell is a header originalCell as long as every originalCell to the left/top is bold, too // we move from left to right and top to bottom for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { List rowCells = rows.get(rowIndex); @@ -257,15 +260,19 @@ public class TablePageBlock extends AbstractPageBlock { for (Float x : sortedUniqueX) { if (prevY != null && prevX != null) { - var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y)); + var cellFromGridStructure = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y)); - if (cell.hasMinimumSize()) { + if (cellFromGridStructure.hasMinimumSize()) { cells.stream() - .filter(cell::nearlyIntersects) - .forEach(intersectingCell -> cell.getTextBlocks().addAll(intersectingCell.getTextBlocks())); + .map(originalCell -> new CellWithIntersection(originalCell, RectangleTransformations.calculateIntersectedArea(cellFromGridStructure, originalCell))) + .filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0) + .filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD) + .max(Comparator.comparing(CellWithIntersection::intersectedArea)) + .map(CellWithIntersection::originalCell) + .ifPresent(matchingCell -> cellFromGridStructure.getTextBlocks().addAll(matchingCell.getTextBlocks())); - row.add(cell); + row.add(cellFromGridStructure); } } prevX = x; @@ -405,4 +412,9 @@ public class TablePageBlock extends AbstractPageBlock { return sb.toString(); } + + record CellWithIntersection(Cell originalCell, double intersectedArea) { + + } + } \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java index 2a18dc0..c51c90b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java @@ -1,21 +1,21 @@ package com.knecon.fforesight.service.layoutparser.processor.services; -import java.awt.geom.Line2D; +import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.X_FIRST_RULING_COMPARATOR; + import java.awt.geom.Point2D; import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; -import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; +import com.knecon.fforesight.service.layoutparser.processor.utils.UnionFind; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -25,26 +25,145 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class RulingCleaningService { - private static final float THRESHOLD_Y = 6; - private static final float THRESHOLD_X = 2; + private static final float THRESHOLD_X_VERTICAL = 1; + private static final float THRESHOLD_Y_VERTICAL = 2; + private static final float THRESHOLD_X_HORIZONTAL = 2; + private static final float THRESHOLD_Y_HORIZONTAL = 3; public CleanRulings getCleanRulings(List tableCells, List rulings) { + Rulings verticalAndHorizontalRulingLines; + if (!rulings.isEmpty()) { - snapPoints(rulings); + verticalAndHorizontalRulingLines = extractVerticalAndHorizontalRulingLines(rulings); + } else { + verticalAndHorizontalRulingLines = getRulingsFromParsedCells(tableCells); } + verticalAndHorizontalRulingLines.verticalLines.sort(X_FIRST_RULING_COMPARATOR); + verticalAndHorizontalRulingLines.horizontalLines.sort(X_FIRST_RULING_COMPARATOR); + verticalAndHorizontalRulingLines = cleanRulings(verticalAndHorizontalRulingLines); + + return CleanRulings.builder().vertical(verticalAndHorizontalRulingLines.verticalLines()).horizontal(verticalAndHorizontalRulingLines.horizontalLines()).build(); + } + + + private Rulings cleanRulings(Rulings rulings) { + + List> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream() + .map(RulingCleaningService::getOverlapRectangle) + .distinct() + .toList()); + List cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream() + .map(rectList -> getXCenteredRuling(Rectangle.boundingBoxOf(rectList))) + .toList(); + + List> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream() + .map(RulingCleaningService::getOverlapRectangle) + .distinct() + .toList()); + + List cleanedHorizontalRulings = groupedOverlappingHorizontalRectangles.stream() + .map(rectList -> getYCenteredRuling(Rectangle.boundingBoxOf(rectList))) + .collect(Collectors.toList()); + + return new Rulings(cleanedVerticalRulings, cleanedHorizontalRulings); + } + + + private List> groupOverlappingRectangles(List rectangles) { + + UnionFind unionFind = new UnionFind<>(); + for (int i = 0; i < rectangles.size(); i++) { + for (int j = i + 1; j < rectangles.size(); j++) { + Rectangle rectangle1 = rectangles.get(i); + Rectangle rectangle2 = rectangles.get(j); + + // we can stop early when we are too far off because of x-y-sorting + if(rectangle1.getRight() < rectangle2.getLeft() && rectangle1.getBottom() < rectangle2.getTop()) { + break; + } + + if (rectangle1.intersects(rectangle2)) { + unionFind.union(rectangle1, rectangle2); + } + } + } + + Map> groups = new HashMap<>(); + for (Rectangle rectangle : rectangles) { + Rectangle root = unionFind.find(rectangle); + groups.computeIfAbsent(root, k -> new ArrayList<>()).add(rectangle); + } + return new ArrayList<>(groups.values()); + } + + + private static Rectangle getOverlapRectangle(Ruling ruling) { + + float top; + float left; + float w; + float h; + + if (ruling.x1 < ruling.x2) { + left = ruling.x1; + w = ruling.x2 - ruling.x1; + } else { + left = ruling.x2; + w = ruling.x1 - ruling.x2; + } + if (ruling.y1 < ruling.y2) { + top = ruling.y1; + h = ruling.y2 - ruling.y1; + } else { + top = ruling.y2; + h = ruling.y1 - ruling.y2; + } + + if (ruling.horizontal()) { + return new Rectangle(top - THRESHOLD_Y_HORIZONTAL, left - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL); + } else { + return new Rectangle(top - THRESHOLD_Y_VERTICAL, left - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL); + } + } + + + public static Ruling getXCenteredRuling(Rectangle rectangle) { + + float x = (float) rectangle.getCenterX(); + float y1 = rectangle.getTop(); + float y2 = rectangle.getBottom(); + + Point2D point1 = new Point2D.Float(x, y1 + THRESHOLD_Y_VERTICAL); + Point2D point2 = new Point2D.Float(x, y2 - THRESHOLD_Y_VERTICAL); + + return new Ruling(point1, point2); + } + + + public static Ruling getYCenteredRuling(Rectangle rectangle) { + + float x1 = rectangle.getLeft(); + float x2 = rectangle.getRight(); + float y = (float) rectangle.getCenterY(); + + Point2D point1 = new Point2D.Float(x1 + THRESHOLD_X_HORIZONTAL, y); + Point2D point2 = new Point2D.Float(x2 - THRESHOLD_X_HORIZONTAL, y); + + return new Ruling(point1, point2); + } + + + private Rulings extractVerticalAndHorizontalRulingLines(List rulings) { + List vrs = new ArrayList<>(); for (Ruling vr : rulings) { if (vr.vertical()) { vrs.add(vr); } } - if (vrs.isEmpty()) { - vrs.addAll(extractVerticalRulings(tableCells)); - } - List verticalRulingLines = collapseOrientedRulings(vrs); List hrs = new ArrayList<>(); for (Ruling hr : rulings) { @@ -52,98 +171,26 @@ public class RulingCleaningService { hrs.add(hr); } } - if (hrs.isEmpty()) { - hrs.addAll(extractHorizontalRulings(tableCells)); - } - List horizontalRulingLines = collapseOrientedRulings(hrs); - - return CleanRulings.builder().vertical(verticalRulingLines).horizontal(horizontalRulingLines).build(); + return new Rulings(vrs, hrs); } - public void snapPoints(List rulings) { + private Rulings getRulingsFromParsedCells(List tableCells) { - // collect points and keep a Line -> p1,p2 map - Map linesToPoints = new HashMap<>(); - List points = new ArrayList<>(); - for (Line2D.Float r : rulings) { - Point2D p1 = r.getP1(); - Point2D p2 = r.getP2(); - linesToPoints.put(r, new Point2D[]{p1, p2}); - points.add(p1); - points.add(p2); - } - - // snap by X - points.sort(Comparator.comparingDouble(Point2D::getX)); - - List> groupedPoints = new ArrayList<>(); - groupedPoints.add(new ArrayList<>(Collections.singletonList(points.get(0)))); - - for (Point2D p : points.subList(1, points.size() - 1)) { - List last = groupedPoints.get(groupedPoints.size() - 1); - if (Math.abs(p.getX() - last.get(0).getX()) < THRESHOLD_X) { - groupedPoints.get(groupedPoints.size() - 1).add(p); - } else { - groupedPoints.add(new ArrayList<>(Collections.singletonList(p))); - } - } - - for (List group : groupedPoints) { - float avgLoc = 0; - for (Point2D p : group) { - avgLoc += p.getX(); - } - avgLoc /= group.size(); - for (Point2D p : group) { - p.setLocation(avgLoc, p.getY()); - } - } - // --- - - // snap by Y - points.sort(Comparator.comparingDouble(Point2D::getY)); - - groupedPoints = new ArrayList<>(); - groupedPoints.add(new ArrayList<>(Collections.singletonList(points.get(0)))); - - for (Point2D p : points.subList(1, points.size() - 1)) { - List last = groupedPoints.get(groupedPoints.size() - 1); - if (Math.abs(p.getY() - last.get(0).getY()) < THRESHOLD_Y) { - groupedPoints.get(groupedPoints.size() - 1).add(p); - } else { - groupedPoints.add(new ArrayList<>(Collections.singletonList(p))); - } - } - - for (List group : groupedPoints) { - float avgLoc = 0; - for (Point2D p : group) { - avgLoc += p.getY(); - } - avgLoc /= group.size(); - for (Point2D p : group) { - p.setLocation(p.getX(), avgLoc); - } - } - // --- - - // finally, modify lines - for (Map.Entry ltp : linesToPoints.entrySet()) { - Point2D[] p = ltp.getValue(); - ltp.getKey().setLine(p[0], p[1]); - } + List vrs = extractVerticalRulingsFromParsedCells(tableCells); + List hrs = extractHorizontalRulingsFromParsedCells(tableCells); + return new Rulings(vrs, hrs); } - private Collection extractVerticalRulings(List cvParsedTableCells) { + private List extractVerticalRulingsFromParsedCells(List tableCells) { List vrs = new ArrayList<>(); - if (cvParsedTableCells != null) { - for (TableCells cvParsedTableCell : cvParsedTableCells) { - Ruling leftLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX0(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1()); - Ruling rightLine = createRuling(cvParsedTableCell.getX1(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1()); + if (tableCells != null) { + for (TableCells tableCell : tableCells) { + Ruling leftLine = createRuling(tableCell.getX0(), tableCell.getX0(), tableCell.getY0(), tableCell.getY1()); + Ruling rightLine = createRuling(tableCell.getX1(), tableCell.getX1(), tableCell.getY0(), tableCell.getY1()); vrs.add(leftLine); vrs.add(rightLine); } @@ -152,19 +199,18 @@ public class RulingCleaningService { } - private Collection extractHorizontalRulings(List cvParsedTableCells) { + private List extractHorizontalRulingsFromParsedCells(List tableCells) { List hrs = new ArrayList<>(); - if (cvParsedTableCells != null) { - for (TableCells cvParsedTableCell : cvParsedTableCells) { - Ruling topLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY1(), cvParsedTableCell.getY1()); - Ruling baseLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY0()); + if (tableCells != null) { + for (TableCells tableCell : tableCells) { + Ruling topLine = createRuling(tableCell.getX0(), tableCell.getX1(), tableCell.getY1(), tableCell.getY1()); + Ruling baseLine = createRuling(tableCell.getX0(), tableCell.getX1(), tableCell.getY0(), tableCell.getY0()); hrs.add(topLine); hrs.add(baseLine); } } - return hrs; } @@ -190,46 +236,8 @@ public class RulingCleaningService { } - private List collapseOrientedRulings(List lines) { + private record Rulings(List verticalLines, List horizontalLines) { - int COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1; - return collapseOrientedRulings(lines, COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT); - } - - - private List collapseOrientedRulings(List lines, int expandAmount) { - - ArrayList rv = new ArrayList<>(); - lines.sort((a, b) -> { - final float diff = a.getPosition() - b.getPosition(); - return Float.compare(diff == 0 ? a.getStart() - b.getStart() : diff, 0f); - }); - - for (Ruling next_line : lines) { - Ruling last = rv.isEmpty() ? null : rv.get(rv.size() - 1); - // if current line colinear with next, and are "close enough": expand current line - if (last != null && DoubleComparisons.feq(next_line.getPosition(), last.getPosition()) && last.nearlyIntersects(next_line, expandAmount)) { - final float lastStart = last.getStart(); - final float lastEnd = last.getEnd(); - - final boolean lastFlipped = lastStart > lastEnd; - final boolean nextFlipped = next_line.getStart() > next_line.getEnd(); - - boolean differentDirections = nextFlipped != lastFlipped; - float nextS = differentDirections ? next_line.getEnd() : next_line.getStart(); - float nextE = differentDirections ? next_line.getStart() : next_line.getEnd(); - - final float newStart = lastFlipped ? Math.max(nextS, lastStart) : Math.min(nextS, lastStart); - final float newEnd = lastFlipped ? Math.min(nextE, lastEnd) : Math.max(nextE, lastEnd); - last.setStartEnd(newStart, newEnd); - assert !last.oblique(); - } else if (next_line.length() == 0) { - continue; - } else { - rv.add(next_line); - } - } - return rv; } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java index 8dd639d..2827153 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java @@ -1,13 +1,13 @@ package com.knecon.fforesight.service.layoutparser.processor.services; -import java.awt.geom.Point2D; +import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR; +import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR; + import java.util.ArrayList; -import java.util.Comparator; -import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Set; +import java.util.stream.Collectors; import org.springframework.stereotype.Service; @@ -20,66 +20,15 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder; +import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder; @Service public class TableExtractionService { - private static final int MAX_TABLE_OUTER_POINT_TOLERANCE = 10; private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1; - private static final float SPREADSHEET_AREA_TOLERANCE = 0.001f; - - private static final Comparator X_FIRST_POINT_COMPARATOR = (point1, point2) -> { - - int rv = 0; - float point1X = DoubleComparisons.round(point1.getX(), 2); - float point1Y = DoubleComparisons.round(point1.getY(), 2); - float point2X = DoubleComparisons.round(point2.getX(), 2); - float point2Y = DoubleComparisons.round(point2.getY(), 2); - - if (point1X > point2X) { - rv = 1; - } else if (point1X < point2X) { - rv = -1; - } else if (point1Y > point2Y) { - rv = 1; - } else if (point1Y < point2Y) { - rv = -1; - } - return rv; - }; - private static final Comparator Y_FIRST_POINT_COMPARATOR = (point1, point2) -> { - - int rv = 0; - float point1X = DoubleComparisons.round(point1.getX(), 2); - float point1Y = DoubleComparisons.round(point1.getY(), 2); - float point2X = DoubleComparisons.round(point2.getX(), 2); - float point2Y = DoubleComparisons.round(point2.getY(), 2); - - if (point1Y > point2Y) { - rv = 1; - } else if (point1Y < point2Y) { - rv = -1; - } else if (point1X > point2X) { - rv = 1; - } else if (point1X < point2X) { - rv = -1; - } - return rv; - }; - - private static final Comparator CELL_SIZE_COMPARATOR = (cell1, cell2) -> { - - Double cell1Size = cell1.getHeight() * cell1.getWidth(); - Double cell2Size = cell2.getHeight() * cell2.getWidth(); - return cell1Size.compareTo(cell2Size); - }; - - private static final Comparator RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> { - - Double rect1Size = rect1.getHeight() * rect1.getWidth(); - Double rect2Size = rect2.getHeight() * rect2.getWidth(); - return rect1Size.compareTo(rect2Size); - }; + private static final int TEXT_BLOCK_CONTAINMENT_TOLERANCE = 2; + private static final double TABLE_UNIFORMITY_THRESHOLD = 0.7; /** @@ -115,7 +64,7 @@ public class TableExtractionService { cells = new ArrayList<>(new HashSet<>(cells)); DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER); - List spreadsheetAreas = findSpreadsheetsFromCells(cells); + List spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells); // sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first // this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR); @@ -132,10 +81,10 @@ public class TableExtractionService { var containedCellsWithText = containedCells.stream() .filter(cell -> !cell.getTextBlocks().isEmpty()) - .count(); + .toList(); // verify if table would contain fewer cells with text than the threshold allows - if (containedCellsWithText >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT) { + if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) { tables.add(new TablePageBlock(containedCells, area, page.getRotation())); cells.removeAll(containedCells); } @@ -164,6 +113,21 @@ public class TableExtractionService { } + private boolean checkIfTableCellsAreUniform(List containedCells) { + + if(containedCells.size() <= 2) { + return true; + } + + Map> cellsGroupedByRoundedWidth = containedCells.stream() + .map(Rectangle::getWidth) + .map(size -> Math.round(size / 10.0) * 10) + .collect(Collectors.groupingBy(Long::longValue)); + + return (double) cellsGroupedByRoundedWidth.size() / containedCells.size() <= TABLE_UNIFORMITY_THRESHOLD; + } + + private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) { double x = textBlock.getPdfMinX(); @@ -175,225 +139,19 @@ public class TableExtractionService { } double x0 = cell.getX(); double y0 = cell.getY(); - return (x >= x0 - 2 && y >= y0 - 2 && (x + w) <= x0 + cell.getWidth() + 2 && (y + h) <= y0 + cell.getHeight() + 2); + return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE + && y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE + && (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE + && (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE); } - private List findCells(List horizontalRulingLines, List verticalRulingLines) { - - // Fix for 211.pdf - for (Ruling r : horizontalRulingLines) { - if (r.getX2() < r.getX1()) { - double a = r.getX2(); - r.x2 = (float) r.getX1(); - r.x1 = (float) a; - } - } - - List cellsFound = new ArrayList<>(); - Map intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines); - List intersectionPointsList = new ArrayList<>(intersectionPoints.keySet()); - intersectionPointsList.sort(Y_FIRST_POINT_COMPARATOR); - - for (int i = 0; i < intersectionPointsList.size(); i++) { - Point2D topLeft = intersectionPointsList.get(i); - Ruling[] hv = intersectionPoints.get(topLeft); - - // CrossingPointsDirectlyBelow( topLeft ); - List xPoints = new ArrayList<>(); - // CrossingPointsDirectlyToTheRight( topLeft ); - List yPoints = new ArrayList<>(); - - for (Point2D p : intersectionPointsList.subList(i, intersectionPointsList.size())) { - if (p.getX() == topLeft.getX() && p.getY() > topLeft.getY()) { - xPoints.add(p); - } - if (p.getY() == topLeft.getY() && p.getX() > topLeft.getX()) { - yPoints.add(p); - } - } - outer: - for (Point2D xPoint : xPoints) { - // is there a vertical edge b/w topLeft and xPoint? - if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) { - continue; - } - for (Point2D yPoint : yPoints) { - // is there a horizontal edge b/w topLeft and yPoint ? - if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) { - continue; - } - Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY()); - if (intersectionPoints.containsKey(btmRight) - && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) - && intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) { - cellsFound.add(new Cell(topLeft, btmRight)); - break outer; - } - } - } - } - - // TODO create cells for vertical ruling lines with aligned endpoints at the top/bottom of a grid - // that aren't connected with an horizontal ruler? - // see: https://github.com/jazzido/tabula-extractor/issues/78#issuecomment-41481207 - - return cellsFound; - } - - - private List findSpreadsheetsFromCells(List cells) { - // via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon - List rectangles = new ArrayList<>(); - Set pointSet = new HashSet<>(); - Map edgesH = new HashMap<>(); - Map edgesV = new HashMap<>(); - - for (Rectangle cell : cells) { - for (Point2D pt : cell.getPoints()) { - if (pointSet.contains(pt)) { // shared vertex, remove it - pointSet.remove(pt); - } else { - pointSet.add(pt); - } - } - } - - // X first sort - List pointsSortX = new ArrayList<>(pointSet); - pointsSortX.sort(X_FIRST_POINT_COMPARATOR); - // Y first sort - List pointsSortY = new ArrayList<>(pointSet); - pointsSortY.sort(Y_FIRST_POINT_COMPARATOR); - - int i = 0; - while (i < pointSet.size()) { - float currY = (float) pointsSortY.get(i).getY(); - while (i < pointSet.size() && DoubleComparisons.feq(pointsSortY.get(i).getY(), currY)) { - edgesH.put(pointsSortY.get(i), pointsSortY.get(i + 1)); - edgesH.put(pointsSortY.get(i + 1), pointsSortY.get(i)); - i += 2; - } - } - - i = 0; - while (i < pointSet.size()) { - float currX = (float) pointsSortX.get(i).getX(); - while (i < pointSet.size() && DoubleComparisons.feq(pointsSortX.get(i).getX(), currX)) { - edgesV.put(pointsSortX.get(i), pointsSortX.get(i + 1)); - edgesV.put(pointsSortX.get(i + 1), pointsSortX.get(i)); - i += 2; - } - } - - // Get all the polygons - List> polygons = new ArrayList<>(); - Point2D nextVertex; - while (!edgesH.isEmpty()) { - ArrayList polygon = new ArrayList<>(); - Point2D first = edgesH.keySet() - .iterator().next(); - polygon.add(new PolygonVertex(first, Direction.HORIZONTAL)); - edgesH.remove(first); - - while (true) { - PolygonVertex curr = polygon.get(polygon.size() - 1); - PolygonVertex lastAddedVertex; - if (curr.direction == Direction.HORIZONTAL) { - nextVertex = edgesV.get(curr.point); - edgesV.remove(curr.point); - lastAddedVertex = new PolygonVertex(nextVertex, Direction.VERTICAL); - } else { - nextVertex = edgesH.get(curr.point); - edgesH.remove(curr.point); - lastAddedVertex = new PolygonVertex(nextVertex, Direction.HORIZONTAL); - } - polygon.add(lastAddedVertex); - - if (lastAddedVertex.equals(polygon.get(0))) { - // closed polygon - polygon.remove(polygon.size() - 1); - break; - } - } - - for (PolygonVertex vertex : polygon) { - edgesH.remove(vertex.point); - edgesV.remove(vertex.point); - } - polygons.add(polygon); - } - - // calculate grid-aligned minimum area rectangles for each found polygon - for (List poly : polygons) { - float top = Float.MAX_VALUE; - float left = Float.MAX_VALUE; - float bottom = Float.MIN_VALUE; - float right = Float.MIN_VALUE; - for (PolygonVertex pt : poly) { - top = (float) Math.min(top, pt.point.getY()); - left = (float) Math.min(left, pt.point.getX()); - bottom = (float) Math.max(bottom, pt.point.getY()); - right = (float) Math.max(right, pt.point.getX()); - } - - // do not add polygons with too many outer points as they are unlikely to be tables - if (poly.size() <= MAX_TABLE_OUTER_POINT_TOLERANCE) { - rectangles.add(new Rectangle(top - SPREADSHEET_AREA_TOLERANCE, - left - SPREADSHEET_AREA_TOLERANCE, - right - left + 2 * SPREADSHEET_AREA_TOLERANCE, - bottom - top + 2 * SPREADSHEET_AREA_TOLERANCE)); - } - } - - return rectangles; - } - - - private enum Direction { - HORIZONTAL, - VERTICAL - } - - static class PolygonVertex { - - Point2D point; - Direction direction; - - - PolygonVertex(Point2D point, Direction direction) { - - this.direction = direction; - this.point = point; - } - - - @Override - public boolean equals(Object other) { - - if (this == other) { - return true; - } - if (!(other instanceof PolygonVertex)) { - return false; - } - return this.point.equals(((PolygonVertex) other).point); - } - - - @Override - public int hashCode() { - - return this.point.hashCode(); - } - - - @Override - public String toString() { - - return String.format("%s[point=%s,direction=%s]", this.getClass().getName(), this.point.toString(), this.direction.toString()); - } + public static List findCells(List horizontalRulingLines, List verticalRulingLines) { + return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines) + .stream() + .map(Cell::new) + .collect(Collectors.toList()); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index 6bde310..20440eb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -105,8 +105,8 @@ public class DocumentGraphFactory { .build(); page.getMainBody().add(imageNode); - List tocId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode); - imageNode.setTreeId(tocId); + List treeId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode); + imageNode.setTreeId(treeId); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java index a2ebcc0..92ff832 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java @@ -188,14 +188,33 @@ public class LayoutGridService { @SneakyThrows private void addPlacedText(Page page, Rectangle2D textBBox, String s, LayoutGrid layoutGrid) { - Point2D.Float upperLeftCorner = switch (page.getRotation()) { - case 90 -> new Point2D.Float((float) (textBBox.getMinX()), (float) textBBox.getMinY()); - case 180 -> new Point2D.Float((float) (textBBox.getMaxX()), (float) textBBox.getMinY()); - case 270 -> new Point2D.Float((float) (textBBox.getMaxX()), (float) textBBox.getMaxY()); - default -> new Point2D.Float((float) (textBBox.getMinX()), (float) textBBox.getMaxY()); - }; + // translates text, such that its right edge is a bit to the left of the drawn box + float translationAmount = ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4); + + Point2D upperLeftCorner; + Point2D translationVector; + switch (page.getRotation()) { + case 90 -> { + upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMinY()); + translationVector = new Point2D.Double(FONT_SIZE, -translationAmount); + } + case 180 -> { + upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMinY()); + translationVector = new Point2D.Double(translationAmount, FONT_SIZE); + } + case 270 -> { + upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMaxY()); + translationVector = new Point2D.Double(-FONT_SIZE, translationAmount); + } + default -> { + upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMaxY()); + translationVector = new Point2D.Double(-translationAmount, -FONT_SIZE); + } + } + + upperLeftCorner = add(upperLeftCorner, translationVector); + var placedTexts = layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getPlacedTexts(); - upperLeftCorner.setLocation(upperLeftCorner.getX() - ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), upperLeftCorner.getY() - FONT_SIZE); placedTexts.add(PlacedText.textFacingUp(s, upperLeftCorner, FONT_SIZE, Color.BLACK, FONT)); } @@ -317,4 +336,10 @@ public class LayoutGridService { .add(new ColoredRectangle(textBBox, color, LINE_WIDTH))); } + + private Point2D add(Point2D a, Point2D b) { + + return new Point2D.Double(a.getX() + b.getX(), a.getY() + b.getY()); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/DrawingOptions.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/DrawingOptions.java new file mode 100644 index 0000000..341e127 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/DrawingOptions.java @@ -0,0 +1,28 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import java.awt.Color; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.experimental.FieldDefaults; + +@Builder +@AllArgsConstructor +@Getter +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class DrawingOptions { + + boolean stroke; + @Builder.Default + Color strokeColor = Color.BLACK; + @Builder.Default + float strokeWidth = 1f; + + boolean fill; + @Builder.Default + Color fillColor = Color.BLACK; + +} + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/GeometricComparators.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/GeometricComparators.java new file mode 100644 index 0000000..c21b516 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/GeometricComparators.java @@ -0,0 +1,88 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import java.awt.geom.Point2D; +import java.util.Comparator; + +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; + +public class GeometricComparators { + + private static final int COMPARATOR_ROUNDING = 2; + + public static final Comparator X_FIRST_POINT_COMPARATOR = (point1, point2) -> { + + int rv = 0; + float point1X = DoubleComparisons.round(point1.getX(), COMPARATOR_ROUNDING); + float point1Y = DoubleComparisons.round(point1.getY(), COMPARATOR_ROUNDING); + float point2X = DoubleComparisons.round(point2.getX(), COMPARATOR_ROUNDING); + float point2Y = DoubleComparisons.round(point2.getY(), COMPARATOR_ROUNDING); + + if (point1X > point2X) { + rv = 1; + } else if (point1X < point2X) { + rv = -1; + } else if (point1Y > point2Y) { + rv = 1; + } else if (point1Y < point2Y) { + rv = -1; + } + return rv; + }; + + public static final Comparator Y_FIRST_POINT_COMPARATOR = (point1, point2) -> { + + int rv = 0; + float point1X = DoubleComparisons.round(point1.getX(), COMPARATOR_ROUNDING); + float point1Y = DoubleComparisons.round(point1.getY(), COMPARATOR_ROUNDING); + float point2X = DoubleComparisons.round(point2.getX(), COMPARATOR_ROUNDING); + float point2Y = DoubleComparisons.round(point2.getY(), COMPARATOR_ROUNDING); + + if (point1Y > point2Y) { + rv = 1; + } else if (point1Y < point2Y) { + rv = -1; + } else if (point1X > point2X) { + rv = 1; + } else if (point1X < point2X) { + rv = -1; + } + return rv; + }; + + public static final Comparator CELL_SIZE_COMPARATOR = (cell1, cell2) -> { + + Double cell1Size = cell1.getHeight() * cell1.getWidth(); + Double cell2Size = cell2.getHeight() * cell2.getWidth(); + return cell1Size.compareTo(cell2Size); + }; + + public static final Comparator RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> { + + Double rect1Size = rect1.getHeight() * rect1.getWidth(); + Double rect2Size = rect2.getHeight() * rect2.getWidth(); + return rect1Size.compareTo(rect2Size); + }; + + public static final Comparator X_FIRST_RULING_COMPARATOR = (ruling1, ruling2) -> { + + int rv = 0; + float point1X = DoubleComparisons.round(Math.min(ruling1.getLeft(), ruling1.getRight()), COMPARATOR_ROUNDING); + float point1Y = DoubleComparisons.round(Math.min(ruling1.getTop(), ruling1.getBottom()), COMPARATOR_ROUNDING); + float point2X = DoubleComparisons.round(Math.min(ruling2.getLeft(), ruling2.getRight()), COMPARATOR_ROUNDING); + float point2Y = DoubleComparisons.round(Math.min(ruling2.getTop(), ruling2.getBottom()), COMPARATOR_ROUNDING); + + if (point1X > point2X) { + rv = 1; + } else if (point1X < point2X) { + rv = -1; + } else if (point1Y > point2Y) { + rv = 1; + } else if (point1Y < point2Y) { + rv = -1; + } + return rv; + }; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java index b950340..bf64c12 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java @@ -21,11 +21,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; -import lombok.AccessLevel; -import lombok.Builder; -import lombok.Getter; import lombok.SneakyThrows; -import lombok.experimental.FieldDefaults; import lombok.experimental.UtilityClass; import lombok.extern.slf4j.Slf4j; @@ -41,20 +37,20 @@ public class PdfVisualisationUtility { public void drawNode(PDDocument document, DocumentTree.Entry entry) { - Options options = buildStandardOptionsForNodes(entry); + DrawingOptions options = buildStandardOptionsForNodes(entry); drawBBoxAndLabelAndNumberOnPage(document, entry, options); } - public void drawTextBlock(PDDocument document, TextBlock textBlock, Options options) { + public void drawTextBlock(PDDocument document, TextBlock textBlock, DrawingOptions options) { textBlock.getAtomicTextBlocks().forEach(atb -> drawAtomicTextBlock(document, atb, options)); } - public void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, Options options) { + public void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, DrawingOptions options) { drawRectangle2DList(document, atomicTextBlock.getPage().getNumber(), atomicTextBlock.getPositions().stream().toList(), options); @@ -62,7 +58,7 @@ public class PdfVisualisationUtility { @SneakyThrows - public void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, Options options) { + public void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, DrawingOptions options) { var pdPage = document.getPage(pageNumber - 1); var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true); @@ -80,14 +76,14 @@ public class PdfVisualisationUtility { @SneakyThrows - public void drawRectangle2DList(PDDocument document, int pageNumber, List rectCollection, Options options) { + public void drawRectangle2DList(PDDocument document, int pageNumber, List rectCollection, DrawingOptions options) { var pdPage = document.getPage(pageNumber - 1); drawRectangle2DList(document, rectCollection, options, pdPage); } - private void drawRectangle2DList(PDDocument document, List rectCollection, Options options, PDPage pdPage) throws IOException { + private void drawRectangle2DList(PDDocument document, List rectCollection, DrawingOptions options, PDPage pdPage) throws IOException { var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true); @@ -110,9 +106,9 @@ public class PdfVisualisationUtility { } - private Options buildStandardOptionsForNodes(DocumentTree.Entry entry) { + private DrawingOptions buildStandardOptionsForNodes(DocumentTree.Entry entry) { - return Options.builder().stroke(true).strokeColor(switch (entry.getType()) { + return DrawingOptions.builder().stroke(true).strokeColor(switch (entry.getType()) { case DOCUMENT -> Color.LIGHT_GRAY; case HEADER, FOOTER -> Color.GREEN; case PARAGRAPH -> Color.BLUE; @@ -125,7 +121,7 @@ public class PdfVisualisationUtility { } - private void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, Options options) { + private void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, DrawingOptions options) { Map rectanglesPerPage = entry.getNode().getBBox(); rectanglesPerPage.forEach((page, rectangle2D) -> { @@ -152,7 +148,7 @@ public class PdfVisualisationUtility { @SneakyThrows - public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List line2DS, Options options) { + public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List line2DS, DrawingOptions options) { var pdPage = pdDocument.getPage(pageNumber - 1); var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true); @@ -176,21 +172,4 @@ public class PdfVisualisationUtility { contentStream.close(); } - - @Builder - @Getter - @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) - public static class Options { - - boolean fill; - boolean stroke; - @Builder.Default - Color strokeColor = Color.BLACK; - @Builder.Default - float strokeWidth = 1f; - @Builder.Default - Color fillColor = Color.BLACK; - - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java index 1a49607..70e9460 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java @@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; import static java.lang.String.format; +import java.awt.geom.Area; import java.awt.geom.Rectangle2D; import java.awt.geom.RectangularShape; import java.util.Collections; @@ -37,15 +38,28 @@ public class RectangleTransformations { } + public static double calculateIntersectedArea(Rectangle2D r1, Rectangle2D r2) { + + Area a1 = new Area(r1); + Area a2 = new Area(r2); + a1.intersect(a2); + Rectangle2D intersection = a1.getBounds2D(); + return intersection.getWidth() * intersection.getHeight(); + } + + public static Rectangle2D bBoxUnionAtomicTextBlock(List atomicTextBlocks) { return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector()); } + + public static Collector collectBBox() { return new Rectangle2DBBoxCollector(); } + public static PDRectangle toPDRectangleBBox(List rectangles) { Rectangle2D rectangle2D = RectangleTransformations.rectangleBBox(rectangles); @@ -70,6 +84,7 @@ public class RectangleTransformations { return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight()); } + public static Rectangle2D rectangleBBox(List rectangles) { return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector()); @@ -84,6 +99,7 @@ public class RectangleTransformations { -redactionLogRectangle.getHeight()); } + public static Rectangle2D toRectangle2D(PDRectangle rectangle) { return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangularIntersectionFinder.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangularIntersectionFinder.java new file mode 100644 index 0000000..3f47b40 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangularIntersectionFinder.java @@ -0,0 +1,77 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR; + +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; + +public class RectangularIntersectionFinder { + + public static List find(List horizontalRulingLines, List verticalRulingLines) { + + // Fix for 211.pdf + for (Ruling r : horizontalRulingLines) { + if (r.getX2() < r.getX1()) { + double a = r.getX2(); + r.x2 = (float) r.getX1(); + r.x1 = (float) a; + } + } + + List foundRectangles = new ArrayList<>(); + Map intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines); + List intersectionPointsList = new ArrayList<>(intersectionPoints.keySet()); + intersectionPointsList.sort(Y_FIRST_POINT_COMPARATOR); + + for (int i = 0; i < intersectionPointsList.size(); i++) { + Point2D topLeft = intersectionPointsList.get(i); + Ruling[] hv = intersectionPoints.get(topLeft); + + // CrossingPointsDirectlyBelow( topLeft ); + List xPoints = new ArrayList<>(); + // CrossingPointsDirectlyToTheRight( topLeft ); + List yPoints = new ArrayList<>(); + + for (Point2D p : intersectionPointsList.subList(i, intersectionPointsList.size())) { + if (p.getX() == topLeft.getX() && p.getY() > topLeft.getY()) { + xPoints.add(p); + } + if (p.getY() == topLeft.getY() && p.getX() > topLeft.getX()) { + yPoints.add(p); + } + } + outer: + for (Point2D xPoint : xPoints) { + // is there a vertical edge b/w topLeft and xPoint? + if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) { + continue; + } + for (Point2D yPoint : yPoints) { + // is there a horizontal edge b/w topLeft and yPoint ? + if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) { + continue; + } + Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY()); + if (intersectionPoints.containsKey(btmRight) + && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) + && intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) { + foundRectangles.add(new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), btmRight.getX() - topLeft.getX(), btmRight.getY() - topLeft.getY())); + break outer; + } + } + } + } + + // TODO create cells for vertical ruling lines with aligned endpoints at the top/bottom of a grid + // that aren't connected with an horizontal ruler? + // see: https://github.com/jazzido/tabula-extractor/issues/78#issuecomment-41481207 + + return foundRectangles; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/SpreadsheetFinder.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/SpreadsheetFinder.java new file mode 100644 index 0000000..660ef3f --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/SpreadsheetFinder.java @@ -0,0 +1,172 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.X_FIRST_POINT_COMPARATOR; +import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR; + +import java.awt.geom.Point2D; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle; + +public class SpreadsheetFinder { + + private static final int MAX_OUTER_POINT_TOLERANCE = 10; + private static final float AREA_TOLERANCE = 0.001f; + + + public static List findSpreadsheetsFromCells(List cells) { + // via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon + List rectangles = new ArrayList<>(); + Set pointSet = new HashSet<>(); + Map edgesH = new HashMap<>(); + Map edgesV = new HashMap<>(); + + for (Rectangle cell : cells) { + for (Point2D pt : cell.getPoints()) { + if (pointSet.contains(pt)) { // shared vertex, remove it + pointSet.remove(pt); + } else { + pointSet.add(pt); + } + } + } + + // X first sort + List pointsSortX = new ArrayList<>(pointSet); + pointsSortX.sort(X_FIRST_POINT_COMPARATOR); + // Y first sort + List pointsSortY = new ArrayList<>(pointSet); + pointsSortY.sort(Y_FIRST_POINT_COMPARATOR); + + int i = 0; + while (i < pointSet.size()) { + float currY = (float) pointsSortY.get(i).getY(); + while (i < pointSet.size() && DoubleComparisons.feq(pointsSortY.get(i).getY(), currY)) { + edgesH.put(pointsSortY.get(i), pointsSortY.get(i + 1)); + edgesH.put(pointsSortY.get(i + 1), pointsSortY.get(i)); + i += 2; + } + } + + i = 0; + while (i < pointSet.size()) { + float currX = (float) pointsSortX.get(i).getX(); + while (i < pointSet.size() && DoubleComparisons.feq(pointsSortX.get(i).getX(), currX)) { + edgesV.put(pointsSortX.get(i), pointsSortX.get(i + 1)); + edgesV.put(pointsSortX.get(i + 1), pointsSortX.get(i)); + i += 2; + } + } + + // Get all the polygons + List> polygons = new ArrayList<>(); + Point2D nextVertex; + while (!edgesH.isEmpty()) { + ArrayList polygon = new ArrayList<>(); + Point2D first = edgesH.keySet() + .iterator().next(); + polygon.add(new PolygonVertex(first, Direction.HORIZONTAL)); + edgesH.remove(first); + + while (true) { + PolygonVertex curr = polygon.get(polygon.size() - 1); + PolygonVertex lastAddedVertex; + if (curr.direction == Direction.HORIZONTAL) { + nextVertex = edgesV.get(curr.point); + edgesV.remove(curr.point); + lastAddedVertex = new PolygonVertex(nextVertex, Direction.VERTICAL); + } else { + nextVertex = edgesH.get(curr.point); + edgesH.remove(curr.point); + lastAddedVertex = new PolygonVertex(nextVertex, Direction.HORIZONTAL); + } + polygon.add(lastAddedVertex); + + if (lastAddedVertex.equals(polygon.get(0))) { + // closed polygon + polygon.remove(polygon.size() - 1); + break; + } + } + + for (PolygonVertex vertex : polygon) { + edgesH.remove(vertex.point); + edgesV.remove(vertex.point); + } + polygons.add(polygon); + } + + // calculate grid-aligned minimum area rectangles for each found polygon + for (List poly : polygons) { + float top = Float.MAX_VALUE; + float left = Float.MAX_VALUE; + float bottom = Float.MIN_VALUE; + float right = Float.MIN_VALUE; + for (PolygonVertex pt : poly) { + top = (float) Math.min(top, pt.point.getY()); + left = (float) Math.min(left, pt.point.getX()); + bottom = (float) Math.max(bottom, pt.point.getY()); + right = (float) Math.max(right, pt.point.getX()); + } + + // do not add polygons with too many outer points as they are unlikely to be tables + if (poly.size() <= MAX_OUTER_POINT_TOLERANCE) { + rectangles.add(new Rectangle(top - AREA_TOLERANCE, left - AREA_TOLERANCE, right - left + 2 * AREA_TOLERANCE, bottom - top + 2 * AREA_TOLERANCE)); + } + } + return rectangles; + } + + + private enum Direction { + HORIZONTAL, + VERTICAL + } + + static class PolygonVertex { + + Point2D point; + Direction direction; + + + PolygonVertex(Point2D point, Direction direction) { + + this.direction = direction; + this.point = point; + } + + + @Override + public boolean equals(Object other) { + + if (this == other) { + return true; + } + if (!(other instanceof PolygonVertex)) { + return false; + } + return this.point.equals(((PolygonVertex) other).point); + } + + + @Override + public int hashCode() { + + return this.point.hashCode(); + } + + + @Override + public String toString() { + + return String.format("%s[point=%s,direction=%s]", this.getClass().getName(), this.point.toString(), this.direction.toString()); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/UnionFind.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/UnionFind.java new file mode 100644 index 0000000..d6af3fa --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/UnionFind.java @@ -0,0 +1,44 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import java.util.HashMap; +import java.util.Map; + +// simple implementation of a disjoint-set data structure +// https://en.wikipedia.org/wiki/Disjoint-set_data_structure +public class UnionFind { + + Map parents = new HashMap<>(); + Map numberOfObjects = new HashMap<>(); + + + public T find(T node) { + + if (!parents.containsKey(node)) { + parents.put(node, node); + numberOfObjects.put(node, 1); + } + if (!node.equals(parents.get(node))) { + parents.put(node, find(parents.get(node))); + } + return parents.get(node); + } + + + public void union(T node1, T node2) { + + T root1 = find(node1); + T root2 = find(node2); + + if (!root1.equals(root2)) { + if (numberOfObjects.getOrDefault(root1, 1) < numberOfObjects.getOrDefault(root2, 1)) { + parents.put(root1, root2); + numberOfObjects.put(root2, numberOfObjects.get(root2) + numberOfObjects.get(root1)); + } else { + parents.put(root2, root1); + numberOfObjects.put(root1, numberOfObjects.get(root1) + numberOfObjects.get(root2)); + } + } + } + +} + diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java index 4b2358e..1f62c3f 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java @@ -29,6 +29,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions; import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; @@ -111,7 +112,7 @@ public class BdrJsonBuildTest extends AbstractTest { try (PDDocument pdDocument = Loader.loadPDF(file); var outputStream = new FileOutputStream(resultingFileName)) { PdfDraw.drawDocumentGraph(pdDocument, document); - PdfDraw.drawTextBlock(pdDocument, textBlock, PdfDraw.Options.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build()); + PdfDraw.drawTextBlock(pdDocument, textBlock, DrawingOptions.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build()); pdDocument.save(outputStream); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java index 5f150e2..1e98204 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java @@ -13,6 +13,7 @@ import org.springframework.core.io.ClassPathResource; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions; import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; @@ -70,7 +71,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentTest { try (PDDocument pdDocument = Loader.loadPDF(fileResource.getFile())) { log.info("drawing document"); PdfDraw.drawDocumentGraph(pdDocument, documentGraph); - PdfDraw.drawTextBlock(pdDocument, textBlock, PdfDraw.Options.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build()); + PdfDraw.drawTextBlock(pdDocument, textBlock, DrawingOptions.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build()); log.info("saving document"); pdDocument.save(tmpFile); log.info("saved document"); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index b437809..2db3906 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -25,7 +25,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; + String fileName = "files/SinglePages/T5 VV-640252-Page16.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index a671655..a9400a6 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -681,7 +681,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 4); validateTable(document, 0, 3, 3, 0, 0); - validateTable(document, 1, 3, 5, 2, 0); + validateTable(document, 1, 3, 6, 2, 0); validateTable(document, 2, 3, 3, 1, 0); validateTable(document, 3, 3, 3, 0, 0); @@ -742,13 +742,12 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - validateTableSize(document, 6); + validateTableSize(document, 5); validateTable(document, 0, 1, 1, 0, 0); validateTable(document, 1, 1, 1, 0, 0); validateTable(document, 2, 1, 1, 0, 0); validateTable(document, 3, 1, 1, 0, 0); validateTable(document, 4, 1, 1, 0, 0); - validateTable(document, 5, 1, 1, 0, 0); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java index 92d31c1..fb3280a 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -1,13 +1,17 @@ package com.knecon.fforesight.service.layoutparser.server.services; +import java.awt.Color; +import java.awt.geom.Rectangle2D; import java.nio.file.Files; import java.nio.file.Path; import java.util.Collections; import java.util.LinkedList; import java.util.List; +import java.util.stream.Collectors; import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.springframework.core.io.ClassPathResource; @@ -26,6 +30,8 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper; +import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder; import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; @@ -34,19 +40,40 @@ import lombok.SneakyThrows; public class RulingCleaningServiceTest extends BuildDocumentTest { @Test -// @Disabled + @Disabled + @SneakyThrows + public void textRectanglesFromRulingsExtraction() { + + String fileName = "files/SinglePages/T5 VV-640252-Page16.pdf"; + String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_CELLS.pdf"; + List pageContents = PageContentExtractor.getSortedPageContents(fileName); + RulingCleaningService rulingCleaningService = new RulingCleaningService(); + List> rectanglesPerPage = new LinkedList<>(); + for (PageContents pageContent : pageContents) { + CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings()); + List rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontal(), cleanRulings.getVertical()); + rectanglesPerPage.add(rects); + } + + PdfDraw.drawRectanglesPerPage(fileName, rectanglesPerPage, lineFileName, DrawingOptions.builder().stroke(true).strokeColor(Color.RED).build()); + } + + + @Test + @Disabled @SneakyThrows public void textRulingExtraction() { - String fileName = "files/211.pdf"; + String fileName = "files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page35.pdf"; String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf"; List pageContents = PageContentExtractor.getSortedPageContents(fileName); RulingCleaningService rulingCleaningService = new RulingCleaningService(); - PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName); List cleanRulingsPerPage = new LinkedList<>(); for (PageContents pageContent : pageContents) { cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings())); } + var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVertical).collect(Collectors.toList()); + PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java index 5576017..4e3280f 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java @@ -24,20 +24,31 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions; import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Getter; import lombok.SneakyThrows; -import lombok.experimental.FieldDefaults; import lombok.experimental.UtilityClass; @UtilityClass public class PdfDraw { + public static void drawRectanglesPerPage(String filename, List> rectanglesPerPage, String tmpFileName, DrawingOptions options) throws IOException { + + ClassPathResource pdfResource = new ClassPathResource(filename); + try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) { + for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { + PdfVisualisationUtility.drawRectangle2DList(pdDocument, + pageNumber, + rectanglesPerPage.get(pageNumber - 1), + options); + } + pdDocument.save(out); + } + + } + public static void drawRectanglesPerPage(String filename, List> rectanglesPerPage, String tmpFileName) throws IOException { ClassPathResource pdfResource = new ClassPathResource(filename); @@ -46,7 +57,7 @@ public class PdfDraw { PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesPerPage.get(pageNumber - 1), - PdfVisualisationUtility.Options.builder().stroke(true).build()); + DrawingOptions.builder().stroke(true).build()); } pdDocument.save(out); } @@ -62,13 +73,13 @@ public class PdfDraw { var rectanglesOnPage = rectanglesPerPage.get(pageNumber - 1); for (int lineNumber = 0; lineNumber < rectanglesOnPage.size(); lineNumber++) { var rectanglesInLine = rectanglesOnPage.get(lineNumber); - PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, PdfVisualisationUtility.Options.builder().stroke(true).build()); + PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, DrawingOptions.builder().stroke(true).build()); double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY()); PdfVisualisationUtility.drawText(String.format("%d", lineNumber), pdDocument, new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2), pageNumber, - PdfVisualisationUtility.Options.builder().stroke(true).build()); + DrawingOptions.builder().stroke(true).build()); } } pdDocument.save(out); @@ -99,20 +110,20 @@ public class PdfDraw { public static void drawNode(PDDocument document, DocumentTree.Entry entry) { - Options options = buildStandardOptionsForNodes(entry); + DrawingOptions options = buildStandardOptionsForNodes(entry); drawBBoxAndLabelAndNumberOnPage(document, entry, options); } - public static void drawTextBlock(PDDocument document, TextBlock textBlock, Options options) { + public static void drawTextBlock(PDDocument document, TextBlock textBlock, DrawingOptions options) { textBlock.getAtomicTextBlocks().forEach(atb -> drawAtomicTextBlock(document, atb, options)); } - public static void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, Options options) { + public static void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, DrawingOptions options) { drawRectangle2DList(document, atomicTextBlock.getPage().getNumber(), atomicTextBlock.getPositions().stream().toList(), options); @@ -120,7 +131,7 @@ public class PdfDraw { @SneakyThrows - private static void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, Options options, boolean rotate) { + private static void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, DrawingOptions options, boolean rotate) { var pdPage = document.getPage(pageNumber - 1); var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true); @@ -142,14 +153,14 @@ public class PdfDraw { @SneakyThrows - public static void drawRectangle2DList(PDDocument document, int pageNumber, List rectCollection, Options options) { + public static void drawRectangle2DList(PDDocument document, int pageNumber, List rectCollection, DrawingOptions options) { var pdPage = document.getPage(pageNumber - 1); drawRectangle2DList(document, rectCollection, options, pdPage); } - private static void drawRectangle2DList(PDDocument document, List rectCollection, Options options, PDPage pdPage) throws IOException { + private static void drawRectangle2DList(PDDocument document, List rectCollection, DrawingOptions options, PDPage pdPage) throws IOException { var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true); @@ -181,12 +192,12 @@ public class PdfDraw { // PdfVisualisationUtility.drawLine2DList(pdDocument, // pageNumber, // list.get(pageNumber - 1), -// PdfVisualisationUtility.Options.builder().stroke(true).build()); +// PdfVisualisationUtility.DrawingOptions.builder().stroke(true).build()); PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesPerPage.get(pageNumber - 1), - PdfVisualisationUtility.Options.builder().stroke(true).build()); - PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, list.get(pageNumber - 1), PdfVisualisationUtility.Options.builder().stroke(true).build()); + DrawingOptions.builder().stroke(true).build()); + PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, list.get(pageNumber - 1), DrawingOptions.builder().stroke(true).build()); } pdDocument.save(out); } @@ -202,35 +213,18 @@ public class PdfDraw { PdfVisualisationUtility.drawLine2DList(pdDocument, pageNumber, linesPerPage.get(pageNumber - 1), - PdfVisualisationUtility.Options.builder().strokeColor(Color.RED).stroke(true).build()); + DrawingOptions.builder().strokeColor(Color.RED).stroke(true).build()); } pdDocument.save(out); } } - @Builder - @AllArgsConstructor - @Getter - @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) - public static class Options { - - boolean stroke; - @Builder.Default - Color strokeColor = Color.BLACK; - @Builder.Default - float strokeWidth = 1f; - - boolean fill; - @Builder.Default - Color fillColor = Color.BLACK; - - } - private static Options buildStandardOptionsForNodes(DocumentTree.Entry entry) { + private static DrawingOptions buildStandardOptionsForNodes(DocumentTree.Entry entry) { - return Options.builder().stroke(true).strokeColor(switch (entry.getType()) { + return DrawingOptions.builder().stroke(true).strokeColor(switch (entry.getType()) { case DOCUMENT -> Color.LIGHT_GRAY; case HEADER, FOOTER -> Color.GREEN; case PARAGRAPH -> Color.BLUE; @@ -243,7 +237,7 @@ public class PdfDraw { } - private static void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, Options options) { + private static void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, DrawingOptions options) { Map rectanglesPerPage = entry.getNode().getBBox(); for (Page page : rectanglesPerPage.keySet()) { diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/DontMergeNonConsecutiveTables.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/DontMergeNonConsecutiveTables.pdf new file mode 100644 index 0000000..4e18c90 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/DontMergeNonConsecutiveTables.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/Meto_vol2_Page10.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/Meto_vol2_Page10.pdf new file mode 100644 index 0000000..10ce129 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/Meto_vol2_Page10.pdf differ