RED-8670: add table detection from idp result

* some 'slight' refactoring
2025-01-09 13:10:49 +01:00 · 2025-01-09 13:10:49 +01:00 · 8df429730f
commit 8df429730f
parent ffd426d859
5 changed files with 381 additions and 16 deletions
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableExtractionService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableExtractionService.java
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.tables;
 import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
 import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;

+import java.awt.Color;
 import java.awt.geom.AffineTransform;
 import java.awt.geom.Point2D;
 import java.awt.geom.Rectangle2D;
@ -108,6 +109,11 @@ public class TableExtractionService {
            if (containedCells.isEmpty()) {
                continue;
            }
+            // if cells are missing, for example a corner hasn't been recognized (See files/syngenta/CustomerFiles/SinglePages/T4_Page16_138 IDD0000261736.pdf),
+            // the LinkedCell based gridification can deal with this, but the transpose logic will then drop the entire column.
+            // That's why we compute the missing Cells from the spreadsheet area and fill them in.
+            Set<Cell> missingCells = TableAreaFiller.findMissingCells(containedCells, area, pdfToPageTransform);
+            containedCells.addAll(missingCells);

            Set<Word> wordsInTable = new HashSet<>(); // As docstrum blockfication recomputes the words, we need to remember the origin words to remove them from the overall list of words
            for (Cell cell : containedCells) {
@ -139,10 +145,14 @@ public class TableExtractionService {
    }


-    private List<TablePageBlock> buildTableFromIdpResult(List<Table> idpTables,
-                                                         List<Word> words,
-                                                         AffineTransform pdfToPageTransform,
-                                                         LayoutParsingType layoutParsingType) {
+    private static void removeWordsFromCells(List<Word> words, TablePageBlock tablePageBlock) {
+
+        Set<Word> wordsFromCells = new HashSet<>(tablePageBlock.getWords());
+        words.removeAll(wordsFromCells);
+    }
+
+
+    private List<TablePageBlock> buildTableFromIdpResult(List<Table> idpTables, List<Word> words, AffineTransform pdfToPageTransform, LayoutParsingType layoutParsingType) {

        if (idpTables == null || idpTables.isEmpty()) {
            return Collections.emptyList();
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableFromCellsExtractor.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableFromCellsExtractor.java
@ -7,7 +7,6 @@ import java.util.LinkedList;
 import java.util.List;

 import com.fasterxml.jackson.annotation.JsonIgnore;
-import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
 import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
 import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
 import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
@ -15,7 +14,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB

 import lombok.Getter;
 import lombok.Setter;
-import lombok.SneakyThrows;
 import lombok.extern.slf4j.Slf4j;

@Slf4j
@ -28,8 +26,6 @@ public class TableFromCellsExtractor {
    @Setter
    private final List<Cell> originCells;
    private final AffineTransform pdfToPageTransform;
-    private final double minCellWidth;
-    private final double minCellHeight;


    public TableFromCellsExtractor(List<Cell> originCells, AffineTransform pdfToPageTransform) {
@ -37,18 +33,15 @@ public class TableFromCellsExtractor {
        classification = PageBlockType.TABLE;
        this.originCells = originCells;
        this.pdfToPageTransform = pdfToPageTransform;
-        this.minCellHeight = originCells.stream()
-                .mapToDouble(BoundingBox::getHeight).min().orElse(0);
-        this.minCellWidth = originCells.stream()
-                .mapToDouble(BoundingBox::getWidth).min().orElse(0);
    }


-    @SneakyThrows
    public TablePageBlock extract() {

-        rows = AreaSweepGridifier.gridify(originCells, pdfToPageTransform.createInverse(), minCellWidth, minCellHeight);
+        computeRows(originCells);
+
        computeHeaders();
+
        return new TablePageBlock(null, rows);
    }

@ -126,4 +119,15 @@ public class TableFromCellsExtractor {

    }

+
+    private void computeRows(List<Cell> cells) {
+
+        if (cells.isEmpty()) {
+            return;
+        }
+
+        TableGridStructureCalculator calculator = new TableGridStructureCalculator(cells, pdfToPageTransform);
+        rows = calculator.gridify();
+    }
+
 }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableGridStructureCalculator.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableGridStructureCalculator.java
@ -0,0 +1,353 @@
+package com.knecon.fforesight.service.layoutparser.processor.services.tables;
+
+import java.awt.geom.AffineTransform;
+import java.awt.geom.Point2D;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
+
+import lombok.SneakyThrows;
+import lombok.extern.slf4j.Slf4j;
+
+@Slf4j
+public class TableGridStructureCalculator {
+
+    // multiplied with minimum cell height/width, Cells may be at most this apart in one dimension, and must overlap at least that much in the other dimension to be considered neighbours
+    private static final double DISTANCE_FACTOR = 0.5;
+    Set<Cell> cells;
+    AffineTransform pageToPdfTransform;
+    double minCellHeight;
+    double minCellWidth;
+
+
+    @SneakyThrows
+    TableGridStructureCalculator(Collection<Cell> cells, AffineTransform pdfToPageTransform) {
+
+        this.cells = new HashSet<>(cells);
+        this.pageToPdfTransform = pdfToPageTransform.createInverse();
+        this.minCellHeight = cells.stream()
+                .mapToDouble(cell -> cell.getBBox().getHeight())
+                .min().orElse(0);
+        this.minCellWidth = cells.stream()
+                .mapToDouble(cell -> cell.getBBox().getWidth())
+                .min().orElse(0);
+    }
+
+
+    /**
+     * Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
+     * Checks if any cell has more than one neighbor in any direction, if it does, it splits the cell according to its neighbors.
+     * This is repeated until no more splits are necessary. Then the rows are computed using that very same linked neighbor structure starting with the top left cell.
+     *
+     * @return TablePageBlock Structure as a rows of cells matrix
+     */
+    public List<List<Cell>> gridify() {
+
+        if (cellsHaveLargeOverlaps()) {
+            // If cells overlap significantly, the logic below will keep splitting them infinitely, so we revert to the simpler area sweep implementation.
+            List<List<Cell>> rows = AreaSweepGridifier.gridify(cells, pageToPdfTransform, minCellWidth, minCellHeight);
+            rows = removeEmptyRows(rows);
+            rows = removeEmptyCols(rows);
+            return rows;
+        }
+
+        var linkedCells = cells.stream()
+                .map(LinkedCell::new)
+                .collect(Collectors.toList());
+
+        computeNeighbours(linkedCells);
+
+        while (linkedCells.stream()
+                .anyMatch(LinkedCell::needsSplit)) {
+
+            List<LinkedCell> newCells = new LinkedList<>();
+            for (LinkedCell linkedCell : linkedCells) {
+                if (linkedCell.needsSplit()) {
+                    newCells.addAll(linkedCell.split());
+                } else {
+                    newCells.add(linkedCell);
+                }
+            }
+            computeNeighbours(newCells);
+            linkedCells = newCells;
+        }
+        return buildStructure(linkedCells);
+    }
+
+
+    private boolean cellsHaveLargeOverlaps() {
+
+        for (Cell cell1 : cells) {
+            for (Cell cell2 : cells) {
+                if (cell1.equals(cell2)) {
+                    continue;
+                }
+                if (cell1.horizontalOverlap(cell2) > minCellWidth * DISTANCE_FACTOR //
+                    && cell1.verticalOverlap(cell2) > minCellHeight * DISTANCE_FACTOR) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+
+    private List<List<Cell>> buildStructure(List<LinkedCell> cells) {
+
+        if (cells.isEmpty()) {
+            return Collections.emptyList();
+        }
+        List<List<Cell>> rows = buildRows(cells);
+        if (isNotRectangular(rows)) {
+            throw new AssertionError();
+        }
+        rows = removeEmptyRows(rows);
+        rows = removeEmptyCols(rows);
+        return rows;
+    }
+
+
+    private boolean isNotRectangular(List<List<Cell>> rows) {
+
+        if (rows.isEmpty()) {
+            return true;
+        }
+        int n = rows.get(0).size();
+        return rows.stream()
+                .anyMatch(row -> row.size() != n);
+    }
+
+
+    private List<List<Cell>> buildRows(List<LinkedCell> cells) {
+
+        List<LinkedCell> topLeftCandidates = cells.stream()
+                .filter(LinkedCell::isTopLeft)
+                .toList();
+
+        assert topLeftCandidates.size() == 1;
+        var cell = topLeftCandidates.get(0);
+
+        List<List<Cell>> rows = new ArrayList<>();
+        rows.add(buildRow(cell));
+        while (!cell.belows.isEmpty()) {
+            cell = cell.belows.get(0);
+            rows.add(buildRow(cell));
+        }
+        if (isNotRectangular(rows)) {
+            throw new AssertionError();
+        }
+        return rows;
+    }
+
+
+    private static List<Cell> buildRow(LinkedCell cell) {
+
+        List<Cell> currentRow = new ArrayList<>();
+        LinkedCell nextCell = cell;
+        currentRow.add(cell.originalCell);
+        while (!nextCell.rights.isEmpty()) {
+            nextCell = nextCell.rights.get(0);
+            currentRow.add(nextCell.originalCell);
+        }
+        return currentRow;
+    }
+
+
+    private void computeNeighbours(List<LinkedCell> cells) {
+
+        for (LinkedCell cell : cells) {
+            cell.resetNeighbours();
+            computeNeighbours(cell, cells);
+        }
+
+    }
+
+
+    private void computeNeighbours(LinkedCell cell, List<LinkedCell> otherCells) {
+
+        for (LinkedCell otherCell : otherCells) {
+            if (cell.equals(otherCell)) {
+                continue;
+            }
+            if (cell.originalCell.horizontalDistance(otherCell.originalCell) <= minCellWidth * DISTANCE_FACTOR
+                && cell.originalCell.verticalOverlap(otherCell.originalCell) >= minCellHeight * DISTANCE_FACTOR) {
+                if (cell.originalCell.getBBox().getCenterX() <= otherCell.originalCell.getBBox().getCenterX()) {
+                    cell.rights.add(otherCell);
+                } else {
+                    cell.lefts.add(otherCell);
+                }
+            } else if (cell.originalCell.verticalDistance(otherCell.originalCell) <= minCellHeight * DISTANCE_FACTOR
+                       && cell.originalCell.horizontalOverlap(otherCell.originalCell) >= minCellWidth * DISTANCE_FACTOR) {
+                if (cell.originalCell.getBBox().getCenterY() <= otherCell.originalCell.getBBox().getCenterY()) {
+                    cell.belows.add(otherCell);
+                } else {
+                    cell.aboves.add(otherCell);
+                }
+            }
+        }
+
+    }
+
+
+    static <T> List<List<T>> transpose(List<List<T>> table) {
+
+        List<List<T>> ret = new ArrayList<List<T>>();
+        final int N = table.get(0).size();
+        for (int i = 0; i < N; i++) {
+            List<T> col = new ArrayList<T>();
+            for (List<T> row : table) {
+                col.add(row.get(i));
+            }
+            ret.add(col);
+        }
+        return ret;
+    }
+
+
+    private List<List<Cell>> removeEmptyCols(List<List<Cell>> rowsOfCells) {
+
+        if (rowsOfCells.isEmpty()) {
+            return rowsOfCells;
+        }
+
+        var colsOfCells = transpose(rowsOfCells);
+        colsOfCells = removeEmptyRows(colsOfCells);
+        return transpose(colsOfCells);
+    }
+
+
+    private List<List<Cell>> removeEmptyRows(List<List<Cell>> rowsOfCells) {
+
+        return rowsOfCells.stream()
+                .filter(row -> row.stream()
+                        .anyMatch(cell -> !cell.getTextBlocks().isEmpty()))
+                .collect(Collectors.toList());
+    }
+
+
+    class LinkedCell {
+
+        private final Cell originalCell;
+        private final List<LinkedCell> rights;
+        private final List<LinkedCell> lefts;
+        private final List<LinkedCell> aboves;
+        private final List<LinkedCell> belows;
+
+
+        LinkedCell(Cell cell) {
+
+            this.originalCell = cell;
+            this.rights = new LinkedList<>();
+            this.lefts = new LinkedList<>();
+            this.aboves = new LinkedList<>();
+            this.belows = new LinkedList<>();
+        }
+
+
+        public boolean needsSplit() {
+
+            return rights.size() > 1 || lefts.size() > 1 || aboves.size() > 1 || belows.size() > 1;
+        }
+
+
+        public boolean isTopLeft() {
+
+            return lefts.isEmpty() && aboves.isEmpty();
+        }
+
+
+        public String toString() {
+
+            return originalCell.toString();
+        }
+
+
+        public Collection<LinkedCell> split() {
+
+            if (rights.size() > 1 && rights.size() >= lefts.size()) {
+                return splitY(rights);
+            }
+            if (lefts.size() > 1) {
+                return splitY(lefts);
+            }
+            if (aboves.size() > 1 && aboves.size() >= belows.size()) {
+                return splitX(aboves);
+            }
+            if (belows.size() > 1) {
+                return splitX(belows);
+            }
+            return List.of(this);
+        }
+
+
+        private List<LinkedCell> splitY(List<LinkedCell> neighbours) {
+
+            List<LinkedCell> splitCells = new LinkedList<>();
+            List<Double> ySplit = neighbours.stream()
+                    .map(right -> right.originalCell.getMaxY())
+                    .sorted()
+                    .toList();
+            Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY());
+            double maxX = originalCell.getBBox().getMaxX();
+            double x = originalCell.getBBox().getX();
+            double maxY = originalCell.getBBox().getMaxY();
+            for (Double neighborY : ySplit) {
+                double y = Math.min(neighborY, maxY);
+                Point2D bottomRight = new Point2D.Double(maxX, y);
+                Cell cell = copyCell(topLeft, bottomRight);
+                splitCells.add(new LinkedCell(cell));
+                topLeft = new Point2D.Double(x, y);
+            }
+            return splitCells;
+        }
+
+
+        private List<LinkedCell> splitX(List<LinkedCell> neighbours) {
+
+            List<LinkedCell> splitCells = new LinkedList<>();
+            List<Double> xSplit = neighbours.stream()
+                    .map(right -> right.originalCell.getMaxX())
+                    .sorted()
+                    .toList();
+            Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY());
+            double maxY = originalCell.getBBox().getMaxY();
+            double y = originalCell.getBBox().getY();
+            double maxX = originalCell.getBBox().getMaxX();
+            for (Double neighborX : xSplit) {
+                double x = Math.min(neighborX, maxX);
+                Point2D bottomRight = new Point2D.Double(x, maxY);
+                Cell cell = copyCell(topLeft, bottomRight);
+                splitCells.add(new LinkedCell(cell));
+                topLeft = new Point2D.Double(x, y);
+            }
+            return splitCells;
+        }
+
+
+        private Cell copyCell(Point2D topLeft, Point2D bottomRight) {
+
+            Cell cell = Cell.fromPageCoordinates(topLeft, bottomRight, pageToPdfTransform);
+            cell.setHeaderCell(originalCell.isHeaderCell());
+            cell.setTextBlocks(originalCell.getTextBlocks());
+            return cell;
+        }
+
+
+        public void resetNeighbours() {
+
+            rights.clear();
+            lefts.clear();
+            aboves.clear();
+            belows.clear();
+        }
+
+    }
+
+}
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java
@ -28,7 +28,6 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
 import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
 import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
-import com.knecon.fforesight.service.layoutparser.processor.model.table.QuadPoint;
 import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
--- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java
+++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java
@ -40,7 +40,6 @@ import com.knecon.fforesight.service.layoutparser.processor.services.PageContent
 import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
 import com.knecon.fforesight.service.layoutparser.server.AbstractTest;
 import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
-import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;

 import lombok.SneakyThrows;