diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableExtractionService.java index 33fcb0f..b952239 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableExtractionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableExtractionService.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.tables; import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR; import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR; +import java.awt.Color; import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; @@ -108,6 +109,11 @@ public class TableExtractionService { if (containedCells.isEmpty()) { continue; } + // if cells are missing, for example a corner hasn't been recognized (See files/syngenta/CustomerFiles/SinglePages/T4_Page16_138 IDD0000261736.pdf), + // the LinkedCell based gridification can deal with this, but the transpose logic will then drop the entire column. + // That's why we compute the missing Cells from the spreadsheet area and fill them in. + Set missingCells = TableAreaFiller.findMissingCells(containedCells, area, pdfToPageTransform); + containedCells.addAll(missingCells); Set wordsInTable = new HashSet<>(); // As docstrum blockfication recomputes the words, we need to remember the origin words to remove them from the overall list of words for (Cell cell : containedCells) { @@ -139,10 +145,14 @@ public class TableExtractionService { } - private List buildTableFromIdpResult(List idpTables, - List words, - AffineTransform pdfToPageTransform, - LayoutParsingType layoutParsingType) { + private static void removeWordsFromCells(List words, TablePageBlock tablePageBlock) { + + Set wordsFromCells = new HashSet<>(tablePageBlock.getWords()); + words.removeAll(wordsFromCells); + } + + + private List buildTableFromIdpResult(List
idpTables, List words, AffineTransform pdfToPageTransform, LayoutParsingType layoutParsingType) { if (idpTables == null || idpTables.isEmpty()) { return Collections.emptyList(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableFromCellsExtractor.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableFromCellsExtractor.java index 804aff4..fa2ad62 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableFromCellsExtractor.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableFromCellsExtractor.java @@ -7,7 +7,6 @@ import java.util.LinkedList; import java.util.List; import com.fasterxml.jackson.annotation.JsonIgnore; -import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; @@ -15,7 +14,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB import lombok.Getter; import lombok.Setter; -import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @Slf4j @@ -28,8 +26,6 @@ public class TableFromCellsExtractor { @Setter private final List originCells; private final AffineTransform pdfToPageTransform; - private final double minCellWidth; - private final double minCellHeight; public TableFromCellsExtractor(List originCells, AffineTransform pdfToPageTransform) { @@ -37,18 +33,15 @@ public class TableFromCellsExtractor { classification = PageBlockType.TABLE; this.originCells = originCells; this.pdfToPageTransform = pdfToPageTransform; - this.minCellHeight = originCells.stream() - .mapToDouble(BoundingBox::getHeight).min().orElse(0); - this.minCellWidth = originCells.stream() - .mapToDouble(BoundingBox::getWidth).min().orElse(0); } - @SneakyThrows public TablePageBlock extract() { - rows = AreaSweepGridifier.gridify(originCells, pdfToPageTransform.createInverse(), minCellWidth, minCellHeight); + computeRows(originCells); + computeHeaders(); + return new TablePageBlock(null, rows); } @@ -126,4 +119,15 @@ public class TableFromCellsExtractor { } + + private void computeRows(List cells) { + + if (cells.isEmpty()) { + return; + } + + TableGridStructureCalculator calculator = new TableGridStructureCalculator(cells, pdfToPageTransform); + rows = calculator.gridify(); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableGridStructureCalculator.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableGridStructureCalculator.java new file mode 100644 index 0000000..b1c21d2 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableGridStructureCalculator.java @@ -0,0 +1,353 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.tables; + +import java.awt.geom.AffineTransform; +import java.awt.geom.Point2D; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class TableGridStructureCalculator { + + // multiplied with minimum cell height/width, Cells may be at most this apart in one dimension, and must overlap at least that much in the other dimension to be considered neighbours + private static final double DISTANCE_FACTOR = 0.5; + Set cells; + AffineTransform pageToPdfTransform; + double minCellHeight; + double minCellWidth; + + + @SneakyThrows + TableGridStructureCalculator(Collection cells, AffineTransform pdfToPageTransform) { + + this.cells = new HashSet<>(cells); + this.pageToPdfTransform = pdfToPageTransform.createInverse(); + this.minCellHeight = cells.stream() + .mapToDouble(cell -> cell.getBBox().getHeight()) + .min().orElse(0); + this.minCellWidth = cells.stream() + .mapToDouble(cell -> cell.getBBox().getWidth()) + .min().orElse(0); + } + + + /** + * Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted. + * Checks if any cell has more than one neighbor in any direction, if it does, it splits the cell according to its neighbors. + * This is repeated until no more splits are necessary. Then the rows are computed using that very same linked neighbor structure starting with the top left cell. + * + * @return TablePageBlock Structure as a rows of cells matrix + */ + public List> gridify() { + + if (cellsHaveLargeOverlaps()) { + // If cells overlap significantly, the logic below will keep splitting them infinitely, so we revert to the simpler area sweep implementation. + List> rows = AreaSweepGridifier.gridify(cells, pageToPdfTransform, minCellWidth, minCellHeight); + rows = removeEmptyRows(rows); + rows = removeEmptyCols(rows); + return rows; + } + + var linkedCells = cells.stream() + .map(LinkedCell::new) + .collect(Collectors.toList()); + + computeNeighbours(linkedCells); + + while (linkedCells.stream() + .anyMatch(LinkedCell::needsSplit)) { + + List newCells = new LinkedList<>(); + for (LinkedCell linkedCell : linkedCells) { + if (linkedCell.needsSplit()) { + newCells.addAll(linkedCell.split()); + } else { + newCells.add(linkedCell); + } + } + computeNeighbours(newCells); + linkedCells = newCells; + } + return buildStructure(linkedCells); + } + + + private boolean cellsHaveLargeOverlaps() { + + for (Cell cell1 : cells) { + for (Cell cell2 : cells) { + if (cell1.equals(cell2)) { + continue; + } + if (cell1.horizontalOverlap(cell2) > minCellWidth * DISTANCE_FACTOR // + && cell1.verticalOverlap(cell2) > minCellHeight * DISTANCE_FACTOR) { + return true; + } + } + } + return false; + } + + + private List> buildStructure(List cells) { + + if (cells.isEmpty()) { + return Collections.emptyList(); + } + List> rows = buildRows(cells); + if (isNotRectangular(rows)) { + throw new AssertionError(); + } + rows = removeEmptyRows(rows); + rows = removeEmptyCols(rows); + return rows; + } + + + private boolean isNotRectangular(List> rows) { + + if (rows.isEmpty()) { + return true; + } + int n = rows.get(0).size(); + return rows.stream() + .anyMatch(row -> row.size() != n); + } + + + private List> buildRows(List cells) { + + List topLeftCandidates = cells.stream() + .filter(LinkedCell::isTopLeft) + .toList(); + + assert topLeftCandidates.size() == 1; + var cell = topLeftCandidates.get(0); + + List> rows = new ArrayList<>(); + rows.add(buildRow(cell)); + while (!cell.belows.isEmpty()) { + cell = cell.belows.get(0); + rows.add(buildRow(cell)); + } + if (isNotRectangular(rows)) { + throw new AssertionError(); + } + return rows; + } + + + private static List buildRow(LinkedCell cell) { + + List currentRow = new ArrayList<>(); + LinkedCell nextCell = cell; + currentRow.add(cell.originalCell); + while (!nextCell.rights.isEmpty()) { + nextCell = nextCell.rights.get(0); + currentRow.add(nextCell.originalCell); + } + return currentRow; + } + + + private void computeNeighbours(List cells) { + + for (LinkedCell cell : cells) { + cell.resetNeighbours(); + computeNeighbours(cell, cells); + } + + } + + + private void computeNeighbours(LinkedCell cell, List otherCells) { + + for (LinkedCell otherCell : otherCells) { + if (cell.equals(otherCell)) { + continue; + } + if (cell.originalCell.horizontalDistance(otherCell.originalCell) <= minCellWidth * DISTANCE_FACTOR + && cell.originalCell.verticalOverlap(otherCell.originalCell) >= minCellHeight * DISTANCE_FACTOR) { + if (cell.originalCell.getBBox().getCenterX() <= otherCell.originalCell.getBBox().getCenterX()) { + cell.rights.add(otherCell); + } else { + cell.lefts.add(otherCell); + } + } else if (cell.originalCell.verticalDistance(otherCell.originalCell) <= minCellHeight * DISTANCE_FACTOR + && cell.originalCell.horizontalOverlap(otherCell.originalCell) >= minCellWidth * DISTANCE_FACTOR) { + if (cell.originalCell.getBBox().getCenterY() <= otherCell.originalCell.getBBox().getCenterY()) { + cell.belows.add(otherCell); + } else { + cell.aboves.add(otherCell); + } + } + } + + } + + + static List> transpose(List> table) { + + List> ret = new ArrayList>(); + final int N = table.get(0).size(); + for (int i = 0; i < N; i++) { + List col = new ArrayList(); + for (List row : table) { + col.add(row.get(i)); + } + ret.add(col); + } + return ret; + } + + + private List> removeEmptyCols(List> rowsOfCells) { + + if (rowsOfCells.isEmpty()) { + return rowsOfCells; + } + + var colsOfCells = transpose(rowsOfCells); + colsOfCells = removeEmptyRows(colsOfCells); + return transpose(colsOfCells); + } + + + private List> removeEmptyRows(List> rowsOfCells) { + + return rowsOfCells.stream() + .filter(row -> row.stream() + .anyMatch(cell -> !cell.getTextBlocks().isEmpty())) + .collect(Collectors.toList()); + } + + + class LinkedCell { + + private final Cell originalCell; + private final List rights; + private final List lefts; + private final List aboves; + private final List belows; + + + LinkedCell(Cell cell) { + + this.originalCell = cell; + this.rights = new LinkedList<>(); + this.lefts = new LinkedList<>(); + this.aboves = new LinkedList<>(); + this.belows = new LinkedList<>(); + } + + + public boolean needsSplit() { + + return rights.size() > 1 || lefts.size() > 1 || aboves.size() > 1 || belows.size() > 1; + } + + + public boolean isTopLeft() { + + return lefts.isEmpty() && aboves.isEmpty(); + } + + + public String toString() { + + return originalCell.toString(); + } + + + public Collection split() { + + if (rights.size() > 1 && rights.size() >= lefts.size()) { + return splitY(rights); + } + if (lefts.size() > 1) { + return splitY(lefts); + } + if (aboves.size() > 1 && aboves.size() >= belows.size()) { + return splitX(aboves); + } + if (belows.size() > 1) { + return splitX(belows); + } + return List.of(this); + } + + + private List splitY(List neighbours) { + + List splitCells = new LinkedList<>(); + List ySplit = neighbours.stream() + .map(right -> right.originalCell.getMaxY()) + .sorted() + .toList(); + Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY()); + double maxX = originalCell.getBBox().getMaxX(); + double x = originalCell.getBBox().getX(); + double maxY = originalCell.getBBox().getMaxY(); + for (Double neighborY : ySplit) { + double y = Math.min(neighborY, maxY); + Point2D bottomRight = new Point2D.Double(maxX, y); + Cell cell = copyCell(topLeft, bottomRight); + splitCells.add(new LinkedCell(cell)); + topLeft = new Point2D.Double(x, y); + } + return splitCells; + } + + + private List splitX(List neighbours) { + + List splitCells = new LinkedList<>(); + List xSplit = neighbours.stream() + .map(right -> right.originalCell.getMaxX()) + .sorted() + .toList(); + Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY()); + double maxY = originalCell.getBBox().getMaxY(); + double y = originalCell.getBBox().getY(); + double maxX = originalCell.getBBox().getMaxX(); + for (Double neighborX : xSplit) { + double x = Math.min(neighborX, maxX); + Point2D bottomRight = new Point2D.Double(x, maxY); + Cell cell = copyCell(topLeft, bottomRight); + splitCells.add(new LinkedCell(cell)); + topLeft = new Point2D.Double(x, y); + } + return splitCells; + } + + + private Cell copyCell(Point2D topLeft, Point2D bottomRight) { + + Cell cell = Cell.fromPageCoordinates(topLeft, bottomRight, pageToPdfTransform); + cell.setHeaderCell(originalCell.isHeaderCell()); + cell.setTextBlocks(originalCell.getTextBlocks()); + return cell; + } + + + public void resetNeighbours() { + + rights.clear(); + lefts.clear(); + aboves.clear(); + belows.clear(); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java index 43e893b..5866717 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java @@ -28,7 +28,6 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; -import com.knecon.fforesight.service.layoutparser.processor.model.table.QuadPoint; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage; diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index 51d1c82..9c2d37c 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -40,7 +40,6 @@ import com.knecon.fforesight.service.layoutparser.processor.services.PageContent import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.server.AbstractTest; import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult; -import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService; import lombok.SneakyThrows;