RED-8670: add table detection from idp result

* some 'slight' refactoring
This commit is contained in:
Kilian Schuettler 2025-01-09 13:10:49 +01:00
parent ffd426d859
commit 8df429730f
5 changed files with 381 additions and 16 deletions

View File

@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.tables;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;
import java.awt.Color;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
@ -108,6 +109,11 @@ public class TableExtractionService {
if (containedCells.isEmpty()) {
continue;
}
// if cells are missing, for example a corner hasn't been recognized (See files/syngenta/CustomerFiles/SinglePages/T4_Page16_138 IDD0000261736.pdf),
// the LinkedCell based gridification can deal with this, but the transpose logic will then drop the entire column.
// That's why we compute the missing Cells from the spreadsheet area and fill them in.
Set<Cell> missingCells = TableAreaFiller.findMissingCells(containedCells, area, pdfToPageTransform);
containedCells.addAll(missingCells);
Set<Word> wordsInTable = new HashSet<>(); // As docstrum blockfication recomputes the words, we need to remember the origin words to remove them from the overall list of words
for (Cell cell : containedCells) {
@ -139,10 +145,14 @@ public class TableExtractionService {
}
private List<TablePageBlock> buildTableFromIdpResult(List<Table> idpTables,
List<Word> words,
AffineTransform pdfToPageTransform,
LayoutParsingType layoutParsingType) {
private static void removeWordsFromCells(List<Word> words, TablePageBlock tablePageBlock) {
Set<Word> wordsFromCells = new HashSet<>(tablePageBlock.getWords());
words.removeAll(wordsFromCells);
}
private List<TablePageBlock> buildTableFromIdpResult(List<Table> idpTables, List<Word> words, AffineTransform pdfToPageTransform, LayoutParsingType layoutParsingType) {
if (idpTables == null || idpTables.isEmpty()) {
return Collections.emptyList();

View File

@ -7,7 +7,6 @@ import java.util.LinkedList;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
@ -15,7 +14,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
import lombok.Getter;
import lombok.Setter;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@ -28,8 +26,6 @@ public class TableFromCellsExtractor {
@Setter
private final List<Cell> originCells;
private final AffineTransform pdfToPageTransform;
private final double minCellWidth;
private final double minCellHeight;
public TableFromCellsExtractor(List<Cell> originCells, AffineTransform pdfToPageTransform) {
@ -37,18 +33,15 @@ public class TableFromCellsExtractor {
classification = PageBlockType.TABLE;
this.originCells = originCells;
this.pdfToPageTransform = pdfToPageTransform;
this.minCellHeight = originCells.stream()
.mapToDouble(BoundingBox::getHeight).min().orElse(0);
this.minCellWidth = originCells.stream()
.mapToDouble(BoundingBox::getWidth).min().orElse(0);
}
@SneakyThrows
public TablePageBlock extract() {
rows = AreaSweepGridifier.gridify(originCells, pdfToPageTransform.createInverse(), minCellWidth, minCellHeight);
computeRows(originCells);
computeHeaders();
return new TablePageBlock(null, rows);
}
@ -126,4 +119,15 @@ public class TableFromCellsExtractor {
}
private void computeRows(List<Cell> cells) {
if (cells.isEmpty()) {
return;
}
TableGridStructureCalculator calculator = new TableGridStructureCalculator(cells, pdfToPageTransform);
rows = calculator.gridify();
}
}

View File

@ -0,0 +1,353 @@
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class TableGridStructureCalculator {
// multiplied with minimum cell height/width, Cells may be at most this apart in one dimension, and must overlap at least that much in the other dimension to be considered neighbours
private static final double DISTANCE_FACTOR = 0.5;
Set<Cell> cells;
AffineTransform pageToPdfTransform;
double minCellHeight;
double minCellWidth;
@SneakyThrows
TableGridStructureCalculator(Collection<Cell> cells, AffineTransform pdfToPageTransform) {
this.cells = new HashSet<>(cells);
this.pageToPdfTransform = pdfToPageTransform.createInverse();
this.minCellHeight = cells.stream()
.mapToDouble(cell -> cell.getBBox().getHeight())
.min().orElse(0);
this.minCellWidth = cells.stream()
.mapToDouble(cell -> cell.getBBox().getWidth())
.min().orElse(0);
}
/**
* Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
* Checks if any cell has more than one neighbor in any direction, if it does, it splits the cell according to its neighbors.
* This is repeated until no more splits are necessary. Then the rows are computed using that very same linked neighbor structure starting with the top left cell.
*
* @return TablePageBlock Structure as a rows of cells matrix
*/
public List<List<Cell>> gridify() {
if (cellsHaveLargeOverlaps()) {
// If cells overlap significantly, the logic below will keep splitting them infinitely, so we revert to the simpler area sweep implementation.
List<List<Cell>> rows = AreaSweepGridifier.gridify(cells, pageToPdfTransform, minCellWidth, minCellHeight);
rows = removeEmptyRows(rows);
rows = removeEmptyCols(rows);
return rows;
}
var linkedCells = cells.stream()
.map(LinkedCell::new)
.collect(Collectors.toList());
computeNeighbours(linkedCells);
while (linkedCells.stream()
.anyMatch(LinkedCell::needsSplit)) {
List<LinkedCell> newCells = new LinkedList<>();
for (LinkedCell linkedCell : linkedCells) {
if (linkedCell.needsSplit()) {
newCells.addAll(linkedCell.split());
} else {
newCells.add(linkedCell);
}
}
computeNeighbours(newCells);
linkedCells = newCells;
}
return buildStructure(linkedCells);
}
private boolean cellsHaveLargeOverlaps() {
for (Cell cell1 : cells) {
for (Cell cell2 : cells) {
if (cell1.equals(cell2)) {
continue;
}
if (cell1.horizontalOverlap(cell2) > minCellWidth * DISTANCE_FACTOR //
&& cell1.verticalOverlap(cell2) > minCellHeight * DISTANCE_FACTOR) {
return true;
}
}
}
return false;
}
private List<List<Cell>> buildStructure(List<LinkedCell> cells) {
if (cells.isEmpty()) {
return Collections.emptyList();
}
List<List<Cell>> rows = buildRows(cells);
if (isNotRectangular(rows)) {
throw new AssertionError();
}
rows = removeEmptyRows(rows);
rows = removeEmptyCols(rows);
return rows;
}
private boolean isNotRectangular(List<List<Cell>> rows) {
if (rows.isEmpty()) {
return true;
}
int n = rows.get(0).size();
return rows.stream()
.anyMatch(row -> row.size() != n);
}
private List<List<Cell>> buildRows(List<LinkedCell> cells) {
List<LinkedCell> topLeftCandidates = cells.stream()
.filter(LinkedCell::isTopLeft)
.toList();
assert topLeftCandidates.size() == 1;
var cell = topLeftCandidates.get(0);
List<List<Cell>> rows = new ArrayList<>();
rows.add(buildRow(cell));
while (!cell.belows.isEmpty()) {
cell = cell.belows.get(0);
rows.add(buildRow(cell));
}
if (isNotRectangular(rows)) {
throw new AssertionError();
}
return rows;
}
private static List<Cell> buildRow(LinkedCell cell) {
List<Cell> currentRow = new ArrayList<>();
LinkedCell nextCell = cell;
currentRow.add(cell.originalCell);
while (!nextCell.rights.isEmpty()) {
nextCell = nextCell.rights.get(0);
currentRow.add(nextCell.originalCell);
}
return currentRow;
}
private void computeNeighbours(List<LinkedCell> cells) {
for (LinkedCell cell : cells) {
cell.resetNeighbours();
computeNeighbours(cell, cells);
}
}
private void computeNeighbours(LinkedCell cell, List<LinkedCell> otherCells) {
for (LinkedCell otherCell : otherCells) {
if (cell.equals(otherCell)) {
continue;
}
if (cell.originalCell.horizontalDistance(otherCell.originalCell) <= minCellWidth * DISTANCE_FACTOR
&& cell.originalCell.verticalOverlap(otherCell.originalCell) >= minCellHeight * DISTANCE_FACTOR) {
if (cell.originalCell.getBBox().getCenterX() <= otherCell.originalCell.getBBox().getCenterX()) {
cell.rights.add(otherCell);
} else {
cell.lefts.add(otherCell);
}
} else if (cell.originalCell.verticalDistance(otherCell.originalCell) <= minCellHeight * DISTANCE_FACTOR
&& cell.originalCell.horizontalOverlap(otherCell.originalCell) >= minCellWidth * DISTANCE_FACTOR) {
if (cell.originalCell.getBBox().getCenterY() <= otherCell.originalCell.getBBox().getCenterY()) {
cell.belows.add(otherCell);
} else {
cell.aboves.add(otherCell);
}
}
}
}
static <T> List<List<T>> transpose(List<List<T>> table) {
List<List<T>> ret = new ArrayList<List<T>>();
final int N = table.get(0).size();
for (int i = 0; i < N; i++) {
List<T> col = new ArrayList<T>();
for (List<T> row : table) {
col.add(row.get(i));
}
ret.add(col);
}
return ret;
}
private List<List<Cell>> removeEmptyCols(List<List<Cell>> rowsOfCells) {
if (rowsOfCells.isEmpty()) {
return rowsOfCells;
}
var colsOfCells = transpose(rowsOfCells);
colsOfCells = removeEmptyRows(colsOfCells);
return transpose(colsOfCells);
}
private List<List<Cell>> removeEmptyRows(List<List<Cell>> rowsOfCells) {
return rowsOfCells.stream()
.filter(row -> row.stream()
.anyMatch(cell -> !cell.getTextBlocks().isEmpty()))
.collect(Collectors.toList());
}
class LinkedCell {
private final Cell originalCell;
private final List<LinkedCell> rights;
private final List<LinkedCell> lefts;
private final List<LinkedCell> aboves;
private final List<LinkedCell> belows;
LinkedCell(Cell cell) {
this.originalCell = cell;
this.rights = new LinkedList<>();
this.lefts = new LinkedList<>();
this.aboves = new LinkedList<>();
this.belows = new LinkedList<>();
}
public boolean needsSplit() {
return rights.size() > 1 || lefts.size() > 1 || aboves.size() > 1 || belows.size() > 1;
}
public boolean isTopLeft() {
return lefts.isEmpty() && aboves.isEmpty();
}
public String toString() {
return originalCell.toString();
}
public Collection<LinkedCell> split() {
if (rights.size() > 1 && rights.size() >= lefts.size()) {
return splitY(rights);
}
if (lefts.size() > 1) {
return splitY(lefts);
}
if (aboves.size() > 1 && aboves.size() >= belows.size()) {
return splitX(aboves);
}
if (belows.size() > 1) {
return splitX(belows);
}
return List.of(this);
}
private List<LinkedCell> splitY(List<LinkedCell> neighbours) {
List<LinkedCell> splitCells = new LinkedList<>();
List<Double> ySplit = neighbours.stream()
.map(right -> right.originalCell.getMaxY())
.sorted()
.toList();
Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY());
double maxX = originalCell.getBBox().getMaxX();
double x = originalCell.getBBox().getX();
double maxY = originalCell.getBBox().getMaxY();
for (Double neighborY : ySplit) {
double y = Math.min(neighborY, maxY);
Point2D bottomRight = new Point2D.Double(maxX, y);
Cell cell = copyCell(topLeft, bottomRight);
splitCells.add(new LinkedCell(cell));
topLeft = new Point2D.Double(x, y);
}
return splitCells;
}
private List<LinkedCell> splitX(List<LinkedCell> neighbours) {
List<LinkedCell> splitCells = new LinkedList<>();
List<Double> xSplit = neighbours.stream()
.map(right -> right.originalCell.getMaxX())
.sorted()
.toList();
Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY());
double maxY = originalCell.getBBox().getMaxY();
double y = originalCell.getBBox().getY();
double maxX = originalCell.getBBox().getMaxX();
for (Double neighborX : xSplit) {
double x = Math.min(neighborX, maxX);
Point2D bottomRight = new Point2D.Double(x, maxY);
Cell cell = copyCell(topLeft, bottomRight);
splitCells.add(new LinkedCell(cell));
topLeft = new Point2D.Double(x, y);
}
return splitCells;
}
private Cell copyCell(Point2D topLeft, Point2D bottomRight) {
Cell cell = Cell.fromPageCoordinates(topLeft, bottomRight, pageToPdfTransform);
cell.setHeaderCell(originalCell.isHeaderCell());
cell.setTextBlocks(originalCell.getTextBlocks());
return cell;
}
public void resetNeighbours() {
rights.clear();
lefts.clear();
aboves.clear();
belows.clear();
}
}
}

View File

@ -28,7 +28,6 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.QuadPoint;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;

View File

@ -40,7 +40,6 @@ import com.knecon.fforesight.service.layoutparser.processor.services.PageContent
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.server.AbstractTest;
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
import lombok.SneakyThrows;