RED-8670: add table detection from idp result
* some 'slight' refactoring
This commit is contained in:
parent
ffd426d859
commit
8df429730f
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
@ -108,6 +109,11 @@ public class TableExtractionService {
|
||||
if (containedCells.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
// if cells are missing, for example a corner hasn't been recognized (See files/syngenta/CustomerFiles/SinglePages/T4_Page16_138 IDD0000261736.pdf),
|
||||
// the LinkedCell based gridification can deal with this, but the transpose logic will then drop the entire column.
|
||||
// That's why we compute the missing Cells from the spreadsheet area and fill them in.
|
||||
Set<Cell> missingCells = TableAreaFiller.findMissingCells(containedCells, area, pdfToPageTransform);
|
||||
containedCells.addAll(missingCells);
|
||||
|
||||
Set<Word> wordsInTable = new HashSet<>(); // As docstrum blockfication recomputes the words, we need to remember the origin words to remove them from the overall list of words
|
||||
for (Cell cell : containedCells) {
|
||||
@ -139,10 +145,14 @@ public class TableExtractionService {
|
||||
}
|
||||
|
||||
|
||||
private List<TablePageBlock> buildTableFromIdpResult(List<Table> idpTables,
|
||||
List<Word> words,
|
||||
AffineTransform pdfToPageTransform,
|
||||
LayoutParsingType layoutParsingType) {
|
||||
private static void removeWordsFromCells(List<Word> words, TablePageBlock tablePageBlock) {
|
||||
|
||||
Set<Word> wordsFromCells = new HashSet<>(tablePageBlock.getWords());
|
||||
words.removeAll(wordsFromCells);
|
||||
}
|
||||
|
||||
|
||||
private List<TablePageBlock> buildTableFromIdpResult(List<Table> idpTables, List<Word> words, AffineTransform pdfToPageTransform, LayoutParsingType layoutParsingType) {
|
||||
|
||||
if (idpTables == null || idpTables.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
|
||||
@ -7,7 +7,6 @@ import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
@ -15,7 +14,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@ -28,8 +26,6 @@ public class TableFromCellsExtractor {
|
||||
@Setter
|
||||
private final List<Cell> originCells;
|
||||
private final AffineTransform pdfToPageTransform;
|
||||
private final double minCellWidth;
|
||||
private final double minCellHeight;
|
||||
|
||||
|
||||
public TableFromCellsExtractor(List<Cell> originCells, AffineTransform pdfToPageTransform) {
|
||||
@ -37,18 +33,15 @@ public class TableFromCellsExtractor {
|
||||
classification = PageBlockType.TABLE;
|
||||
this.originCells = originCells;
|
||||
this.pdfToPageTransform = pdfToPageTransform;
|
||||
this.minCellHeight = originCells.stream()
|
||||
.mapToDouble(BoundingBox::getHeight).min().orElse(0);
|
||||
this.minCellWidth = originCells.stream()
|
||||
.mapToDouble(BoundingBox::getWidth).min().orElse(0);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public TablePageBlock extract() {
|
||||
|
||||
rows = AreaSweepGridifier.gridify(originCells, pdfToPageTransform.createInverse(), minCellWidth, minCellHeight);
|
||||
computeRows(originCells);
|
||||
|
||||
computeHeaders();
|
||||
|
||||
return new TablePageBlock(null, rows);
|
||||
}
|
||||
|
||||
@ -126,4 +119,15 @@ public class TableFromCellsExtractor {
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void computeRows(List<Cell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
TableGridStructureCalculator calculator = new TableGridStructureCalculator(cells, pdfToPageTransform);
|
||||
rows = calculator.gridify();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,353 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class TableGridStructureCalculator {
|
||||
|
||||
// multiplied with minimum cell height/width, Cells may be at most this apart in one dimension, and must overlap at least that much in the other dimension to be considered neighbours
|
||||
private static final double DISTANCE_FACTOR = 0.5;
|
||||
Set<Cell> cells;
|
||||
AffineTransform pageToPdfTransform;
|
||||
double minCellHeight;
|
||||
double minCellWidth;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
TableGridStructureCalculator(Collection<Cell> cells, AffineTransform pdfToPageTransform) {
|
||||
|
||||
this.cells = new HashSet<>(cells);
|
||||
this.pageToPdfTransform = pdfToPageTransform.createInverse();
|
||||
this.minCellHeight = cells.stream()
|
||||
.mapToDouble(cell -> cell.getBBox().getHeight())
|
||||
.min().orElse(0);
|
||||
this.minCellWidth = cells.stream()
|
||||
.mapToDouble(cell -> cell.getBBox().getWidth())
|
||||
.min().orElse(0);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
||||
* Checks if any cell has more than one neighbor in any direction, if it does, it splits the cell according to its neighbors.
|
||||
* This is repeated until no more splits are necessary. Then the rows are computed using that very same linked neighbor structure starting with the top left cell.
|
||||
*
|
||||
* @return TablePageBlock Structure as a rows of cells matrix
|
||||
*/
|
||||
public List<List<Cell>> gridify() {
|
||||
|
||||
if (cellsHaveLargeOverlaps()) {
|
||||
// If cells overlap significantly, the logic below will keep splitting them infinitely, so we revert to the simpler area sweep implementation.
|
||||
List<List<Cell>> rows = AreaSweepGridifier.gridify(cells, pageToPdfTransform, minCellWidth, minCellHeight);
|
||||
rows = removeEmptyRows(rows);
|
||||
rows = removeEmptyCols(rows);
|
||||
return rows;
|
||||
}
|
||||
|
||||
var linkedCells = cells.stream()
|
||||
.map(LinkedCell::new)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
computeNeighbours(linkedCells);
|
||||
|
||||
while (linkedCells.stream()
|
||||
.anyMatch(LinkedCell::needsSplit)) {
|
||||
|
||||
List<LinkedCell> newCells = new LinkedList<>();
|
||||
for (LinkedCell linkedCell : linkedCells) {
|
||||
if (linkedCell.needsSplit()) {
|
||||
newCells.addAll(linkedCell.split());
|
||||
} else {
|
||||
newCells.add(linkedCell);
|
||||
}
|
||||
}
|
||||
computeNeighbours(newCells);
|
||||
linkedCells = newCells;
|
||||
}
|
||||
return buildStructure(linkedCells);
|
||||
}
|
||||
|
||||
|
||||
private boolean cellsHaveLargeOverlaps() {
|
||||
|
||||
for (Cell cell1 : cells) {
|
||||
for (Cell cell2 : cells) {
|
||||
if (cell1.equals(cell2)) {
|
||||
continue;
|
||||
}
|
||||
if (cell1.horizontalOverlap(cell2) > minCellWidth * DISTANCE_FACTOR //
|
||||
&& cell1.verticalOverlap(cell2) > minCellHeight * DISTANCE_FACTOR) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> buildStructure(List<LinkedCell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
List<List<Cell>> rows = buildRows(cells);
|
||||
if (isNotRectangular(rows)) {
|
||||
throw new AssertionError();
|
||||
}
|
||||
rows = removeEmptyRows(rows);
|
||||
rows = removeEmptyCols(rows);
|
||||
return rows;
|
||||
}
|
||||
|
||||
|
||||
private boolean isNotRectangular(List<List<Cell>> rows) {
|
||||
|
||||
if (rows.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
int n = rows.get(0).size();
|
||||
return rows.stream()
|
||||
.anyMatch(row -> row.size() != n);
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> buildRows(List<LinkedCell> cells) {
|
||||
|
||||
List<LinkedCell> topLeftCandidates = cells.stream()
|
||||
.filter(LinkedCell::isTopLeft)
|
||||
.toList();
|
||||
|
||||
assert topLeftCandidates.size() == 1;
|
||||
var cell = topLeftCandidates.get(0);
|
||||
|
||||
List<List<Cell>> rows = new ArrayList<>();
|
||||
rows.add(buildRow(cell));
|
||||
while (!cell.belows.isEmpty()) {
|
||||
cell = cell.belows.get(0);
|
||||
rows.add(buildRow(cell));
|
||||
}
|
||||
if (isNotRectangular(rows)) {
|
||||
throw new AssertionError();
|
||||
}
|
||||
return rows;
|
||||
}
|
||||
|
||||
|
||||
private static List<Cell> buildRow(LinkedCell cell) {
|
||||
|
||||
List<Cell> currentRow = new ArrayList<>();
|
||||
LinkedCell nextCell = cell;
|
||||
currentRow.add(cell.originalCell);
|
||||
while (!nextCell.rights.isEmpty()) {
|
||||
nextCell = nextCell.rights.get(0);
|
||||
currentRow.add(nextCell.originalCell);
|
||||
}
|
||||
return currentRow;
|
||||
}
|
||||
|
||||
|
||||
private void computeNeighbours(List<LinkedCell> cells) {
|
||||
|
||||
for (LinkedCell cell : cells) {
|
||||
cell.resetNeighbours();
|
||||
computeNeighbours(cell, cells);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void computeNeighbours(LinkedCell cell, List<LinkedCell> otherCells) {
|
||||
|
||||
for (LinkedCell otherCell : otherCells) {
|
||||
if (cell.equals(otherCell)) {
|
||||
continue;
|
||||
}
|
||||
if (cell.originalCell.horizontalDistance(otherCell.originalCell) <= minCellWidth * DISTANCE_FACTOR
|
||||
&& cell.originalCell.verticalOverlap(otherCell.originalCell) >= minCellHeight * DISTANCE_FACTOR) {
|
||||
if (cell.originalCell.getBBox().getCenterX() <= otherCell.originalCell.getBBox().getCenterX()) {
|
||||
cell.rights.add(otherCell);
|
||||
} else {
|
||||
cell.lefts.add(otherCell);
|
||||
}
|
||||
} else if (cell.originalCell.verticalDistance(otherCell.originalCell) <= minCellHeight * DISTANCE_FACTOR
|
||||
&& cell.originalCell.horizontalOverlap(otherCell.originalCell) >= minCellWidth * DISTANCE_FACTOR) {
|
||||
if (cell.originalCell.getBBox().getCenterY() <= otherCell.originalCell.getBBox().getCenterY()) {
|
||||
cell.belows.add(otherCell);
|
||||
} else {
|
||||
cell.aboves.add(otherCell);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
static <T> List<List<T>> transpose(List<List<T>> table) {
|
||||
|
||||
List<List<T>> ret = new ArrayList<List<T>>();
|
||||
final int N = table.get(0).size();
|
||||
for (int i = 0; i < N; i++) {
|
||||
List<T> col = new ArrayList<T>();
|
||||
for (List<T> row : table) {
|
||||
col.add(row.get(i));
|
||||
}
|
||||
ret.add(col);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> removeEmptyCols(List<List<Cell>> rowsOfCells) {
|
||||
|
||||
if (rowsOfCells.isEmpty()) {
|
||||
return rowsOfCells;
|
||||
}
|
||||
|
||||
var colsOfCells = transpose(rowsOfCells);
|
||||
colsOfCells = removeEmptyRows(colsOfCells);
|
||||
return transpose(colsOfCells);
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> removeEmptyRows(List<List<Cell>> rowsOfCells) {
|
||||
|
||||
return rowsOfCells.stream()
|
||||
.filter(row -> row.stream()
|
||||
.anyMatch(cell -> !cell.getTextBlocks().isEmpty()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
class LinkedCell {
|
||||
|
||||
private final Cell originalCell;
|
||||
private final List<LinkedCell> rights;
|
||||
private final List<LinkedCell> lefts;
|
||||
private final List<LinkedCell> aboves;
|
||||
private final List<LinkedCell> belows;
|
||||
|
||||
|
||||
LinkedCell(Cell cell) {
|
||||
|
||||
this.originalCell = cell;
|
||||
this.rights = new LinkedList<>();
|
||||
this.lefts = new LinkedList<>();
|
||||
this.aboves = new LinkedList<>();
|
||||
this.belows = new LinkedList<>();
|
||||
}
|
||||
|
||||
|
||||
public boolean needsSplit() {
|
||||
|
||||
return rights.size() > 1 || lefts.size() > 1 || aboves.size() > 1 || belows.size() > 1;
|
||||
}
|
||||
|
||||
|
||||
public boolean isTopLeft() {
|
||||
|
||||
return lefts.isEmpty() && aboves.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return originalCell.toString();
|
||||
}
|
||||
|
||||
|
||||
public Collection<LinkedCell> split() {
|
||||
|
||||
if (rights.size() > 1 && rights.size() >= lefts.size()) {
|
||||
return splitY(rights);
|
||||
}
|
||||
if (lefts.size() > 1) {
|
||||
return splitY(lefts);
|
||||
}
|
||||
if (aboves.size() > 1 && aboves.size() >= belows.size()) {
|
||||
return splitX(aboves);
|
||||
}
|
||||
if (belows.size() > 1) {
|
||||
return splitX(belows);
|
||||
}
|
||||
return List.of(this);
|
||||
}
|
||||
|
||||
|
||||
private List<LinkedCell> splitY(List<LinkedCell> neighbours) {
|
||||
|
||||
List<LinkedCell> splitCells = new LinkedList<>();
|
||||
List<Double> ySplit = neighbours.stream()
|
||||
.map(right -> right.originalCell.getMaxY())
|
||||
.sorted()
|
||||
.toList();
|
||||
Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY());
|
||||
double maxX = originalCell.getBBox().getMaxX();
|
||||
double x = originalCell.getBBox().getX();
|
||||
double maxY = originalCell.getBBox().getMaxY();
|
||||
for (Double neighborY : ySplit) {
|
||||
double y = Math.min(neighborY, maxY);
|
||||
Point2D bottomRight = new Point2D.Double(maxX, y);
|
||||
Cell cell = copyCell(topLeft, bottomRight);
|
||||
splitCells.add(new LinkedCell(cell));
|
||||
topLeft = new Point2D.Double(x, y);
|
||||
}
|
||||
return splitCells;
|
||||
}
|
||||
|
||||
|
||||
private List<LinkedCell> splitX(List<LinkedCell> neighbours) {
|
||||
|
||||
List<LinkedCell> splitCells = new LinkedList<>();
|
||||
List<Double> xSplit = neighbours.stream()
|
||||
.map(right -> right.originalCell.getMaxX())
|
||||
.sorted()
|
||||
.toList();
|
||||
Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY());
|
||||
double maxY = originalCell.getBBox().getMaxY();
|
||||
double y = originalCell.getBBox().getY();
|
||||
double maxX = originalCell.getBBox().getMaxX();
|
||||
for (Double neighborX : xSplit) {
|
||||
double x = Math.min(neighborX, maxX);
|
||||
Point2D bottomRight = new Point2D.Double(x, maxY);
|
||||
Cell cell = copyCell(topLeft, bottomRight);
|
||||
splitCells.add(new LinkedCell(cell));
|
||||
topLeft = new Point2D.Double(x, y);
|
||||
}
|
||||
return splitCells;
|
||||
}
|
||||
|
||||
|
||||
private Cell copyCell(Point2D topLeft, Point2D bottomRight) {
|
||||
|
||||
Cell cell = Cell.fromPageCoordinates(topLeft, bottomRight, pageToPdfTransform);
|
||||
cell.setHeaderCell(originalCell.isHeaderCell());
|
||||
cell.setTextBlocks(originalCell.getTextBlocks());
|
||||
return cell;
|
||||
}
|
||||
|
||||
|
||||
public void resetNeighbours() {
|
||||
|
||||
rights.clear();
|
||||
lefts.clear();
|
||||
aboves.clear();
|
||||
belows.clear();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -28,7 +28,6 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.QuadPoint;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ListIdentifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextBlockOnPage;
|
||||
|
||||
@ -40,7 +40,6 @@ import com.knecon.fforesight.service.layoutparser.processor.services.PageContent
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.server.AbstractTest;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user