RED-8670: move all gridification to QuadPointGridifier

This commit is contained in:
Kilian Schuettler 2025-01-16 12:52:59 +01:00
parent 6f6cae594f
commit 065abc5ae2
5 changed files with 80 additions and 403 deletions

View File

@ -181,22 +181,10 @@ public class LayoutParsingPipeline {
Layout parsing has finished in %.02f s.
identifiers: %s
%s
Files have been saved with Ids:
Structure: %s
Text: %s
Positions: %s
PageData: %s
Simplified Text: %s
Viewer Doc: %s""",
""",
((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentWithVisualization.document().getNumberOfPages(), documentWithVisualization.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId()))
buildSemanticNodeCountMessage(documentWithVisualization.document().getNumberOfPages(), documentWithVisualization.buildSemanticNodeCounts())))
.layoutParserVersion(layoutParserVersion)
.build();

View File

@ -18,10 +18,12 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.LinkedQu
import com.knecon.fforesight.service.layoutparser.processor.model.table.QuadPoint;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@AllArgsConstructor
public class QuadPointGridifier {
public static final int MAX_SPLITTING_ITERATIONS = 10;
@ -51,6 +53,16 @@ public class QuadPointGridifier {
}
@SneakyThrows
public static QuadPointGridifier fromCells(Collection<Cell> cells, AffineTransform pdfToPageTransform) {
var qpCells = cells.stream()
.map(cell -> new LinkedQuadPointCell(QuadPoint.fromRectangle2D(cell.getBBox()), cell.getTextBlocks()))
.collect(Collectors.toSet());
return new QuadPointGridifier(qpCells, pdfToPageTransform);
}
public Stream<Line2D> horizontalLines(QuadPoint quadPoint) {
return Stream.of(quadPoint.getBottomLine(), quadPoint.getTopLine());
@ -110,26 +122,41 @@ public class QuadPointGridifier {
if (cells.isEmpty()) {
return Collections.emptyList();
}
List<List<LinkedQuadPointCell>> rows = buildRows(cells);
List<List<Cell>> cellRows = mapToCells(rows);
List<List<Cell>> rows = buildRows(cells);
if (isNotRectangular(rows)) {
log.error("Non rectangular table on page {}",
cells.stream()
.map(LinkedQuadPointCell::getPageBlocks)
.flatMap(List::stream)
.map(AbstractPageBlock::getWords)
.flatMap(Collection::stream)
.map(Word::getPage)
.findAny().orElse(0));
// sometimes this algorithm fails to produce a rectangular table, this happens when the lines are so tilted it eventually produces a cell which is skipped due to being too small, leading to non-rectangular rows.
log.warn("Non rectangular table on page {}, using fallback algorithm.", getPageNumber(cells));
// Sometimes this algorithm fails to produce a rectangular table, this happens when the lines are so tilted it eventually produces a cell which is skipped due to being too small, leading to non-rectangular rows.
// Might also happen, if more than MAX_SPLITTING_ITERATIONS splits are required.
// Then we use the area sweep algorithm as a fallback.
return AreaSweepGridifier.gridify(this.cells.stream()
.map(this::toCell)
.toList(), pageToPdfTransform, minCellWidth, minCellHeight);
return areaSweepFallback();
}
cellRows = removeEmptyRows(cellRows);
cellRows = removeEmptyCols(cellRows);
return cellRows;
rows = removeEmptyRows(rows);
rows = removeEmptyCols(rows);
return rows;
}
private static Integer getPageNumber(List<LinkedQuadPointCell> cells) {
return cells.stream()
.map(LinkedQuadPointCell::getPageBlocks)
.flatMap(List::stream)
.map(AbstractPageBlock::getWords)
.flatMap(Collection::stream)
.map(Word::getPage)
.findAny().orElse(0);
}
private List<List<Cell>> areaSweepFallback() {
List<Cell> cells = this.cells.stream()
.map(this::toCell)
.toList();
List<List<Cell>> rows = AreaSweepGridifier.gridify(cells, pageToPdfTransform, minCellWidth, minCellHeight);
rows = removeEmptyRows(rows);
rows = removeEmptyCols(rows);
return rows;
}
@ -152,7 +179,7 @@ public class QuadPointGridifier {
}
private boolean isNotRectangular(List<List<LinkedQuadPointCell>> rows) {
private boolean isNotRectangular(List<List<Cell>> rows) {
if (rows.isEmpty()) {
return true;
@ -163,14 +190,15 @@ public class QuadPointGridifier {
}
private List<List<LinkedQuadPointCell>> buildRows(List<LinkedQuadPointCell> cells) {
private List<List<Cell>> buildRows(List<LinkedQuadPointCell> cells) {
List<LinkedQuadPointCell> topLeftCandidates = cells.stream()
.filter(LinkedQuadPointCell::isTopLeft)
.toList();
if (topLeftCandidates.size() != 1) {
log.error("More than one top-left cell found!");
log.warn("More than one top left candidate on page {}, using fallback algorithm.", getPageNumber(cells));
return areaSweepFallback();
}
var cell = topLeftCandidates.get(0);
@ -180,7 +208,7 @@ public class QuadPointGridifier {
cell = cell.getBelows().get(0);
rows.add(buildRow(cell));
}
return rows;
return mapToCells(rows);
}

View File

@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.tables;
import java.awt.geom.AffineTransform;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnore;
@ -126,7 +125,7 @@ public class TableFromCellsExtractor {
return;
}
TableGridStructureCalculator calculator = new TableGridStructureCalculator(cells, pdfToPageTransform);
QuadPointGridifier calculator = QuadPointGridifier.fromCells(cells, pdfToPageTransform);
rows = calculator.gridify();
}

View File

@ -1,360 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class TableGridStructureCalculator {
// multiplied with minimum cell height/width, Cells may be at most this apart in one dimension, and must overlap at least that much in the other dimension to be considered neighbours
private static final double DISTANCE_FACTOR = 0.5;
private static final int MAX_SPLITTING_ITERATIONS = 10;
Set<Cell> cells;
AffineTransform pageToPdfTransform;
double minCellHeight;
double minCellWidth;
@SneakyThrows
TableGridStructureCalculator(Collection<Cell> cells, AffineTransform pdfToPageTransform) {
this.cells = new HashSet<>(cells);
this.pageToPdfTransform = pdfToPageTransform.createInverse();
this.minCellHeight = cells.stream()
.mapToDouble(cell -> cell.getBBox().getHeight())
.min().orElse(0);
this.minCellWidth = cells.stream()
.mapToDouble(cell -> cell.getBBox().getWidth())
.min().orElse(0);
}
/**
* Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
* Checks if any cell has more than one neighbor in any direction, if it does, it splits the cell according to its neighbors.
* This is repeated until no more splits are necessary. Then the rows are computed using that very same linked neighbor structure starting with the top left cell.
*
* @return TablePageBlock Structure as a rows of cells matrix
*/
public List<List<Cell>> gridify() {
if (cellsHaveLargeOverlaps()) {
// If cells overlap significantly, the logic below will keep splitting them infinitely, so we revert to the simpler area sweep implementation.
return areaSweepFallback();
}
var linkedCells = cells.stream()
.map(LinkedCell::new)
.collect(Collectors.toList());
computeNeighbours(linkedCells);
int splits = 0;
while (linkedCells.stream()
.anyMatch(LinkedCell::needsSplit) && splits <= MAX_SPLITTING_ITERATIONS) {
List<LinkedCell> newCells = new LinkedList<>();
for (LinkedCell linkedCell : linkedCells) {
if (linkedCell.needsSplit()) {
newCells.addAll(linkedCell.split());
} else {
newCells.add(linkedCell);
}
}
computeNeighbours(newCells);
linkedCells = newCells;
splits++;
}
return buildStructure(linkedCells);
}
private List<List<Cell>> areaSweepFallback() {
List<List<Cell>> rows = AreaSweepGridifier.gridify(cells, pageToPdfTransform, minCellWidth, minCellHeight);
rows = removeEmptyRows(rows);
rows = removeEmptyCols(rows);
return rows;
}
private boolean cellsHaveLargeOverlaps() {
for (Cell cell1 : cells) {
for (Cell cell2 : cells) {
if (cell1.equals(cell2)) {
continue;
}
if (cell1.horizontalOverlap(cell2) > minCellWidth * DISTANCE_FACTOR //
&& cell1.verticalOverlap(cell2) > minCellHeight * DISTANCE_FACTOR) {
return true;
}
}
}
return false;
}
private List<List<Cell>> buildStructure(List<LinkedCell> cells) {
if (cells.isEmpty()) {
return Collections.emptyList();
}
List<List<Cell>> rows = buildRows(cells);
if (isNotRectangular(rows)) {
// For some tables the result is not rectangular, this either happens if cells are missing or the algorithm would need more than the max iterations to solve it.
// This is unacceptable so we revert to the area sweep implementation, which by design will always produce a rectangular result.
return areaSweepFallback();
}
rows = removeEmptyRows(rows);
rows = removeEmptyCols(rows);
return rows;
}
private boolean isNotRectangular(List<List<Cell>> rows) {
if (rows.isEmpty()) {
return true;
}
int n = rows.get(0).size();
return rows.stream()
.anyMatch(row -> row.size() != n);
}
private List<List<Cell>> buildRows(List<LinkedCell> cells) {
List<LinkedCell> topLeftCandidates = cells.stream()
.filter(LinkedCell::isTopLeft)
.toList();
assert topLeftCandidates.size() == 1;
var cell = topLeftCandidates.get(0);
List<List<Cell>> rows = new ArrayList<>();
rows.add(buildRow(cell));
while (!cell.belows.isEmpty()) {
cell = cell.belows.get(0);
rows.add(buildRow(cell));
}
return rows;
}
private static List<Cell> buildRow(LinkedCell cell) {
List<Cell> currentRow = new ArrayList<>();
LinkedCell nextCell = cell;
currentRow.add(cell.originalCell);
while (!nextCell.rights.isEmpty()) {
nextCell = nextCell.rights.get(0);
currentRow.add(nextCell.originalCell);
}
return currentRow;
}
private void computeNeighbours(List<LinkedCell> cells) {
for (LinkedCell cell : cells) {
cell.resetNeighbours();
computeNeighbours(cell, cells);
}
}
private void computeNeighbours(LinkedCell cell, List<LinkedCell> otherCells) {
for (LinkedCell otherCell : otherCells) {
if (cell.equals(otherCell)) {
continue;
}
if (cell.originalCell.horizontalDistance(otherCell.originalCell) <= minCellWidth * DISTANCE_FACTOR
&& cell.originalCell.verticalOverlap(otherCell.originalCell) >= minCellHeight * DISTANCE_FACTOR) {
if (cell.originalCell.getBBox().getCenterX() <= otherCell.originalCell.getBBox().getCenterX()) {
cell.rights.add(otherCell);
} else {
cell.lefts.add(otherCell);
}
} else if (cell.originalCell.verticalDistance(otherCell.originalCell) <= minCellHeight * DISTANCE_FACTOR
&& cell.originalCell.horizontalOverlap(otherCell.originalCell) >= minCellWidth * DISTANCE_FACTOR) {
if (cell.originalCell.getBBox().getCenterY() <= otherCell.originalCell.getBBox().getCenterY()) {
cell.belows.add(otherCell);
} else {
cell.aboves.add(otherCell);
}
}
}
}
static <T> List<List<T>> transpose(List<List<T>> table) {
List<List<T>> ret = new ArrayList<List<T>>();
final int N = table.get(0).size();
for (int i = 0; i < N; i++) {
List<T> col = new ArrayList<T>();
for (List<T> row : table) {
col.add(row.get(i));
}
ret.add(col);
}
return ret;
}
private List<List<Cell>> removeEmptyCols(List<List<Cell>> rowsOfCells) {
if (rowsOfCells.isEmpty()) {
return rowsOfCells;
}
var colsOfCells = transpose(rowsOfCells);
colsOfCells = removeEmptyRows(colsOfCells);
return transpose(colsOfCells);
}
private List<List<Cell>> removeEmptyRows(List<List<Cell>> rowsOfCells) {
return rowsOfCells.stream()
.filter(row -> row.stream()
.anyMatch(cell -> !cell.getTextBlocks().isEmpty()))
.collect(Collectors.toList());
}
class LinkedCell {
private final Cell originalCell;
private final List<LinkedCell> rights;
private final List<LinkedCell> lefts;
private final List<LinkedCell> aboves;
private final List<LinkedCell> belows;
LinkedCell(Cell cell) {
this.originalCell = cell;
this.rights = new LinkedList<>();
this.lefts = new LinkedList<>();
this.aboves = new LinkedList<>();
this.belows = new LinkedList<>();
}
public boolean needsSplit() {
return rights.size() > 1 || lefts.size() > 1 || aboves.size() > 1 || belows.size() > 1;
}
public boolean isTopLeft() {
return lefts.isEmpty() && aboves.isEmpty();
}
public String toString() {
return originalCell.toString();
}
public Collection<LinkedCell> split() {
if (rights.size() > 1 && rights.size() >= lefts.size()) {
return splitY(rights);
}
if (lefts.size() > 1) {
return splitY(lefts);
}
if (aboves.size() > 1 && aboves.size() >= belows.size()) {
return splitX(aboves);
}
if (belows.size() > 1) {
return splitX(belows);
}
return List.of(this);
}
private List<LinkedCell> splitY(List<LinkedCell> neighbours) {
List<LinkedCell> splitCells = new LinkedList<>();
List<Double> ySplit = neighbours.stream()
.map(right -> right.originalCell.getMaxY())
.sorted()
.toList();
Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY());
double maxX = originalCell.getBBox().getMaxX();
double x = originalCell.getBBox().getX();
double maxY = originalCell.getBBox().getMaxY();
for (Double neighborY : ySplit) {
double y = Math.min(neighborY, maxY);
Point2D bottomRight = new Point2D.Double(maxX, y);
Cell cell = copyCell(topLeft, bottomRight);
splitCells.add(new LinkedCell(cell));
topLeft = new Point2D.Double(x, y);
}
return splitCells;
}
private List<LinkedCell> splitX(List<LinkedCell> neighbours) {
List<LinkedCell> splitCells = new LinkedList<>();
List<Double> xSplit = neighbours.stream()
.map(right -> right.originalCell.getMaxX())
.sorted()
.toList();
Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY());
double maxY = originalCell.getBBox().getMaxY();
double y = originalCell.getBBox().getY();
double maxX = originalCell.getBBox().getMaxX();
for (Double neighborX : xSplit) {
double x = Math.min(neighborX, maxX);
Point2D bottomRight = new Point2D.Double(x, maxY);
Cell cell = copyCell(topLeft, bottomRight);
splitCells.add(new LinkedCell(cell));
topLeft = new Point2D.Double(x, y);
}
return splitCells;
}
private Cell copyCell(Point2D topLeft, Point2D bottomRight) {
Cell cell = Cell.fromPageCoordinates(topLeft, bottomRight, pageToPdfTransform);
cell.setHeaderCell(originalCell.isHeaderCell());
cell.setTextBlocks(originalCell.getTextBlocks());
return cell;
}
public void resetNeighbours() {
rights.clear();
lefts.clear();
aboves.clear();
belows.clear();
}
}
}

View File

@ -2,12 +2,16 @@ package com.knecon.fforesight.service.layoutparser.server;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Disabled;
@ -37,13 +41,14 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@Test
public void testLayoutParserEndToEnd() {
String filePath = "/home/kschuettler/Dokumente/TestFiles/OCR/TestSet/VV-331340-first100.pdf";
String filePath = "/home/kschuettler/Dokumente/TestFiles/RM syngenta standard/95 Trinexapac-ethyl_RAR_08_Volume_3CA_B-6_2018-01-10.pdf";
runForFile(filePath);
}
@Test
@SneakyThrows
public void testLayoutParserEndToEndWithIdpResult() {
String filePath = "/home/kschuettler/Dokumente/Ticket Related/RED-8670/VV-331340-first100.pdf/viewerDocument.pdf";
@ -58,7 +63,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@SneakyThrows
public void testLayoutParserEndToEndWithFolder() {
String folder = "/home/kschuettler/Dokumente/Ticket Related/RED-9975";
String folder = "/home/kschuettler/Dokumente/TestFiles/RM syngenta standard";
List<Path> pdfFiles = Files.walk(Path.of(folder))
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
.sorted(Comparator.comparing(Path::getFileName))
@ -67,20 +72,37 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
System.out.printf("Found %d pdf files to process %n", pdfFiles.size());
AtomicInteger count = new AtomicInteger(0);
List<String> errorFiles = Collections.synchronizedList(new ArrayList<>());
pdfFiles.stream()
.peek(path -> log.info("{}/{}-{}", count.getAndIncrement(), pdfFiles.size(), path.getFileName()))
.forEach(path -> runForFile(path.toFile().toString()));
.peek(path -> log.info("[{}/{}]: {}", count.getAndIncrement(), pdfFiles.size(), path.getFileName()))
.forEach(path -> runForFiles(path.toFile().toString(), errorFiles));
if (!errorFiles.isEmpty()) {
log.error("Errors occurred in files:\n{}", String.join("\n", errorFiles));
throw new AssertionError();
}
}
private void runForFiles(String filePath, List<String> errorFiles) {
try {
runForFile(filePath, null);
log.info("File {} processed successfully", filePath);
} catch (Throwable e) {
log.error("File {} failed with exception", filePath, e);
errorFiles.add(filePath);
}
}
@SneakyThrows
private void runForFile(String filePath) {
runForFile(filePath, null);
}
@SneakyThrows
private void runForFile(String filePath, String idpResultPath) {
private void runForFile(String filePath, String idpResultPath) throws IOException {
String fileName = Path.of(filePath).getFileName().toString();
File file;