diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 0710ddd..e6ed46d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -181,22 +181,10 @@ public class LayoutParsingPipeline { Layout parsing has finished in %.02f s. identifiers: %s %s - Files have been saved with Ids: - Structure: %s - Text: %s - Positions: %s - PageData: %s - Simplified Text: %s - Viewer Doc: %s""", + """, ((float) (System.currentTimeMillis() - start)) / 1000, layoutParsingRequest.identifier(), - buildSemanticNodeCountMessage(documentWithVisualization.document().getNumberOfPages(), documentWithVisualization.buildSemanticNodeCounts()), - layoutParsingRequest.structureFileStorageId(), - layoutParsingRequest.textBlockFileStorageId(), - layoutParsingRequest.positionBlockFileStorageId(), - layoutParsingRequest.pageFileStorageId(), - layoutParsingRequest.simplifiedTextStorageId(), - layoutParsingRequest.viewerDocumentStorageId())) + buildSemanticNodeCountMessage(documentWithVisualization.document().getNumberOfPages(), documentWithVisualization.buildSemanticNodeCounts()))) .layoutParserVersion(layoutParserVersion) .build(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/QuadPointGridifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/QuadPointGridifier.java index b2f285a..b764587 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/QuadPointGridifier.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/QuadPointGridifier.java @@ -18,10 +18,12 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.LinkedQu import com.knecon.fforesight.service.layoutparser.processor.model.table.QuadPoint; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; +import lombok.AllArgsConstructor; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @Slf4j +@AllArgsConstructor public class QuadPointGridifier { public static final int MAX_SPLITTING_ITERATIONS = 10; @@ -51,6 +53,16 @@ public class QuadPointGridifier { } + @SneakyThrows + public static QuadPointGridifier fromCells(Collection cells, AffineTransform pdfToPageTransform) { + + var qpCells = cells.stream() + .map(cell -> new LinkedQuadPointCell(QuadPoint.fromRectangle2D(cell.getBBox()), cell.getTextBlocks())) + .collect(Collectors.toSet()); + return new QuadPointGridifier(qpCells, pdfToPageTransform); + } + + public Stream horizontalLines(QuadPoint quadPoint) { return Stream.of(quadPoint.getBottomLine(), quadPoint.getTopLine()); @@ -110,26 +122,41 @@ public class QuadPointGridifier { if (cells.isEmpty()) { return Collections.emptyList(); } - List> rows = buildRows(cells); - List> cellRows = mapToCells(rows); + List> rows = buildRows(cells); if (isNotRectangular(rows)) { - log.error("Non rectangular table on page {}", - cells.stream() - .map(LinkedQuadPointCell::getPageBlocks) - .flatMap(List::stream) - .map(AbstractPageBlock::getWords) - .flatMap(Collection::stream) - .map(Word::getPage) - .findAny().orElse(0)); - // sometimes this algorithm fails to produce a rectangular table, this happens when the lines are so tilted it eventually produces a cell which is skipped due to being too small, leading to non-rectangular rows. + log.warn("Non rectangular table on page {}, using fallback algorithm.", getPageNumber(cells)); + // Sometimes this algorithm fails to produce a rectangular table, this happens when the lines are so tilted it eventually produces a cell which is skipped due to being too small, leading to non-rectangular rows. + // Might also happen, if more than MAX_SPLITTING_ITERATIONS splits are required. // Then we use the area sweep algorithm as a fallback. - return AreaSweepGridifier.gridify(this.cells.stream() - .map(this::toCell) - .toList(), pageToPdfTransform, minCellWidth, minCellHeight); + return areaSweepFallback(); } - cellRows = removeEmptyRows(cellRows); - cellRows = removeEmptyCols(cellRows); - return cellRows; + rows = removeEmptyRows(rows); + rows = removeEmptyCols(rows); + return rows; + } + + + private static Integer getPageNumber(List cells) { + + return cells.stream() + .map(LinkedQuadPointCell::getPageBlocks) + .flatMap(List::stream) + .map(AbstractPageBlock::getWords) + .flatMap(Collection::stream) + .map(Word::getPage) + .findAny().orElse(0); + } + + + private List> areaSweepFallback() { + + List cells = this.cells.stream() + .map(this::toCell) + .toList(); + List> rows = AreaSweepGridifier.gridify(cells, pageToPdfTransform, minCellWidth, minCellHeight); + rows = removeEmptyRows(rows); + rows = removeEmptyCols(rows); + return rows; } @@ -152,7 +179,7 @@ public class QuadPointGridifier { } - private boolean isNotRectangular(List> rows) { + private boolean isNotRectangular(List> rows) { if (rows.isEmpty()) { return true; @@ -163,14 +190,15 @@ public class QuadPointGridifier { } - private List> buildRows(List cells) { + private List> buildRows(List cells) { List topLeftCandidates = cells.stream() .filter(LinkedQuadPointCell::isTopLeft) .toList(); if (topLeftCandidates.size() != 1) { - log.error("More than one top-left cell found!"); + log.warn("More than one top left candidate on page {}, using fallback algorithm.", getPageNumber(cells)); + return areaSweepFallback(); } var cell = topLeftCandidates.get(0); @@ -180,7 +208,7 @@ public class QuadPointGridifier { cell = cell.getBelows().get(0); rows.add(buildRow(cell)); } - return rows; + return mapToCells(rows); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableFromCellsExtractor.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableFromCellsExtractor.java index fa2ad62..5679295 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableFromCellsExtractor.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableFromCellsExtractor.java @@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.tables; import java.awt.geom.AffineTransform; import java.util.ArrayList; import java.util.Collection; -import java.util.LinkedList; import java.util.List; import com.fasterxml.jackson.annotation.JsonIgnore; @@ -126,7 +125,7 @@ public class TableFromCellsExtractor { return; } - TableGridStructureCalculator calculator = new TableGridStructureCalculator(cells, pdfToPageTransform); + QuadPointGridifier calculator = QuadPointGridifier.fromCells(cells, pdfToPageTransform); rows = calculator.gridify(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableGridStructureCalculator.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableGridStructureCalculator.java deleted file mode 100644 index fcc40cb..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableGridStructureCalculator.java +++ /dev/null @@ -1,360 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.services.tables; - -import java.awt.geom.AffineTransform; -import java.awt.geom.Point2D; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; - -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; - -@Slf4j -public class TableGridStructureCalculator { - - // multiplied with minimum cell height/width, Cells may be at most this apart in one dimension, and must overlap at least that much in the other dimension to be considered neighbours - private static final double DISTANCE_FACTOR = 0.5; - private static final int MAX_SPLITTING_ITERATIONS = 10; - Set cells; - AffineTransform pageToPdfTransform; - double minCellHeight; - double minCellWidth; - - - @SneakyThrows - TableGridStructureCalculator(Collection cells, AffineTransform pdfToPageTransform) { - - this.cells = new HashSet<>(cells); - this.pageToPdfTransform = pdfToPageTransform.createInverse(); - this.minCellHeight = cells.stream() - .mapToDouble(cell -> cell.getBBox().getHeight()) - .min().orElse(0); - this.minCellWidth = cells.stream() - .mapToDouble(cell -> cell.getBBox().getWidth()) - .min().orElse(0); - } - - - /** - * Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted. - * Checks if any cell has more than one neighbor in any direction, if it does, it splits the cell according to its neighbors. - * This is repeated until no more splits are necessary. Then the rows are computed using that very same linked neighbor structure starting with the top left cell. - * - * @return TablePageBlock Structure as a rows of cells matrix - */ - public List> gridify() { - - if (cellsHaveLargeOverlaps()) { - // If cells overlap significantly, the logic below will keep splitting them infinitely, so we revert to the simpler area sweep implementation. - return areaSweepFallback(); - } - - var linkedCells = cells.stream() - .map(LinkedCell::new) - .collect(Collectors.toList()); - - computeNeighbours(linkedCells); - int splits = 0; - while (linkedCells.stream() - .anyMatch(LinkedCell::needsSplit) && splits <= MAX_SPLITTING_ITERATIONS) { - - List newCells = new LinkedList<>(); - for (LinkedCell linkedCell : linkedCells) { - if (linkedCell.needsSplit()) { - newCells.addAll(linkedCell.split()); - } else { - newCells.add(linkedCell); - } - } - computeNeighbours(newCells); - linkedCells = newCells; - splits++; - } - return buildStructure(linkedCells); - } - - - private List> areaSweepFallback() { - - List> rows = AreaSweepGridifier.gridify(cells, pageToPdfTransform, minCellWidth, minCellHeight); - rows = removeEmptyRows(rows); - rows = removeEmptyCols(rows); - return rows; - } - - - private boolean cellsHaveLargeOverlaps() { - - for (Cell cell1 : cells) { - for (Cell cell2 : cells) { - if (cell1.equals(cell2)) { - continue; - } - if (cell1.horizontalOverlap(cell2) > minCellWidth * DISTANCE_FACTOR // - && cell1.verticalOverlap(cell2) > minCellHeight * DISTANCE_FACTOR) { - return true; - } - } - } - return false; - } - - - private List> buildStructure(List cells) { - - if (cells.isEmpty()) { - return Collections.emptyList(); - } - List> rows = buildRows(cells); - if (isNotRectangular(rows)) { - // For some tables the result is not rectangular, this either happens if cells are missing or the algorithm would need more than the max iterations to solve it. - // This is unacceptable so we revert to the area sweep implementation, which by design will always produce a rectangular result. - return areaSweepFallback(); - } - rows = removeEmptyRows(rows); - rows = removeEmptyCols(rows); - return rows; - } - - - private boolean isNotRectangular(List> rows) { - - if (rows.isEmpty()) { - return true; - } - int n = rows.get(0).size(); - return rows.stream() - .anyMatch(row -> row.size() != n); - } - - - private List> buildRows(List cells) { - - List topLeftCandidates = cells.stream() - .filter(LinkedCell::isTopLeft) - .toList(); - - assert topLeftCandidates.size() == 1; - var cell = topLeftCandidates.get(0); - - List> rows = new ArrayList<>(); - rows.add(buildRow(cell)); - while (!cell.belows.isEmpty()) { - cell = cell.belows.get(0); - rows.add(buildRow(cell)); - } - return rows; - } - - - private static List buildRow(LinkedCell cell) { - - List currentRow = new ArrayList<>(); - LinkedCell nextCell = cell; - currentRow.add(cell.originalCell); - while (!nextCell.rights.isEmpty()) { - nextCell = nextCell.rights.get(0); - currentRow.add(nextCell.originalCell); - } - return currentRow; - } - - - private void computeNeighbours(List cells) { - - for (LinkedCell cell : cells) { - cell.resetNeighbours(); - computeNeighbours(cell, cells); - } - - } - - - private void computeNeighbours(LinkedCell cell, List otherCells) { - - for (LinkedCell otherCell : otherCells) { - if (cell.equals(otherCell)) { - continue; - } - if (cell.originalCell.horizontalDistance(otherCell.originalCell) <= minCellWidth * DISTANCE_FACTOR - && cell.originalCell.verticalOverlap(otherCell.originalCell) >= minCellHeight * DISTANCE_FACTOR) { - if (cell.originalCell.getBBox().getCenterX() <= otherCell.originalCell.getBBox().getCenterX()) { - cell.rights.add(otherCell); - } else { - cell.lefts.add(otherCell); - } - } else if (cell.originalCell.verticalDistance(otherCell.originalCell) <= minCellHeight * DISTANCE_FACTOR - && cell.originalCell.horizontalOverlap(otherCell.originalCell) >= minCellWidth * DISTANCE_FACTOR) { - if (cell.originalCell.getBBox().getCenterY() <= otherCell.originalCell.getBBox().getCenterY()) { - cell.belows.add(otherCell); - } else { - cell.aboves.add(otherCell); - } - } - } - - } - - - static List> transpose(List> table) { - - List> ret = new ArrayList>(); - final int N = table.get(0).size(); - for (int i = 0; i < N; i++) { - List col = new ArrayList(); - for (List row : table) { - col.add(row.get(i)); - } - ret.add(col); - } - return ret; - } - - - private List> removeEmptyCols(List> rowsOfCells) { - - if (rowsOfCells.isEmpty()) { - return rowsOfCells; - } - - var colsOfCells = transpose(rowsOfCells); - colsOfCells = removeEmptyRows(colsOfCells); - return transpose(colsOfCells); - } - - - private List> removeEmptyRows(List> rowsOfCells) { - - return rowsOfCells.stream() - .filter(row -> row.stream() - .anyMatch(cell -> !cell.getTextBlocks().isEmpty())) - .collect(Collectors.toList()); - } - - - class LinkedCell { - - private final Cell originalCell; - private final List rights; - private final List lefts; - private final List aboves; - private final List belows; - - - LinkedCell(Cell cell) { - - this.originalCell = cell; - this.rights = new LinkedList<>(); - this.lefts = new LinkedList<>(); - this.aboves = new LinkedList<>(); - this.belows = new LinkedList<>(); - } - - - public boolean needsSplit() { - - return rights.size() > 1 || lefts.size() > 1 || aboves.size() > 1 || belows.size() > 1; - } - - - public boolean isTopLeft() { - - return lefts.isEmpty() && aboves.isEmpty(); - } - - - public String toString() { - - return originalCell.toString(); - } - - - public Collection split() { - - if (rights.size() > 1 && rights.size() >= lefts.size()) { - return splitY(rights); - } - if (lefts.size() > 1) { - return splitY(lefts); - } - if (aboves.size() > 1 && aboves.size() >= belows.size()) { - return splitX(aboves); - } - if (belows.size() > 1) { - return splitX(belows); - } - return List.of(this); - } - - - private List splitY(List neighbours) { - - List splitCells = new LinkedList<>(); - List ySplit = neighbours.stream() - .map(right -> right.originalCell.getMaxY()) - .sorted() - .toList(); - Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY()); - double maxX = originalCell.getBBox().getMaxX(); - double x = originalCell.getBBox().getX(); - double maxY = originalCell.getBBox().getMaxY(); - for (Double neighborY : ySplit) { - double y = Math.min(neighborY, maxY); - Point2D bottomRight = new Point2D.Double(maxX, y); - Cell cell = copyCell(topLeft, bottomRight); - splitCells.add(new LinkedCell(cell)); - topLeft = new Point2D.Double(x, y); - } - return splitCells; - } - - - private List splitX(List neighbours) { - - List splitCells = new LinkedList<>(); - List xSplit = neighbours.stream() - .map(right -> right.originalCell.getMaxX()) - .sorted() - .toList(); - Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY()); - double maxY = originalCell.getBBox().getMaxY(); - double y = originalCell.getBBox().getY(); - double maxX = originalCell.getBBox().getMaxX(); - for (Double neighborX : xSplit) { - double x = Math.min(neighborX, maxX); - Point2D bottomRight = new Point2D.Double(x, maxY); - Cell cell = copyCell(topLeft, bottomRight); - splitCells.add(new LinkedCell(cell)); - topLeft = new Point2D.Double(x, y); - } - return splitCells; - } - - - private Cell copyCell(Point2D topLeft, Point2D bottomRight) { - - Cell cell = Cell.fromPageCoordinates(topLeft, bottomRight, pageToPdfTransform); - cell.setHeaderCell(originalCell.isHeaderCell()); - cell.setTextBlocks(originalCell.getTextBlocks()); - return cell; - } - - - public void resetNeighbours() { - - rights.clear(); - lefts.clear(); - aboves.clear(); - belows.clear(); - } - - } - -} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 3432ff0..f4634fb 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -2,12 +2,16 @@ package com.knecon.fforesight.service.layoutparser.server; import java.io.File; import java.io.FileInputStream; +import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Disabled; @@ -37,13 +41,14 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @Test public void testLayoutParserEndToEnd() { - String filePath = "/home/kschuettler/Dokumente/TestFiles/OCR/TestSet/VV-331340-first100.pdf"; + String filePath = "/home/kschuettler/Dokumente/TestFiles/RM syngenta standard/95 Trinexapac-ethyl_RAR_08_Volume_3CA_B-6_2018-01-10.pdf"; runForFile(filePath); } @Test + @SneakyThrows public void testLayoutParserEndToEndWithIdpResult() { String filePath = "/home/kschuettler/Dokumente/Ticket Related/RED-8670/VV-331340-first100.pdf/viewerDocument.pdf"; @@ -58,7 +63,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @SneakyThrows public void testLayoutParserEndToEndWithFolder() { - String folder = "/home/kschuettler/Dokumente/Ticket Related/RED-9975"; + String folder = "/home/kschuettler/Dokumente/TestFiles/RM syngenta standard"; List pdfFiles = Files.walk(Path.of(folder)) .filter(path -> path.getFileName().toString().endsWith(".pdf")) .sorted(Comparator.comparing(Path::getFileName)) @@ -67,20 +72,37 @@ public class LayoutparserEnd2EndTest extends AbstractTest { System.out.printf("Found %d pdf files to process %n", pdfFiles.size()); AtomicInteger count = new AtomicInteger(0); + List errorFiles = Collections.synchronizedList(new ArrayList<>()); pdfFiles.stream() - .peek(path -> log.info("{}/{}-{}", count.getAndIncrement(), pdfFiles.size(), path.getFileName())) - .forEach(path -> runForFile(path.toFile().toString())); + .peek(path -> log.info("[{}/{}]: {}", count.getAndIncrement(), pdfFiles.size(), path.getFileName())) + .forEach(path -> runForFiles(path.toFile().toString(), errorFiles)); + if (!errorFiles.isEmpty()) { + log.error("Errors occurred in files:\n{}", String.join("\n", errorFiles)); + throw new AssertionError(); + } } + private void runForFiles(String filePath, List errorFiles) { + + try { + runForFile(filePath, null); + log.info("File {} processed successfully", filePath); + } catch (Throwable e) { + log.error("File {} failed with exception", filePath, e); + errorFiles.add(filePath); + } + } + + + @SneakyThrows private void runForFile(String filePath) { runForFile(filePath, null); } - @SneakyThrows - private void runForFile(String filePath, String idpResultPath) { + private void runForFile(String filePath, String idpResultPath) throws IOException { String fileName = Path.of(filePath).getFileName().toString(); File file;