RED-8670: move all gridification to QuadPointGridifier
This commit is contained in:
parent
6f6cae594f
commit
065abc5ae2
@ -181,22 +181,10 @@ public class LayoutParsingPipeline {
|
||||
Layout parsing has finished in %.02f s.
|
||||
identifiers: %s
|
||||
%s
|
||||
Files have been saved with Ids:
|
||||
Structure: %s
|
||||
Text: %s
|
||||
Positions: %s
|
||||
PageData: %s
|
||||
Simplified Text: %s
|
||||
Viewer Doc: %s""",
|
||||
""",
|
||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||
layoutParsingRequest.identifier(),
|
||||
buildSemanticNodeCountMessage(documentWithVisualization.document().getNumberOfPages(), documentWithVisualization.buildSemanticNodeCounts()),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
layoutParsingRequest.simplifiedTextStorageId(),
|
||||
layoutParsingRequest.viewerDocumentStorageId()))
|
||||
buildSemanticNodeCountMessage(documentWithVisualization.document().getNumberOfPages(), documentWithVisualization.buildSemanticNodeCounts())))
|
||||
.layoutParserVersion(layoutParserVersion)
|
||||
.build();
|
||||
|
||||
|
||||
@ -18,10 +18,12 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.LinkedQu
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.QuadPoint;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@AllArgsConstructor
|
||||
public class QuadPointGridifier {
|
||||
|
||||
public static final int MAX_SPLITTING_ITERATIONS = 10;
|
||||
@ -51,6 +53,16 @@ public class QuadPointGridifier {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static QuadPointGridifier fromCells(Collection<Cell> cells, AffineTransform pdfToPageTransform) {
|
||||
|
||||
var qpCells = cells.stream()
|
||||
.map(cell -> new LinkedQuadPointCell(QuadPoint.fromRectangle2D(cell.getBBox()), cell.getTextBlocks()))
|
||||
.collect(Collectors.toSet());
|
||||
return new QuadPointGridifier(qpCells, pdfToPageTransform);
|
||||
}
|
||||
|
||||
|
||||
public Stream<Line2D> horizontalLines(QuadPoint quadPoint) {
|
||||
|
||||
return Stream.of(quadPoint.getBottomLine(), quadPoint.getTopLine());
|
||||
@ -110,26 +122,41 @@ public class QuadPointGridifier {
|
||||
if (cells.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
List<List<LinkedQuadPointCell>> rows = buildRows(cells);
|
||||
List<List<Cell>> cellRows = mapToCells(rows);
|
||||
List<List<Cell>> rows = buildRows(cells);
|
||||
if (isNotRectangular(rows)) {
|
||||
log.error("Non rectangular table on page {}",
|
||||
cells.stream()
|
||||
.map(LinkedQuadPointCell::getPageBlocks)
|
||||
.flatMap(List::stream)
|
||||
.map(AbstractPageBlock::getWords)
|
||||
.flatMap(Collection::stream)
|
||||
.map(Word::getPage)
|
||||
.findAny().orElse(0));
|
||||
// sometimes this algorithm fails to produce a rectangular table, this happens when the lines are so tilted it eventually produces a cell which is skipped due to being too small, leading to non-rectangular rows.
|
||||
log.warn("Non rectangular table on page {}, using fallback algorithm.", getPageNumber(cells));
|
||||
// Sometimes this algorithm fails to produce a rectangular table, this happens when the lines are so tilted it eventually produces a cell which is skipped due to being too small, leading to non-rectangular rows.
|
||||
// Might also happen, if more than MAX_SPLITTING_ITERATIONS splits are required.
|
||||
// Then we use the area sweep algorithm as a fallback.
|
||||
return AreaSweepGridifier.gridify(this.cells.stream()
|
||||
.map(this::toCell)
|
||||
.toList(), pageToPdfTransform, minCellWidth, minCellHeight);
|
||||
return areaSweepFallback();
|
||||
}
|
||||
cellRows = removeEmptyRows(cellRows);
|
||||
cellRows = removeEmptyCols(cellRows);
|
||||
return cellRows;
|
||||
rows = removeEmptyRows(rows);
|
||||
rows = removeEmptyCols(rows);
|
||||
return rows;
|
||||
}
|
||||
|
||||
|
||||
private static Integer getPageNumber(List<LinkedQuadPointCell> cells) {
|
||||
|
||||
return cells.stream()
|
||||
.map(LinkedQuadPointCell::getPageBlocks)
|
||||
.flatMap(List::stream)
|
||||
.map(AbstractPageBlock::getWords)
|
||||
.flatMap(Collection::stream)
|
||||
.map(Word::getPage)
|
||||
.findAny().orElse(0);
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> areaSweepFallback() {
|
||||
|
||||
List<Cell> cells = this.cells.stream()
|
||||
.map(this::toCell)
|
||||
.toList();
|
||||
List<List<Cell>> rows = AreaSweepGridifier.gridify(cells, pageToPdfTransform, minCellWidth, minCellHeight);
|
||||
rows = removeEmptyRows(rows);
|
||||
rows = removeEmptyCols(rows);
|
||||
return rows;
|
||||
}
|
||||
|
||||
|
||||
@ -152,7 +179,7 @@ public class QuadPointGridifier {
|
||||
}
|
||||
|
||||
|
||||
private boolean isNotRectangular(List<List<LinkedQuadPointCell>> rows) {
|
||||
private boolean isNotRectangular(List<List<Cell>> rows) {
|
||||
|
||||
if (rows.isEmpty()) {
|
||||
return true;
|
||||
@ -163,14 +190,15 @@ public class QuadPointGridifier {
|
||||
}
|
||||
|
||||
|
||||
private List<List<LinkedQuadPointCell>> buildRows(List<LinkedQuadPointCell> cells) {
|
||||
private List<List<Cell>> buildRows(List<LinkedQuadPointCell> cells) {
|
||||
|
||||
List<LinkedQuadPointCell> topLeftCandidates = cells.stream()
|
||||
.filter(LinkedQuadPointCell::isTopLeft)
|
||||
.toList();
|
||||
|
||||
if (topLeftCandidates.size() != 1) {
|
||||
log.error("More than one top-left cell found!");
|
||||
log.warn("More than one top left candidate on page {}, using fallback algorithm.", getPageNumber(cells));
|
||||
return areaSweepFallback();
|
||||
}
|
||||
var cell = topLeftCandidates.get(0);
|
||||
|
||||
@ -180,7 +208,7 @@ public class QuadPointGridifier {
|
||||
cell = cell.getBelows().get(0);
|
||||
rows.add(buildRow(cell));
|
||||
}
|
||||
return rows;
|
||||
return mapToCells(rows);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
@ -126,7 +125,7 @@ public class TableFromCellsExtractor {
|
||||
return;
|
||||
}
|
||||
|
||||
TableGridStructureCalculator calculator = new TableGridStructureCalculator(cells, pdfToPageTransform);
|
||||
QuadPointGridifier calculator = QuadPointGridifier.fromCells(cells, pdfToPageTransform);
|
||||
rows = calculator.gridify();
|
||||
}
|
||||
|
||||
|
||||
@ -1,360 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class TableGridStructureCalculator {
|
||||
|
||||
// multiplied with minimum cell height/width, Cells may be at most this apart in one dimension, and must overlap at least that much in the other dimension to be considered neighbours
|
||||
private static final double DISTANCE_FACTOR = 0.5;
|
||||
private static final int MAX_SPLITTING_ITERATIONS = 10;
|
||||
Set<Cell> cells;
|
||||
AffineTransform pageToPdfTransform;
|
||||
double minCellHeight;
|
||||
double minCellWidth;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
TableGridStructureCalculator(Collection<Cell> cells, AffineTransform pdfToPageTransform) {
|
||||
|
||||
this.cells = new HashSet<>(cells);
|
||||
this.pageToPdfTransform = pdfToPageTransform.createInverse();
|
||||
this.minCellHeight = cells.stream()
|
||||
.mapToDouble(cell -> cell.getBBox().getHeight())
|
||||
.min().orElse(0);
|
||||
this.minCellWidth = cells.stream()
|
||||
.mapToDouble(cell -> cell.getBBox().getWidth())
|
||||
.min().orElse(0);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
||||
* Checks if any cell has more than one neighbor in any direction, if it does, it splits the cell according to its neighbors.
|
||||
* This is repeated until no more splits are necessary. Then the rows are computed using that very same linked neighbor structure starting with the top left cell.
|
||||
*
|
||||
* @return TablePageBlock Structure as a rows of cells matrix
|
||||
*/
|
||||
public List<List<Cell>> gridify() {
|
||||
|
||||
if (cellsHaveLargeOverlaps()) {
|
||||
// If cells overlap significantly, the logic below will keep splitting them infinitely, so we revert to the simpler area sweep implementation.
|
||||
return areaSweepFallback();
|
||||
}
|
||||
|
||||
var linkedCells = cells.stream()
|
||||
.map(LinkedCell::new)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
computeNeighbours(linkedCells);
|
||||
int splits = 0;
|
||||
while (linkedCells.stream()
|
||||
.anyMatch(LinkedCell::needsSplit) && splits <= MAX_SPLITTING_ITERATIONS) {
|
||||
|
||||
List<LinkedCell> newCells = new LinkedList<>();
|
||||
for (LinkedCell linkedCell : linkedCells) {
|
||||
if (linkedCell.needsSplit()) {
|
||||
newCells.addAll(linkedCell.split());
|
||||
} else {
|
||||
newCells.add(linkedCell);
|
||||
}
|
||||
}
|
||||
computeNeighbours(newCells);
|
||||
linkedCells = newCells;
|
||||
splits++;
|
||||
}
|
||||
return buildStructure(linkedCells);
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> areaSweepFallback() {
|
||||
|
||||
List<List<Cell>> rows = AreaSweepGridifier.gridify(cells, pageToPdfTransform, minCellWidth, minCellHeight);
|
||||
rows = removeEmptyRows(rows);
|
||||
rows = removeEmptyCols(rows);
|
||||
return rows;
|
||||
}
|
||||
|
||||
|
||||
private boolean cellsHaveLargeOverlaps() {
|
||||
|
||||
for (Cell cell1 : cells) {
|
||||
for (Cell cell2 : cells) {
|
||||
if (cell1.equals(cell2)) {
|
||||
continue;
|
||||
}
|
||||
if (cell1.horizontalOverlap(cell2) > minCellWidth * DISTANCE_FACTOR //
|
||||
&& cell1.verticalOverlap(cell2) > minCellHeight * DISTANCE_FACTOR) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> buildStructure(List<LinkedCell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
List<List<Cell>> rows = buildRows(cells);
|
||||
if (isNotRectangular(rows)) {
|
||||
// For some tables the result is not rectangular, this either happens if cells are missing or the algorithm would need more than the max iterations to solve it.
|
||||
// This is unacceptable so we revert to the area sweep implementation, which by design will always produce a rectangular result.
|
||||
return areaSweepFallback();
|
||||
}
|
||||
rows = removeEmptyRows(rows);
|
||||
rows = removeEmptyCols(rows);
|
||||
return rows;
|
||||
}
|
||||
|
||||
|
||||
private boolean isNotRectangular(List<List<Cell>> rows) {
|
||||
|
||||
if (rows.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
int n = rows.get(0).size();
|
||||
return rows.stream()
|
||||
.anyMatch(row -> row.size() != n);
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> buildRows(List<LinkedCell> cells) {
|
||||
|
||||
List<LinkedCell> topLeftCandidates = cells.stream()
|
||||
.filter(LinkedCell::isTopLeft)
|
||||
.toList();
|
||||
|
||||
assert topLeftCandidates.size() == 1;
|
||||
var cell = topLeftCandidates.get(0);
|
||||
|
||||
List<List<Cell>> rows = new ArrayList<>();
|
||||
rows.add(buildRow(cell));
|
||||
while (!cell.belows.isEmpty()) {
|
||||
cell = cell.belows.get(0);
|
||||
rows.add(buildRow(cell));
|
||||
}
|
||||
return rows;
|
||||
}
|
||||
|
||||
|
||||
private static List<Cell> buildRow(LinkedCell cell) {
|
||||
|
||||
List<Cell> currentRow = new ArrayList<>();
|
||||
LinkedCell nextCell = cell;
|
||||
currentRow.add(cell.originalCell);
|
||||
while (!nextCell.rights.isEmpty()) {
|
||||
nextCell = nextCell.rights.get(0);
|
||||
currentRow.add(nextCell.originalCell);
|
||||
}
|
||||
return currentRow;
|
||||
}
|
||||
|
||||
|
||||
private void computeNeighbours(List<LinkedCell> cells) {
|
||||
|
||||
for (LinkedCell cell : cells) {
|
||||
cell.resetNeighbours();
|
||||
computeNeighbours(cell, cells);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void computeNeighbours(LinkedCell cell, List<LinkedCell> otherCells) {
|
||||
|
||||
for (LinkedCell otherCell : otherCells) {
|
||||
if (cell.equals(otherCell)) {
|
||||
continue;
|
||||
}
|
||||
if (cell.originalCell.horizontalDistance(otherCell.originalCell) <= minCellWidth * DISTANCE_FACTOR
|
||||
&& cell.originalCell.verticalOverlap(otherCell.originalCell) >= minCellHeight * DISTANCE_FACTOR) {
|
||||
if (cell.originalCell.getBBox().getCenterX() <= otherCell.originalCell.getBBox().getCenterX()) {
|
||||
cell.rights.add(otherCell);
|
||||
} else {
|
||||
cell.lefts.add(otherCell);
|
||||
}
|
||||
} else if (cell.originalCell.verticalDistance(otherCell.originalCell) <= minCellHeight * DISTANCE_FACTOR
|
||||
&& cell.originalCell.horizontalOverlap(otherCell.originalCell) >= minCellWidth * DISTANCE_FACTOR) {
|
||||
if (cell.originalCell.getBBox().getCenterY() <= otherCell.originalCell.getBBox().getCenterY()) {
|
||||
cell.belows.add(otherCell);
|
||||
} else {
|
||||
cell.aboves.add(otherCell);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
static <T> List<List<T>> transpose(List<List<T>> table) {
|
||||
|
||||
List<List<T>> ret = new ArrayList<List<T>>();
|
||||
final int N = table.get(0).size();
|
||||
for (int i = 0; i < N; i++) {
|
||||
List<T> col = new ArrayList<T>();
|
||||
for (List<T> row : table) {
|
||||
col.add(row.get(i));
|
||||
}
|
||||
ret.add(col);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> removeEmptyCols(List<List<Cell>> rowsOfCells) {
|
||||
|
||||
if (rowsOfCells.isEmpty()) {
|
||||
return rowsOfCells;
|
||||
}
|
||||
|
||||
var colsOfCells = transpose(rowsOfCells);
|
||||
colsOfCells = removeEmptyRows(colsOfCells);
|
||||
return transpose(colsOfCells);
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> removeEmptyRows(List<List<Cell>> rowsOfCells) {
|
||||
|
||||
return rowsOfCells.stream()
|
||||
.filter(row -> row.stream()
|
||||
.anyMatch(cell -> !cell.getTextBlocks().isEmpty()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
class LinkedCell {
|
||||
|
||||
private final Cell originalCell;
|
||||
private final List<LinkedCell> rights;
|
||||
private final List<LinkedCell> lefts;
|
||||
private final List<LinkedCell> aboves;
|
||||
private final List<LinkedCell> belows;
|
||||
|
||||
|
||||
LinkedCell(Cell cell) {
|
||||
|
||||
this.originalCell = cell;
|
||||
this.rights = new LinkedList<>();
|
||||
this.lefts = new LinkedList<>();
|
||||
this.aboves = new LinkedList<>();
|
||||
this.belows = new LinkedList<>();
|
||||
}
|
||||
|
||||
|
||||
public boolean needsSplit() {
|
||||
|
||||
return rights.size() > 1 || lefts.size() > 1 || aboves.size() > 1 || belows.size() > 1;
|
||||
}
|
||||
|
||||
|
||||
public boolean isTopLeft() {
|
||||
|
||||
return lefts.isEmpty() && aboves.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return originalCell.toString();
|
||||
}
|
||||
|
||||
|
||||
public Collection<LinkedCell> split() {
|
||||
|
||||
if (rights.size() > 1 && rights.size() >= lefts.size()) {
|
||||
return splitY(rights);
|
||||
}
|
||||
if (lefts.size() > 1) {
|
||||
return splitY(lefts);
|
||||
}
|
||||
if (aboves.size() > 1 && aboves.size() >= belows.size()) {
|
||||
return splitX(aboves);
|
||||
}
|
||||
if (belows.size() > 1) {
|
||||
return splitX(belows);
|
||||
}
|
||||
return List.of(this);
|
||||
}
|
||||
|
||||
|
||||
private List<LinkedCell> splitY(List<LinkedCell> neighbours) {
|
||||
|
||||
List<LinkedCell> splitCells = new LinkedList<>();
|
||||
List<Double> ySplit = neighbours.stream()
|
||||
.map(right -> right.originalCell.getMaxY())
|
||||
.sorted()
|
||||
.toList();
|
||||
Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY());
|
||||
double maxX = originalCell.getBBox().getMaxX();
|
||||
double x = originalCell.getBBox().getX();
|
||||
double maxY = originalCell.getBBox().getMaxY();
|
||||
for (Double neighborY : ySplit) {
|
||||
double y = Math.min(neighborY, maxY);
|
||||
Point2D bottomRight = new Point2D.Double(maxX, y);
|
||||
Cell cell = copyCell(topLeft, bottomRight);
|
||||
splitCells.add(new LinkedCell(cell));
|
||||
topLeft = new Point2D.Double(x, y);
|
||||
}
|
||||
return splitCells;
|
||||
}
|
||||
|
||||
|
||||
private List<LinkedCell> splitX(List<LinkedCell> neighbours) {
|
||||
|
||||
List<LinkedCell> splitCells = new LinkedList<>();
|
||||
List<Double> xSplit = neighbours.stream()
|
||||
.map(right -> right.originalCell.getMaxX())
|
||||
.sorted()
|
||||
.toList();
|
||||
Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY());
|
||||
double maxY = originalCell.getBBox().getMaxY();
|
||||
double y = originalCell.getBBox().getY();
|
||||
double maxX = originalCell.getBBox().getMaxX();
|
||||
for (Double neighborX : xSplit) {
|
||||
double x = Math.min(neighborX, maxX);
|
||||
Point2D bottomRight = new Point2D.Double(x, maxY);
|
||||
Cell cell = copyCell(topLeft, bottomRight);
|
||||
splitCells.add(new LinkedCell(cell));
|
||||
topLeft = new Point2D.Double(x, y);
|
||||
}
|
||||
return splitCells;
|
||||
}
|
||||
|
||||
|
||||
private Cell copyCell(Point2D topLeft, Point2D bottomRight) {
|
||||
|
||||
Cell cell = Cell.fromPageCoordinates(topLeft, bottomRight, pageToPdfTransform);
|
||||
cell.setHeaderCell(originalCell.isHeaderCell());
|
||||
cell.setTextBlocks(originalCell.getTextBlocks());
|
||||
return cell;
|
||||
}
|
||||
|
||||
|
||||
public void resetNeighbours() {
|
||||
|
||||
rights.clear();
|
||||
lefts.clear();
|
||||
aboves.clear();
|
||||
belows.clear();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -2,12 +2,16 @@ package com.knecon.fforesight.service.layoutparser.server;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
@ -37,13 +41,14 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@Test
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
String filePath = "/home/kschuettler/Dokumente/TestFiles/OCR/TestSet/VV-331340-first100.pdf";
|
||||
String filePath = "/home/kschuettler/Dokumente/TestFiles/RM syngenta standard/95 Trinexapac-ethyl_RAR_08_Volume_3CA_B-6_2018-01-10.pdf";
|
||||
|
||||
runForFile(filePath);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testLayoutParserEndToEndWithIdpResult() {
|
||||
|
||||
String filePath = "/home/kschuettler/Dokumente/Ticket Related/RED-8670/VV-331340-first100.pdf/viewerDocument.pdf";
|
||||
@ -58,7 +63,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testLayoutParserEndToEndWithFolder() {
|
||||
|
||||
String folder = "/home/kschuettler/Dokumente/Ticket Related/RED-9975";
|
||||
String folder = "/home/kschuettler/Dokumente/TestFiles/RM syngenta standard";
|
||||
List<Path> pdfFiles = Files.walk(Path.of(folder))
|
||||
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
||||
.sorted(Comparator.comparing(Path::getFileName))
|
||||
@ -67,20 +72,37 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
|
||||
System.out.printf("Found %d pdf files to process %n", pdfFiles.size());
|
||||
AtomicInteger count = new AtomicInteger(0);
|
||||
List<String> errorFiles = Collections.synchronizedList(new ArrayList<>());
|
||||
pdfFiles.stream()
|
||||
.peek(path -> log.info("{}/{}-{}", count.getAndIncrement(), pdfFiles.size(), path.getFileName()))
|
||||
.forEach(path -> runForFile(path.toFile().toString()));
|
||||
.peek(path -> log.info("[{}/{}]: {}", count.getAndIncrement(), pdfFiles.size(), path.getFileName()))
|
||||
.forEach(path -> runForFiles(path.toFile().toString(), errorFiles));
|
||||
if (!errorFiles.isEmpty()) {
|
||||
log.error("Errors occurred in files:\n{}", String.join("\n", errorFiles));
|
||||
throw new AssertionError();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void runForFiles(String filePath, List<String> errorFiles) {
|
||||
|
||||
try {
|
||||
runForFile(filePath, null);
|
||||
log.info("File {} processed successfully", filePath);
|
||||
} catch (Throwable e) {
|
||||
log.error("File {} failed with exception", filePath, e);
|
||||
errorFiles.add(filePath);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void runForFile(String filePath) {
|
||||
|
||||
runForFile(filePath, null);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void runForFile(String filePath, String idpResultPath) {
|
||||
private void runForFile(String filePath, String idpResultPath) throws IOException {
|
||||
|
||||
String fileName = Path.of(filePath).getFileName().toString();
|
||||
File file;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user