RED-8550: Faulty table recognition and text duplication leads to huge sections
* cherrypick
This commit is contained in:
parent
3c9049dc8a
commit
18a28e82d0
@ -94,16 +94,21 @@ public class LayoutParsingPipeline {
|
||||
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||
|
||||
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
|
||||
.orElse(originFile);
|
||||
|
||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
||||
if (layoutParsingRequest.imagesFileStorageId()
|
||||
.isPresent()) {
|
||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
|
||||
.get());
|
||||
}
|
||||
|
||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
||||
if (layoutParsingRequest.tablesFileStorageId()
|
||||
.isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
|
||||
.get());
|
||||
}
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
|
||||
@ -142,25 +147,25 @@ public class LayoutParsingPipeline {
|
||||
.numberOfPages(documentGraph.getNumberOfPages())
|
||||
.duration(System.currentTimeMillis() - start)
|
||||
.message(format("""
|
||||
Layout parsing has finished in %.02f s.
|
||||
identifiers: %s
|
||||
%s
|
||||
Files have been saved with Ids:
|
||||
Structure: %s
|
||||
Text: %s
|
||||
Positions: %s
|
||||
PageData: %s
|
||||
Simplified Text: %s
|
||||
Viewer Doc: %s""",
|
||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||
layoutParsingRequest.identifier(),
|
||||
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
layoutParsingRequest.simplifiedTextStorageId(),
|
||||
layoutParsingRequest.viewerDocumentStorageId()))
|
||||
Layout parsing has finished in %.02f s.
|
||||
identifiers: %s
|
||||
%s
|
||||
Files have been saved with Ids:
|
||||
Structure: %s
|
||||
Text: %s
|
||||
Positions: %s
|
||||
PageData: %s
|
||||
Simplified Text: %s
|
||||
Viewer Doc: %s""",
|
||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||
layoutParsingRequest.identifier(),
|
||||
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
layoutParsingRequest.simplifiedTextStorageId(),
|
||||
layoutParsingRequest.viewerDocumentStorageId()))
|
||||
.build();
|
||||
|
||||
}
|
||||
@ -170,9 +175,9 @@ public class LayoutParsingPipeline {
|
||||
|
||||
AtomicReference<Document> documentReference = new AtomicReference<>();
|
||||
|
||||
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry).contextualName("build-document-graph").observe(() -> {
|
||||
documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument));
|
||||
});
|
||||
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry)
|
||||
.contextualName("build-document-graph")
|
||||
.observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument)));
|
||||
|
||||
return documentReference.get();
|
||||
}
|
||||
@ -181,14 +186,14 @@ public class LayoutParsingPipeline {
|
||||
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
||||
|
||||
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||
numberOfPages,
|
||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||
numberOfPages,
|
||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||
}
|
||||
|
||||
|
||||
@ -319,9 +324,7 @@ public class LayoutParsingPipeline {
|
||||
|
||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||
|
||||
if (!classificationPage.isLandscape()) {
|
||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||
}
|
||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
||||
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
||||
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
||||
|
||||
@ -76,4 +76,14 @@ public class Cell extends Rectangle {
|
||||
return this.getHeight() >= MIN_SIZE && this.getWidth() >= MIN_SIZE;
|
||||
}
|
||||
|
||||
public boolean nearlyIntersects(Cell other) {
|
||||
|
||||
if (this.getHeight() <= 0 || other.getHeight() <= 0) {
|
||||
return false;
|
||||
}
|
||||
double x0 = this.getX() + 2;
|
||||
double y0 = this.getY() + 2;
|
||||
return (other.x + other.width > x0 && other.y + other.height > y0 && other.x < x0 + this.getWidth() - 2 && other.y < y0 + this.getHeight() - 2);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,14 +1,12 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
@ -21,7 +19,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Slf4j
|
||||
public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
private final TreeMap<CellPosition, Cell> cells = new TreeMap<>();
|
||||
private final TreeMap<CellPosition, Cell> cellTreeMap = new TreeMap<>();
|
||||
|
||||
private final int rotation;
|
||||
@Getter
|
||||
@ -30,10 +28,14 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
private int unrotatedRowCount;
|
||||
private int unrotatedColCount;
|
||||
private List<List<Cell>> rows;
|
||||
@Getter
|
||||
@Setter
|
||||
private List<Cell> cells;
|
||||
|
||||
|
||||
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
|
||||
|
||||
this.cells = cells;
|
||||
addCells(cells);
|
||||
minX = area.getLeft();
|
||||
minY = area.getBottom();
|
||||
@ -50,6 +52,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
return getColCount() == 0 || getRowCount() == 0;
|
||||
}
|
||||
|
||||
|
||||
public List<List<Cell>> getRows() {
|
||||
|
||||
if (rows == null) {
|
||||
@ -80,7 +83,10 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
public int getColCount() {
|
||||
|
||||
return getRows().stream().mapToInt(List::size).max().orElse(0);
|
||||
return getRows().stream()
|
||||
.mapToInt(List::size)
|
||||
.max()
|
||||
.orElse(0);
|
||||
|
||||
}
|
||||
|
||||
@ -120,7 +126,8 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
List<Cell> cellsToTheTop = new ArrayList<>();
|
||||
for (int i = 0; i < rowIndex; i++) {
|
||||
try {
|
||||
cellsToTheTop.add(rows.get(i).get(colIndex));
|
||||
cellsToTheTop.add(rows.get(i)
|
||||
.get(colIndex));
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
|
||||
}
|
||||
@ -135,7 +142,8 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
|
||||
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks()
|
||||
.get(0).getMostPopularWordStyle().equals("bold")) {
|
||||
cell.setHeaderCell(true);
|
||||
}
|
||||
}
|
||||
@ -151,7 +159,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
for (int i = 0; i < unrotatedColCount; i++) { // rows
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
|
||||
Cell cell = cells.get(new CellPosition(j, i));
|
||||
Cell cell = cellTreeMap.get(new CellPosition(j, i));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
@ -162,7 +170,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < unrotatedRowCount; j++) { // cols
|
||||
Cell cell = cells.get(new CellPosition(j, i));
|
||||
Cell cell = cellTreeMap.get(new CellPosition(j, i));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
@ -173,7 +181,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
for (int i = 0; i < unrotatedRowCount; i++) {
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < unrotatedColCount; j++) {
|
||||
Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
|
||||
Cell cell = cellTreeMap.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
@ -187,17 +195,6 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
private void add(Cell chunk, int row, int col) {
|
||||
|
||||
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
|
||||
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
|
||||
|
||||
CellPosition cp = new CellPosition(row, col);
|
||||
cells.put(cp, chunk);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void addCells(List<Cell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
@ -206,11 +203,12 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1);
|
||||
|
||||
List<List<Cell>> rowsOfCells = calculateStructure(cells);
|
||||
List<List<Cell>> rowsOfCellsMatrix = calculateTableStructure(cells);
|
||||
|
||||
for (int i = 0; i < rowsOfCells.size(); i++) {
|
||||
for (int j = 0; j < rowsOfCells.get(i).size(); j++) {
|
||||
add(rowsOfCells.get(i).get(j), i, j);
|
||||
for (int i = 0; i < rowsOfCellsMatrix.size(); i++) {
|
||||
for (int j = 0; j < rowsOfCellsMatrix.get(i).size(); j++) {
|
||||
addCellToRowAndCol(rowsOfCellsMatrix.get(i)
|
||||
.get(j), i, j);
|
||||
}
|
||||
}
|
||||
|
||||
@ -221,29 +219,36 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
||||
*
|
||||
* @param cells The found cells
|
||||
* @return TablePageBlock Structure
|
||||
* @return TablePageBlock Structure as a rows of cells matrix
|
||||
*/
|
||||
private List<List<Cell>> calculateStructure(List<Cell> cells) {
|
||||
|
||||
List<List<Cell>> matrix = new ArrayList<>();
|
||||
private List<List<Cell>> calculateTableStructure(List<Cell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return matrix;
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
Set<Float> uniqueX = new HashSet<>();
|
||||
Set<Float> uniqueY = new HashSet<>();
|
||||
cells.stream().filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3).forEach(c -> {
|
||||
uniqueX.add(c.getLeft());
|
||||
uniqueX.add(c.getRight());
|
||||
uniqueY.add(c.getBottom());
|
||||
uniqueY.add(c.getTop());
|
||||
});
|
||||
cells.stream()
|
||||
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
|
||||
.forEach(c -> {
|
||||
uniqueX.add(c.getLeft());
|
||||
uniqueX.add(c.getRight());
|
||||
uniqueY.add(c.getBottom());
|
||||
uniqueY.add(c.getTop());
|
||||
});
|
||||
|
||||
var sortedUniqueX = uniqueX.stream().sorted().toList();
|
||||
var sortedUniqueY = uniqueY.stream().sorted().toList();
|
||||
var sortedUniqueX = uniqueX.stream()
|
||||
.sorted()
|
||||
.toList();
|
||||
var sortedUniqueY = uniqueY.stream()
|
||||
.sorted()
|
||||
.toList();
|
||||
|
||||
List<List<Cell>> rowsOfCells = new ArrayList<>();
|
||||
|
||||
Float prevY = null;
|
||||
|
||||
for (Float y : sortedUniqueY) {
|
||||
|
||||
List<Cell> row = new ArrayList<>();
|
||||
@ -254,42 +259,81 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
if (prevY != null && prevX != null) {
|
||||
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
||||
|
||||
var intersectionCell = cells.stream().filter(c -> intersects(cell, c)).findFirst();
|
||||
|
||||
intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
|
||||
if (cell.hasMinimumSize()) {
|
||||
|
||||
cells.stream()
|
||||
.filter(cell::nearlyIntersects)
|
||||
.forEach(intersectingCell -> cell.getTextBlocks().addAll(intersectingCell.getTextBlocks()));
|
||||
|
||||
row.add(cell);
|
||||
}
|
||||
}
|
||||
prevX = x;
|
||||
}
|
||||
|
||||
if (prevY != null && prevX != null && !row.isEmpty()) {
|
||||
matrix.add(row);
|
||||
// exclude empty rows and rows where all text blocks are empty
|
||||
if (prevY != null && prevX != null && !row.isEmpty() && !row.stream()
|
||||
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
|
||||
|
||||
rowsOfCells.add(row);
|
||||
}
|
||||
prevY = y;
|
||||
}
|
||||
|
||||
Collections.reverse(matrix);
|
||||
Collections.reverse(rowsOfCells);
|
||||
|
||||
return matrix;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public boolean intersects(Cell cell1, Cell cell2) {
|
||||
if (cell1.getHeight() <= 0 || cell2.getHeight() <= 0) {
|
||||
return false;
|
||||
// now cells are removed which are part of a column without any text blocks
|
||||
// this is done by first computing the inverse matrix which contains call columns of cells
|
||||
// then the column indices that have to be removed are determined
|
||||
List<List<Cell>> columnsOfCells = new ArrayList<>();
|
||||
int maxRowLength = rowsOfCells.stream()
|
||||
.map(List::size)
|
||||
.max(java.util.Comparator.naturalOrder())
|
||||
.orElse(0);
|
||||
for (int i = 0; i < maxRowLength; i++) {
|
||||
columnsOfCells.add(new ArrayList<>());
|
||||
}
|
||||
double x0 = cell1.getX() + 2;
|
||||
double y0 = cell1.getY() + 2;
|
||||
return (cell2.x + cell2.width > x0 &&
|
||||
cell2.y + cell2.height > y0 &&
|
||||
cell2.x < x0 + cell1.getWidth() -2 &&
|
||||
cell2.y < y0 + cell1.getHeight() -2);
|
||||
|
||||
for (List<Cell> row : rowsOfCells) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
columnsOfCells.get(j).add(row.get(j));
|
||||
}
|
||||
}
|
||||
|
||||
List<Integer> columnIndicesToRemove = new ArrayList<>();
|
||||
int columnIndex = 0;
|
||||
for (List<Cell> col : columnsOfCells) {
|
||||
if (col.stream()
|
||||
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
|
||||
columnIndicesToRemove.add(columnIndex);
|
||||
}
|
||||
columnIndex++;
|
||||
}
|
||||
columnIndicesToRemove.sort(Collections.reverseOrder());
|
||||
|
||||
// update all rows so that the values of the empty columns get removed
|
||||
var rowsOfCellsBefore = new ArrayList<>(rowsOfCells);
|
||||
rowsOfCells = new ArrayList<>();
|
||||
for (List<Cell> row : rowsOfCellsBefore) {
|
||||
var updatedRow = new ArrayList<>(row);
|
||||
columnIndicesToRemove.forEach(idxToRemove -> updatedRow.remove(updatedRow.get(idxToRemove)));
|
||||
rowsOfCells.add(updatedRow);
|
||||
}
|
||||
|
||||
return rowsOfCells;
|
||||
}
|
||||
|
||||
|
||||
private void addCellToRowAndCol(Cell cell, int row, int col) {
|
||||
|
||||
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
|
||||
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
|
||||
|
||||
CellPosition cp = new CellPosition(row, col);
|
||||
cellTreeMap.put(cp, cell);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getText() {
|
||||
@ -314,7 +358,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
if (!first) {
|
||||
sb.append("\n");
|
||||
}
|
||||
sb.append('\"').append(textBlock.getText().replaceAll("\"", "\\\"")).append('\"');
|
||||
sb.append('\"').append(textBlock.getText().replaceAll("\"", "\"")).append('\"');
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
@ -328,8 +372,6 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
public String getTextAsHtml() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
@ -25,7 +25,8 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class RulingCleaningService {
|
||||
|
||||
private static final float THRESHOLD = 6;
|
||||
private static final float THRESHOLD_Y = 6;
|
||||
private static final float THRESHOLD_X = 2;
|
||||
|
||||
|
||||
public CleanRulings getCleanRulings(List<TableCells> tableCells, List<Ruling> rulings) {
|
||||
@ -81,7 +82,7 @@ public class RulingCleaningService {
|
||||
|
||||
for (Point2D p : points.subList(1, points.size() - 1)) {
|
||||
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
|
||||
if (Math.abs(p.getX() - last.get(0).getX()) < THRESHOLD) {
|
||||
if (Math.abs(p.getX() - last.get(0).getX()) < THRESHOLD_X) {
|
||||
groupedPoints.get(groupedPoints.size() - 1).add(p);
|
||||
} else {
|
||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
|
||||
@ -108,7 +109,7 @@ public class RulingCleaningService {
|
||||
|
||||
for (Point2D p : points.subList(1, points.size() - 1)) {
|
||||
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
|
||||
if (Math.abs(p.getY() - last.get(0).getY()) < THRESHOLD) {
|
||||
if (Math.abs(p.getY() - last.get(0).getY()) < THRESHOLD_Y) {
|
||||
groupedPoints.get(groupedPoints.size() - 1).add(p);
|
||||
} else {
|
||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
|
||||
|
||||
@ -5,7 +5,6 @@ import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
@ -25,55 +24,62 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparis
|
||||
@Service
|
||||
public class TableExtractionService {
|
||||
|
||||
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
|
||||
private static final int MAX_TABLE_OUTER_POINT_TOLERANCE = 10;
|
||||
private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
|
||||
private static final float SPREADSHEET_AREA_TOLERANCE = 0.001f;
|
||||
|
||||
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (point1, point2) -> {
|
||||
|
||||
int rv = 0;
|
||||
float arg0X = DoubleComparisons.round(arg0.getX(), 2);
|
||||
float arg0Y = DoubleComparisons.round(arg0.getY(), 2);
|
||||
float arg1X = DoubleComparisons.round(arg1.getX(), 2);
|
||||
float arg1Y = DoubleComparisons.round(arg1.getY(), 2);
|
||||
float point1X = DoubleComparisons.round(point1.getX(), 2);
|
||||
float point1Y = DoubleComparisons.round(point1.getY(), 2);
|
||||
float point2X = DoubleComparisons.round(point2.getX(), 2);
|
||||
float point2Y = DoubleComparisons.round(point2.getY(), 2);
|
||||
|
||||
if (arg0X > arg1X) {
|
||||
if (point1X > point2X) {
|
||||
rv = 1;
|
||||
} else if (arg0X < arg1X) {
|
||||
} else if (point1X < point2X) {
|
||||
rv = -1;
|
||||
} else if (arg0Y > arg1Y) {
|
||||
} else if (point1Y > point2Y) {
|
||||
rv = 1;
|
||||
} else if (arg0Y < arg1Y) {
|
||||
} else if (point1Y < point2Y) {
|
||||
rv = -1;
|
||||
}
|
||||
return rv;
|
||||
};
|
||||
private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {
|
||||
private static final Comparator<Point2D> Y_FIRST_POINT_COMPARATOR = (point1, point2) -> {
|
||||
|
||||
int rv = 0;
|
||||
float arg0X = DoubleComparisons.round(arg0.getX(), 2);
|
||||
float arg0Y = DoubleComparisons.round(arg0.getY(), 2);
|
||||
float arg1X = DoubleComparisons.round(arg1.getX(), 2);
|
||||
float arg1Y = DoubleComparisons.round(arg1.getY(), 2);
|
||||
float point1X = DoubleComparisons.round(point1.getX(), 2);
|
||||
float point1Y = DoubleComparisons.round(point1.getY(), 2);
|
||||
float point2X = DoubleComparisons.round(point2.getX(), 2);
|
||||
float point2Y = DoubleComparisons.round(point2.getY(), 2);
|
||||
|
||||
if (arg0Y > arg1Y) {
|
||||
if (point1Y > point2Y) {
|
||||
rv = 1;
|
||||
} else if (arg0Y < arg1Y) {
|
||||
} else if (point1Y < point2Y) {
|
||||
rv = -1;
|
||||
} else if (arg0X > arg1X) {
|
||||
} else if (point1X > point2X) {
|
||||
rv = 1;
|
||||
} else if (arg0X < arg1X) {
|
||||
} else if (point1X < point2X) {
|
||||
rv = -1;
|
||||
}
|
||||
return rv;
|
||||
};
|
||||
|
||||
private static final Comparator<Cell> CELL_SIZE_COMPARATOR = (cell1, cell2) -> {
|
||||
|
||||
public boolean contains(Cell cell, double x, double y, double w, double h) {
|
||||
Double cell1Size = cell1.getHeight() * cell1.getWidth();
|
||||
Double cell2Size = cell2.getHeight() * cell2.getWidth();
|
||||
return cell1Size.compareTo(cell2Size);
|
||||
};
|
||||
|
||||
if (cell.isEmpty() || w <= 0 || h <= 0) {
|
||||
return false;
|
||||
}
|
||||
double x0 = cell.getX();
|
||||
double y0 = cell.getY();
|
||||
return (x >= x0 - 2 && y >= y0 - 2 && (x + w) <= x0 + cell.getWidth() + 2 && (y + h) <= y0 + cell.getHeight() + 2);
|
||||
}
|
||||
private static final Comparator<Rectangle> RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> {
|
||||
|
||||
Double rect1Size = rect1.getHeight() * rect1.getWidth();
|
||||
Double rect2Size = rect2.getHeight() * rect2.getWidth();
|
||||
return rect1Size.compareTo(rect2Size);
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
@ -89,22 +95,18 @@ public class TableExtractionService {
|
||||
* @param cleanRulings The lines used to build the table.
|
||||
* @param page Page object that contains textblocks and statistics.
|
||||
*/
|
||||
|
||||
public void extractTables(CleanRulings cleanRulings, ClassificationPage page) {
|
||||
|
||||
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
|
||||
List<TextPageBlock> toBeRemoved = new ArrayList<>();
|
||||
// sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them
|
||||
cells.sort(CELL_SIZE_COMPARATOR);
|
||||
|
||||
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
||||
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
|
||||
for (Cell cell : cells) {
|
||||
if (cell.hasMinimumSize() && contains(cell,
|
||||
textBlock.getPdfMinX(),
|
||||
textBlock.getPdfMinY(),
|
||||
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
|
||||
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
|
||||
if (cell.hasMinimumSize() && doesCellContainTextBlock(cell, textBlock)) {
|
||||
cell.addTextBlock(textBlock);
|
||||
toBeRemoved.add(textBlock);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -114,39 +116,70 @@ public class TableExtractionService {
|
||||
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
|
||||
|
||||
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells);
|
||||
// sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first
|
||||
// this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells
|
||||
spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR);
|
||||
|
||||
List<TablePageBlock> tables = new ArrayList<>();
|
||||
for (Rectangle area : spreadsheetAreas) {
|
||||
|
||||
List<Cell> overlappingCells = new ArrayList<>();
|
||||
List<Cell> containedCells = new ArrayList<>();
|
||||
for (Cell c : cells) {
|
||||
if (c.hasMinimumSize() && c.intersects(area)) {
|
||||
overlappingCells.add(c);
|
||||
if (c.hasMinimumSize() && area.contains(c)) {
|
||||
containedCells.add(c);
|
||||
}
|
||||
}
|
||||
tables.add(new TablePageBlock(overlappingCells, area, page.getRotation()));
|
||||
|
||||
var containedCellsWithText = containedCells.stream()
|
||||
.filter(cell -> !cell.getTextBlocks().isEmpty())
|
||||
.count();
|
||||
|
||||
// verify if table would contain fewer cells with text than the threshold allows
|
||||
if (containedCellsWithText >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT) {
|
||||
tables.add(new TablePageBlock(containedCells, area, page.getRotation()));
|
||||
cells.removeAll(containedCells);
|
||||
}
|
||||
}
|
||||
|
||||
for (TablePageBlock table : tables) {
|
||||
int position = -1;
|
||||
|
||||
Iterator<AbstractPageBlock> itty = page.getTextBlocks().iterator();
|
||||
while (itty.hasNext()) {
|
||||
AbstractPageBlock textBlock = itty.next();
|
||||
if (textBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) textBlock) : table.contains(textBlock) && position == -1) {
|
||||
position = page.getTextBlocks().indexOf(textBlock);
|
||||
for (AbstractPageBlock pageBlock : page.getTextBlocks()) {
|
||||
if (pageBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) pageBlock) : table.contains(pageBlock) && position == -1) {
|
||||
position = page.getTextBlocks().indexOf(pageBlock);
|
||||
}
|
||||
}
|
||||
if (position != -1) {
|
||||
page.getTextBlocks().add(position, table);
|
||||
|
||||
var toBeRemoved = table.getCells()
|
||||
.stream()
|
||||
.map(Cell::getTextBlocks)
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
// remove text blocks from the page that were also added with the table (from its contained cells)
|
||||
page.getTextBlocks().removeAll(toBeRemoved);
|
||||
}
|
||||
}
|
||||
|
||||
page.getTextBlocks().removeAll(toBeRemoved);
|
||||
}
|
||||
|
||||
|
||||
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) {
|
||||
|
||||
double x = textBlock.getPdfMinX();
|
||||
double y = textBlock.getPdfMinY();
|
||||
double w = textBlock.getPdfMaxX() - textBlock.getPdfMinX();
|
||||
double h = textBlock.getPdfMaxY() - textBlock.getPdfMinY();
|
||||
if (cell.isEmpty() || w <= 0 || h <= 0) {
|
||||
return false;
|
||||
}
|
||||
double x0 = cell.getX();
|
||||
double y0 = cell.getY();
|
||||
return (x >= x0 - 2 && y >= y0 - 2 && (x + w) <= x0 + cell.getWidth() + 2 && (y + h) <= y0 + cell.getHeight() + 2);
|
||||
}
|
||||
|
||||
|
||||
private List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
// Fix for 211.pdf
|
||||
for (Ruling r : horizontalRulingLines) {
|
||||
@ -160,7 +193,7 @@ public class TableExtractionService {
|
||||
List<Cell> cellsFound = new ArrayList<>();
|
||||
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
|
||||
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
|
||||
intersectionPointsList.sort(POINT_COMPARATOR);
|
||||
intersectionPointsList.sort(Y_FIRST_POINT_COMPARATOR);
|
||||
|
||||
for (int i = 0; i < intersectionPointsList.size(); i++) {
|
||||
Point2D topLeft = intersectionPointsList.get(i);
|
||||
@ -186,13 +219,14 @@ public class TableExtractionService {
|
||||
continue;
|
||||
}
|
||||
for (Point2D yPoint : yPoints) {
|
||||
// is there an horizontal edge b/w topLeft and yPoint ?
|
||||
// is there a horizontal edge b/w topLeft and yPoint ?
|
||||
if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) {
|
||||
continue;
|
||||
}
|
||||
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
|
||||
if (intersectionPoints.containsKey(btmRight) && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) && intersectionPoints.get(btmRight)[1].equals(
|
||||
intersectionPoints.get(yPoint)[1])) {
|
||||
if (intersectionPoints.containsKey(btmRight)
|
||||
&& intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0])
|
||||
&& intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) {
|
||||
cellsFound.add(new Cell(topLeft, btmRight));
|
||||
break outer;
|
||||
}
|
||||
@ -214,7 +248,6 @@ public class TableExtractionService {
|
||||
Set<Point2D> pointSet = new HashSet<>();
|
||||
Map<Point2D, Point2D> edgesH = new HashMap<>();
|
||||
Map<Point2D, Point2D> edgesV = new HashMap<>();
|
||||
int i = 0;
|
||||
|
||||
for (Rectangle cell : cells) {
|
||||
for (Point2D pt : cell.getPoints()) {
|
||||
@ -231,8 +264,9 @@ public class TableExtractionService {
|
||||
pointsSortX.sort(X_FIRST_POINT_COMPARATOR);
|
||||
// Y first sort
|
||||
List<Point2D> pointsSortY = new ArrayList<>(pointSet);
|
||||
pointsSortY.sort(POINT_COMPARATOR);
|
||||
pointsSortY.sort(Y_FIRST_POINT_COMPARATOR);
|
||||
|
||||
int i = 0;
|
||||
while (i < pointSet.size()) {
|
||||
float currY = (float) pointsSortY.get(i).getY();
|
||||
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortY.get(i).getY(), currY)) {
|
||||
@ -257,7 +291,8 @@ public class TableExtractionService {
|
||||
Point2D nextVertex;
|
||||
while (!edgesH.isEmpty()) {
|
||||
ArrayList<PolygonVertex> polygon = new ArrayList<>();
|
||||
Point2D first = edgesH.keySet().iterator().next();
|
||||
Point2D first = edgesH.keySet()
|
||||
.iterator().next();
|
||||
polygon.add(new PolygonVertex(first, Direction.HORIZONTAL));
|
||||
edgesH.remove(first);
|
||||
|
||||
@ -301,7 +336,14 @@ public class TableExtractionService {
|
||||
bottom = (float) Math.max(bottom, pt.point.getY());
|
||||
right = (float) Math.max(right, pt.point.getX());
|
||||
}
|
||||
rectangles.add(new Rectangle(top, left, right - left, bottom - top));
|
||||
|
||||
// do not add polygons with too many outer points as they are unlikely to be tables
|
||||
if (poly.size() <= MAX_TABLE_OUTER_POINT_TOLERANCE) {
|
||||
rectangles.add(new Rectangle(top - SPREADSHEET_AREA_TOLERANCE,
|
||||
left - SPREADSHEET_AREA_TOLERANCE,
|
||||
right - left + 2 * SPREADSHEET_AREA_TOLERANCE,
|
||||
bottom - top + 2 * SPREADSHEET_AREA_TOLERANCE));
|
||||
}
|
||||
}
|
||||
|
||||
return rectangles;
|
||||
|
||||
@ -30,8 +30,6 @@ public class TableMergingUtility {
|
||||
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock,
|
||||
consecutiveTable)) {
|
||||
consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return Stream.concat(Stream.of(originalTablePageBlock), consecutiveTablesWithSameColCountAndHeaders.stream()).toList();
|
||||
|
||||
@ -35,9 +35,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||
long start = System.currentTimeMillis();
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
@ -51,7 +52,11 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, documentFile, new ImageServiceResponse(), tableResponse, Path.of(fileName).getFileName().toFile().toString());
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||
documentFile,
|
||||
new ImageServiceResponse(),
|
||||
tableResponse,
|
||||
Path.of(fileName).getFileName().toFile().toString());
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
@ -60,3 +65,4 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -29,8 +29,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Im
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
@ -50,12 +48,6 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Autowired
|
||||
private RedactManagerClassificationService redactManagerClassificationService;
|
||||
|
||||
@Autowired
|
||||
private CvTableParsingAdapter cvTableParsingAdapter;
|
||||
|
||||
@Autowired
|
||||
private ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
|
||||
@Autowired
|
||||
private SectionsBuilderService sectionsBuilderService;
|
||||
|
||||
@ -64,10 +56,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
|
||||
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
tableServiceResponse,
|
||||
"document");
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
tableServiceResponse,
|
||||
"document");
|
||||
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
|
||||
@ -87,11 +79,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void tablesToHtmlDebugger() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T5 VV-640252-Page16.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
toHtml(document, "/tmp/A20622A izRMS (CZ) fRR Part B9_Page185.html");
|
||||
toHtml(document, "/tmp/T5.html");
|
||||
|
||||
}
|
||||
|
||||
@ -109,6 +101,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
@Disabled
|
||||
@Test
|
||||
public void testScanRotationBorderIsIgnored() throws IOException {
|
||||
|
||||
@ -117,8 +110,16 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
var tables = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList();
|
||||
|
||||
// Quality of the table parsing is not good, because the file is rotated at scanning.
|
||||
// We only asset that the table border is not the page border.
|
||||
@ -140,12 +141,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
imageServiceResponse.getData()
|
||||
.forEach(imageMetadata -> images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
||||
imageMetadata.getPosition().getY1(),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight()),
|
||||
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
|
||||
imageMetadata.isAlpha(),
|
||||
imageMetadata.getPosition().getPageNumber())));
|
||||
imageMetadata.getPosition().getY1(),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight()),
|
||||
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
|
||||
imageMetadata.isAlpha(),
|
||||
imageMetadata.getPosition().getPageNumber())));
|
||||
|
||||
System.out.println("object");
|
||||
}
|
||||
@ -157,11 +158,22 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock table = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(table.getColCount()).isEqualTo(6);
|
||||
assertThat(table.getRowCount()).isEqualTo(13);
|
||||
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
||||
assertThat(table.getRows()
|
||||
.stream()
|
||||
.mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
||||
}
|
||||
|
||||
|
||||
@ -171,15 +183,37 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
|
||||
TablePageBlock secondTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(2);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.toList().equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@ -189,15 +223,37 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
||||
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
|
||||
TablePageBlock secondTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(9);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1).stream().map(Cell::getHeaderCells).collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(firstTable.getRowCount() - 1)
|
||||
.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.toList().equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@ -207,19 +263,41 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
|
||||
TablePageBlock secondTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.toList().equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@Test // Non-sense test
|
||||
@Test
|
||||
public void testDoc56Page170() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf");
|
||||
@ -230,8 +308,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTable(document, 0, 1, 1, 0, 0);
|
||||
validateTable(document, 1, 2, 2, 0, 0);
|
||||
validateTable(document, 2, 6, 20, 0, 0);
|
||||
validateTable(document, 3, 7, 31, 0, 0);
|
||||
validateTable(document, 2, 4, 19, 12, 0);
|
||||
validateTable(document, 3, 2, 12, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -265,29 +343,30 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
validateTable(document, 0, 8, 8, 0, 0);
|
||||
|
||||
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
|
||||
"Author, date",
|
||||
"Study title",
|
||||
"Analytical method Author, date, No.",
|
||||
"Technique, LOQ of the method, validated working range",
|
||||
"Method meets analytical validation criteria",
|
||||
"Remarks (in case validation criteria are not met)",
|
||||
"Acceptability of the method"),
|
||||
Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
|
||||
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
|
||||
"Evans P.G. 2001 TMJ4569B, VV-323245",
|
||||
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
|
||||
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
|
||||
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
|
||||
"Y",
|
||||
"N/A",
|
||||
"Y"));
|
||||
"Author, date",
|
||||
"Study title",
|
||||
"Analytical method Author, date, No.",
|
||||
"Technique, LOQ of the method, validated working range",
|
||||
"Method meets analytical validation criteria",
|
||||
"Remarks (in case validation criteria are not met)",
|
||||
"Acceptability of the method"),
|
||||
Arrays.asList(
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
|
||||
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
|
||||
"Evans P.G. 2001 TMJ4569B, VV-323245",
|
||||
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
|
||||
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
|
||||
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
|
||||
"Y",
|
||||
"N/A",
|
||||
"Y"));
|
||||
|
||||
validateTable(document, 0, values);
|
||||
|
||||
@ -579,10 +658,109 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testT0() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T0 TableWithMergedCells.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 6, 8, 0, 0);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testT1() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T1 MultipleNestedTable.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 3, 3, 0, 0);
|
||||
validateTable(document, 1, 3, 5, 2, 0);
|
||||
validateTable(document, 2, 3, 3, 1, 0);
|
||||
validateTable(document, 3, 3, 3, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testT2() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T2 MultipleTables.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 6);
|
||||
|
||||
validateTable(document, 0, 5, 5, 0, 0);
|
||||
validateTable(document, 1, 5, 6, 0, 0);
|
||||
validateTable(document, 2, 5, 5, 0, 0);
|
||||
validateTable(document, 3, 5, 5, 0, 0);
|
||||
validateTable(document, 4, 5, 5, 0, 0);
|
||||
validateTable(document, 5, 5, 5, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testT3() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T3 S-Meto_Page29.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 6, 5, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testT4() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T4 138 IDD0000261736_Page16.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 5, 8, 1, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testT5() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T5 VV-640252-Page16.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 6);
|
||||
validateTable(document, 0, 1, 1, 0, 0);
|
||||
validateTable(document, 1, 1, 1, 0, 0);
|
||||
validateTable(document, 2, 1, 1, 0, 0);
|
||||
validateTable(document, 3, 1, 1, 0, 0);
|
||||
validateTable(document, 4, 1, 1, 0, 0);
|
||||
validateTable(document, 5, 1, 1, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void toHtml(ClassificationDocument document, String filename) {
|
||||
|
||||
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
||||
var tables = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
int currentPage = 1;
|
||||
@ -603,9 +781,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
||||
|
||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
|
||||
TablePageBlock table = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(tableIndex);
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
|
||||
int emptyCellsFoundFound = rows.stream()
|
||||
.flatMap(List::stream)
|
||||
.toList()
|
||||
.stream()
|
||||
.filter(f -> f.toString().isEmpty())
|
||||
.toList().size();
|
||||
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
row.forEach(r -> System.out.println(r.toString()));
|
||||
@ -620,11 +808,20 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
|
||||
|
||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
|
||||
TablePageBlock table = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(tableIndex);
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
|
||||
List<Cell> rowsFlattened = rows.stream().flatMap(List::stream).toList();
|
||||
List<String> valuesFlattened = values.stream().flatMap(List::stream).toList();
|
||||
List<Cell> rowsFlattened = rows.stream()
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
List<String> valuesFlattened = values.stream()
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
|
||||
for (int i = 0; i < valuesFlattened.size(); i++) {
|
||||
Cell cell = rowsFlattened.get(i);
|
||||
@ -637,7 +834,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
private void validateTableSize(ClassificationDocument document, int tableSize) {
|
||||
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize);
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList().size()).isEqualTo(tableSize);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -26,10 +26,8 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -57,9 +55,6 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testTableExtraction() {
|
||||
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
|
||||
ClassPathResource resource = new ClassPathResource("files");
|
||||
List<String> pdfFileNames = Files.walk(resource.getFile().toPath())
|
||||
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
||||
@ -67,8 +62,8 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
.map(Path::toString)
|
||||
.toList();
|
||||
|
||||
for (int i = 0; i < pdfFileNames.size(); i++) {
|
||||
writeJsons(Path.of(pdfFileNames.get(i)));
|
||||
for (String pdfFileName : pdfFileNames) {
|
||||
writeJsons(Path.of(pdfFileName));
|
||||
}
|
||||
}
|
||||
|
||||
@ -88,13 +83,13 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
filename.toFile().toString()));
|
||||
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
|
||||
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
|
||||
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
|
||||
String tmpFileNameBefore = "C:/Users/YANNIK~1/AppData/Local/Temp/before." + filename.getFileName().toString();
|
||||
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure())) {
|
||||
String tmpFileNameBefore = "/tmp/before." + filename.getFileName().toString();
|
||||
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
|
||||
PdfDraw.drawDocumentGraph(pdDocument, documentGraphBefore);
|
||||
pdDocument.save(tmpFileNameBefore);
|
||||
}
|
||||
String tmpFileNameAfter = "C:/Users/YANNIK~1/AppData/Local/Temp/after." + filename.getFileName().toString();
|
||||
String tmpFileNameAfter = "/tmp/after." + filename.getFileName().toString();
|
||||
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
|
||||
PdfDraw.drawDocumentGraph(pdDocument, documentGraphAfter);
|
||||
pdDocument.save(tmpFileNameAfter);
|
||||
@ -105,9 +100,9 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2, String pdfName) {
|
||||
private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2) {
|
||||
|
||||
List listStructure1 = structure1.streamAllEntries()
|
||||
List<Table> listStructure1 = structure1.streamAllEntries()
|
||||
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
||||
.map(DocumentStructure.EntryData::getProperties)
|
||||
.map(properties -> {
|
||||
@ -117,7 +112,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
})
|
||||
.toList();
|
||||
|
||||
List listStructure2 = structure2.streamAllEntries()
|
||||
List<Table> listStructure2 = structure2.streamAllEntries()
|
||||
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
||||
.map(DocumentStructure.EntryData::getProperties)
|
||||
.map(properties -> {
|
||||
@ -128,8 +123,8 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
.toList();
|
||||
|
||||
for (int i = 0; i < listStructure1.size(); i++) {
|
||||
Table tableNode1 = (Table) listStructure1.get(i);
|
||||
Table tableNode2 = (Table) listStructure2.get(i);
|
||||
Table tableNode1 = listStructure1.get(i);
|
||||
Table tableNode2 = listStructure2.get(i);
|
||||
if (tableNode1.getNumberOfRows() != tableNode2.getNumberOfRows() || tableNode1.getNumberOfCols() != tableNode2.getNumberOfCols()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user