RED-8550: Faulty table recognition and text duplication leads to huge sections

This commit is contained in:
Maverick Studer 2024-02-21 13:54:30 +01:00
parent 0979a267d4
commit 1d64028158
15 changed files with 551 additions and 253 deletions

View File

@ -97,7 +97,8 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
@ -105,20 +106,24 @@ public class LayoutParsingPipeline {
}
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
if (layoutParsingRequest.imagesFileStorageId()
.isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get());
}
TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
if (layoutParsingRequest.tablesFileStorageId()
.isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
.get());
}
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
originFile,
imageServiceResponse,
tableServiceResponse,
visualLayoutParsingResponse,
originFile,
imageServiceResponse,
tableServiceResponse,
visualLayoutParsingResponse,
layoutParsingRequest.identifier().toString());
log.info("Building document graph for {}", layoutParsingRequest.identifier());
@ -151,25 +156,25 @@ public class LayoutParsingPipeline {
.numberOfPages(documentGraph.getNumberOfPages())
.duration(System.currentTimeMillis() - start)
.message(format("""
Layout parsing has finished in %.02f s.
identifiers: %s
%s
Files have been saved with Ids:
Structure: %s
Text: %s
Positions: %s
PageData: %s
Simplified Text: %s
Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId()))
Layout parsing has finished in %.02f s.
identifiers: %s
%s
Files have been saved with Ids:
Structure: %s
Text: %s
Positions: %s
PageData: %s
Simplified Text: %s
Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId()))
.build();
}
@ -179,9 +184,9 @@ public class LayoutParsingPipeline {
AtomicReference<Document> documentReference = new AtomicReference<>();
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry).contextualName("build-document-graph").observe(() -> {
documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument));
});
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry)
.contextualName("build-document-graph")
.observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument)));
return documentReference.get();
}
@ -190,14 +195,14 @@ public class LayoutParsingPipeline {
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
}
@ -337,9 +342,7 @@ public class LayoutParsingPipeline {
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
if (!classificationPage.isLandscape()) {
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
}
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());

View File

@ -76,4 +76,14 @@ public class Cell extends Rectangle {
return this.getHeight() >= MIN_SIZE && this.getWidth() >= MIN_SIZE;
}
public boolean nearlyIntersects(Cell other) {
if (this.getHeight() <= 0 || other.getHeight() <= 0) {
return false;
}
double x0 = this.getX() + 2;
double y0 = this.getY() + 2;
return (other.x + other.width > x0 && other.y + other.height > y0 && other.x < x0 + this.getWidth() - 2 && other.y < y0 + this.getHeight() - 2);
}
}

View File

@ -1,14 +1,12 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
@ -21,7 +19,7 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
public class TablePageBlock extends AbstractPageBlock {
private final TreeMap<CellPosition, Cell> cells = new TreeMap<>();
private final TreeMap<CellPosition, Cell> cellTreeMap = new TreeMap<>();
private final int rotation;
@Getter
@ -30,10 +28,14 @@ public class TablePageBlock extends AbstractPageBlock {
private int unrotatedRowCount;
private int unrotatedColCount;
private List<List<Cell>> rows;
@Getter
@Setter
private List<Cell> cells;
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
this.cells = cells;
addCells(cells);
minX = area.getLeft();
minY = area.getBottom();
@ -50,6 +52,7 @@ public class TablePageBlock extends AbstractPageBlock {
return getColCount() == 0 || getRowCount() == 0;
}
public List<List<Cell>> getRows() {
if (rows == null) {
@ -80,7 +83,10 @@ public class TablePageBlock extends AbstractPageBlock {
public int getColCount() {
return getRows().stream().mapToInt(List::size).max().orElse(0);
return getRows().stream()
.mapToInt(List::size)
.max()
.orElse(0);
}
@ -120,7 +126,8 @@ public class TablePageBlock extends AbstractPageBlock {
List<Cell> cellsToTheTop = new ArrayList<>();
for (int i = 0; i < rowIndex; i++) {
try {
cellsToTheTop.add(rows.get(i).get(colIndex));
cellsToTheTop.add(rows.get(i)
.get(colIndex));
} catch (IndexOutOfBoundsException e) {
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
}
@ -135,7 +142,8 @@ public class TablePageBlock extends AbstractPageBlock {
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks()
.get(0).getMostPopularWordStyle().equals("bold")) {
cell.setHeaderCell(true);
}
}
@ -151,7 +159,7 @@ public class TablePageBlock extends AbstractPageBlock {
for (int i = 0; i < unrotatedColCount; i++) { // rows
List<Cell> lastRow = new ArrayList<>();
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
Cell cell = cells.get(new CellPosition(j, i));
Cell cell = cellTreeMap.get(new CellPosition(j, i));
if (cell != null) {
lastRow.add(cell);
}
@ -162,7 +170,7 @@ public class TablePageBlock extends AbstractPageBlock {
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
List<Cell> lastRow = new ArrayList<>();
for (int j = 0; j < unrotatedRowCount; j++) { // cols
Cell cell = cells.get(new CellPosition(j, i));
Cell cell = cellTreeMap.get(new CellPosition(j, i));
if (cell != null) {
lastRow.add(cell);
}
@ -173,7 +181,7 @@ public class TablePageBlock extends AbstractPageBlock {
for (int i = 0; i < unrotatedRowCount; i++) {
List<Cell> lastRow = new ArrayList<>();
for (int j = 0; j < unrotatedColCount; j++) {
Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
Cell cell = cellTreeMap.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
if (cell != null) {
lastRow.add(cell);
}
@ -187,17 +195,6 @@ public class TablePageBlock extends AbstractPageBlock {
}
private void add(Cell chunk, int row, int col) {
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
CellPosition cp = new CellPosition(row, col);
cells.put(cp, chunk);
}
private void addCells(List<Cell> cells) {
if (cells.isEmpty()) {
@ -206,11 +203,12 @@ public class TablePageBlock extends AbstractPageBlock {
cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1);
List<List<Cell>> rowsOfCells = calculateStructure(cells);
List<List<Cell>> rowsOfCellsMatrix = calculateTableStructure(cells);
for (int i = 0; i < rowsOfCells.size(); i++) {
for (int j = 0; j < rowsOfCells.get(i).size(); j++) {
add(rowsOfCells.get(i).get(j), i, j);
for (int i = 0; i < rowsOfCellsMatrix.size(); i++) {
for (int j = 0; j < rowsOfCellsMatrix.get(i).size(); j++) {
addCellToRowAndCol(rowsOfCellsMatrix.get(i)
.get(j), i, j);
}
}
@ -221,29 +219,36 @@ public class TablePageBlock extends AbstractPageBlock {
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
*
* @param cells The found cells
* @return TablePageBlock Structure
* @return TablePageBlock Structure as a rows of cells matrix
*/
private List<List<Cell>> calculateStructure(List<Cell> cells) {
List<List<Cell>> matrix = new ArrayList<>();
private List<List<Cell>> calculateTableStructure(List<Cell> cells) {
if (cells.isEmpty()) {
return matrix;
return new ArrayList<>();
}
Set<Float> uniqueX = new HashSet<>();
Set<Float> uniqueY = new HashSet<>();
cells.stream().filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3).forEach(c -> {
uniqueX.add(c.getLeft());
uniqueX.add(c.getRight());
uniqueY.add(c.getBottom());
uniqueY.add(c.getTop());
});
cells.stream()
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
.forEach(c -> {
uniqueX.add(c.getLeft());
uniqueX.add(c.getRight());
uniqueY.add(c.getBottom());
uniqueY.add(c.getTop());
});
var sortedUniqueX = uniqueX.stream().sorted().toList();
var sortedUniqueY = uniqueY.stream().sorted().toList();
var sortedUniqueX = uniqueX.stream()
.sorted()
.toList();
var sortedUniqueY = uniqueY.stream()
.sorted()
.toList();
List<List<Cell>> rowsOfCells = new ArrayList<>();
Float prevY = null;
for (Float y : sortedUniqueY) {
List<Cell> row = new ArrayList<>();
@ -254,42 +259,81 @@ public class TablePageBlock extends AbstractPageBlock {
if (prevY != null && prevX != null) {
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
var intersectionCell = cells.stream().filter(c -> intersects(cell, c)).findFirst();
intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
if (cell.hasMinimumSize()) {
cells.stream()
.filter(cell::nearlyIntersects)
.forEach(intersectingCell -> cell.getTextBlocks().addAll(intersectingCell.getTextBlocks()));
row.add(cell);
}
}
prevX = x;
}
if (prevY != null && prevX != null && !row.isEmpty()) {
matrix.add(row);
// exclude empty rows and rows where all text blocks are empty
if (prevY != null && prevX != null && !row.isEmpty() && !row.stream()
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
rowsOfCells.add(row);
}
prevY = y;
}
Collections.reverse(matrix);
Collections.reverse(rowsOfCells);
return matrix;
}
public boolean intersects(Cell cell1, Cell cell2) {
if (cell1.getHeight() <= 0 || cell2.getHeight() <= 0) {
return false;
// now cells are removed which are part of a column without any text blocks
// this is done by first computing the inverse matrix which contains call columns of cells
// then the column indices that have to be removed are determined
List<List<Cell>> columnsOfCells = new ArrayList<>();
int maxRowLength = rowsOfCells.stream()
.map(List::size)
.max(java.util.Comparator.naturalOrder())
.orElse(0);
for (int i = 0; i < maxRowLength; i++) {
columnsOfCells.add(new ArrayList<>());
}
double x0 = cell1.getX() + 2;
double y0 = cell1.getY() + 2;
return (cell2.x + cell2.width > x0 &&
cell2.y + cell2.height > y0 &&
cell2.x < x0 + cell1.getWidth() -2 &&
cell2.y < y0 + cell1.getHeight() -2);
for (List<Cell> row : rowsOfCells) {
for (int j = 0; j < row.size(); j++) {
columnsOfCells.get(j).add(row.get(j));
}
}
List<Integer> columnIndicesToRemove = new ArrayList<>();
int columnIndex = 0;
for (List<Cell> col : columnsOfCells) {
if (col.stream()
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
columnIndicesToRemove.add(columnIndex);
}
columnIndex++;
}
columnIndicesToRemove.sort(Collections.reverseOrder());
// update all rows so that the values of the empty columns get removed
var rowsOfCellsBefore = new ArrayList<>(rowsOfCells);
rowsOfCells = new ArrayList<>();
for (List<Cell> row : rowsOfCellsBefore) {
var updatedRow = new ArrayList<>(row);
columnIndicesToRemove.forEach(idxToRemove -> updatedRow.remove(updatedRow.get(idxToRemove)));
rowsOfCells.add(updatedRow);
}
return rowsOfCells;
}
private void addCellToRowAndCol(Cell cell, int row, int col) {
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
CellPosition cp = new CellPosition(row, col);
cellTreeMap.put(cp, cell);
}
@Override
public String getText() {
@ -314,7 +358,7 @@ public class TablePageBlock extends AbstractPageBlock {
if (!first) {
sb.append("\n");
}
sb.append('\"').append(textBlock.getText().replaceAll("\"", "\\\"")).append('\"');
sb.append('\"').append(textBlock.getText().replaceAll("\"", "\"")).append('\"');
first = false;
}
}
@ -328,8 +372,6 @@ public class TablePageBlock extends AbstractPageBlock {
}
public String getTextAsHtml() {
StringBuilder sb = new StringBuilder();

View File

@ -25,7 +25,8 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class RulingCleaningService {
private static final float THRESHOLD = 6;
private static final float THRESHOLD_Y = 6;
private static final float THRESHOLD_X = 2;
public CleanRulings getCleanRulings(List<TableCells> tableCells, List<Ruling> rulings) {
@ -81,7 +82,7 @@ public class RulingCleaningService {
for (Point2D p : points.subList(1, points.size() - 1)) {
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
if (Math.abs(p.getX() - last.get(0).getX()) < THRESHOLD) {
if (Math.abs(p.getX() - last.get(0).getX()) < THRESHOLD_X) {
groupedPoints.get(groupedPoints.size() - 1).add(p);
} else {
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
@ -108,7 +109,7 @@ public class RulingCleaningService {
for (Point2D p : points.subList(1, points.size() - 1)) {
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
if (Math.abs(p.getY() - last.get(0).getY()) < THRESHOLD) {
if (Math.abs(p.getY() - last.get(0).getY()) < THRESHOLD_Y) {
groupedPoints.get(groupedPoints.size() - 1).add(p);
} else {
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));

View File

@ -5,7 +5,6 @@ import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
@ -25,55 +24,62 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparis
@Service
public class TableExtractionService {
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
private static final int MAX_TABLE_OUTER_POINT_TOLERANCE = 10;
private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
private static final float SPREADSHEET_AREA_TOLERANCE = 0.001f;
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (point1, point2) -> {
int rv = 0;
float arg0X = DoubleComparisons.round(arg0.getX(), 2);
float arg0Y = DoubleComparisons.round(arg0.getY(), 2);
float arg1X = DoubleComparisons.round(arg1.getX(), 2);
float arg1Y = DoubleComparisons.round(arg1.getY(), 2);
float point1X = DoubleComparisons.round(point1.getX(), 2);
float point1Y = DoubleComparisons.round(point1.getY(), 2);
float point2X = DoubleComparisons.round(point2.getX(), 2);
float point2Y = DoubleComparisons.round(point2.getY(), 2);
if (arg0X > arg1X) {
if (point1X > point2X) {
rv = 1;
} else if (arg0X < arg1X) {
} else if (point1X < point2X) {
rv = -1;
} else if (arg0Y > arg1Y) {
} else if (point1Y > point2Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
} else if (point1Y < point2Y) {
rv = -1;
}
return rv;
};
private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {
private static final Comparator<Point2D> Y_FIRST_POINT_COMPARATOR = (point1, point2) -> {
int rv = 0;
float arg0X = DoubleComparisons.round(arg0.getX(), 2);
float arg0Y = DoubleComparisons.round(arg0.getY(), 2);
float arg1X = DoubleComparisons.round(arg1.getX(), 2);
float arg1Y = DoubleComparisons.round(arg1.getY(), 2);
float point1X = DoubleComparisons.round(point1.getX(), 2);
float point1Y = DoubleComparisons.round(point1.getY(), 2);
float point2X = DoubleComparisons.round(point2.getX(), 2);
float point2Y = DoubleComparisons.round(point2.getY(), 2);
if (arg0Y > arg1Y) {
if (point1Y > point2Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
} else if (point1Y < point2Y) {
rv = -1;
} else if (arg0X > arg1X) {
} else if (point1X > point2X) {
rv = 1;
} else if (arg0X < arg1X) {
} else if (point1X < point2X) {
rv = -1;
}
return rv;
};
private static final Comparator<Cell> CELL_SIZE_COMPARATOR = (cell1, cell2) -> {
public boolean contains(Cell cell, double x, double y, double w, double h) {
Double cell1Size = cell1.getHeight() * cell1.getWidth();
Double cell2Size = cell2.getHeight() * cell2.getWidth();
return cell1Size.compareTo(cell2Size);
};
if (cell.isEmpty() || w <= 0 || h <= 0) {
return false;
}
double x0 = cell.getX();
double y0 = cell.getY();
return (x >= x0 - 2 && y >= y0 - 2 && (x + w) <= x0 + cell.getWidth() + 2 && (y + h) <= y0 + cell.getHeight() + 2);
}
private static final Comparator<Rectangle> RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> {
Double rect1Size = rect1.getHeight() * rect1.getWidth();
Double rect2Size = rect2.getHeight() * rect2.getWidth();
return rect1Size.compareTo(rect2Size);
};
/**
@ -89,22 +95,18 @@ public class TableExtractionService {
* @param cleanRulings The lines used to build the table.
* @param page Page object that contains textblocks and statistics.
*/
public void extractTables(CleanRulings cleanRulings, ClassificationPage page) {
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
List<TextPageBlock> toBeRemoved = new ArrayList<>();
// sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them
cells.sort(CELL_SIZE_COMPARATOR);
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
for (Cell cell : cells) {
if (cell.hasMinimumSize() && contains(cell,
textBlock.getPdfMinX(),
textBlock.getPdfMinY(),
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
if (cell.hasMinimumSize() && doesCellContainTextBlock(cell, textBlock)) {
cell.addTextBlock(textBlock);
toBeRemoved.add(textBlock);
break;
}
}
@ -114,39 +116,70 @@ public class TableExtractionService {
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells);
// sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first
// this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells
spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR);
List<TablePageBlock> tables = new ArrayList<>();
for (Rectangle area : spreadsheetAreas) {
List<Cell> overlappingCells = new ArrayList<>();
List<Cell> containedCells = new ArrayList<>();
for (Cell c : cells) {
if (c.hasMinimumSize() && c.intersects(area)) {
overlappingCells.add(c);
if (c.hasMinimumSize() && area.contains(c)) {
containedCells.add(c);
}
}
tables.add(new TablePageBlock(overlappingCells, area, page.getRotation()));
var containedCellsWithText = containedCells.stream()
.filter(cell -> !cell.getTextBlocks().isEmpty())
.count();
// verify if table would contain fewer cells with text than the threshold allows
if (containedCellsWithText >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT) {
tables.add(new TablePageBlock(containedCells, area, page.getRotation()));
cells.removeAll(containedCells);
}
}
for (TablePageBlock table : tables) {
int position = -1;
Iterator<AbstractPageBlock> itty = page.getTextBlocks().iterator();
while (itty.hasNext()) {
AbstractPageBlock textBlock = itty.next();
if (textBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) textBlock) : table.contains(textBlock) && position == -1) {
position = page.getTextBlocks().indexOf(textBlock);
for (AbstractPageBlock pageBlock : page.getTextBlocks()) {
if (pageBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) pageBlock) : table.contains(pageBlock) && position == -1) {
position = page.getTextBlocks().indexOf(pageBlock);
}
}
if (position != -1) {
page.getTextBlocks().add(position, table);
var toBeRemoved = table.getCells()
.stream()
.map(Cell::getTextBlocks)
.flatMap(List::stream)
.toList();
// remove text blocks from the page that were also added with the table (from its contained cells)
page.getTextBlocks().removeAll(toBeRemoved);
}
}
page.getTextBlocks().removeAll(toBeRemoved);
}
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) {
double x = textBlock.getPdfMinX();
double y = textBlock.getPdfMinY();
double w = textBlock.getPdfMaxX() - textBlock.getPdfMinX();
double h = textBlock.getPdfMaxY() - textBlock.getPdfMinY();
if (cell.isEmpty() || w <= 0 || h <= 0) {
return false;
}
double x0 = cell.getX();
double y0 = cell.getY();
return (x >= x0 - 2 && y >= y0 - 2 && (x + w) <= x0 + cell.getWidth() + 2 && (y + h) <= y0 + cell.getHeight() + 2);
}
private List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
// Fix for 211.pdf
for (Ruling r : horizontalRulingLines) {
@ -160,7 +193,7 @@ public class TableExtractionService {
List<Cell> cellsFound = new ArrayList<>();
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
intersectionPointsList.sort(POINT_COMPARATOR);
intersectionPointsList.sort(Y_FIRST_POINT_COMPARATOR);
for (int i = 0; i < intersectionPointsList.size(); i++) {
Point2D topLeft = intersectionPointsList.get(i);
@ -186,13 +219,14 @@ public class TableExtractionService {
continue;
}
for (Point2D yPoint : yPoints) {
// is there an horizontal edge b/w topLeft and yPoint ?
// is there a horizontal edge b/w topLeft and yPoint ?
if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) {
continue;
}
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
if (intersectionPoints.containsKey(btmRight) && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) && intersectionPoints.get(btmRight)[1].equals(
intersectionPoints.get(yPoint)[1])) {
if (intersectionPoints.containsKey(btmRight)
&& intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0])
&& intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) {
cellsFound.add(new Cell(topLeft, btmRight));
break outer;
}
@ -214,7 +248,6 @@ public class TableExtractionService {
Set<Point2D> pointSet = new HashSet<>();
Map<Point2D, Point2D> edgesH = new HashMap<>();
Map<Point2D, Point2D> edgesV = new HashMap<>();
int i = 0;
for (Rectangle cell : cells) {
for (Point2D pt : cell.getPoints()) {
@ -231,8 +264,9 @@ public class TableExtractionService {
pointsSortX.sort(X_FIRST_POINT_COMPARATOR);
// Y first sort
List<Point2D> pointsSortY = new ArrayList<>(pointSet);
pointsSortY.sort(POINT_COMPARATOR);
pointsSortY.sort(Y_FIRST_POINT_COMPARATOR);
int i = 0;
while (i < pointSet.size()) {
float currY = (float) pointsSortY.get(i).getY();
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortY.get(i).getY(), currY)) {
@ -257,7 +291,8 @@ public class TableExtractionService {
Point2D nextVertex;
while (!edgesH.isEmpty()) {
ArrayList<PolygonVertex> polygon = new ArrayList<>();
Point2D first = edgesH.keySet().iterator().next();
Point2D first = edgesH.keySet()
.iterator().next();
polygon.add(new PolygonVertex(first, Direction.HORIZONTAL));
edgesH.remove(first);
@ -301,7 +336,14 @@ public class TableExtractionService {
bottom = (float) Math.max(bottom, pt.point.getY());
right = (float) Math.max(right, pt.point.getX());
}
rectangles.add(new Rectangle(top, left, right - left, bottom - top));
// do not add polygons with too many outer points as they are unlikely to be tables
if (poly.size() <= MAX_TABLE_OUTER_POINT_TOLERANCE) {
rectangles.add(new Rectangle(top - SPREADSHEET_AREA_TOLERANCE,
left - SPREADSHEET_AREA_TOLERANCE,
right - left + 2 * SPREADSHEET_AREA_TOLERANCE,
bottom - top + 2 * SPREADSHEET_AREA_TOLERANCE));
}
}
return rectangles;

View File

@ -30,8 +30,6 @@ public class TableMergingUtility {
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock,
consecutiveTable)) {
consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable);
} else {
break;
}
}
return Stream.concat(Stream.of(originalTablePageBlock), consecutiveTablesWithSameColCountAndHeaders.stream()).toList();

View File

@ -36,9 +36,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
long start = System.currentTimeMillis();
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
}
@Test
@Disabled
@SneakyThrows
@ -52,7 +53,11 @@ public class ViewerDocumentTest extends BuildDocumentTest {
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
var documentFile = new ClassPathResource(fileName).getFile();
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, documentFile, new ImageServiceResponse(), tableResponse, new VisualLayoutParsingResponse(),Path.of(fileName).getFileName().toFile().toString());
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
documentFile,
new ImageServiceResponse(),
tableResponse,
new VisualLayoutParsingResponse(),Path.of(fileName).getFileName().toFile().toString());
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
@ -61,3 +66,4 @@ public class ViewerDocumentTest extends BuildDocumentTest {
}
}

View File

@ -29,8 +29,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Im
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
@ -51,12 +49,6 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@Autowired
private RedactManagerClassificationService redactManagerClassificationService;
@Autowired
private CvTableParsingAdapter cvTableParsingAdapter;
@Autowired
private ImageServiceResponseAdapter imageServiceResponseAdapter;
@Autowired
private SectionsBuilderService sectionsBuilderService;
@ -65,11 +57,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
originDocument,
new ImageServiceResponse(),
tableServiceResponse,
new VisualLayoutParsingResponse(),
"document");
originDocument,
new ImageServiceResponse(),
tableServiceResponse,
new VisualLayoutParsingResponse(),
"document");
redactManagerClassificationService.classifyDocument(classificationDocument);
@ -89,11 +81,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@Test
public void tablesToHtmlDebugger() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T5 VV-640252-Page16.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
toHtml(document, "/tmp/A20622A izRMS (CZ) fRR Part B9_Page185.html");
toHtml(document, "/tmp/T5.html");
}
@ -111,6 +103,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
}
@Disabled
@Test
public void testScanRotationBorderIsIgnored() throws IOException {
@ -119,8 +112,16 @@ public class PdfSegmentationServiceTest extends AbstractTest {
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
var tables = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList();
// Quality of the table parsing is not good, because the file is rotated at scanning.
// We only asset that the table border is not the page border.
@ -142,12 +143,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
imageServiceResponse.getData()
.forEach(imageMetadata -> images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
imageMetadata.getPosition().getY1(),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight()),
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
imageMetadata.isAlpha(),
imageMetadata.getPosition().getPageNumber())));
imageMetadata.getPosition().getY1(),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight()),
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
imageMetadata.isAlpha(),
imageMetadata.getPosition().getPageNumber())));
System.out.println("object");
}
@ -159,11 +160,22 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock table = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(0);
assertThat(table.getColCount()).isEqualTo(6);
assertThat(table.getRowCount()).isEqualTo(13);
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
assertThat(table.getRows()
.stream()
.mapToInt(List::size).sum()).isEqualTo(6 * 13);
}
@ -173,15 +185,37 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
TablePageBlock secondTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(2);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(0)
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.toList().equals(firstTableHeaderCells))).isTrue();
}
@ -191,15 +225,37 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(9);
assertThat(firstTable.getRowCount()).isEqualTo(5);
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
TablePageBlock secondTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(9);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1).stream().map(Cell::getHeaderCells).collect(Collectors.toList());
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(firstTable.getRowCount() - 1)
.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.toList().equals(firstTableHeaderCells))).isTrue();
}
@ -209,19 +265,41 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
TablePageBlock secondTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(0)
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.toList().equals(firstTableHeaderCells))).isTrue();
}
@Test // Non-sense test
@Test
public void testDoc56Page170() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf");
@ -232,8 +310,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTable(document, 0, 1, 1, 0, 0);
validateTable(document, 1, 2, 2, 0, 0);
validateTable(document, 2, 6, 20, 0, 0);
validateTable(document, 3, 7, 31, 0, 0);
validateTable(document, 2, 4, 19, 12, 0);
validateTable(document, 3, 2, 12, 0, 0);
}
@ -267,29 +345,30 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTable(document, 0, 8, 8, 0, 0);
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
"Author, date",
"Study title",
"Analytical method Author, date, No.",
"Technique, LOQ of the method, validated working range",
"Method meets analytical validation criteria",
"Remarks (in case validation criteria are not met)",
"Acceptability of the method"),
Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
"Evans P.G. 2001 TMJ4569B, VV-323245",
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
"Y",
"N/A",
"Y"));
"Author, date",
"Study title",
"Analytical method Author, date, No.",
"Technique, LOQ of the method, validated working range",
"Method meets analytical validation criteria",
"Remarks (in case validation criteria are not met)",
"Acceptability of the method"),
Arrays.asList(
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
"Evans P.G. 2001 TMJ4569B, VV-323245",
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
"Y",
"N/A",
"Y"));
validateTable(document, 0, values);
@ -581,10 +660,109 @@ public class PdfSegmentationServiceTest extends AbstractTest {
}
@Test
public void testT0() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T0 TableWithMergedCells.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
validateTableSize(document, 1);
validateTable(document, 0, 6, 8, 0, 0);
}
@Test
public void testT1() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T1 MultipleNestedTable.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
validateTableSize(document, 4);
validateTable(document, 0, 3, 3, 0, 0);
validateTable(document, 1, 3, 5, 2, 0);
validateTable(document, 2, 3, 3, 1, 0);
validateTable(document, 3, 3, 3, 0, 0);
}
@Test
public void testT2() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T2 MultipleTables.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
validateTableSize(document, 6);
validateTable(document, 0, 5, 5, 0, 0);
validateTable(document, 1, 5, 6, 0, 0);
validateTable(document, 2, 5, 5, 0, 0);
validateTable(document, 3, 5, 5, 0, 0);
validateTable(document, 4, 5, 5, 0, 0);
validateTable(document, 5, 5, 5, 0, 0);
}
@Test
public void testT3() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T3 S-Meto_Page29.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
validateTableSize(document, 1);
validateTable(document, 0, 6, 5, 0, 0);
}
@Test
public void testT4() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T4 138 IDD0000261736_Page16.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
validateTableSize(document, 1);
validateTable(document, 0, 5, 8, 1, 0);
}
@Test
public void testT5() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T5 VV-640252-Page16.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
validateTableSize(document, 6);
validateTable(document, 0, 1, 1, 0, 0);
validateTable(document, 1, 1, 1, 0, 0);
validateTable(document, 2, 1, 1, 0, 0);
validateTable(document, 3, 1, 1, 0, 0);
validateTable(document, 4, 1, 1, 0, 0);
validateTable(document, 5, 1, 1, 0, 0);
}
@SneakyThrows
private void toHtml(ClassificationDocument document, String filename) {
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
var tables = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList();
StringBuilder sb = new StringBuilder();
int currentPage = 1;
@ -605,9 +783,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
TablePageBlock table = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(tableIndex);
List<List<Cell>> rows = table.getRows();
int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
int emptyCellsFoundFound = rows.stream()
.flatMap(List::stream)
.toList()
.stream()
.filter(f -> f.toString().isEmpty())
.toList().size();
for (List<Cell> row : table.getRows()) {
row.forEach(r -> System.out.println(r.toString()));
@ -622,11 +810,20 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
TablePageBlock table = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(tableIndex);
List<List<Cell>> rows = table.getRows();
List<Cell> rowsFlattened = rows.stream().flatMap(List::stream).toList();
List<String> valuesFlattened = values.stream().flatMap(List::stream).toList();
List<Cell> rowsFlattened = rows.stream()
.flatMap(List::stream)
.toList();
List<String> valuesFlattened = values.stream()
.flatMap(List::stream)
.toList();
for (int i = 0; i < valuesFlattened.size(); i++) {
Cell cell = rowsFlattened.get(i);
@ -639,7 +836,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTableSize(ClassificationDocument document, int tableSize) {
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize);
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList().size()).isEqualTo(tableSize);
}

View File

@ -27,10 +27,8 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import lombok.SneakyThrows;
@ -58,9 +56,6 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
@SneakyThrows
public void testTableExtraction() {
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
ClassPathResource resource = new ClassPathResource("files");
List<String> pdfFileNames = Files.walk(resource.getFile().toPath())
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
@ -68,8 +63,8 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
.map(Path::toString)
.toList();
for (int i = 0; i < pdfFileNames.size(); i++) {
writeJsons(Path.of(pdfFileNames.get(i)));
for (String pdfFileName : pdfFileNames) {
writeJsons(Path.of(pdfFileName));
}
}
@ -91,13 +86,13 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
filename.toFile().toString()));
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
String tmpFileNameBefore = "C:/Users/YANNIK~1/AppData/Local/Temp/before." + filename.getFileName().toString();
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure())) {
String tmpFileNameBefore = "/tmp/before." + filename.getFileName().toString();
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
PdfDraw.drawDocumentGraph(pdDocument, documentGraphBefore);
pdDocument.save(tmpFileNameBefore);
}
String tmpFileNameAfter = "C:/Users/YANNIK~1/AppData/Local/Temp/after." + filename.getFileName().toString();
String tmpFileNameAfter = "/tmp/after." + filename.getFileName().toString();
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
PdfDraw.drawDocumentGraph(pdDocument, documentGraphAfter);
pdDocument.save(tmpFileNameAfter);
@ -108,9 +103,9 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
@SneakyThrows
private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2, String pdfName) {
private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2) {
List listStructure1 = structure1.streamAllEntries()
List<Table> listStructure1 = structure1.streamAllEntries()
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
.map(DocumentStructure.EntryData::getProperties)
.map(properties -> {
@ -120,7 +115,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
})
.toList();
List listStructure2 = structure2.streamAllEntries()
List<Table> listStructure2 = structure2.streamAllEntries()
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
.map(DocumentStructure.EntryData::getProperties)
.map(properties -> {
@ -131,8 +126,8 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
.toList();
for (int i = 0; i < listStructure1.size(); i++) {
Table tableNode1 = (Table) listStructure1.get(i);
Table tableNode2 = (Table) listStructure2.get(i);
Table tableNode1 = listStructure1.get(i);
Table tableNode2 = listStructure2.get(i);
if (tableNode1.getNumberOfRows() != tableNode2.getNumberOfRows() || tableNode1.getNumberOfCols() != tableNode2.getNumberOfCols()) {
return false;
}