diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 5c8c6fa..25cf3f8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -97,7 +97,8 @@ public class LayoutParsingPipeline { log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); - File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile); + File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()) + .orElse(originFile); VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse(); if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) { @@ -105,20 +106,24 @@ public class LayoutParsingPipeline { } ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); - if (layoutParsingRequest.imagesFileStorageId().isPresent()) { - imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()); + if (layoutParsingRequest.imagesFileStorageId() + .isPresent()) { + imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId() + .get()); } TableServiceResponse tableServiceResponse = new TableServiceResponse(); - if (layoutParsingRequest.tablesFileStorageId().isPresent()) { - tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get()); + if (layoutParsingRequest.tablesFileStorageId() + .isPresent()) { + tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId() + .get()); } ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), - originFile, - imageServiceResponse, - tableServiceResponse, - visualLayoutParsingResponse, + originFile, + imageServiceResponse, + tableServiceResponse, + visualLayoutParsingResponse, layoutParsingRequest.identifier().toString()); log.info("Building document graph for {}", layoutParsingRequest.identifier()); @@ -151,25 +156,25 @@ public class LayoutParsingPipeline { .numberOfPages(documentGraph.getNumberOfPages()) .duration(System.currentTimeMillis() - start) .message(format(""" - Layout parsing has finished in %.02f s. - identifiers: %s - %s - Files have been saved with Ids: - Structure: %s - Text: %s - Positions: %s - PageData: %s - Simplified Text: %s - Viewer Doc: %s""", - ((float) (System.currentTimeMillis() - start)) / 1000, - layoutParsingRequest.identifier(), - buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()), - layoutParsingRequest.structureFileStorageId(), - layoutParsingRequest.textBlockFileStorageId(), - layoutParsingRequest.positionBlockFileStorageId(), - layoutParsingRequest.pageFileStorageId(), - layoutParsingRequest.simplifiedTextStorageId(), - layoutParsingRequest.viewerDocumentStorageId())) + Layout parsing has finished in %.02f s. + identifiers: %s + %s + Files have been saved with Ids: + Structure: %s + Text: %s + Positions: %s + PageData: %s + Simplified Text: %s + Viewer Doc: %s""", + ((float) (System.currentTimeMillis() - start)) / 1000, + layoutParsingRequest.identifier(), + buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()), + layoutParsingRequest.structureFileStorageId(), + layoutParsingRequest.textBlockFileStorageId(), + layoutParsingRequest.positionBlockFileStorageId(), + layoutParsingRequest.pageFileStorageId(), + layoutParsingRequest.simplifiedTextStorageId(), + layoutParsingRequest.viewerDocumentStorageId())) .build(); } @@ -179,9 +184,9 @@ public class LayoutParsingPipeline { AtomicReference documentReference = new AtomicReference<>(); - Observation.createNotStarted("LayoutParsingPipeline", observationRegistry).contextualName("build-document-graph").observe(() -> { - documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument)); - }); + Observation.createNotStarted("LayoutParsingPipeline", observationRegistry) + .contextualName("build-document-graph") + .observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument))); return documentReference.get(); } @@ -190,14 +195,14 @@ public class LayoutParsingPipeline { private String buildSemanticNodeCountMessage(int numberOfPages, Map semanticNodeCounts) { return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed", - numberOfPages, - semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), - semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), - semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), - semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), - semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), - semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), - semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); + numberOfPages, + semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), + semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), + semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), + semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), + semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), + semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), + semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); } @@ -337,9 +342,7 @@ public class LayoutParsingPipeline { private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) { - if (!classificationPage.isLandscape()) { - document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); - } + document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue()); document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue()); document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java index 2f324a4..7dfce70 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java @@ -76,4 +76,14 @@ public class Cell extends Rectangle { return this.getHeight() >= MIN_SIZE && this.getWidth() >= MIN_SIZE; } + public boolean nearlyIntersects(Cell other) { + + if (this.getHeight() <= 0 || other.getHeight() <= 0) { + return false; + } + double x0 = this.getX() + 2; + double y0 = this.getY() + 2; + return (other.x + other.width > x0 && other.y + other.height > y0 && other.x < x0 + this.getWidth() - 2 && other.y < y0 + this.getHeight() - 2); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java index 1295424..8e91dae 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java @@ -1,14 +1,12 @@ package com.knecon.fforesight.service.layoutparser.processor.model.table; import java.awt.geom.Point2D; -import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.TreeMap; -import java.util.stream.Collectors; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; @@ -21,7 +19,7 @@ import lombok.extern.slf4j.Slf4j; @Slf4j public class TablePageBlock extends AbstractPageBlock { - private final TreeMap cells = new TreeMap<>(); + private final TreeMap cellTreeMap = new TreeMap<>(); private final int rotation; @Getter @@ -30,10 +28,14 @@ public class TablePageBlock extends AbstractPageBlock { private int unrotatedRowCount; private int unrotatedColCount; private List> rows; + @Getter + @Setter + private List cells; public TablePageBlock(List cells, Rectangle area, int rotation) { + this.cells = cells; addCells(cells); minX = area.getLeft(); minY = area.getBottom(); @@ -50,6 +52,7 @@ public class TablePageBlock extends AbstractPageBlock { return getColCount() == 0 || getRowCount() == 0; } + public List> getRows() { if (rows == null) { @@ -80,7 +83,10 @@ public class TablePageBlock extends AbstractPageBlock { public int getColCount() { - return getRows().stream().mapToInt(List::size).max().orElse(0); + return getRows().stream() + .mapToInt(List::size) + .max() + .orElse(0); } @@ -120,7 +126,8 @@ public class TablePageBlock extends AbstractPageBlock { List cellsToTheTop = new ArrayList<>(); for (int i = 0; i < rowIndex; i++) { try { - cellsToTheTop.add(rows.get(i).get(colIndex)); + cellsToTheTop.add(rows.get(i) + .get(colIndex)); } catch (IndexOutOfBoundsException e) { log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex); } @@ -135,7 +142,8 @@ public class TablePageBlock extends AbstractPageBlock { if (lastHeaderCell != null) { cell.getHeaderCells().add(lastHeaderCell); } - if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) { + if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks() + .get(0).getMostPopularWordStyle().equals("bold")) { cell.setHeaderCell(true); } } @@ -151,7 +159,7 @@ public class TablePageBlock extends AbstractPageBlock { for (int i = 0; i < unrotatedColCount; i++) { // rows List lastRow = new ArrayList<>(); for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols - Cell cell = cells.get(new CellPosition(j, i)); + Cell cell = cellTreeMap.get(new CellPosition(j, i)); if (cell != null) { lastRow.add(cell); } @@ -162,7 +170,7 @@ public class TablePageBlock extends AbstractPageBlock { for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows List lastRow = new ArrayList<>(); for (int j = 0; j < unrotatedRowCount; j++) { // cols - Cell cell = cells.get(new CellPosition(j, i)); + Cell cell = cellTreeMap.get(new CellPosition(j, i)); if (cell != null) { lastRow.add(cell); } @@ -173,7 +181,7 @@ public class TablePageBlock extends AbstractPageBlock { for (int i = 0; i < unrotatedRowCount; i++) { List lastRow = new ArrayList<>(); for (int j = 0; j < unrotatedColCount; j++) { - Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault() + Cell cell = cellTreeMap.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault() if (cell != null) { lastRow.add(cell); } @@ -187,17 +195,6 @@ public class TablePageBlock extends AbstractPageBlock { } - private void add(Cell chunk, int row, int col) { - - unrotatedRowCount = Math.max(unrotatedRowCount, row + 1); - unrotatedColCount = Math.max(unrotatedColCount, col + 1); - - CellPosition cp = new CellPosition(row, col); - cells.put(cp, chunk); - - } - - private void addCells(List cells) { if (cells.isEmpty()) { @@ -206,11 +203,12 @@ public class TablePageBlock extends AbstractPageBlock { cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1); - List> rowsOfCells = calculateStructure(cells); + List> rowsOfCellsMatrix = calculateTableStructure(cells); - for (int i = 0; i < rowsOfCells.size(); i++) { - for (int j = 0; j < rowsOfCells.get(i).size(); j++) { - add(rowsOfCells.get(i).get(j), i, j); + for (int i = 0; i < rowsOfCellsMatrix.size(); i++) { + for (int j = 0; j < rowsOfCellsMatrix.get(i).size(); j++) { + addCellToRowAndCol(rowsOfCellsMatrix.get(i) + .get(j), i, j); } } @@ -221,29 +219,36 @@ public class TablePageBlock extends AbstractPageBlock { * Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted. * * @param cells The found cells - * @return TablePageBlock Structure + * @return TablePageBlock Structure as a rows of cells matrix */ - private List> calculateStructure(List cells) { - - List> matrix = new ArrayList<>(); + private List> calculateTableStructure(List cells) { if (cells.isEmpty()) { - return matrix; + return new ArrayList<>(); } Set uniqueX = new HashSet<>(); Set uniqueY = new HashSet<>(); - cells.stream().filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3).forEach(c -> { - uniqueX.add(c.getLeft()); - uniqueX.add(c.getRight()); - uniqueY.add(c.getBottom()); - uniqueY.add(c.getTop()); - }); + cells.stream() + .filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3) + .forEach(c -> { + uniqueX.add(c.getLeft()); + uniqueX.add(c.getRight()); + uniqueY.add(c.getBottom()); + uniqueY.add(c.getTop()); + }); - var sortedUniqueX = uniqueX.stream().sorted().toList(); - var sortedUniqueY = uniqueY.stream().sorted().toList(); + var sortedUniqueX = uniqueX.stream() + .sorted() + .toList(); + var sortedUniqueY = uniqueY.stream() + .sorted() + .toList(); + + List> rowsOfCells = new ArrayList<>(); Float prevY = null; + for (Float y : sortedUniqueY) { List row = new ArrayList<>(); @@ -254,42 +259,81 @@ public class TablePageBlock extends AbstractPageBlock { if (prevY != null && prevX != null) { var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y)); - var intersectionCell = cells.stream().filter(c -> intersects(cell, c)).findFirst(); - - intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks())); if (cell.hasMinimumSize()) { + + cells.stream() + .filter(cell::nearlyIntersects) + .forEach(intersectingCell -> cell.getTextBlocks().addAll(intersectingCell.getTextBlocks())); + row.add(cell); } } prevX = x; } - if (prevY != null && prevX != null && !row.isEmpty()) { - matrix.add(row); + // exclude empty rows and rows where all text blocks are empty + if (prevY != null && prevX != null && !row.isEmpty() && !row.stream() + .allMatch(cell -> cell.getTextBlocks().isEmpty())) { + + rowsOfCells.add(row); } prevY = y; } - Collections.reverse(matrix); + Collections.reverse(rowsOfCells); - return matrix; - } - - - - public boolean intersects(Cell cell1, Cell cell2) { - if (cell1.getHeight() <= 0 || cell2.getHeight() <= 0) { - return false; + // now cells are removed which are part of a column without any text blocks + // this is done by first computing the inverse matrix which contains call columns of cells + // then the column indices that have to be removed are determined + List> columnsOfCells = new ArrayList<>(); + int maxRowLength = rowsOfCells.stream() + .map(List::size) + .max(java.util.Comparator.naturalOrder()) + .orElse(0); + for (int i = 0; i < maxRowLength; i++) { + columnsOfCells.add(new ArrayList<>()); } - double x0 = cell1.getX() + 2; - double y0 = cell1.getY() + 2; - return (cell2.x + cell2.width > x0 && - cell2.y + cell2.height > y0 && - cell2.x < x0 + cell1.getWidth() -2 && - cell2.y < y0 + cell1.getHeight() -2); + + for (List row : rowsOfCells) { + for (int j = 0; j < row.size(); j++) { + columnsOfCells.get(j).add(row.get(j)); + } + } + + List columnIndicesToRemove = new ArrayList<>(); + int columnIndex = 0; + for (List col : columnsOfCells) { + if (col.stream() + .allMatch(cell -> cell.getTextBlocks().isEmpty())) { + columnIndicesToRemove.add(columnIndex); + } + columnIndex++; + } + columnIndicesToRemove.sort(Collections.reverseOrder()); + + // update all rows so that the values of the empty columns get removed + var rowsOfCellsBefore = new ArrayList<>(rowsOfCells); + rowsOfCells = new ArrayList<>(); + for (List row : rowsOfCellsBefore) { + var updatedRow = new ArrayList<>(row); + columnIndicesToRemove.forEach(idxToRemove -> updatedRow.remove(updatedRow.get(idxToRemove))); + rowsOfCells.add(updatedRow); + } + + return rowsOfCells; } + private void addCellToRowAndCol(Cell cell, int row, int col) { + + unrotatedRowCount = Math.max(unrotatedRowCount, row + 1); + unrotatedColCount = Math.max(unrotatedColCount, col + 1); + + CellPosition cp = new CellPosition(row, col); + cellTreeMap.put(cp, cell); + + } + @Override public String getText() { @@ -314,7 +358,7 @@ public class TablePageBlock extends AbstractPageBlock { if (!first) { sb.append("\n"); } - sb.append('\"').append(textBlock.getText().replaceAll("\"", "\\\"")).append('\"'); + sb.append('\"').append(textBlock.getText().replaceAll("\"", "\"")).append('\"'); first = false; } } @@ -328,8 +372,6 @@ public class TablePageBlock extends AbstractPageBlock { } - - public String getTextAsHtml() { StringBuilder sb = new StringBuilder(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java index b24157c..2a18dc0 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java @@ -25,7 +25,8 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class RulingCleaningService { - private static final float THRESHOLD = 6; + private static final float THRESHOLD_Y = 6; + private static final float THRESHOLD_X = 2; public CleanRulings getCleanRulings(List tableCells, List rulings) { @@ -81,7 +82,7 @@ public class RulingCleaningService { for (Point2D p : points.subList(1, points.size() - 1)) { List last = groupedPoints.get(groupedPoints.size() - 1); - if (Math.abs(p.getX() - last.get(0).getX()) < THRESHOLD) { + if (Math.abs(p.getX() - last.get(0).getX()) < THRESHOLD_X) { groupedPoints.get(groupedPoints.size() - 1).add(p); } else { groupedPoints.add(new ArrayList<>(Collections.singletonList(p))); @@ -108,7 +109,7 @@ public class RulingCleaningService { for (Point2D p : points.subList(1, points.size() - 1)) { List last = groupedPoints.get(groupedPoints.size() - 1); - if (Math.abs(p.getY() - last.get(0).getY()) < THRESHOLD) { + if (Math.abs(p.getY() - last.get(0).getY()) < THRESHOLD_Y) { groupedPoints.get(groupedPoints.size() - 1).add(p); } else { groupedPoints.add(new ArrayList<>(Collections.singletonList(p))); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java index dd6bcc8..8dd639d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java @@ -5,7 +5,6 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; @@ -25,55 +24,62 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparis @Service public class TableExtractionService { - private static final Comparator X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> { + private static final int MAX_TABLE_OUTER_POINT_TOLERANCE = 10; + private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1; + private static final float SPREADSHEET_AREA_TOLERANCE = 0.001f; + + private static final Comparator X_FIRST_POINT_COMPARATOR = (point1, point2) -> { int rv = 0; - float arg0X = DoubleComparisons.round(arg0.getX(), 2); - float arg0Y = DoubleComparisons.round(arg0.getY(), 2); - float arg1X = DoubleComparisons.round(arg1.getX(), 2); - float arg1Y = DoubleComparisons.round(arg1.getY(), 2); + float point1X = DoubleComparisons.round(point1.getX(), 2); + float point1Y = DoubleComparisons.round(point1.getY(), 2); + float point2X = DoubleComparisons.round(point2.getX(), 2); + float point2Y = DoubleComparisons.round(point2.getY(), 2); - if (arg0X > arg1X) { + if (point1X > point2X) { rv = 1; - } else if (arg0X < arg1X) { + } else if (point1X < point2X) { rv = -1; - } else if (arg0Y > arg1Y) { + } else if (point1Y > point2Y) { rv = 1; - } else if (arg0Y < arg1Y) { + } else if (point1Y < point2Y) { rv = -1; } return rv; }; - private static final Comparator POINT_COMPARATOR = (arg0, arg1) -> { + private static final Comparator Y_FIRST_POINT_COMPARATOR = (point1, point2) -> { int rv = 0; - float arg0X = DoubleComparisons.round(arg0.getX(), 2); - float arg0Y = DoubleComparisons.round(arg0.getY(), 2); - float arg1X = DoubleComparisons.round(arg1.getX(), 2); - float arg1Y = DoubleComparisons.round(arg1.getY(), 2); + float point1X = DoubleComparisons.round(point1.getX(), 2); + float point1Y = DoubleComparisons.round(point1.getY(), 2); + float point2X = DoubleComparisons.round(point2.getX(), 2); + float point2Y = DoubleComparisons.round(point2.getY(), 2); - if (arg0Y > arg1Y) { + if (point1Y > point2Y) { rv = 1; - } else if (arg0Y < arg1Y) { + } else if (point1Y < point2Y) { rv = -1; - } else if (arg0X > arg1X) { + } else if (point1X > point2X) { rv = 1; - } else if (arg0X < arg1X) { + } else if (point1X < point2X) { rv = -1; } return rv; }; + private static final Comparator CELL_SIZE_COMPARATOR = (cell1, cell2) -> { - public boolean contains(Cell cell, double x, double y, double w, double h) { + Double cell1Size = cell1.getHeight() * cell1.getWidth(); + Double cell2Size = cell2.getHeight() * cell2.getWidth(); + return cell1Size.compareTo(cell2Size); + }; - if (cell.isEmpty() || w <= 0 || h <= 0) { - return false; - } - double x0 = cell.getX(); - double y0 = cell.getY(); - return (x >= x0 - 2 && y >= y0 - 2 && (x + w) <= x0 + cell.getWidth() + 2 && (y + h) <= y0 + cell.getHeight() + 2); - } + private static final Comparator RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> { + + Double rect1Size = rect1.getHeight() * rect1.getWidth(); + Double rect2Size = rect2.getHeight() * rect2.getWidth(); + return rect1Size.compareTo(rect2Size); + }; /** @@ -89,22 +95,18 @@ public class TableExtractionService { * @param cleanRulings The lines used to build the table. * @param page Page object that contains textblocks and statistics. */ + public void extractTables(CleanRulings cleanRulings, ClassificationPage page) { List cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); - - List toBeRemoved = new ArrayList<>(); + // sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them + cells.sort(CELL_SIZE_COMPARATOR); for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) { TextPageBlock textBlock = (TextPageBlock) abstractPageBlock; for (Cell cell : cells) { - if (cell.hasMinimumSize() && contains(cell, - textBlock.getPdfMinX(), - textBlock.getPdfMinY(), - textBlock.getPdfMaxX() - textBlock.getPdfMinX(), - textBlock.getPdfMaxY() - textBlock.getPdfMinY())) { + if (cell.hasMinimumSize() && doesCellContainTextBlock(cell, textBlock)) { cell.addTextBlock(textBlock); - toBeRemoved.add(textBlock); break; } } @@ -114,39 +116,70 @@ public class TableExtractionService { DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER); List spreadsheetAreas = findSpreadsheetsFromCells(cells); + // sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first + // this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells + spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR); List tables = new ArrayList<>(); for (Rectangle area : spreadsheetAreas) { - List overlappingCells = new ArrayList<>(); + List containedCells = new ArrayList<>(); for (Cell c : cells) { - if (c.hasMinimumSize() && c.intersects(area)) { - overlappingCells.add(c); + if (c.hasMinimumSize() && area.contains(c)) { + containedCells.add(c); } } - tables.add(new TablePageBlock(overlappingCells, area, page.getRotation())); + + var containedCellsWithText = containedCells.stream() + .filter(cell -> !cell.getTextBlocks().isEmpty()) + .count(); + + // verify if table would contain fewer cells with text than the threshold allows + if (containedCellsWithText >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT) { + tables.add(new TablePageBlock(containedCells, area, page.getRotation())); + cells.removeAll(containedCells); + } } for (TablePageBlock table : tables) { int position = -1; - Iterator itty = page.getTextBlocks().iterator(); - while (itty.hasNext()) { - AbstractPageBlock textBlock = itty.next(); - if (textBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) textBlock) : table.contains(textBlock) && position == -1) { - position = page.getTextBlocks().indexOf(textBlock); + for (AbstractPageBlock pageBlock : page.getTextBlocks()) { + if (pageBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) pageBlock) : table.contains(pageBlock) && position == -1) { + position = page.getTextBlocks().indexOf(pageBlock); } } if (position != -1) { page.getTextBlocks().add(position, table); + + var toBeRemoved = table.getCells() + .stream() + .map(Cell::getTextBlocks) + .flatMap(List::stream) + .toList(); + // remove text blocks from the page that were also added with the table (from its contained cells) + page.getTextBlocks().removeAll(toBeRemoved); } } - - page.getTextBlocks().removeAll(toBeRemoved); } - public List findCells(List horizontalRulingLines, List verticalRulingLines) { + private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) { + + double x = textBlock.getPdfMinX(); + double y = textBlock.getPdfMinY(); + double w = textBlock.getPdfMaxX() - textBlock.getPdfMinX(); + double h = textBlock.getPdfMaxY() - textBlock.getPdfMinY(); + if (cell.isEmpty() || w <= 0 || h <= 0) { + return false; + } + double x0 = cell.getX(); + double y0 = cell.getY(); + return (x >= x0 - 2 && y >= y0 - 2 && (x + w) <= x0 + cell.getWidth() + 2 && (y + h) <= y0 + cell.getHeight() + 2); + } + + + private List findCells(List horizontalRulingLines, List verticalRulingLines) { // Fix for 211.pdf for (Ruling r : horizontalRulingLines) { @@ -160,7 +193,7 @@ public class TableExtractionService { List cellsFound = new ArrayList<>(); Map intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines); List intersectionPointsList = new ArrayList<>(intersectionPoints.keySet()); - intersectionPointsList.sort(POINT_COMPARATOR); + intersectionPointsList.sort(Y_FIRST_POINT_COMPARATOR); for (int i = 0; i < intersectionPointsList.size(); i++) { Point2D topLeft = intersectionPointsList.get(i); @@ -186,13 +219,14 @@ public class TableExtractionService { continue; } for (Point2D yPoint : yPoints) { - // is there an horizontal edge b/w topLeft and yPoint ? + // is there a horizontal edge b/w topLeft and yPoint ? if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) { continue; } Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY()); - if (intersectionPoints.containsKey(btmRight) && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) && intersectionPoints.get(btmRight)[1].equals( - intersectionPoints.get(yPoint)[1])) { + if (intersectionPoints.containsKey(btmRight) + && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) + && intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) { cellsFound.add(new Cell(topLeft, btmRight)); break outer; } @@ -214,7 +248,6 @@ public class TableExtractionService { Set pointSet = new HashSet<>(); Map edgesH = new HashMap<>(); Map edgesV = new HashMap<>(); - int i = 0; for (Rectangle cell : cells) { for (Point2D pt : cell.getPoints()) { @@ -231,8 +264,9 @@ public class TableExtractionService { pointsSortX.sort(X_FIRST_POINT_COMPARATOR); // Y first sort List pointsSortY = new ArrayList<>(pointSet); - pointsSortY.sort(POINT_COMPARATOR); + pointsSortY.sort(Y_FIRST_POINT_COMPARATOR); + int i = 0; while (i < pointSet.size()) { float currY = (float) pointsSortY.get(i).getY(); while (i < pointSet.size() && DoubleComparisons.feq(pointsSortY.get(i).getY(), currY)) { @@ -257,7 +291,8 @@ public class TableExtractionService { Point2D nextVertex; while (!edgesH.isEmpty()) { ArrayList polygon = new ArrayList<>(); - Point2D first = edgesH.keySet().iterator().next(); + Point2D first = edgesH.keySet() + .iterator().next(); polygon.add(new PolygonVertex(first, Direction.HORIZONTAL)); edgesH.remove(first); @@ -301,7 +336,14 @@ public class TableExtractionService { bottom = (float) Math.max(bottom, pt.point.getY()); right = (float) Math.max(right, pt.point.getX()); } - rectangles.add(new Rectangle(top, left, right - left, bottom - top)); + + // do not add polygons with too many outer points as they are unlikely to be tables + if (poly.size() <= MAX_TABLE_OUTER_POINT_TOLERANCE) { + rectangles.add(new Rectangle(top - SPREADSHEET_AREA_TOLERANCE, + left - SPREADSHEET_AREA_TOLERANCE, + right - left + 2 * SPREADSHEET_AREA_TOLERANCE, + bottom - top + 2 * SPREADSHEET_AREA_TOLERANCE)); + } } return rectangles; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java index e6a7332..fe15845 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java @@ -30,8 +30,6 @@ public class TableMergingUtility { if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock, consecutiveTable)) { consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable); - } else { - break; } } return Stream.concat(Stream.of(originalTablePageBlock), consecutiveTablesWithSameColCountAndHeaders.stream()).toList(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 06c053d..0e28417 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -36,9 +36,10 @@ public class ViewerDocumentTest extends BuildDocumentTest { Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); long start = System.currentTimeMillis(); layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); - System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); + System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); } + @Test @Disabled @SneakyThrows @@ -52,7 +53,11 @@ public class ViewerDocumentTest extends BuildDocumentTest { var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class); var documentFile = new ClassPathResource(fileName).getFile(); - var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, documentFile, new ImageServiceResponse(), tableResponse, new VisualLayoutParsingResponse(),Path.of(fileName).getFileName().toFile().toString()); + var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, + documentFile, + new ImageServiceResponse(), + tableResponse, + new VisualLayoutParsingResponse(),Path.of(fileName).getFileName().toFile().toString()); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument); @@ -61,3 +66,4 @@ public class ViewerDocumentTest extends BuildDocumentTest { } } + diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index 1f9e8a0..16fbacb 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -29,8 +29,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Im import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter; -import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; @@ -51,12 +49,6 @@ public class PdfSegmentationServiceTest extends AbstractTest { @Autowired private RedactManagerClassificationService redactManagerClassificationService; - @Autowired - private CvTableParsingAdapter cvTableParsingAdapter; - - @Autowired - private ImageServiceResponseAdapter imageServiceResponseAdapter; - @Autowired private SectionsBuilderService sectionsBuilderService; @@ -65,11 +57,11 @@ public class PdfSegmentationServiceTest extends AbstractTest { public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) { ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, - originDocument, - new ImageServiceResponse(), - tableServiceResponse, - new VisualLayoutParsingResponse(), - "document"); + originDocument, + new ImageServiceResponse(), + tableServiceResponse, + new VisualLayoutParsingResponse(), + "document"); redactManagerClassificationService.classifyDocument(classificationDocument); @@ -89,11 +81,11 @@ public class PdfSegmentationServiceTest extends AbstractTest { @Test public void tablesToHtmlDebugger() throws IOException { - ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf"); + ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T5 VV-640252-Page16.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - toHtml(document, "/tmp/A20622A izRMS (CZ) fRR Part B9_Page185.html"); + toHtml(document, "/tmp/T5.html"); } @@ -111,6 +103,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { } + @Disabled @Test public void testScanRotationBorderIsIgnored() throws IOException { @@ -119,8 +112,16 @@ public class PdfSegmentationServiceTest extends AbstractTest { var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse); - assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); - var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList(); + assertThat(document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .collect(Collectors.toList())).isNotEmpty(); + var tables = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList(); // Quality of the table parsing is not good, because the file is rotated at scanning. // We only asset that the table border is not the page border. @@ -142,12 +143,12 @@ public class PdfSegmentationServiceTest extends AbstractTest { imageServiceResponse.getData() .forEach(imageMetadata -> images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>()) .add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(), - imageMetadata.getPosition().getY1(), - imageMetadata.getGeometry().getWidth(), - imageMetadata.getGeometry().getHeight()), - ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)), - imageMetadata.isAlpha(), - imageMetadata.getPosition().getPageNumber()))); + imageMetadata.getPosition().getY1(), + imageMetadata.getGeometry().getWidth(), + imageMetadata.getGeometry().getHeight()), + ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)), + imageMetadata.isAlpha(), + imageMetadata.getPosition().getPageNumber()))); System.out.println("object"); } @@ -159,11 +160,22 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); - TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0); + assertThat(document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .collect(Collectors.toList())).isNotEmpty(); + TablePageBlock table = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList() + .get(0); assertThat(table.getColCount()).isEqualTo(6); assertThat(table.getRowCount()).isEqualTo(13); - assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13); + assertThat(table.getRows() + .stream() + .mapToInt(List::size).sum()).isEqualTo(6 * 13); } @@ -173,15 +185,37 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); - TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0); + assertThat(document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .collect(Collectors.toList())).isNotEmpty(); + TablePageBlock firstTable = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList() + .get(0); assertThat(firstTable.getColCount()).isEqualTo(8); assertThat(firstTable.getRowCount()).isEqualTo(1); - TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1); + TablePageBlock secondTable = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList() + .get(1); assertThat(secondTable.getColCount()).isEqualTo(8); assertThat(secondTable.getRowCount()).isEqualTo(2); - List> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList()); - assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue(); + List> firstTableHeaderCells = firstTable.getRows() + .get(0) + .stream() + .map(Collections::singletonList) + .collect(Collectors.toList()); + assertThat(secondTable.getRows() + .stream() + .allMatch(row -> row.stream() + .map(Cell::getHeaderCells) + .toList().equals(firstTableHeaderCells))).isTrue(); } @@ -191,15 +225,37 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); - TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0); + assertThat(document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .collect(Collectors.toList())).isNotEmpty(); + TablePageBlock firstTable = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList() + .get(0); assertThat(firstTable.getColCount()).isEqualTo(9); assertThat(firstTable.getRowCount()).isEqualTo(5); - TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1); + TablePageBlock secondTable = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList() + .get(1); assertThat(secondTable.getColCount()).isEqualTo(9); assertThat(secondTable.getRowCount()).isEqualTo(6); - List> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1).stream().map(Cell::getHeaderCells).collect(Collectors.toList()); - assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue(); + List> firstTableHeaderCells = firstTable.getRows() + .get(firstTable.getRowCount() - 1) + .stream() + .map(Cell::getHeaderCells) + .collect(Collectors.toList()); + assertThat(secondTable.getRows() + .stream() + .allMatch(row -> row.stream() + .map(Cell::getHeaderCells) + .toList().equals(firstTableHeaderCells))).isTrue(); } @@ -209,19 +265,41 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); - TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0); + assertThat(document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .collect(Collectors.toList())).isNotEmpty(); + TablePageBlock firstTable = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList() + .get(0); assertThat(firstTable.getColCount()).isEqualTo(8); assertThat(firstTable.getRowCount()).isEqualTo(1); - TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1); + TablePageBlock secondTable = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList() + .get(1); assertThat(secondTable.getColCount()).isEqualTo(8); assertThat(secondTable.getRowCount()).isEqualTo(6); - List> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList()); - assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue(); + List> firstTableHeaderCells = firstTable.getRows() + .get(0) + .stream() + .map(Collections::singletonList) + .collect(Collectors.toList()); + assertThat(secondTable.getRows() + .stream() + .allMatch(row -> row.stream() + .map(Cell::getHeaderCells) + .toList().equals(firstTableHeaderCells))).isTrue(); } - @Test // Non-sense test + @Test public void testDoc56Page170() throws IOException { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf"); @@ -232,8 +310,8 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTable(document, 0, 1, 1, 0, 0); validateTable(document, 1, 2, 2, 0, 0); - validateTable(document, 2, 6, 20, 0, 0); - validateTable(document, 3, 7, 31, 0, 0); + validateTable(document, 2, 4, 19, 12, 0); + validateTable(document, 3, 2, 12, 0, 0); } @@ -267,29 +345,30 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTable(document, 0, 8, 8, 0, 0); List> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR", - "Author, date", - "Study title", - "Analytical method Author, date, No.", - "Technique, LOQ of the method, validated working range", - "Method meets analytical validation criteria", - "Remarks (in case validation criteria are not met)", - "Acceptability of the method"), - Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"), - Arrays.asList("CA 7.1.2.1.1 DAR (2009)", - "Evans P.G. 2001 TMJ4569B, VV-323245", - "Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom", - "Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845", - "LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD", - "Y", - "N/A", - "Y")); + "Author, date", + "Study title", + "Analytical method Author, date, No.", + "Technique, LOQ of the method, validated working range", + "Method meets analytical validation criteria", + "Remarks (in case validation criteria are not met)", + "Acceptability of the method"), + Arrays.asList( + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"), + Arrays.asList("CA 7.1.2.1.1 DAR (2009)", + "Evans P.G. 2001 TMJ4569B, VV-323245", + "Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom", + "Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845", + "LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD", + "Y", + "N/A", + "Y")); validateTable(document, 0, values); @@ -581,10 +660,109 @@ public class PdfSegmentationServiceTest extends AbstractTest { } + @Test + public void testT0() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T0 TableWithMergedCells.pdf"); + + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); + + validateTableSize(document, 1); + + validateTable(document, 0, 6, 8, 0, 0); + } + + + @Test + public void testT1() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T1 MultipleNestedTable.pdf"); + + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); + + validateTableSize(document, 4); + + validateTable(document, 0, 3, 3, 0, 0); + validateTable(document, 1, 3, 5, 2, 0); + validateTable(document, 2, 3, 3, 1, 0); + validateTable(document, 3, 3, 3, 0, 0); + + } + + + @Test + public void testT2() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T2 MultipleTables.pdf"); + + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); + + validateTableSize(document, 6); + + validateTable(document, 0, 5, 5, 0, 0); + validateTable(document, 1, 5, 6, 0, 0); + validateTable(document, 2, 5, 5, 0, 0); + validateTable(document, 3, 5, 5, 0, 0); + validateTable(document, 4, 5, 5, 0, 0); + validateTable(document, 5, 5, 5, 0, 0); + + } + + + @Test + public void testT3() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T3 S-Meto_Page29.pdf"); + + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); + + validateTableSize(document, 1); + + validateTable(document, 0, 6, 5, 0, 0); + + } + + + @Test + public void testT4() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T4 138 IDD0000261736_Page16.pdf"); + + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); + + validateTableSize(document, 1); + + validateTable(document, 0, 5, 8, 1, 0); + + } + + + @Test + public void testT5() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T5 VV-640252-Page16.pdf"); + + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); + + validateTableSize(document, 6); + validateTable(document, 0, 1, 1, 0, 0); + validateTable(document, 1, 1, 1, 0, 0); + validateTable(document, 2, 1, 1, 0, 0); + validateTable(document, 3, 1, 1, 0, 0); + validateTable(document, 4, 1, 1, 0, 0); + validateTable(document, 5, 1, 1, 0, 0); + + } + + @SneakyThrows private void toHtml(ClassificationDocument document, String filename) { - var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList(); + var tables = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList(); StringBuilder sb = new StringBuilder(); int currentPage = 1; @@ -605,9 +783,19 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) { - TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex); + TablePageBlock table = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList() + .get(tableIndex); List> rows = table.getRows(); - int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size(); + int emptyCellsFoundFound = rows.stream() + .flatMap(List::stream) + .toList() + .stream() + .filter(f -> f.toString().isEmpty()) + .toList().size(); for (List row : table.getRows()) { row.forEach(r -> System.out.println(r.toString())); @@ -622,11 +810,20 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTable(ClassificationDocument document, int tableIndex, List> values) { - TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex); + TablePageBlock table = document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList() + .get(tableIndex); List> rows = table.getRows(); - List rowsFlattened = rows.stream().flatMap(List::stream).toList(); - List valuesFlattened = values.stream().flatMap(List::stream).toList(); + List rowsFlattened = rows.stream() + .flatMap(List::stream) + .toList(); + List valuesFlattened = values.stream() + .flatMap(List::stream) + .toList(); for (int i = 0; i < valuesFlattened.size(); i++) { Cell cell = rowsFlattened.get(i); @@ -639,7 +836,11 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTableSize(ClassificationDocument document, int tableSize) { - assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize); + assertThat(document.getSections() + .stream() + .flatMap(paragraph -> paragraph.getTables() + .stream()) + .toList().size()).isEqualTo(tableSize); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java index 8025534..86b3c69 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -27,10 +27,8 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper; -import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; -import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService; import lombok.SneakyThrows; @@ -58,9 +56,6 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { @SneakyThrows public void testTableExtraction() { - ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); - LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); - ClassPathResource resource = new ClassPathResource("files"); List pdfFileNames = Files.walk(resource.getFile().toPath()) .filter(path -> path.getFileName().toString().endsWith(".pdf")) @@ -68,8 +63,8 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { .map(Path::toString) .toList(); - for (int i = 0; i < pdfFileNames.size(); i++) { - writeJsons(Path.of(pdfFileNames.get(i))); + for (String pdfFileName : pdfFileNames) { + writeJsons(Path.of(pdfFileName)); } } @@ -91,13 +86,13 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { filename.toFile().toString())); DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore); DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter); - if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) { - String tmpFileNameBefore = "C:/Users/YANNIK~1/AppData/Local/Temp/before." + filename.getFileName().toString(); + if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure())) { + String tmpFileNameBefore = "/tmp/before." + filename.getFileName().toString(); try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) { PdfDraw.drawDocumentGraph(pdDocument, documentGraphBefore); pdDocument.save(tmpFileNameBefore); } - String tmpFileNameAfter = "C:/Users/YANNIK~1/AppData/Local/Temp/after." + filename.getFileName().toString(); + String tmpFileNameAfter = "/tmp/after." + filename.getFileName().toString(); try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) { PdfDraw.drawDocumentGraph(pdDocument, documentGraphAfter); pdDocument.save(tmpFileNameAfter); @@ -108,9 +103,9 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { @SneakyThrows - private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2, String pdfName) { + private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2) { - List listStructure1 = structure1.streamAllEntries() + List listStructure1 = structure1.streamAllEntries() .filter(entryData -> entryData.getType().equals(NodeType.TABLE)) .map(DocumentStructure.EntryData::getProperties) .map(properties -> { @@ -120,7 +115,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { }) .toList(); - List listStructure2 = structure2.streamAllEntries() + List
listStructure2 = structure2.streamAllEntries() .filter(entryData -> entryData.getType().equals(NodeType.TABLE)) .map(DocumentStructure.EntryData::getProperties) .map(properties -> { @@ -131,8 +126,8 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { .toList(); for (int i = 0; i < listStructure1.size(); i++) { - Table tableNode1 = (Table) listStructure1.get(i); - Table tableNode2 = (Table) listStructure2.get(i); + Table tableNode1 = listStructure1.get(i); + Table tableNode2 = listStructure2.get(i); if (tableNode1.getNumberOfRows() != tableNode2.getNumberOfRows() || tableNode1.getNumberOfCols() != tableNode2.getNumberOfCols()) { return false; } diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/T0 TableWithMergedCells.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/T0 TableWithMergedCells.pdf new file mode 100644 index 0000000..d1799c6 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/T0 TableWithMergedCells.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/T1 MultipleNestedTable.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/T1 MultipleNestedTable.pdf new file mode 100644 index 0000000..18e9d53 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/T1 MultipleNestedTable.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/T2 MultipleTables.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/T2 MultipleTables.pdf new file mode 100644 index 0000000..c162250 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/T2 MultipleTables.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/T3 S-Meto_Page29.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/T3 S-Meto_Page29.pdf new file mode 100644 index 0000000..ecacabc Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/T3 S-Meto_Page29.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/T4 138 IDD0000261736_Page16.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/T4 138 IDD0000261736_Page16.pdf new file mode 100644 index 0000000..8b25470 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/T4 138 IDD0000261736_Page16.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/T5 VV-640252-Page16.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/T5 VV-640252-Page16.pdf new file mode 100644 index 0000000..0edf419 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SinglePages/T5 VV-640252-Page16.pdf differ