Compare commits

...

24 Commits

Author SHA1 Message Date
Dominique Eifländer
c4c71efadd Merge branch 'RED-8933-4.0' into 'release/0.89.x'
RED-8933: Fixed bugs in DocumineClassificationService

See merge request fforesight/layout-parser!149
2024-05-08 13:31:25 +02:00
Dominique Eifländer
5e88cb9a2d RED-8933: Fixed bugs in DocumineClassificationService 2024-05-08 12:56:51 +02:00
Corina Olariu
45ff220d83 Merge branch 'RED-8992-bp' into 'release/0.89.x'
RED-8992: Enable to add annotation on header with line breaks

See merge request fforesight/layout-parser!142
2024-04-24 13:51:31 +02:00
Corina Olariu
f4f01644f7 RED-8992 - Enable to add annotation on header with line breaks
- don't reorder textblocks classified asheaders and footers
- add unit test
2024-04-24 13:36:36 +03:00
Dominique Eifländer
9eaecdf378 RED-8932 Fixed not merged headline with identifier 2024-04-24 11:44:17 +02:00
Kilian Schüttler
59745a916c Merge branch 'RED-7384' into 'release/0.89.x'
RED-7384: add empty textBlock to Image to ensure continuous textranges across all SemanticNodes

See merge request fforesight/layout-parser!139
2024-04-23 12:03:32 +02:00
Kilian Schuettler
0dda309829 RED-7384: add empty textBlock to Image to ensure continuous textranges across all SemanticNodes 2024-04-23 11:30:13 +02:00
Kilian Schüttler
bfa90c2d79 Merge branch 'RED-8995' into 'release/0.89.x'
RED-8995: swap incremental save for save without compression to correct wrong layers in rare cases

See merge request fforesight/layout-parser!136
2024-04-23 10:44:59 +02:00
Kilian Schuettler
37f7a6a03f RED-8995: swap incremental save for save without compression to correct wrong layers in rare cases 2024-04-22 11:00:43 +02:00
Kilian Schüttler
bdbac18169 Merge branch 'RED-8995' into 'release/0.89.x'
RED-8995: unclassified text might be missing from document data

See merge request fforesight/layout-parser!133
2024-04-18 10:01:12 +02:00
Kilian Schuettler
2addf63baf RED-8995: unclassified text might be missing from document data
* treat TablePageBlock.OTHER like PARAGRAPH (no special treatment)
2024-04-17 17:40:21 +02:00
Kilian Schüttler
778bae0f7f Merge branch 'RED-8747' into 'release/0.89.x'
RED-8747 - Entities not merged properly

See merge request fforesight/layout-parser!130
2024-04-09 16:30:24 +02:00
Corina Olariu
a01958c842 RED-8747 - Entities not merged properly 2024-04-09 16:30:24 +02:00
Kilian Schüttler
fbe9a34343 Merge branch 'RED-8799' into 'release/0.89.x'
RED-8799: LayoutGrid is wrong draw for some tables

See merge request fforesight/layout-parser!127
2024-04-05 13:42:36 +02:00
Kilian Schüttler
fd7c461c8d RED-8799: LayoutGrid is wrong draw for some tables 2024-04-05 13:42:36 +02:00
Dominique Eifländer
cafbcbefc6 Merge branch 'RED-8873-bp' into 'release/0.89.x'
RED-8773 - Fix images not appearing on specific file

See merge request fforesight/layout-parser!124
2024-04-05 10:11:31 +02:00
Andrei Isvoran
34b260bb60 RED-8773 - Fix images not appearing on specific file 2024-04-03 10:21:45 +03:00
Dominique Eifländer
1ca02f72c8 Merge branch 'RED-8627-4.0' into 'release/0.89.x'
RED-8627: Fixed scrambled text after sorting

See merge request fforesight/layout-parser!121
2024-03-19 11:26:49 +01:00
Dominique Eifländer
350513a699 RED-8627: Fixed scrambled text after sorting 2024-03-19 11:16:07 +01:00
Dominique Eifländer
ab7b2cf0d5 Merge branch 'RED-7384' into 'release/0.89.x'
RED-7384: Fixes for ClassCastException

See merge request fforesight/layout-parser!110
2024-03-08 12:53:14 +01:00
Kilian Schuettler
007cbfd1ee RED-7384: Fixes for ClassCastException
* changed save -> incrementalSave
* always use origin file instead of reusing viewerdoc
* Sometimes the viewer document is corrupted after saving and missing the contentstreams on a random page, for the files we viewed it did not seem to happen with incrementalSave.might only be a timing issue though
2024-03-08 12:42:40 +01:00
Maverick Studer
a266d98f11 Merge branch 'RED-8550-bp' into 'release/0.89.x'
RED-8550: Faulty table recognition and text duplication leads to huge sections

See merge request fforesight/layout-parser!107
2024-02-29 14:17:31 +01:00
Maverick Studer
33f726c689 RED-8550: Faulty table recognition and text duplication leads to huge sections
(cherry picked from commit 74f55a5cbf905d0f869d7aa2c12c80a6d9c42e36)
2024-02-29 13:09:50 +01:00
Maverick Studer
18a28e82d0 RED-8550: Faulty table recognition and text duplication leads to huge sections
* cherrypick
2024-02-21 14:19:48 +01:00
48 changed files with 1531 additions and 820 deletions

View File

@ -28,6 +28,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@ -94,16 +95,23 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
// File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
// .orElse(originFile);
File viewerDocumentFile = originFile;
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
if (layoutParsingRequest.imagesFileStorageId()
.isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get());
}
TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
if (layoutParsingRequest.tablesFileStorageId()
.isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
.get());
}
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
@ -142,25 +150,25 @@ public class LayoutParsingPipeline {
.numberOfPages(documentGraph.getNumberOfPages())
.duration(System.currentTimeMillis() - start)
.message(format("""
Layout parsing has finished in %.02f s.
identifiers: %s
%s
Files have been saved with Ids:
Structure: %s
Text: %s
Positions: %s
PageData: %s
Simplified Text: %s
Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId()))
Layout parsing has finished in %.02f s.
identifiers: %s
%s
Files have been saved with Ids:
Structure: %s
Text: %s
Positions: %s
PageData: %s
Simplified Text: %s
Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId()))
.build();
}
@ -170,9 +178,9 @@ public class LayoutParsingPipeline {
AtomicReference<Document> documentReference = new AtomicReference<>();
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry).contextualName("build-document-graph").observe(() -> {
documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument));
});
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry)
.contextualName("build-document-graph")
.observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument)));
return documentReference.get();
}
@ -181,14 +189,14 @@ public class LayoutParsingPipeline {
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
}
@ -243,8 +251,10 @@ public class LayoutParsingPipeline {
PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
};
@ -319,9 +329,7 @@ public class LayoutParsingPipeline {
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
if (!classificationPage.isLandscape()) {
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
}
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());

View File

@ -6,12 +6,14 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public abstract class AbstractPageBlock {
@EqualsAndHashCode(callSuper = true)
public abstract class AbstractPageBlock extends Rectangle {
@JsonIgnore
protected float minX;

View File

@ -12,7 +12,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -36,6 +35,8 @@ public class Image implements GenericSemanticNode {
boolean transparent;
Rectangle2D position;
TextBlock leafTextBlock;
boolean redaction;
boolean ignored;
@Builder.Default
@ -66,7 +67,7 @@ public class Image implements GenericSemanticNode {
@Override
public TextBlock getTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
return leafTextBlock;
}
@ -92,4 +93,11 @@ public class Image implements GenericSemanticNode {
return bBoxPerPage;
}
@Override
public boolean isLeaf() {
return true;
}
}

View File

@ -84,14 +84,16 @@ public class TableCell implements GenericSemanticNode {
private TextBlock buildTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
return streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
}
@Override
public String toString() {
return treeId + ": " + NodeType.TABLE_CELL + ": " + this.buildTextBlock().buildSummary();
return treeId + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary();
}
}

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
@ -36,6 +37,12 @@ public class Cell extends Rectangle {
}
public Cell(Rectangle2D r) {
super((float) r.getY(), (float) r.getX(), (float) r.getWidth(), (float) r.getHeight());
}
public void addTextBlock(TextPageBlock textBlock) {
textBlocks.add(textBlock);

View File

@ -20,7 +20,8 @@ import lombok.extern.slf4j.Slf4j;
@SuppressWarnings("all")
public class Ruling extends Line2D.Float {
private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2;
public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2;
public static final int COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT = 2;
public Ruling(Point2D p1, Point2D p2) {
@ -110,8 +111,8 @@ public class Ruling extends Line2D.Float {
});
for (Ruling h : horizontals) {
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, h));
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, h));
}
for (Ruling v : verticals) {
@ -151,7 +152,7 @@ public class Ruling extends Line2D.Float {
if (i == null) {
continue;
}
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT)});
} catch (UnsupportedOperationException e) {
log.info("Some line are oblique, ignoring...");
continue;
@ -267,7 +268,7 @@ public class Ruling extends Line2D.Float {
}
public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) {
public boolean nearlyIntersects(Ruling another) {
if (this.intersectsLine(another)) {
return true;
@ -276,9 +277,9 @@ public class Ruling extends Line2D.Float {
boolean rv = false;
if (this.perpendicularTo(another)) {
rv = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).intersectsLine(another);
rv = this.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT).intersectsLine(another);
} else {
rv = this.expand(colinearOrParallelExpandAmount).intersectsLine(another.expand(colinearOrParallelExpandAmount));
rv = this.expand(COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT).intersectsLine(another.expand(COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT));
}
return rv;
@ -319,8 +320,8 @@ public class Ruling extends Line2D.Float {
public Point2D intersectionPoint(Ruling other) {
Ruling this_l = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
Ruling other_l = other.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
Ruling this_l = this.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
Ruling other_l = other.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
Ruling horizontal, vertical;
if (!this_l.intersectsLine(other_l)) {

View File

@ -1,18 +1,18 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Getter;
import lombok.Setter;
@ -21,7 +21,8 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
public class TablePageBlock extends AbstractPageBlock {
private final TreeMap<CellPosition, Cell> cells = new TreeMap<>();
public static final double CELL_AREA_CONTAINED_THRESHOLD = 0.98;
private final TreeMap<CellPosition, Cell> cellTreeMap = new TreeMap<>();
private final int rotation;
@Getter
@ -30,10 +31,14 @@ public class TablePageBlock extends AbstractPageBlock {
private int unrotatedRowCount;
private int unrotatedColCount;
private List<List<Cell>> rows;
@Getter
@Setter
private List<Cell> cells;
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
this.cells = cells;
addCells(cells);
minX = area.getLeft();
minY = area.getBottom();
@ -50,6 +55,7 @@ public class TablePageBlock extends AbstractPageBlock {
return getColCount() == 0 || getRowCount() == 0;
}
public List<List<Cell>> getRows() {
if (rows == null) {
@ -80,14 +86,17 @@ public class TablePageBlock extends AbstractPageBlock {
public int getColCount() {
return getRows().stream().mapToInt(List::size).max().orElse(0);
return getRows().stream()
.mapToInt(List::size)
.max()
.orElse(0);
}
/**
* Detect header cells (either first row or first column):
* Column is marked as header if cell text is bold and row cell text is not bold.
* Column is marked as header if originalCell text is bold and row originalCell text is not bold.
* Defaults to row.
*/
private void computeHeaders() {
@ -95,7 +104,7 @@ public class TablePageBlock extends AbstractPageBlock {
if (rows == null) {
rows = computeRows();
}
// A bold cell is a header cell as long as every cell to the left/top is bold, too
// A bold originalCell is a header originalCell as long as every originalCell to the left/top is bold, too
// we move from left to right and top to bottom
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
List<Cell> rowCells = rows.get(rowIndex);
@ -120,7 +129,8 @@ public class TablePageBlock extends AbstractPageBlock {
List<Cell> cellsToTheTop = new ArrayList<>();
for (int i = 0; i < rowIndex; i++) {
try {
cellsToTheTop.add(rows.get(i).get(colIndex));
cellsToTheTop.add(rows.get(i)
.get(colIndex));
} catch (IndexOutOfBoundsException e) {
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
}
@ -135,7 +145,8 @@ public class TablePageBlock extends AbstractPageBlock {
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks()
.get(0).getMostPopularWordStyle().equals("bold")) {
cell.setHeaderCell(true);
}
}
@ -151,7 +162,7 @@ public class TablePageBlock extends AbstractPageBlock {
for (int i = 0; i < unrotatedColCount; i++) { // rows
List<Cell> lastRow = new ArrayList<>();
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
Cell cell = cells.get(new CellPosition(j, i));
Cell cell = cellTreeMap.get(new CellPosition(j, i));
if (cell != null) {
lastRow.add(cell);
}
@ -162,7 +173,7 @@ public class TablePageBlock extends AbstractPageBlock {
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
List<Cell> lastRow = new ArrayList<>();
for (int j = 0; j < unrotatedRowCount; j++) { // cols
Cell cell = cells.get(new CellPosition(j, i));
Cell cell = cellTreeMap.get(new CellPosition(j, i));
if (cell != null) {
lastRow.add(cell);
}
@ -173,7 +184,7 @@ public class TablePageBlock extends AbstractPageBlock {
for (int i = 0; i < unrotatedRowCount; i++) {
List<Cell> lastRow = new ArrayList<>();
for (int j = 0; j < unrotatedColCount; j++) {
Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
Cell cell = cellTreeMap.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
if (cell != null) {
lastRow.add(cell);
}
@ -187,17 +198,6 @@ public class TablePageBlock extends AbstractPageBlock {
}
private void add(Cell chunk, int row, int col) {
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
CellPosition cp = new CellPosition(row, col);
cells.put(cp, chunk);
}
private void addCells(List<Cell> cells) {
if (cells.isEmpty()) {
@ -206,11 +206,12 @@ public class TablePageBlock extends AbstractPageBlock {
cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1);
List<List<Cell>> rowsOfCells = calculateStructure(cells);
List<List<Cell>> rowsOfCellsMatrix = calculateTableStructure(cells);
for (int i = 0; i < rowsOfCells.size(); i++) {
for (int j = 0; j < rowsOfCells.get(i).size(); j++) {
add(rowsOfCells.get(i).get(j), i, j);
for (int i = 0; i < rowsOfCellsMatrix.size(); i++) {
for (int j = 0; j < rowsOfCellsMatrix.get(i).size(); j++) {
addCellToRowAndCol(rowsOfCellsMatrix.get(i)
.get(j), i, j);
}
}
@ -221,29 +222,36 @@ public class TablePageBlock extends AbstractPageBlock {
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
*
* @param cells The found cells
* @return TablePageBlock Structure
* @return TablePageBlock Structure as a rows of cells matrix
*/
private List<List<Cell>> calculateStructure(List<Cell> cells) {
List<List<Cell>> matrix = new ArrayList<>();
private List<List<Cell>> calculateTableStructure(List<Cell> cells) {
if (cells.isEmpty()) {
return matrix;
return new ArrayList<>();
}
Set<Float> uniqueX = new HashSet<>();
Set<Float> uniqueY = new HashSet<>();
cells.stream().filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3).forEach(c -> {
uniqueX.add(c.getLeft());
uniqueX.add(c.getRight());
uniqueY.add(c.getBottom());
uniqueY.add(c.getTop());
});
cells.stream()
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
.forEach(c -> {
uniqueX.add(c.getLeft());
uniqueX.add(c.getRight());
uniqueY.add(c.getBottom());
uniqueY.add(c.getTop());
});
var sortedUniqueX = uniqueX.stream().sorted().toList();
var sortedUniqueY = uniqueY.stream().sorted().toList();
var sortedUniqueX = uniqueX.stream()
.sorted()
.toList();
var sortedUniqueY = uniqueY.stream()
.sorted()
.toList();
List<List<Cell>> rowsOfCells = new ArrayList<>();
Float prevY = null;
for (Float y : sortedUniqueY) {
List<Cell> row = new ArrayList<>();
@ -252,44 +260,87 @@ public class TablePageBlock extends AbstractPageBlock {
for (Float x : sortedUniqueX) {
if (prevY != null && prevX != null) {
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
var cellFromGridStructure = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
var intersectionCell = cells.stream().filter(c -> intersects(cell, c)).findFirst();
if (cellFromGridStructure.hasMinimumSize()) {
intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
if (cell.hasMinimumSize()) {
row.add(cell);
cells.stream()
.map(originalCell -> new CellWithIntersection(originalCell, RectangleTransformations.calculateIntersectedArea(cellFromGridStructure, originalCell)))
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
.max(Comparator.comparing(CellWithIntersection::intersectedArea))
.map(CellWithIntersection::originalCell)
.ifPresent(matchingCell -> cellFromGridStructure.getTextBlocks().addAll(matchingCell.getTextBlocks()));
row.add(cellFromGridStructure);
}
}
prevX = x;
}
if (prevY != null && prevX != null && !row.isEmpty()) {
matrix.add(row);
// exclude empty rows and rows where all text blocks are empty
if (prevY != null && prevX != null && !row.isEmpty() && !row.stream()
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
rowsOfCells.add(row);
}
prevY = y;
}
Collections.reverse(matrix);
Collections.reverse(rowsOfCells);
return matrix;
}
public boolean intersects(Cell cell1, Cell cell2) {
if (cell1.getHeight() <= 0 || cell2.getHeight() <= 0) {
return false;
// now cells are removed which are part of a column without any text blocks
// this is done by first computing the inverse matrix which contains call columns of cells
// then the column indices that have to be removed are determined
List<List<Cell>> columnsOfCells = new ArrayList<>();
int maxRowLength = rowsOfCells.stream()
.map(List::size)
.max(java.util.Comparator.naturalOrder())
.orElse(0);
for (int i = 0; i < maxRowLength; i++) {
columnsOfCells.add(new ArrayList<>());
}
double x0 = cell1.getX() + 2;
double y0 = cell1.getY() + 2;
return (cell2.x + cell2.width > x0 &&
cell2.y + cell2.height > y0 &&
cell2.x < x0 + cell1.getWidth() -2 &&
cell2.y < y0 + cell1.getHeight() -2);
for (List<Cell> row : rowsOfCells) {
for (int j = 0; j < row.size(); j++) {
columnsOfCells.get(j).add(row.get(j));
}
}
List<Integer> columnIndicesToRemove = new ArrayList<>();
int columnIndex = 0;
for (List<Cell> col : columnsOfCells) {
if (col.stream()
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
columnIndicesToRemove.add(columnIndex);
}
columnIndex++;
}
columnIndicesToRemove.sort(Collections.reverseOrder());
// update all rows so that the values of the empty columns get removed
var rowsOfCellsBefore = new ArrayList<>(rowsOfCells);
rowsOfCells = new ArrayList<>();
for (List<Cell> row : rowsOfCellsBefore) {
var updatedRow = new ArrayList<>(row);
columnIndicesToRemove.forEach(idxToRemove -> updatedRow.remove(updatedRow.get(idxToRemove)));
rowsOfCells.add(updatedRow);
}
return rowsOfCells;
}
private void addCellToRowAndCol(Cell cell, int row, int col) {
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
CellPosition cp = new CellPosition(row, col);
cellTreeMap.put(cp, cell);
}
@Override
public String getText() {
@ -314,7 +365,7 @@ public class TablePageBlock extends AbstractPageBlock {
if (!first) {
sb.append("\n");
}
sb.append('\"').append(textBlock.getText().replaceAll("\"", "\\\"")).append('\"');
sb.append('\"').append(textBlock.getText().replaceAll("\"", "\"")).append('\"');
first = false;
}
}
@ -328,8 +379,6 @@ public class TablePageBlock extends AbstractPageBlock {
}
public String getTextAsHtml() {
StringBuilder sb = new StringBuilder();
@ -363,4 +412,9 @@ public class TablePageBlock extends AbstractPageBlock {
return sb.toString();
}
record CellWithIntersection(Cell originalCell, double intersectedArea) {
}
}

View File

@ -1,7 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.beans.BeanUtils;
import com.fasterxml.jackson.annotation.JsonIgnore;
@ -50,7 +49,13 @@ public class RedTextPosition {
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
var pos = new RedTextPosition();
BeanUtils.copyProperties(textPosition, pos);
pos.setRotation(textPosition.getRotation());
pos.setPageHeight(textPosition.getPageHeight());
pos.setPageWidth(textPosition.getPageWidth());
pos.setUnicode(textPosition.getUnicode());
pos.setDir(textPosition.getDir());
pos.setWidthOfSpace(textPosition.getWidthOfSpace());
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
pos.setFontName(textPosition.getFont().getName());
pos.setFontSizeInPt(textPosition.getFontSizeInPt());

View File

@ -190,6 +190,12 @@ public class TextPositionSequence implements CharSequence {
}
public float getTextHeightNoPadding() {
return textPositions.get(0).getHeightDir();
}
public float getTextHeight() {
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;

View File

@ -1,21 +1,21 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Line2D;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.X_FIRST_RULING_COMPARATOR;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.utils.UnionFind;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@ -25,25 +25,145 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class RulingCleaningService {
private static final float THRESHOLD = 6;
private static final float THRESHOLD_X_VERTICAL = 1;
private static final float THRESHOLD_Y_VERTICAL = 2;
private static final float THRESHOLD_X_HORIZONTAL = 2;
private static final float THRESHOLD_Y_HORIZONTAL = 3;
public CleanRulings getCleanRulings(List<TableCells> tableCells, List<Ruling> rulings) {
Rulings verticalAndHorizontalRulingLines;
if (!rulings.isEmpty()) {
snapPoints(rulings);
verticalAndHorizontalRulingLines = extractVerticalAndHorizontalRulingLines(rulings);
} else {
verticalAndHorizontalRulingLines = getRulingsFromParsedCells(tableCells);
}
verticalAndHorizontalRulingLines.verticalLines.sort(X_FIRST_RULING_COMPARATOR);
verticalAndHorizontalRulingLines.horizontalLines.sort(X_FIRST_RULING_COMPARATOR);
verticalAndHorizontalRulingLines = cleanRulings(verticalAndHorizontalRulingLines);
return CleanRulings.builder().vertical(verticalAndHorizontalRulingLines.verticalLines()).horizontal(verticalAndHorizontalRulingLines.horizontalLines()).build();
}
private Rulings cleanRulings(Rulings rulings) {
List<List<Rectangle>> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
.map(RulingCleaningService::getOverlapRectangle)
.distinct()
.toList());
List<Ruling> cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream()
.map(rectList -> getXCenteredRuling(Rectangle.boundingBoxOf(rectList)))
.toList();
List<List<Rectangle>> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
.map(RulingCleaningService::getOverlapRectangle)
.distinct()
.toList());
List<Ruling> cleanedHorizontalRulings = groupedOverlappingHorizontalRectangles.stream()
.map(rectList -> getYCenteredRuling(Rectangle.boundingBoxOf(rectList)))
.collect(Collectors.toList());
return new Rulings(cleanedVerticalRulings, cleanedHorizontalRulings);
}
private List<List<Rectangle>> groupOverlappingRectangles(List<Rectangle> rectangles) {
UnionFind<Rectangle> unionFind = new UnionFind<>();
for (int i = 0; i < rectangles.size(); i++) {
for (int j = i + 1; j < rectangles.size(); j++) {
Rectangle rectangle1 = rectangles.get(i);
Rectangle rectangle2 = rectangles.get(j);
// we can stop early when we are too far off because of x-y-sorting
if(rectangle1.getRight() < rectangle2.getLeft() && rectangle1.getBottom() < rectangle2.getTop()) {
break;
}
if (rectangle1.intersects(rectangle2)) {
unionFind.union(rectangle1, rectangle2);
}
}
}
Map<Rectangle, List<Rectangle>> groups = new HashMap<>();
for (Rectangle rectangle : rectangles) {
Rectangle root = unionFind.find(rectangle);
groups.computeIfAbsent(root, k -> new ArrayList<>()).add(rectangle);
}
return new ArrayList<>(groups.values());
}
private static Rectangle getOverlapRectangle(Ruling ruling) {
float top;
float left;
float w;
float h;
if (ruling.x1 < ruling.x2) {
left = ruling.x1;
w = ruling.x2 - ruling.x1;
} else {
left = ruling.x2;
w = ruling.x1 - ruling.x2;
}
if (ruling.y1 < ruling.y2) {
top = ruling.y1;
h = ruling.y2 - ruling.y1;
} else {
top = ruling.y2;
h = ruling.y1 - ruling.y2;
}
if (ruling.horizontal()) {
return new Rectangle(top - THRESHOLD_Y_HORIZONTAL, left - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
} else {
return new Rectangle(top - THRESHOLD_Y_VERTICAL, left - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
}
}
public static Ruling getXCenteredRuling(Rectangle rectangle) {
float x = (float) rectangle.getCenterX();
float y1 = rectangle.getTop();
float y2 = rectangle.getBottom();
Point2D point1 = new Point2D.Float(x, y1 + THRESHOLD_Y_VERTICAL);
Point2D point2 = new Point2D.Float(x, y2 - THRESHOLD_Y_VERTICAL);
return new Ruling(point1, point2);
}
public static Ruling getYCenteredRuling(Rectangle rectangle) {
float x1 = rectangle.getLeft();
float x2 = rectangle.getRight();
float y = (float) rectangle.getCenterY();
Point2D point1 = new Point2D.Float(x1 + THRESHOLD_X_HORIZONTAL, y);
Point2D point2 = new Point2D.Float(x2 - THRESHOLD_X_HORIZONTAL, y);
return new Ruling(point1, point2);
}
private Rulings extractVerticalAndHorizontalRulingLines(List<Ruling> rulings) {
List<Ruling> vrs = new ArrayList<>();
for (Ruling vr : rulings) {
if (vr.vertical()) {
vrs.add(vr);
}
}
if (vrs.isEmpty()) {
vrs.addAll(extractVerticalRulings(tableCells));
}
List<Ruling> verticalRulingLines = collapseOrientedRulings(vrs);
List<Ruling> hrs = new ArrayList<>();
for (Ruling hr : rulings) {
@ -51,98 +171,26 @@ public class RulingCleaningService {
hrs.add(hr);
}
}
if (hrs.isEmpty()) {
hrs.addAll(extractHorizontalRulings(tableCells));
}
List<Ruling> horizontalRulingLines = collapseOrientedRulings(hrs);
return CleanRulings.builder().vertical(verticalRulingLines).horizontal(horizontalRulingLines).build();
return new Rulings(vrs, hrs);
}
public void snapPoints(List<? extends Line2D.Float> rulings) {
private Rulings getRulingsFromParsedCells(List<TableCells> tableCells) {
// collect points and keep a Line -> p1,p2 map
Map<Line2D.Float, Point2D[]> linesToPoints = new HashMap<>();
List<Point2D> points = new ArrayList<>();
for (Line2D.Float r : rulings) {
Point2D p1 = r.getP1();
Point2D p2 = r.getP2();
linesToPoints.put(r, new Point2D[]{p1, p2});
points.add(p1);
points.add(p2);
}
// snap by X
points.sort(Comparator.comparingDouble(Point2D::getX));
List<List<Point2D>> groupedPoints = new ArrayList<>();
groupedPoints.add(new ArrayList<>(Collections.singletonList(points.get(0))));
for (Point2D p : points.subList(1, points.size() - 1)) {
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
if (Math.abs(p.getX() - last.get(0).getX()) < THRESHOLD) {
groupedPoints.get(groupedPoints.size() - 1).add(p);
} else {
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
}
}
for (List<Point2D> group : groupedPoints) {
float avgLoc = 0;
for (Point2D p : group) {
avgLoc += p.getX();
}
avgLoc /= group.size();
for (Point2D p : group) {
p.setLocation(avgLoc, p.getY());
}
}
// ---
// snap by Y
points.sort(Comparator.comparingDouble(Point2D::getY));
groupedPoints = new ArrayList<>();
groupedPoints.add(new ArrayList<>(Collections.singletonList(points.get(0))));
for (Point2D p : points.subList(1, points.size() - 1)) {
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
if (Math.abs(p.getY() - last.get(0).getY()) < THRESHOLD) {
groupedPoints.get(groupedPoints.size() - 1).add(p);
} else {
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
}
}
for (List<Point2D> group : groupedPoints) {
float avgLoc = 0;
for (Point2D p : group) {
avgLoc += p.getY();
}
avgLoc /= group.size();
for (Point2D p : group) {
p.setLocation(p.getX(), avgLoc);
}
}
// ---
// finally, modify lines
for (Map.Entry<Line2D.Float, Point2D[]> ltp : linesToPoints.entrySet()) {
Point2D[] p = ltp.getValue();
ltp.getKey().setLine(p[0], p[1]);
}
List<Ruling> vrs = extractVerticalRulingsFromParsedCells(tableCells);
List<Ruling> hrs = extractHorizontalRulingsFromParsedCells(tableCells);
return new Rulings(vrs, hrs);
}
private Collection<? extends Ruling> extractVerticalRulings(List<TableCells> cvParsedTableCells) {
private List<Ruling> extractVerticalRulingsFromParsedCells(List<TableCells> tableCells) {
List<Ruling> vrs = new ArrayList<>();
if (cvParsedTableCells != null) {
for (TableCells cvParsedTableCell : cvParsedTableCells) {
Ruling leftLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX0(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1());
Ruling rightLine = createRuling(cvParsedTableCell.getX1(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1());
if (tableCells != null) {
for (TableCells tableCell : tableCells) {
Ruling leftLine = createRuling(tableCell.getX0(), tableCell.getX0(), tableCell.getY0(), tableCell.getY1());
Ruling rightLine = createRuling(tableCell.getX1(), tableCell.getX1(), tableCell.getY0(), tableCell.getY1());
vrs.add(leftLine);
vrs.add(rightLine);
}
@ -151,19 +199,18 @@ public class RulingCleaningService {
}
private Collection<? extends Ruling> extractHorizontalRulings(List<TableCells> cvParsedTableCells) {
private List<Ruling> extractHorizontalRulingsFromParsedCells(List<TableCells> tableCells) {
List<Ruling> hrs = new ArrayList<>();
if (cvParsedTableCells != null) {
for (TableCells cvParsedTableCell : cvParsedTableCells) {
Ruling topLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY1(), cvParsedTableCell.getY1());
Ruling baseLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY0());
if (tableCells != null) {
for (TableCells tableCell : tableCells) {
Ruling topLine = createRuling(tableCell.getX0(), tableCell.getX1(), tableCell.getY1(), tableCell.getY1());
Ruling baseLine = createRuling(tableCell.getX0(), tableCell.getX1(), tableCell.getY0(), tableCell.getY0());
hrs.add(topLine);
hrs.add(baseLine);
}
}
return hrs;
}
@ -189,46 +236,8 @@ public class RulingCleaningService {
}
private List<Ruling> collapseOrientedRulings(List<Ruling> lines) {
private record Rulings(List<Ruling> verticalLines, List<Ruling> horizontalLines) {
int COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1;
return collapseOrientedRulings(lines, COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT);
}
private List<Ruling> collapseOrientedRulings(List<Ruling> lines, int expandAmount) {
ArrayList<Ruling> rv = new ArrayList<>();
lines.sort((a, b) -> {
final float diff = a.getPosition() - b.getPosition();
return Float.compare(diff == 0 ? a.getStart() - b.getStart() : diff, 0f);
});
for (Ruling next_line : lines) {
Ruling last = rv.isEmpty() ? null : rv.get(rv.size() - 1);
// if current line colinear with next, and are "close enough": expand current line
if (last != null && DoubleComparisons.feq(next_line.getPosition(), last.getPosition()) && last.nearlyIntersects(next_line, expandAmount)) {
final float lastStart = last.getStart();
final float lastEnd = last.getEnd();
final boolean lastFlipped = lastStart > lastEnd;
final boolean nextFlipped = next_line.getStart() > next_line.getEnd();
boolean differentDirections = nextFlipped != lastFlipped;
float nextS = differentDirections ? next_line.getEnd() : next_line.getStart();
float nextE = differentDirections ? next_line.getStart() : next_line.getEnd();
final float newStart = lastFlipped ? Math.max(nextS, lastStart) : Math.min(nextS, lastStart);
final float newEnd = lastFlipped ? Math.min(nextE, lastEnd) : Math.max(nextE, lastEnd);
last.setStartEnd(newStart, newEnd);
assert !last.oblique();
} else if (next_line.length() == 0) {
continue;
} else {
rv.add(next_line);
}
}
return rv;
}
}

View File

@ -43,7 +43,6 @@ public class SectionsBuilderService {
for (ClassificationPage page : document.getPages()) {
List<TextPageBlock> header = new ArrayList<>();
List<TextPageBlock> footer = new ArrayList<>();
List<TextPageBlock> unclassifiedText = new ArrayList<>();
for (AbstractPageBlock current : page.getTextBlocks()) {
if (current.getClassification() == null) {
@ -62,11 +61,6 @@ public class SectionsBuilderService {
continue;
}
if (current.getClassification().equals(PageBlockType.OTHER)) {
unclassifiedText.add((TextPageBlock) current);
continue;
}
if (prev != null && current.getClassification().isHeadline() && !prev.getClassification().isHeadline() || !document.isHeadlines()) {
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
chunkBlock.setHeadline(lastHeadline);
@ -94,9 +88,6 @@ public class SectionsBuilderService {
if (!footer.isEmpty()) {
footers.add(new ClassificationFooter(footer));
}
if (!unclassifiedText.isEmpty()) {
unclassifiedTexts.add(new UnclassifiedText(unclassifiedText));
}
}
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);

View File

@ -1,14 +1,13 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Point2D;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
@ -21,59 +20,15 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder;
@Service
public class TableExtractionService {
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
int rv = 0;
float arg0X = DoubleComparisons.round(arg0.getX(), 2);
float arg0Y = DoubleComparisons.round(arg0.getY(), 2);
float arg1X = DoubleComparisons.round(arg1.getX(), 2);
float arg1Y = DoubleComparisons.round(arg1.getY(), 2);
if (arg0X > arg1X) {
rv = 1;
} else if (arg0X < arg1X) {
rv = -1;
} else if (arg0Y > arg1Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
rv = -1;
}
return rv;
};
private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {
int rv = 0;
float arg0X = DoubleComparisons.round(arg0.getX(), 2);
float arg0Y = DoubleComparisons.round(arg0.getY(), 2);
float arg1X = DoubleComparisons.round(arg1.getX(), 2);
float arg1Y = DoubleComparisons.round(arg1.getY(), 2);
if (arg0Y > arg1Y) {
rv = 1;
} else if (arg0Y < arg1Y) {
rv = -1;
} else if (arg0X > arg1X) {
rv = 1;
} else if (arg0X < arg1X) {
rv = -1;
}
return rv;
};
public boolean contains(Cell cell, double x, double y, double w, double h) {
if (cell.isEmpty() || w <= 0 || h <= 0) {
return false;
}
double x0 = cell.getX();
double y0 = cell.getY();
return (x >= x0 - 2 && y >= y0 - 2 && (x + w) <= x0 + cell.getWidth() + 2 && (y + h) <= y0 + cell.getHeight() + 2);
}
private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
private static final int TEXT_BLOCK_CONTAINMENT_TOLERANCE = 2;
private static final double TABLE_UNIFORMITY_THRESHOLD = 0.7;
/**
@ -89,22 +44,18 @@ public class TableExtractionService {
* @param cleanRulings The lines used to build the table.
* @param page Page object that contains textblocks and statistics.
*/
public void extractTables(CleanRulings cleanRulings, ClassificationPage page) {
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
List<TextPageBlock> toBeRemoved = new ArrayList<>();
// sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them
cells.sort(CELL_SIZE_COMPARATOR);
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
for (Cell cell : cells) {
if (cell.hasMinimumSize() && contains(cell,
textBlock.getPdfMinX(),
textBlock.getPdfMinY(),
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
if (cell.hasMinimumSize() && doesCellContainTextBlock(cell, textBlock)) {
cell.addTextBlock(textBlock);
toBeRemoved.add(textBlock);
break;
}
}
@ -113,245 +64,94 @@ public class TableExtractionService {
cells = new ArrayList<>(new HashSet<>(cells));
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells);
List<Rectangle> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
// sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first
// this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells
spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR);
List<TablePageBlock> tables = new ArrayList<>();
for (Rectangle area : spreadsheetAreas) {
List<Cell> overlappingCells = new ArrayList<>();
List<Cell> containedCells = new ArrayList<>();
for (Cell c : cells) {
if (c.hasMinimumSize() && c.intersects(area)) {
overlappingCells.add(c);
if (c.hasMinimumSize() && area.contains(c)) {
containedCells.add(c);
}
}
tables.add(new TablePageBlock(overlappingCells, area, page.getRotation()));
var containedCellsWithText = containedCells.stream()
.filter(cell -> !cell.getTextBlocks().isEmpty())
.toList();
// verify if table would contain fewer cells with text than the threshold allows
if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
tables.add(new TablePageBlock(containedCells, area, page.getRotation()));
cells.removeAll(containedCells);
}
}
for (TablePageBlock table : tables) {
int position = -1;
Iterator<AbstractPageBlock> itty = page.getTextBlocks().iterator();
while (itty.hasNext()) {
AbstractPageBlock textBlock = itty.next();
if (textBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) textBlock) : table.contains(textBlock) && position == -1) {
position = page.getTextBlocks().indexOf(textBlock);
for (AbstractPageBlock pageBlock : page.getTextBlocks()) {
if (pageBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) pageBlock) : table.contains(pageBlock) && position == -1) {
position = page.getTextBlocks().indexOf(pageBlock);
}
}
if (position != -1) {
page.getTextBlocks().add(position, table);
var toBeRemoved = table.getCells()
.stream()
.map(Cell::getTextBlocks)
.flatMap(List::stream)
.toList();
// remove text blocks from the page that were also added with the table (from its contained cells)
page.getTextBlocks().removeAll(toBeRemoved);
}
}
page.getTextBlocks().removeAll(toBeRemoved);
}
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
private boolean checkIfTableCellsAreUniform(List<Cell> containedCells) {
// Fix for 211.pdf
for (Ruling r : horizontalRulingLines) {
if (r.getX2() < r.getX1()) {
double a = r.getX2();
r.x2 = (float) r.getX1();
r.x1 = (float) a;
}
if(containedCells.size() <= 2) {
return true;
}
List<Cell> cellsFound = new ArrayList<>();
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
intersectionPointsList.sort(POINT_COMPARATOR);
Map<Long, List<Long>> cellsGroupedByRoundedWidth = containedCells.stream()
.map(Rectangle::getWidth)
.map(size -> Math.round(size / 10.0) * 10)
.collect(Collectors.groupingBy(Long::longValue));
for (int i = 0; i < intersectionPointsList.size(); i++) {
Point2D topLeft = intersectionPointsList.get(i);
Ruling[] hv = intersectionPoints.get(topLeft);
// CrossingPointsDirectlyBelow( topLeft );
List<Point2D> xPoints = new ArrayList<>();
// CrossingPointsDirectlyToTheRight( topLeft );
List<Point2D> yPoints = new ArrayList<>();
for (Point2D p : intersectionPointsList.subList(i, intersectionPointsList.size())) {
if (p.getX() == topLeft.getX() && p.getY() > topLeft.getY()) {
xPoints.add(p);
}
if (p.getY() == topLeft.getY() && p.getX() > topLeft.getX()) {
yPoints.add(p);
}
}
outer:
for (Point2D xPoint : xPoints) {
// is there a vertical edge b/w topLeft and xPoint?
if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) {
continue;
}
for (Point2D yPoint : yPoints) {
// is there an horizontal edge b/w topLeft and yPoint ?
if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) {
continue;
}
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
if (intersectionPoints.containsKey(btmRight) && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) && intersectionPoints.get(btmRight)[1].equals(
intersectionPoints.get(yPoint)[1])) {
cellsFound.add(new Cell(topLeft, btmRight));
break outer;
}
}
}
}
// TODO create cells for vertical ruling lines with aligned endpoints at the top/bottom of a grid
// that aren't connected with an horizontal ruler?
// see: https://github.com/jazzido/tabula-extractor/issues/78#issuecomment-41481207
return cellsFound;
return (double) cellsGroupedByRoundedWidth.size() / containedCells.size() <= TABLE_UNIFORMITY_THRESHOLD;
}
private List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
List<Rectangle> rectangles = new ArrayList<>();
Set<Point2D> pointSet = new HashSet<>();
Map<Point2D, Point2D> edgesH = new HashMap<>();
Map<Point2D, Point2D> edgesV = new HashMap<>();
int i = 0;
private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) {
for (Rectangle cell : cells) {
for (Point2D pt : cell.getPoints()) {
if (pointSet.contains(pt)) { // shared vertex, remove it
pointSet.remove(pt);
} else {
pointSet.add(pt);
}
}
double x = textBlock.getPdfMinX();
double y = textBlock.getPdfMinY();
double w = textBlock.getPdfMaxX() - textBlock.getPdfMinX();
double h = textBlock.getPdfMaxY() - textBlock.getPdfMinY();
if (cell.isEmpty() || w <= 0 || h <= 0) {
return false;
}
// X first sort
List<Point2D> pointsSortX = new ArrayList<>(pointSet);
pointsSortX.sort(X_FIRST_POINT_COMPARATOR);
// Y first sort
List<Point2D> pointsSortY = new ArrayList<>(pointSet);
pointsSortY.sort(POINT_COMPARATOR);
while (i < pointSet.size()) {
float currY = (float) pointsSortY.get(i).getY();
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortY.get(i).getY(), currY)) {
edgesH.put(pointsSortY.get(i), pointsSortY.get(i + 1));
edgesH.put(pointsSortY.get(i + 1), pointsSortY.get(i));
i += 2;
}
}
i = 0;
while (i < pointSet.size()) {
float currX = (float) pointsSortX.get(i).getX();
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortX.get(i).getX(), currX)) {
edgesV.put(pointsSortX.get(i), pointsSortX.get(i + 1));
edgesV.put(pointsSortX.get(i + 1), pointsSortX.get(i));
i += 2;
}
}
// Get all the polygons
List<List<PolygonVertex>> polygons = new ArrayList<>();
Point2D nextVertex;
while (!edgesH.isEmpty()) {
ArrayList<PolygonVertex> polygon = new ArrayList<>();
Point2D first = edgesH.keySet().iterator().next();
polygon.add(new PolygonVertex(first, Direction.HORIZONTAL));
edgesH.remove(first);
while (true) {
PolygonVertex curr = polygon.get(polygon.size() - 1);
PolygonVertex lastAddedVertex;
if (curr.direction == Direction.HORIZONTAL) {
nextVertex = edgesV.get(curr.point);
edgesV.remove(curr.point);
lastAddedVertex = new PolygonVertex(nextVertex, Direction.VERTICAL);
} else {
nextVertex = edgesH.get(curr.point);
edgesH.remove(curr.point);
lastAddedVertex = new PolygonVertex(nextVertex, Direction.HORIZONTAL);
}
polygon.add(lastAddedVertex);
if (lastAddedVertex.equals(polygon.get(0))) {
// closed polygon
polygon.remove(polygon.size() - 1);
break;
}
}
for (PolygonVertex vertex : polygon) {
edgesH.remove(vertex.point);
edgesV.remove(vertex.point);
}
polygons.add(polygon);
}
// calculate grid-aligned minimum area rectangles for each found polygon
for (List<PolygonVertex> poly : polygons) {
float top = Float.MAX_VALUE;
float left = Float.MAX_VALUE;
float bottom = Float.MIN_VALUE;
float right = Float.MIN_VALUE;
for (PolygonVertex pt : poly) {
top = (float) Math.min(top, pt.point.getY());
left = (float) Math.min(left, pt.point.getX());
bottom = (float) Math.max(bottom, pt.point.getY());
right = (float) Math.max(right, pt.point.getX());
}
rectangles.add(new Rectangle(top, left, right - left, bottom - top));
}
return rectangles;
double x0 = cell.getX();
double y0 = cell.getY();
return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE
&& y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE
&& (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE
&& (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE);
}
private enum Direction {
HORIZONTAL,
VERTICAL
}
static class PolygonVertex {
Point2D point;
Direction direction;
PolygonVertex(Point2D point, Direction direction) {
this.direction = direction;
this.point = point;
}
@Override
public boolean equals(Object other) {
if (this == other) {
return true;
}
if (!(other instanceof PolygonVertex)) {
return false;
}
return this.point.equals(((PolygonVertex) other).point);
}
@Override
public int hashCode() {
return this.point.hashCode();
}
@Override
public String toString() {
return String.format("%s[point=%s,direction=%s]", this.getClass().getName(), this.point.toString(), this.direction.toString());
}
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
.stream()
.map(Cell::new)
.collect(Collectors.toList());
}
}

View File

@ -61,8 +61,8 @@ public class DocuMineBlockificationService {
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle()
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle()
.contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString());
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();

View File

@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica
import static java.util.stream.Collectors.toSet;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
@ -13,10 +14,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
@SuppressWarnings("all")
@ -31,12 +35,12 @@ public class RedactManagerBlockificationService {
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The words of a page.
* @param horizontalRulingLines Horizontal table lines.
* @param verticalRulingLines Vertical table lines.
* @param textPositions The words of a page.
* @return Page object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> emptyCells) {
CleanRulings usedRulings = RectangleTransformations.extractRulings(emptyCells);
int indexOnPage = 0;
List<TextPositionSequence> chunkWords = new ArrayList<>();
@ -54,7 +58,7 @@ public class RedactManagerBlockificationService {
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontal(), usedRulings.getVertical());
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {

View File

@ -5,7 +5,6 @@ import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
@ -13,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
@ -23,7 +23,7 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class DocuMineClassificationService {
private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
@ -64,46 +64,54 @@ public class DocuMineClassificationService {
return;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
) {
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
) {
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
}
} else if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
.startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) {
} else if (textBlock.getText().length() > 5
&& (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular()
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && matcher2.reset().find() && !textBlock.toString()
.contains(":")
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && matcher2.reset().find() && !textBlock.toString().contains(":")
|| textBlock.toString().startsWith("APPENDIX")
|| textBlock.toString().startsWith("FIGURE")
|| textBlock.toString().startsWith("TABLE"))
&& !textBlock.toString().endsWith(":")
&& matcher2.reset().find()) {
textBlock.setClassification(PageBlockType.getHeadlineType(1));
document.setHeadlines(true);
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
} else if (matcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.reset().find() && !matcher3.reset().matches()) {
textBlock.setClassification(PageBlockType.getHeadlineType(2));
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
&& textBlock.getMostPopularWordStyle().equals("bold")
&& !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular())
&& textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular())
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
.getMostPopular()
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
&& textBlock.getMostPopularWordStyle().equals("italic")
&& !document.getFontStyleCounter().getMostPopular().equals("italic")
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);

View File

@ -18,8 +18,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
@ -31,6 +29,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
@ -49,9 +49,14 @@ public class DocumentGraphFactory {
Document documentGraph = new Document();
Context context = new Context(documentGraph);
document.getPages().forEach(context::buildAndAddPageWithCounter);
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image));
addSections(document, context);
document.getPages()
.forEach(context::buildAndAddPageWithCounter);
document.getSections()
.stream()
.flatMap(section -> section.getImages()
.stream())
.forEach(image -> context.getImages().add(image));
addSections(document, context, documentGraph);
addHeaderAndFooterToEachPage(document, context);
documentGraph.setNumberOfPages(context.pages.size());
@ -62,9 +67,10 @@ public class DocumentGraphFactory {
}
private void addSections(ClassificationDocument document, Context context) {
private void addSections(ClassificationDocument classificationDocument, Context context, Document document) {
document.getSections().forEach(section -> SectionNodeFactory.addSection(null, section.getNonEmptyPageBlocks(), section.getImages(), context));
classificationDocument.getSections()
.forEach(section -> SectionNodeFactory.addSection(null, section.getNonEmptyPageBlocks(), section.getImages(), context, document));
}
@ -74,9 +80,11 @@ public class DocumentGraphFactory {
GenericSemanticNode node;
if (originalTextBlock.isHeadline()) {
node = Headline.builder().documentTree(context.getDocumentTree()).build();
node = Headline.builder().documentTree(context.getDocumentTree())
.build();
} else {
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
node = Paragraph.builder().documentTree(context.getDocumentTree())
.build();
}
page.getMainBody().add(node);
@ -91,7 +99,16 @@ public class DocumentGraphFactory {
}
public void addImage(Section section, ClassifiedImage image, Context context) {
public void addImage(GenericSemanticNode parent, ClassifiedImage image, Context context) {
Image imageNode = createImage(image, context);
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parent, imageNode);
imageNode.setTreeId(treeId);
imageNode.setLeafTextBlock(context.textBlockFactory.emptyTextBlock(parent, context, context.getPage(image.getPage())));
}
private Image createImage(ClassifiedImage image, Context context) {
Rectangle2D position = image.getPosition();
Page page = context.getPage(image.getPage());
@ -104,9 +121,7 @@ public class DocumentGraphFactory {
.documentTree(context.getDocumentTree())
.build();
page.getMainBody().add(imageNode);
List<Integer> tocId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode);
imageNode.setTreeId(tocId);
return imageNode;
}
@ -146,10 +161,7 @@ public class DocumentGraphFactory {
Page page = context.getPage(textBlocks.get(0).getPage());
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
footer,
context,
page);
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId);
footer.setLeafTextBlock(textBlock);
@ -161,7 +173,7 @@ public class DocumentGraphFactory {
Page page = context.getPage(textBlocks.get(0).getPage());
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId);
header.setLeafTextBlock(textBlock);
@ -172,7 +184,8 @@ public class DocumentGraphFactory {
private void addEmptyFooter(int pageIndex, Context context) {
Page page = context.getPage(pageIndex);
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
.build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId);
@ -184,7 +197,8 @@ public class DocumentGraphFactory {
private void addEmptyHeader(int pageIndex, Context context) {
Page page = context.getPage(pageIndex);
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
Header header = Header.builder().documentTree(context.getDocumentTree())
.build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId);

View File

@ -11,6 +11,7 @@ import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -24,27 +25,46 @@ import lombok.experimental.UtilityClass;
@UtilityClass
public class SectionNodeFactory {
public void addSection(GenericSemanticNode parentNode, List<AbstractPageBlock> pageBlocks, List<ClassifiedImage> images, DocumentGraphFactory.Context context) {
public void addSection(GenericSemanticNode parentNode,
List<AbstractPageBlock> pageBlocks,
List<ClassifiedImage> images,
DocumentGraphFactory.Context context,
Document document) {
// This is for the case where we have images on a page without any text/footer/header.
// The pageBlocks list is empty, but we still need to add those images to the document.
if (!images.isEmpty() && pageBlocks.isEmpty()) {
images.stream()
.distinct()
.forEach(image -> DocumentGraphFactory.addImage(document, image, context));
return;
}
if (pageBlocks.isEmpty()) {
return;
}
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractPageBlock::getPage));
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
.collect(groupingBy(AbstractPageBlock::getPage));
Section section = Section.builder().documentTree(context.getDocumentTree())
.build();
context.getSections().add(section);
blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
blocksPerPage.keySet()
.forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
section.setTreeId(getTreeId(parentNode, context, section));
addFirstHeadlineDirectlyToSection(pageBlocks, context, section);
addFirstHeadlineDirectlyToSection(pageBlocks, context, section, document);
if (containsTablesAndTextBlocks(pageBlocks)) {
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context));
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context, document));
} else {
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section);
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section, document);
}
images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context));
images.stream()
.distinct()
.forEach(image -> DocumentGraphFactory.addImage(section, image, context));
}
@ -58,16 +78,16 @@ public class SectionNodeFactory {
}
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section, Document document) {
if (pageBlocks.get(0).isHeadline()) {
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section);
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section, document);
pageBlocks.remove(0);
}
}
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section, Document document) {
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
@ -86,7 +106,7 @@ public class SectionNodeFactory {
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
alreadyMerged.addAll(tablesToMerge);
TableNodeFactory.addTable(section, tablesToMerge, context);
TableNodeFactory.addTable(section, tablesToMerge, context, document);
} else {
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
}
@ -96,7 +116,9 @@ public class SectionNodeFactory {
private boolean containsTablesAndTextBlocks(List<AbstractPageBlock> pageBlocks) {
return pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
return pageBlocks.stream()
.anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream()
.anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
}
@ -112,7 +134,9 @@ public class SectionNodeFactory {
List<List<AbstractPageBlock>> splitList = splitIntoCoherentList(pageBlocks);
movePrecedingHeadlineToTableList(splitList);
return splitList.stream().filter(list -> !list.isEmpty()).toList();
return splitList.stream()
.filter(list -> !list.isEmpty())
.toList();
}
@ -133,7 +157,8 @@ public class SectionNodeFactory {
private boolean listIsTablesOnly(List<AbstractPageBlock> abstractPageBlocks) {
return abstractPageBlocks.stream().allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
return abstractPageBlocks.stream()
.allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
}

View File

@ -8,6 +8,7 @@ import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@ -27,23 +28,26 @@ public class TableNodeFactory {
public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) {
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context, Document document) {
setPageNumberInCells(tablesToMerge);
Set<Page> pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet());
List<List<Cell>> mergedRows = tablesToMerge.stream().map(TablePageBlock::getRows).flatMap(Collection::stream).toList();
Set<Page> pages = tablesToMerge.stream()
.map(AbstractPageBlock::getPage)
.map(context::getPage)
.collect(Collectors.toSet());
List<List<Cell>> mergedRows = tablesToMerge.stream()
.map(TablePageBlock::getRows)
.flatMap(Collection::stream)
.toList();
Table table = Table.builder()
.documentTree(context.getDocumentTree())
.numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size())
.numberOfRows(mergedRows.size())
Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size()).numberOfRows(mergedRows.size())
.build();
pages.forEach(page -> addTableToPage(page, parentNode, table));
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
table.setTreeId(treeId);
addTableCells(mergedRows, table, context);
addTableCells(mergedRows, table, context, document);
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
}
@ -63,7 +67,8 @@ public class TableNodeFactory {
private void setPageNumberInTextBlocksWithPageNumberSetTo0(TablePageBlock table, Cell cell) {
cell.getTextBlocks().stream()//
cell.getTextBlocks()
.stream()//
.filter(tb -> tb.getPage() == 0)//
.forEach(tb -> tb.setPage(table.getPage()));
}
@ -82,28 +87,32 @@ public class TableNodeFactory {
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
if (table.streamHeaders().findAny().isEmpty()) {
table.streamRow(0).forEach(tableCellNode -> tableCellNode.setHeader(true));
if (table.streamHeaders()
.findAny().isEmpty()) {
table.streamRow(0)
.forEach(tableCellNode -> tableCellNode.setHeader(true));
}
}
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) {
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context, Document document) {
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context);
addTableCell(rows.get(rowIndex)
.get(colIndex), rowIndex, colIndex, table, context, document);
}
}
}
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) {
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context, Document document) {
Page page = context.getPage(cell.getPageNumber());
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()).build();
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D())
.build();
page.getMainBody().add(tableCell);
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
@ -113,16 +122,26 @@ public class TableNodeFactory {
if (cell.getTextBlocks().isEmpty()) {
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
} else if (cell.getTextBlocks().size() == 1) {
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
textBlock = context.getTextBlockFactory()
.buildAtomicTextBlock(cell.getTextBlocks()
.get(0).getSequences(), tableCell, context, page);
tableCell.setLeafTextBlock(textBlock);
} else if (firstTextBlockIsHeadline(cell)) {
SectionNodeFactory.addSection(tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context);
SectionNodeFactory.addSection(tableCell,
cell.getTextBlocks()
.stream()
.map(tb -> (AbstractPageBlock) tb)
.toList(),
emptyList(),
context,
document);
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
tableCell.setLeafTextBlock(textBlock);
} else {
cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
cell.getTextBlocks()
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
}
}
@ -135,7 +154,8 @@ public class TableNodeFactory {
private boolean firstTextBlockIsHeadline(Cell cell) {
return cell.getTextBlocks().get(0).isHeadline();
return cell.getTextBlocks()
.get(0).isHeadline();
}
}

View File

@ -110,11 +110,13 @@ public class LayoutGridService {
return;
}
for (Page page : table.getPages()) {
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0).filter(tableCell -> tableCell.isOnPage(page.getNumber())).map(TableCell::getRow).findFirst();
if (optionalFirstRowOnPage.isEmpty()) {
continue;
}
int firstRowOnPage = optionalFirstRowOnPage.get();
Stream<Double> xStream = switch (page.getRotation()) {
case 90 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinX);
case 180 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMaxX);
@ -123,6 +125,7 @@ public class LayoutGridService {
};
List<Double> xs = xStream.collect(Collectors.toList());
xs.remove(0);
Stream<Double> yStream = switch (page.getRotation()) {
case 90 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMinY);
case 180 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinY);
@ -132,7 +135,7 @@ public class LayoutGridService {
List<Double> ys = yStream.collect(Collectors.toList());
ys.remove(0);
Rectangle2D tableBBox = table.getBBox().get(table.getFirstPage());
Rectangle2D tableBBox = table.getBBox().get(page);
List<ColoredLine> coloredLines = layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getColoredLines();
xs.forEach(x -> {
Line2D line = new Line2D.Double(new Point2D.Double(x, tableBBox.getMaxY()), new Point2D.Double(x, tableBBox.getMinY()));
@ -188,14 +191,33 @@ public class LayoutGridService {
@SneakyThrows
private void addPlacedText(Page page, Rectangle2D textBBox, String s, LayoutGrid layoutGrid) {
Point2D.Float upperLeftCorner = switch (page.getRotation()) {
case 90 -> new Point2D.Float((float) (textBBox.getMinX()), (float) textBBox.getMinY());
case 180 -> new Point2D.Float((float) (textBBox.getMaxX()), (float) textBBox.getMinY());
case 270 -> new Point2D.Float((float) (textBBox.getMaxX()), (float) textBBox.getMaxY());
default -> new Point2D.Float((float) (textBBox.getMinX()), (float) textBBox.getMaxY());
};
// translates text, such that its right edge is a bit to the left of the drawn box
float translationAmount = ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4);
Point2D upperLeftCorner;
Point2D translationVector;
switch (page.getRotation()) {
case 90 -> {
upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMinY());
translationVector = new Point2D.Double(FONT_SIZE, -translationAmount);
}
case 180 -> {
upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMinY());
translationVector = new Point2D.Double(translationAmount, FONT_SIZE);
}
case 270 -> {
upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMaxY());
translationVector = new Point2D.Double(-FONT_SIZE, translationAmount);
}
default -> {
upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMaxY());
translationVector = new Point2D.Double(-translationAmount, -FONT_SIZE);
}
}
upperLeftCorner = add(upperLeftCorner, translationVector);
var placedTexts = layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getPlacedTexts();
upperLeftCorner.setLocation(upperLeftCorner.getX() - ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), upperLeftCorner.getY() - FONT_SIZE);
placedTexts.add(PlacedText.textFacingUp(s, upperLeftCorner, FONT_SIZE, Color.BLACK, FONT));
}
@ -317,4 +339,10 @@ public class LayoutGridService {
.add(new ColoredRectangle(textBBox, color, LINE_WIDTH)));
}
private Point2D add(Point2D a, Point2D b) {
return new Point2D.Double(a.getX() + b.getX(), a.getY() + b.getY());
}
}

View File

@ -0,0 +1,28 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.Color;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Builder
@AllArgsConstructor
@Getter
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class DrawingOptions {
boolean stroke;
@Builder.Default
Color strokeColor = Color.BLACK;
@Builder.Default
float strokeWidth = 1f;
boolean fill;
@Builder.Default
Color fillColor = Color.BLACK;
}

View File

@ -0,0 +1,88 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.Point2D;
import java.util.Comparator;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
public class GeometricComparators {
private static final int COMPARATOR_ROUNDING = 2;
public static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (point1, point2) -> {
int rv = 0;
float point1X = DoubleComparisons.round(point1.getX(), COMPARATOR_ROUNDING);
float point1Y = DoubleComparisons.round(point1.getY(), COMPARATOR_ROUNDING);
float point2X = DoubleComparisons.round(point2.getX(), COMPARATOR_ROUNDING);
float point2Y = DoubleComparisons.round(point2.getY(), COMPARATOR_ROUNDING);
if (point1X > point2X) {
rv = 1;
} else if (point1X < point2X) {
rv = -1;
} else if (point1Y > point2Y) {
rv = 1;
} else if (point1Y < point2Y) {
rv = -1;
}
return rv;
};
public static final Comparator<Point2D> Y_FIRST_POINT_COMPARATOR = (point1, point2) -> {
int rv = 0;
float point1X = DoubleComparisons.round(point1.getX(), COMPARATOR_ROUNDING);
float point1Y = DoubleComparisons.round(point1.getY(), COMPARATOR_ROUNDING);
float point2X = DoubleComparisons.round(point2.getX(), COMPARATOR_ROUNDING);
float point2Y = DoubleComparisons.round(point2.getY(), COMPARATOR_ROUNDING);
if (point1Y > point2Y) {
rv = 1;
} else if (point1Y < point2Y) {
rv = -1;
} else if (point1X > point2X) {
rv = 1;
} else if (point1X < point2X) {
rv = -1;
}
return rv;
};
public static final Comparator<Cell> CELL_SIZE_COMPARATOR = (cell1, cell2) -> {
Double cell1Size = cell1.getHeight() * cell1.getWidth();
Double cell2Size = cell2.getHeight() * cell2.getWidth();
return cell1Size.compareTo(cell2Size);
};
public static final Comparator<Rectangle> RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> {
Double rect1Size = rect1.getHeight() * rect1.getWidth();
Double rect2Size = rect2.getHeight() * rect2.getWidth();
return rect1Size.compareTo(rect2Size);
};
public static final Comparator<Ruling> X_FIRST_RULING_COMPARATOR = (ruling1, ruling2) -> {
int rv = 0;
float point1X = DoubleComparisons.round(Math.min(ruling1.getLeft(), ruling1.getRight()), COMPARATOR_ROUNDING);
float point1Y = DoubleComparisons.round(Math.min(ruling1.getTop(), ruling1.getBottom()), COMPARATOR_ROUNDING);
float point2X = DoubleComparisons.round(Math.min(ruling2.getLeft(), ruling2.getRight()), COMPARATOR_ROUNDING);
float point2Y = DoubleComparisons.round(Math.min(ruling2.getTop(), ruling2.getBottom()), COMPARATOR_ROUNDING);
if (point1X > point2X) {
rv = 1;
} else if (point1X < point2X) {
rv = -1;
} else if (point1Y > point2Y) {
rv = 1;
} else if (point1Y < point2Y) {
rv = -1;
}
return rv;
};
}

View File

@ -21,11 +21,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.Builder;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
@ -41,20 +37,20 @@ public class PdfVisualisationUtility {
public void drawNode(PDDocument document, DocumentTree.Entry entry) {
Options options = buildStandardOptionsForNodes(entry);
DrawingOptions options = buildStandardOptionsForNodes(entry);
drawBBoxAndLabelAndNumberOnPage(document, entry, options);
}
public void drawTextBlock(PDDocument document, TextBlock textBlock, Options options) {
public void drawTextBlock(PDDocument document, TextBlock textBlock, DrawingOptions options) {
textBlock.getAtomicTextBlocks().forEach(atb -> drawAtomicTextBlock(document, atb, options));
}
public void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, Options options) {
public void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, DrawingOptions options) {
drawRectangle2DList(document, atomicTextBlock.getPage().getNumber(), atomicTextBlock.getPositions().stream().toList(), options);
@ -62,7 +58,7 @@ public class PdfVisualisationUtility {
@SneakyThrows
public void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, Options options) {
public void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, DrawingOptions options) {
var pdPage = document.getPage(pageNumber - 1);
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
@ -80,14 +76,14 @@ public class PdfVisualisationUtility {
@SneakyThrows
public void drawRectangle2DList(PDDocument document, int pageNumber, List<Rectangle2D> rectCollection, Options options) {
public void drawRectangle2DList(PDDocument document, int pageNumber, List<Rectangle2D> rectCollection, DrawingOptions options) {
var pdPage = document.getPage(pageNumber - 1);
drawRectangle2DList(document, rectCollection, options, pdPage);
}
private void drawRectangle2DList(PDDocument document, List<Rectangle2D> rectCollection, Options options, PDPage pdPage) throws IOException {
private void drawRectangle2DList(PDDocument document, List<Rectangle2D> rectCollection, DrawingOptions options, PDPage pdPage) throws IOException {
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
@ -110,9 +106,9 @@ public class PdfVisualisationUtility {
}
private Options buildStandardOptionsForNodes(DocumentTree.Entry entry) {
private DrawingOptions buildStandardOptionsForNodes(DocumentTree.Entry entry) {
return Options.builder().stroke(true).strokeColor(switch (entry.getType()) {
return DrawingOptions.builder().stroke(true).strokeColor(switch (entry.getType()) {
case DOCUMENT -> Color.LIGHT_GRAY;
case HEADER, FOOTER -> Color.GREEN;
case PARAGRAPH -> Color.BLUE;
@ -125,7 +121,7 @@ public class PdfVisualisationUtility {
}
private void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, Options options) {
private void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, DrawingOptions options) {
Map<Page, Rectangle2D> rectanglesPerPage = entry.getNode().getBBox();
rectanglesPerPage.forEach((page, rectangle2D) -> {
@ -152,7 +148,7 @@ public class PdfVisualisationUtility {
@SneakyThrows
public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List<? extends Line2D> line2DS, Options options) {
public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List<? extends Line2D> line2DS, DrawingOptions options) {
var pdPage = pdDocument.getPage(pageNumber - 1);
var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true);
@ -176,21 +172,4 @@ public class PdfVisualisationUtility {
contentStream.close();
}
@Builder
@Getter
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public static class Options {
boolean fill;
boolean stroke;
@Builder.Default
Color strokeColor = Color.BLACK;
@Builder.Default
float strokeWidth = 1f;
@Builder.Default
Color fillColor = Color.BLACK;
}
}

View File

@ -2,8 +2,11 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
import static java.lang.String.format;
import java.awt.geom.Area;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
@ -19,6 +22,8 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
@ -37,15 +42,28 @@ public class RectangleTransformations {
}
public static double calculateIntersectedArea(Rectangle2D r1, Rectangle2D r2) {
Area a1 = new Area(r1);
Area a2 = new Area(r2);
a1.intersect(a2);
Rectangle2D intersection = a1.getBounds2D();
return intersection.getWidth() * intersection.getHeight();
}
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
}
public static Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> collectBBox() {
return new Rectangle2DBBoxCollector();
}
public static PDRectangle toPDRectangleBBox(List<Rectangle> rectangles) {
Rectangle2D rectangle2D = RectangleTransformations.rectangleBBox(rectangles);
@ -70,6 +88,7 @@ public class RectangleTransformations {
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
}
public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) {
return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector());
@ -84,6 +103,7 @@ public class RectangleTransformations {
-redactionLogRectangle.getHeight());
}
public static Rectangle2D toRectangle2D(PDRectangle rectangle) {
return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight());
@ -133,7 +153,27 @@ public class RectangleTransformations {
previousRectangle = currentRectangle;
}
}
return rectangleListsWithGaps.stream().map(RectangleTransformations::rectangle2DBBox).toList();
return rectangleListsWithGaps.stream()
.map(RectangleTransformations::rectangle2DBBox)
.toList();
}
public static CleanRulings extractRulings(List<? extends Rectangle2D.Float> rectangles) {
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
List<Ruling> horizontalRulings = new ArrayList<>();
List<Ruling> verticalRulings = new ArrayList<>();
rectangles.forEach(rectangle -> {
horizontalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y), new Point2D.Float(rectangle.x + rectangle.width, rectangle.y)));
horizontalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y + rectangle.height),
new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height)));
verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y), new Point2D.Float(rectangle.x, rectangle.y + rectangle.height)));
verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x + rectangle.width, rectangle.y),
new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height)));
});
return CleanRulings.builder().vertical(verticalRulings).horizontal(horizontalRulings).build();
}

View File

@ -0,0 +1,77 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
public class RectangularIntersectionFinder {
public static List<Rectangle2D> find(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
// Fix for 211.pdf
for (Ruling r : horizontalRulingLines) {
if (r.getX2() < r.getX1()) {
double a = r.getX2();
r.x2 = (float) r.getX1();
r.x1 = (float) a;
}
}
List<Rectangle2D> foundRectangles = new ArrayList<>();
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
intersectionPointsList.sort(Y_FIRST_POINT_COMPARATOR);
for (int i = 0; i < intersectionPointsList.size(); i++) {
Point2D topLeft = intersectionPointsList.get(i);
Ruling[] hv = intersectionPoints.get(topLeft);
// CrossingPointsDirectlyBelow( topLeft );
List<Point2D> xPoints = new ArrayList<>();
// CrossingPointsDirectlyToTheRight( topLeft );
List<Point2D> yPoints = new ArrayList<>();
for (Point2D p : intersectionPointsList.subList(i, intersectionPointsList.size())) {
if (p.getX() == topLeft.getX() && p.getY() > topLeft.getY()) {
xPoints.add(p);
}
if (p.getY() == topLeft.getY() && p.getX() > topLeft.getX()) {
yPoints.add(p);
}
}
outer:
for (Point2D xPoint : xPoints) {
// is there a vertical edge b/w topLeft and xPoint?
if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) {
continue;
}
for (Point2D yPoint : yPoints) {
// is there a horizontal edge b/w topLeft and yPoint ?
if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) {
continue;
}
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
if (intersectionPoints.containsKey(btmRight)
&& intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0])
&& intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) {
foundRectangles.add(new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), btmRight.getX() - topLeft.getX(), btmRight.getY() - topLeft.getY()));
break outer;
}
}
}
}
// TODO create cells for vertical ruling lines with aligned endpoints at the top/bottom of a grid
// that aren't connected with an horizontal ruler?
// see: https://github.com/jazzido/tabula-extractor/issues/78#issuecomment-41481207
return foundRectangles;
}
}

View File

@ -0,0 +1,172 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.X_FIRST_POINT_COMPARATOR;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
public class SpreadsheetFinder {
private static final int MAX_OUTER_POINT_TOLERANCE = 10;
private static final float AREA_TOLERANCE = 0.001f;
public static List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
List<Rectangle> rectangles = new ArrayList<>();
Set<Point2D> pointSet = new HashSet<>();
Map<Point2D, Point2D> edgesH = new HashMap<>();
Map<Point2D, Point2D> edgesV = new HashMap<>();
for (Rectangle cell : cells) {
for (Point2D pt : cell.getPoints()) {
if (pointSet.contains(pt)) { // shared vertex, remove it
pointSet.remove(pt);
} else {
pointSet.add(pt);
}
}
}
// X first sort
List<Point2D> pointsSortX = new ArrayList<>(pointSet);
pointsSortX.sort(X_FIRST_POINT_COMPARATOR);
// Y first sort
List<Point2D> pointsSortY = new ArrayList<>(pointSet);
pointsSortY.sort(Y_FIRST_POINT_COMPARATOR);
int i = 0;
while (i < pointSet.size()) {
float currY = (float) pointsSortY.get(i).getY();
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortY.get(i).getY(), currY)) {
edgesH.put(pointsSortY.get(i), pointsSortY.get(i + 1));
edgesH.put(pointsSortY.get(i + 1), pointsSortY.get(i));
i += 2;
}
}
i = 0;
while (i < pointSet.size()) {
float currX = (float) pointsSortX.get(i).getX();
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortX.get(i).getX(), currX)) {
edgesV.put(pointsSortX.get(i), pointsSortX.get(i + 1));
edgesV.put(pointsSortX.get(i + 1), pointsSortX.get(i));
i += 2;
}
}
// Get all the polygons
List<List<PolygonVertex>> polygons = new ArrayList<>();
Point2D nextVertex;
while (!edgesH.isEmpty()) {
ArrayList<PolygonVertex> polygon = new ArrayList<>();
Point2D first = edgesH.keySet()
.iterator().next();
polygon.add(new PolygonVertex(first, Direction.HORIZONTAL));
edgesH.remove(first);
while (true) {
PolygonVertex curr = polygon.get(polygon.size() - 1);
PolygonVertex lastAddedVertex;
if (curr.direction == Direction.HORIZONTAL) {
nextVertex = edgesV.get(curr.point);
edgesV.remove(curr.point);
lastAddedVertex = new PolygonVertex(nextVertex, Direction.VERTICAL);
} else {
nextVertex = edgesH.get(curr.point);
edgesH.remove(curr.point);
lastAddedVertex = new PolygonVertex(nextVertex, Direction.HORIZONTAL);
}
polygon.add(lastAddedVertex);
if (lastAddedVertex.equals(polygon.get(0))) {
// closed polygon
polygon.remove(polygon.size() - 1);
break;
}
}
for (PolygonVertex vertex : polygon) {
edgesH.remove(vertex.point);
edgesV.remove(vertex.point);
}
polygons.add(polygon);
}
// calculate grid-aligned minimum area rectangles for each found polygon
for (List<PolygonVertex> poly : polygons) {
float top = Float.MAX_VALUE;
float left = Float.MAX_VALUE;
float bottom = Float.MIN_VALUE;
float right = Float.MIN_VALUE;
for (PolygonVertex pt : poly) {
top = (float) Math.min(top, pt.point.getY());
left = (float) Math.min(left, pt.point.getX());
bottom = (float) Math.max(bottom, pt.point.getY());
right = (float) Math.max(right, pt.point.getX());
}
// do not add polygons with too many outer points as they are unlikely to be tables
if (poly.size() <= MAX_OUTER_POINT_TOLERANCE) {
rectangles.add(new Rectangle(top - AREA_TOLERANCE, left - AREA_TOLERANCE, right - left + 2 * AREA_TOLERANCE, bottom - top + 2 * AREA_TOLERANCE));
}
}
return rectangles;
}
private enum Direction {
HORIZONTAL,
VERTICAL
}
static class PolygonVertex {
Point2D point;
Direction direction;
PolygonVertex(Point2D point, Direction direction) {
this.direction = direction;
this.point = point;
}
@Override
public boolean equals(Object other) {
if (this == other) {
return true;
}
if (!(other instanceof PolygonVertex)) {
return false;
}
return this.point.equals(((PolygonVertex) other).point);
}
@Override
public int hashCode() {
return this.point.hashCode();
}
@Override
public String toString() {
return String.format("%s[point=%s,direction=%s]", this.getClass().getName(), this.point.toString(), this.direction.toString());
}
}
}

View File

@ -30,8 +30,6 @@ public class TableMergingUtility {
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock,
consecutiveTable)) {
consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable);
} else {
break;
}
}
return Stream.concat(Stream.of(originalTablePageBlock), consecutiveTablesWithSameColCountAndHeaders.stream()).toList();

View File

@ -23,4 +23,10 @@ public class TextPositionOperations {
return sequence;
}
public static List<TextPositionSequence> mergeTextPositionSequence(List<TextPageBlock> textBlocks) {
return textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList());
}
}

View File

@ -28,15 +28,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPosit
*
* @author Ben Litchfield
*/
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence>
{
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence> {
@Override
public int compare(TextPositionSequence pos1, TextPositionSequence pos2)
{
public int compare(TextPositionSequence pos1, TextPositionSequence pos2) {
// only compare text that is in the same direction
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
if (cmp1 != 0)
{
if (cmp1 != 0) {
return cmp1;
}
@ -48,25 +46,19 @@ public class TextPositionSequenceComparator implements Comparator<TextPositionSe
float pos2YBottom = pos2.getMaxYDirAdj();
// note that the coordinates have been adjusted so 0,0 is in upper left
float pos1YTop = pos1YBottom - pos1.getTextHeight();
float pos2YTop = pos2YBottom - pos2.getTextHeight();
float pos1YTop = pos1YBottom - pos1.getTextHeightNoPadding();
float pos2YTop = pos2YBottom - pos2.getTextHeightNoPadding();
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
// we will do a simple tolerance comparison
if (yDifference < .1 ||
pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom ||
pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)
{
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
return Float.compare(x1, x2);
}
else if (pos1YBottom < pos2YBottom)
{
} else if (pos1YBottom < pos2YBottom) {
return -1;
}
else
{
} else {
return 1;
}
}
}

View File

@ -0,0 +1,44 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.util.HashMap;
import java.util.Map;
// simple implementation of a disjoint-set data structure
// https://en.wikipedia.org/wiki/Disjoint-set_data_structure
public class UnionFind<T> {
Map<T, T> parents = new HashMap<>();
Map<T, Integer> numberOfObjects = new HashMap<>();
public T find(T node) {
if (!parents.containsKey(node)) {
parents.put(node, node);
numberOfObjects.put(node, 1);
}
if (!node.equals(parents.get(node))) {
parents.put(node, find(parents.get(node)));
}
return parents.get(node);
}
public void union(T node1, T node2) {
T root1 = find(node1);
T root2 = find(node2);
if (!root1.equals(root2)) {
if (numberOfObjects.getOrDefault(root1, 1) < numberOfObjects.getOrDefault(root2, 1)) {
parents.put(root1, root2);
numberOfObjects.put(root2, numberOfObjects.get(root2) + numberOfObjects.get(root1));
} else {
parents.put(root2, root1);
numberOfObjects.put(root1, numberOfObjects.get(root1) + numberOfObjects.get(root2));
}
}
}
}

View File

@ -29,6 +29,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions;
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
@ -111,7 +112,7 @@ public class BdrJsonBuildTest extends AbstractTest {
try (PDDocument pdDocument = Loader.loadPDF(file); var outputStream = new FileOutputStream(resultingFileName)) {
PdfDraw.drawDocumentGraph(pdDocument, document);
PdfDraw.drawTextBlock(pdDocument, textBlock, PdfDraw.Options.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build());
PdfDraw.drawTextBlock(pdDocument, textBlock, DrawingOptions.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build());
pdDocument.save(outputStream);
}
}

View File

@ -28,7 +28,20 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info);
Arrays.stream(finishedEvent.message().split("\n"))
.forEach(log::info);
}
@Test
@SneakyThrows
public void testLayoutParserEndToEnd_RED_8747() {
prepareStorage("files/SinglePages/MergedEntities.pdf");
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
Arrays.stream(finishedEvent.message().split("\n"))
.forEach(log::info);
}
}

View File

@ -13,6 +13,7 @@ import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
@ -70,7 +71,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentTest {
try (PDDocument pdDocument = Loader.loadPDF(fileResource.getFile())) {
log.info("drawing document");
PdfDraw.drawDocumentGraph(pdDocument, documentGraph);
PdfDraw.drawTextBlock(pdDocument, textBlock, PdfDraw.Options.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build());
PdfDraw.drawTextBlock(pdDocument, textBlock, DrawingOptions.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build());
log.info("saving document");
pdDocument.save(tmpFile);
log.info("saved document");

View File

@ -25,7 +25,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String fileName = "files/SinglePages/T5 VV-640252-Page16.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();
@ -35,9 +35,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
long start = System.currentTimeMillis();
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
}
@Test
@Disabled
@SneakyThrows
@ -51,7 +52,11 @@ public class ViewerDocumentTest extends BuildDocumentTest {
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
var documentFile = new ClassPathResource(fileName).getFile();
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, documentFile, new ImageServiceResponse(), tableResponse, Path.of(fileName).getFileName().toFile().toString());
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
documentFile,
new ImageServiceResponse(),
tableResponse,
Path.of(fileName).getFileName().toFile().toString());
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
@ -60,3 +65,4 @@ public class ViewerDocumentTest extends BuildDocumentTest {
}
}

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.server.segmentation;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.awt.geom.Rectangle2D;
import java.io.File;
@ -25,16 +26,20 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
import lombok.SneakyThrows;
@ -50,12 +55,6 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@Autowired
private RedactManagerClassificationService redactManagerClassificationService;
@Autowired
private CvTableParsingAdapter cvTableParsingAdapter;
@Autowired
private ImageServiceResponseAdapter imageServiceResponseAdapter;
@Autowired
private SectionsBuilderService sectionsBuilderService;
@ -64,10 +63,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
originDocument,
new ImageServiceResponse(),
tableServiceResponse,
"document");
originDocument,
new ImageServiceResponse(),
tableServiceResponse,
"document");
redactManagerClassificationService.classifyDocument(classificationDocument);
@ -87,11 +86,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@Test
public void tablesToHtmlDebugger() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T5 VV-640252-Page16.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
toHtml(document, "/tmp/A20622A izRMS (CZ) fRR Part B9_Page185.html");
toHtml(document, "/tmp/T5.html");
}
@ -109,6 +108,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
}
@Disabled
@Test
public void testScanRotationBorderIsIgnored() throws IOException {
@ -117,8 +117,16 @@ public class PdfSegmentationServiceTest extends AbstractTest {
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
var tables = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList();
// Quality of the table parsing is not good, because the file is rotated at scanning.
// We only asset that the table border is not the page border.
@ -140,12 +148,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
imageServiceResponse.getData()
.forEach(imageMetadata -> images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
imageMetadata.getPosition().getY1(),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight()),
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
imageMetadata.isAlpha(),
imageMetadata.getPosition().getPageNumber())));
imageMetadata.getPosition().getY1(),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight()),
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
imageMetadata.isAlpha(),
imageMetadata.getPosition().getPageNumber())));
System.out.println("object");
}
@ -157,11 +165,22 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock table = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(0);
assertThat(table.getColCount()).isEqualTo(6);
assertThat(table.getRowCount()).isEqualTo(13);
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
assertThat(table.getRows()
.stream()
.mapToInt(List::size).sum()).isEqualTo(6 * 13);
}
@ -171,15 +190,37 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
TablePageBlock secondTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(2);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(0)
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.toList().equals(firstTableHeaderCells))).isTrue();
}
@ -189,15 +230,37 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(9);
assertThat(firstTable.getRowCount()).isEqualTo(5);
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
TablePageBlock secondTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(9);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1).stream().map(Cell::getHeaderCells).collect(Collectors.toList());
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(firstTable.getRowCount() - 1)
.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.toList().equals(firstTableHeaderCells))).isTrue();
}
@ -207,19 +270,41 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.collect(Collectors.toList())).isNotEmpty();
TablePageBlock firstTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
TablePageBlock secondTable = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(0)
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
assertThat(secondTable.getRows()
.stream()
.allMatch(row -> row.stream()
.map(Cell::getHeaderCells)
.toList().equals(firstTableHeaderCells))).isTrue();
}
@Test // Non-sense test
@Test
public void testDoc56Page170() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf");
@ -230,8 +315,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTable(document, 0, 1, 1, 0, 0);
validateTable(document, 1, 2, 2, 0, 0);
validateTable(document, 2, 6, 20, 0, 0);
validateTable(document, 3, 7, 31, 0, 0);
validateTable(document, 2, 4, 19, 12, 0);
validateTable(document, 3, 2, 12, 0, 0);
}
@ -265,29 +350,30 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTable(document, 0, 8, 8, 0, 0);
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
"Author, date",
"Study title",
"Analytical method Author, date, No.",
"Technique, LOQ of the method, validated working range",
"Method meets analytical validation criteria",
"Remarks (in case validation criteria are not met)",
"Acceptability of the method"),
Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
"Evans P.G. 2001 TMJ4569B, VV-323245",
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
"Y",
"N/A",
"Y"));
"Author, date",
"Study title",
"Analytical method Author, date, No.",
"Technique, LOQ of the method, validated working range",
"Method meets analytical validation criteria",
"Remarks (in case validation criteria are not met)",
"Acceptability of the method"),
Arrays.asList(
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
"Evans P.G. 2001 TMJ4569B, VV-323245",
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
"Y",
"N/A",
"Y"));
validateTable(document, 0, values);
@ -579,10 +665,156 @@ public class PdfSegmentationServiceTest extends AbstractTest {
}
@Test
public void testT0() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T0 TableWithMergedCells.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
validateTableSize(document, 1);
validateTable(document, 0, 6, 8, 0, 0);
}
@Test
public void testT1() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T1 MultipleNestedTable.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
validateTableSize(document, 4);
validateTable(document, 0, 3, 3, 0, 0);
validateTable(document, 1, 3, 6, 2, 0);
validateTable(document, 2, 3, 3, 1, 0);
validateTable(document, 3, 3, 3, 0, 0);
}
@Test
public void testT2() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T2 MultipleTables.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
validateTableSize(document, 6);
validateTable(document, 0, 5, 5, 0, 0);
validateTable(document, 1, 5, 6, 0, 0);
validateTable(document, 2, 5, 5, 0, 0);
validateTable(document, 3, 5, 5, 0, 0);
validateTable(document, 4, 5, 5, 0, 0);
validateTable(document, 5, 5, 5, 0, 0);
}
@Test
public void testT3() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T3 S-Meto_Page29.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
validateTableSize(document, 1);
validateTable(document, 0, 6, 5, 0, 0);
}
@Test
public void testT4() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T4 138 IDD0000261736_Page16.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
validateTableSize(document, 1);
validateTable(document, 0, 5, 8, 1, 0);
}
@Test
public void testT5() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T5 VV-640252-Page16.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
validateTableSize(document, 5);
validateTable(document, 0, 1, 1, 0, 0);
validateTable(document, 1, 1, 1, 0, 0);
validateTable(document, 2, 1, 1, 0, 0);
validateTable(document, 3, 1, 1, 0, 0);
validateTable(document, 4, 1, 1, 0, 0);
}
@Test
public void testMergedEntities_Page26() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/MergedEntities.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
validateTableSize(document, 1);
validateTable(document, 0, 6, 6, 5, 0);
}
@Test
public void testHeaderAndFooter() throws IOException {
String fileName = "files/SinglePages/Page1_54 Fludioxonil - EU AIR3 - Document E1 - Listing of Community and Member States MRLs.pdf";
String textToSearch = "Annex to Regulation 283/2013 Annex to Regulation 284/2013";
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
var textPositions = textPositionPerPage.stream()
.flatMap(t -> t.getSortedTextPositionSequences()
.stream()
.map(TextPositionSequence::toString))
.collect(Collectors.joining(" "));
assertThat(textPositions.contains(textToSearch)).isFalse();
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks().size()).isEqualTo(3);
assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks()
.get(0).getSequences().size()).isEqualTo(8);
assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks()
.get(0).toString()).isEqualTo(textToSearch);
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
TextBlock leafTextBlock = document.getFirstPage().getHeader().getLeafTextBlock();
assertTrue(leafTextBlock.getSearchText().contains(textToSearch));
}
@SneakyThrows
private void toHtml(ClassificationDocument document, String filename) {
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
var tables = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList();
StringBuilder sb = new StringBuilder();
int currentPage = 1;
@ -603,9 +835,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
TablePageBlock table = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(tableIndex);
List<List<Cell>> rows = table.getRows();
int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
int emptyCellsFoundFound = rows.stream()
.flatMap(List::stream)
.toList()
.stream()
.filter(f -> f.toString().isEmpty())
.toList().size();
for (List<Cell> row : table.getRows()) {
row.forEach(r -> System.out.println(r.toString()));
@ -620,11 +862,20 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
TablePageBlock table = document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList()
.get(tableIndex);
List<List<Cell>> rows = table.getRows();
List<Cell> rowsFlattened = rows.stream().flatMap(List::stream).toList();
List<String> valuesFlattened = values.stream().flatMap(List::stream).toList();
List<Cell> rowsFlattened = rows.stream()
.flatMap(List::stream)
.toList();
List<String> valuesFlattened = values.stream()
.flatMap(List::stream)
.toList();
for (int i = 0; i < valuesFlattened.size(); i++) {
Cell cell = rowsFlattened.get(i);
@ -637,7 +888,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private void validateTableSize(ClassificationDocument document, int tableSize) {
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize);
assertThat(document.getSections()
.stream()
.flatMap(paragraph -> paragraph.getTables()
.stream())
.toList().size()).isEqualTo(tableSize);
}

View File

@ -1,13 +1,17 @@
package com.knecon.fforesight.service.layoutparser.server.services;
import java.awt.Color;
import java.awt.geom.Rectangle2D;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
@ -26,29 +30,50 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import lombok.SneakyThrows;
public class RulingCleaningServiceTest extends BuildDocumentTest {
@Test
// @Disabled
@Disabled
@SneakyThrows
public void textRectanglesFromRulingsExtraction() {
String fileName = "files/SinglePages/T5 VV-640252-Page16.pdf";
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_CELLS.pdf";
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
RulingCleaningService rulingCleaningService = new RulingCleaningService();
List<List<Rectangle2D>> rectanglesPerPage = new LinkedList<>();
for (PageContents pageContent : pageContents) {
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings());
List<Rectangle2D> rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontal(), cleanRulings.getVertical());
rectanglesPerPage.add(rects);
}
PdfDraw.drawRectanglesPerPage(fileName, rectanglesPerPage, lineFileName, DrawingOptions.builder().stroke(true).strokeColor(Color.RED).build());
}
@Test
@Disabled
@SneakyThrows
public void textRulingExtraction() {
String fileName = "files/211.pdf";
String fileName = "files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page35.pdf";
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf";
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
RulingCleaningService rulingCleaningService = new RulingCleaningService();
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
for (PageContents pageContent : pageContents) {
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings()));
}
var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVertical).collect(Collectors.toList());
PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName);
}
@ -57,9 +82,6 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
@SneakyThrows
public void testTableExtraction() {
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
ClassPathResource resource = new ClassPathResource("files");
List<String> pdfFileNames = Files.walk(resource.getFile().toPath())
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
@ -67,8 +89,8 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
.map(Path::toString)
.toList();
for (int i = 0; i < pdfFileNames.size(); i++) {
writeJsons(Path.of(pdfFileNames.get(i)));
for (String pdfFileName : pdfFileNames) {
writeJsons(Path.of(pdfFileName));
}
}
@ -88,13 +110,13 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
filename.toFile().toString()));
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
String tmpFileNameBefore = "C:/Users/YANNIK~1/AppData/Local/Temp/before." + filename.getFileName().toString();
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure())) {
String tmpFileNameBefore = "/tmp/before." + filename.getFileName().toString();
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
PdfDraw.drawDocumentGraph(pdDocument, documentGraphBefore);
pdDocument.save(tmpFileNameBefore);
}
String tmpFileNameAfter = "C:/Users/YANNIK~1/AppData/Local/Temp/after." + filename.getFileName().toString();
String tmpFileNameAfter = "/tmp/after." + filename.getFileName().toString();
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
PdfDraw.drawDocumentGraph(pdDocument, documentGraphAfter);
pdDocument.save(tmpFileNameAfter);
@ -105,9 +127,9 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
@SneakyThrows
private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2, String pdfName) {
private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2) {
List listStructure1 = structure1.streamAllEntries()
List<Table> listStructure1 = structure1.streamAllEntries()
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
.map(DocumentStructure.EntryData::getProperties)
.map(properties -> {
@ -117,7 +139,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
})
.toList();
List listStructure2 = structure2.streamAllEntries()
List<Table> listStructure2 = structure2.streamAllEntries()
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
.map(DocumentStructure.EntryData::getProperties)
.map(properties -> {
@ -128,8 +150,8 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
.toList();
for (int i = 0; i < listStructure1.size(); i++) {
Table tableNode1 = (Table) listStructure1.get(i);
Table tableNode2 = (Table) listStructure2.get(i);
Table tableNode1 = listStructure1.get(i);
Table tableNode2 = listStructure2.get(i);
if (tableNode1.getNumberOfRows() != tableNode2.getNumberOfRows() || tableNode1.getNumberOfCols() != tableNode2.getNumberOfCols()) {
return false;
}

View File

@ -24,20 +24,31 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions;
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PdfDraw {
public static void drawRectanglesPerPage(String filename, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName, DrawingOptions options) throws IOException {
ClassPathResource pdfResource = new ClassPathResource(filename);
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
pageNumber,
rectanglesPerPage.get(pageNumber - 1),
options);
}
pdDocument.save(out);
}
}
public static void drawRectanglesPerPage(String filename, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) throws IOException {
ClassPathResource pdfResource = new ClassPathResource(filename);
@ -46,7 +57,7 @@ public class PdfDraw {
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
pageNumber,
rectanglesPerPage.get(pageNumber - 1),
PdfVisualisationUtility.Options.builder().stroke(true).build());
DrawingOptions.builder().stroke(true).build());
}
pdDocument.save(out);
}
@ -62,13 +73,13 @@ public class PdfDraw {
var rectanglesOnPage = rectanglesPerPage.get(pageNumber - 1);
for (int lineNumber = 0; lineNumber < rectanglesOnPage.size(); lineNumber++) {
var rectanglesInLine = rectanglesOnPage.get(lineNumber);
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, PdfVisualisationUtility.Options.builder().stroke(true).build());
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, DrawingOptions.builder().stroke(true).build());
double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY());
PdfVisualisationUtility.drawText(String.format("%d", lineNumber),
pdDocument,
new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2),
pageNumber,
PdfVisualisationUtility.Options.builder().stroke(true).build());
DrawingOptions.builder().stroke(true).build());
}
}
pdDocument.save(out);
@ -99,20 +110,20 @@ public class PdfDraw {
public static void drawNode(PDDocument document, DocumentTree.Entry entry) {
Options options = buildStandardOptionsForNodes(entry);
DrawingOptions options = buildStandardOptionsForNodes(entry);
drawBBoxAndLabelAndNumberOnPage(document, entry, options);
}
public static void drawTextBlock(PDDocument document, TextBlock textBlock, Options options) {
public static void drawTextBlock(PDDocument document, TextBlock textBlock, DrawingOptions options) {
textBlock.getAtomicTextBlocks().forEach(atb -> drawAtomicTextBlock(document, atb, options));
}
public static void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, Options options) {
public static void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, DrawingOptions options) {
drawRectangle2DList(document, atomicTextBlock.getPage().getNumber(), atomicTextBlock.getPositions().stream().toList(), options);
@ -120,7 +131,7 @@ public class PdfDraw {
@SneakyThrows
private static void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, Options options, boolean rotate) {
private static void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, DrawingOptions options, boolean rotate) {
var pdPage = document.getPage(pageNumber - 1);
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
@ -142,14 +153,14 @@ public class PdfDraw {
@SneakyThrows
public static void drawRectangle2DList(PDDocument document, int pageNumber, List<Rectangle2D> rectCollection, Options options) {
public static void drawRectangle2DList(PDDocument document, int pageNumber, List<Rectangle2D> rectCollection, DrawingOptions options) {
var pdPage = document.getPage(pageNumber - 1);
drawRectangle2DList(document, rectCollection, options, pdPage);
}
private static void drawRectangle2DList(PDDocument document, List<Rectangle2D> rectCollection, Options options, PDPage pdPage) throws IOException {
private static void drawRectangle2DList(PDDocument document, List<Rectangle2D> rectCollection, DrawingOptions options, PDPage pdPage) throws IOException {
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
@ -181,12 +192,12 @@ public class PdfDraw {
// PdfVisualisationUtility.drawLine2DList(pdDocument,
// pageNumber,
// list.get(pageNumber - 1),
// PdfVisualisationUtility.Options.builder().stroke(true).build());
// PdfVisualisationUtility.DrawingOptions.builder().stroke(true).build());
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
pageNumber,
rectanglesPerPage.get(pageNumber - 1),
PdfVisualisationUtility.Options.builder().stroke(true).build());
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, list.get(pageNumber - 1), PdfVisualisationUtility.Options.builder().stroke(true).build());
DrawingOptions.builder().stroke(true).build());
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, list.get(pageNumber - 1), DrawingOptions.builder().stroke(true).build());
}
pdDocument.save(out);
}
@ -202,35 +213,18 @@ public class PdfDraw {
PdfVisualisationUtility.drawLine2DList(pdDocument,
pageNumber,
linesPerPage.get(pageNumber - 1),
PdfVisualisationUtility.Options.builder().strokeColor(Color.RED).stroke(true).build());
DrawingOptions.builder().strokeColor(Color.RED).stroke(true).build());
}
pdDocument.save(out);
}
}
@Builder
@AllArgsConstructor
@Getter
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public static class Options {
boolean stroke;
@Builder.Default
Color strokeColor = Color.BLACK;
@Builder.Default
float strokeWidth = 1f;
boolean fill;
@Builder.Default
Color fillColor = Color.BLACK;
}
private static Options buildStandardOptionsForNodes(DocumentTree.Entry entry) {
private static DrawingOptions buildStandardOptionsForNodes(DocumentTree.Entry entry) {
return Options.builder().stroke(true).strokeColor(switch (entry.getType()) {
return DrawingOptions.builder().stroke(true).strokeColor(switch (entry.getType()) {
case DOCUMENT -> Color.LIGHT_GRAY;
case HEADER, FOOTER -> Color.GREEN;
case PARAGRAPH -> Color.BLUE;
@ -243,7 +237,7 @@ public class PdfDraw {
}
private static void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, Options options) {
private static void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, DrawingOptions options) {
Map<Page, Rectangle2D> rectanglesPerPage = entry.getNode().getBBox();
for (Page page : rectanglesPerPage.keySet()) {

View File

@ -17,6 +17,7 @@ import java.util.stream.Collectors;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdfwriter.compress.CompressParameters;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
@ -126,8 +127,8 @@ public class ViewerDocumentService {
pdDocument = openPDDocument(tmpFile.toFile());
}
}
observedIncrementalSave(pdDocument, destinationFile);
observedIncrementalSave(pdDocument, destinationFile);
pdDocument.close();
assert tmpFile.toFile().delete();
}
@ -282,10 +283,12 @@ public class ViewerDocumentService {
@SneakyThrows
private void observedIncrementalSave(PDDocument pdDocument, File outputFile) {
/*
Sometimes the viewer document is corrupted after saving and missing the content streams on a random page, for the files we viewed it did not seem to happen with incrementalSave. It might only be a timing issue though
*/
Observation.createNotStarted("ViewerDocumentService", registry).contextualName("incremental-save").observe(() -> {
try (var out = new FileOutputStream(outputFile)) {
pdDocument.save(out);
pdDocument.save(out, CompressParameters.NO_COMPRESSION);
} catch (IOException e) {
throw new RuntimeException(e);
}