Compare commits
24 Commits
main
...
release/0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c4c71efadd | ||
|
|
5e88cb9a2d | ||
|
|
45ff220d83 | ||
|
|
f4f01644f7 | ||
|
|
9eaecdf378 | ||
|
|
59745a916c | ||
|
|
0dda309829 | ||
|
|
bfa90c2d79 | ||
|
|
37f7a6a03f | ||
|
|
bdbac18169 | ||
|
|
2addf63baf | ||
|
|
778bae0f7f | ||
|
|
a01958c842 | ||
|
|
fbe9a34343 | ||
|
|
fd7c461c8d | ||
|
|
cafbcbefc6 | ||
|
|
34b260bb60 | ||
|
|
1ca02f72c8 | ||
|
|
350513a699 | ||
|
|
ab7b2cf0d5 | ||
|
|
007cbfd1ee | ||
|
|
a266d98f11 | ||
|
|
33f726c689 | ||
|
|
18a28e82d0 |
@ -28,6 +28,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
@ -94,16 +95,23 @@ public class LayoutParsingPipeline {
|
||||
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||
|
||||
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
|
||||
// File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
|
||||
// .orElse(originFile);
|
||||
File viewerDocumentFile = originFile;
|
||||
|
||||
|
||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
||||
if (layoutParsingRequest.imagesFileStorageId()
|
||||
.isPresent()) {
|
||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
|
||||
.get());
|
||||
}
|
||||
|
||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
||||
if (layoutParsingRequest.tablesFileStorageId()
|
||||
.isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
|
||||
.get());
|
||||
}
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
|
||||
@ -142,25 +150,25 @@ public class LayoutParsingPipeline {
|
||||
.numberOfPages(documentGraph.getNumberOfPages())
|
||||
.duration(System.currentTimeMillis() - start)
|
||||
.message(format("""
|
||||
Layout parsing has finished in %.02f s.
|
||||
identifiers: %s
|
||||
%s
|
||||
Files have been saved with Ids:
|
||||
Structure: %s
|
||||
Text: %s
|
||||
Positions: %s
|
||||
PageData: %s
|
||||
Simplified Text: %s
|
||||
Viewer Doc: %s""",
|
||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||
layoutParsingRequest.identifier(),
|
||||
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
layoutParsingRequest.simplifiedTextStorageId(),
|
||||
layoutParsingRequest.viewerDocumentStorageId()))
|
||||
Layout parsing has finished in %.02f s.
|
||||
identifiers: %s
|
||||
%s
|
||||
Files have been saved with Ids:
|
||||
Structure: %s
|
||||
Text: %s
|
||||
Positions: %s
|
||||
PageData: %s
|
||||
Simplified Text: %s
|
||||
Viewer Doc: %s""",
|
||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||
layoutParsingRequest.identifier(),
|
||||
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
layoutParsingRequest.simplifiedTextStorageId(),
|
||||
layoutParsingRequest.viewerDocumentStorageId()))
|
||||
.build();
|
||||
|
||||
}
|
||||
@ -170,9 +178,9 @@ public class LayoutParsingPipeline {
|
||||
|
||||
AtomicReference<Document> documentReference = new AtomicReference<>();
|
||||
|
||||
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry).contextualName("build-document-graph").observe(() -> {
|
||||
documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument));
|
||||
});
|
||||
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry)
|
||||
.contextualName("build-document-graph")
|
||||
.observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument)));
|
||||
|
||||
return documentReference.get();
|
||||
}
|
||||
@ -181,14 +189,14 @@ public class LayoutParsingPipeline {
|
||||
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
||||
|
||||
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||
numberOfPages,
|
||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||
numberOfPages,
|
||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||
}
|
||||
|
||||
|
||||
@ -243,8 +251,10 @@ public class LayoutParsingPipeline {
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
|
||||
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
};
|
||||
@ -319,9 +329,7 @@ public class LayoutParsingPipeline {
|
||||
|
||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||
|
||||
if (!classificationPage.isLandscape()) {
|
||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||
}
|
||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
||||
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
||||
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
||||
|
||||
@ -6,12 +6,14 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public abstract class AbstractPageBlock {
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
public abstract class AbstractPageBlock extends Rectangle {
|
||||
|
||||
@JsonIgnore
|
||||
protected float minX;
|
||||
|
||||
@ -12,7 +12,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -36,6 +35,8 @@ public class Image implements GenericSemanticNode {
|
||||
boolean transparent;
|
||||
Rectangle2D position;
|
||||
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
boolean redaction;
|
||||
boolean ignored;
|
||||
@Builder.Default
|
||||
@ -66,7 +67,7 @@ public class Image implements GenericSemanticNode {
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@ -92,4 +93,11 @@ public class Image implements GenericSemanticNode {
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -84,14 +84,16 @@ public class TableCell implements GenericSemanticNode {
|
||||
|
||||
private TextBlock buildTextBlock() {
|
||||
|
||||
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
||||
return streamAllSubNodes().filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getLeafTextBlock)
|
||||
.collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.TABLE_CELL + ": " + this.buildTextBlock().buildSummary();
|
||||
return treeId + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
@ -36,6 +37,12 @@ public class Cell extends Rectangle {
|
||||
}
|
||||
|
||||
|
||||
public Cell(Rectangle2D r) {
|
||||
|
||||
super((float) r.getY(), (float) r.getX(), (float) r.getWidth(), (float) r.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public void addTextBlock(TextPageBlock textBlock) {
|
||||
|
||||
textBlocks.add(textBlock);
|
||||
|
||||
@ -20,7 +20,8 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@SuppressWarnings("all")
|
||||
public class Ruling extends Line2D.Float {
|
||||
|
||||
private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2;
|
||||
public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2;
|
||||
public static final int COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT = 2;
|
||||
|
||||
|
||||
public Ruling(Point2D p1, Point2D p2) {
|
||||
@ -110,8 +111,8 @@ public class Ruling extends Line2D.Float {
|
||||
});
|
||||
|
||||
for (Ruling h : horizontals) {
|
||||
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
||||
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
||||
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, h));
|
||||
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, h));
|
||||
}
|
||||
|
||||
for (Ruling v : verticals) {
|
||||
@ -151,7 +152,7 @@ public class Ruling extends Line2D.Float {
|
||||
if (i == null) {
|
||||
continue;
|
||||
}
|
||||
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
|
||||
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT)});
|
||||
} catch (UnsupportedOperationException e) {
|
||||
log.info("Some line are oblique, ignoring...");
|
||||
continue;
|
||||
@ -267,7 +268,7 @@ public class Ruling extends Line2D.Float {
|
||||
}
|
||||
|
||||
|
||||
public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) {
|
||||
public boolean nearlyIntersects(Ruling another) {
|
||||
|
||||
if (this.intersectsLine(another)) {
|
||||
return true;
|
||||
@ -276,9 +277,9 @@ public class Ruling extends Line2D.Float {
|
||||
boolean rv = false;
|
||||
|
||||
if (this.perpendicularTo(another)) {
|
||||
rv = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).intersectsLine(another);
|
||||
rv = this.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT).intersectsLine(another);
|
||||
} else {
|
||||
rv = this.expand(colinearOrParallelExpandAmount).intersectsLine(another.expand(colinearOrParallelExpandAmount));
|
||||
rv = this.expand(COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT).intersectsLine(another.expand(COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT));
|
||||
}
|
||||
|
||||
return rv;
|
||||
@ -319,8 +320,8 @@ public class Ruling extends Line2D.Float {
|
||||
|
||||
public Point2D intersectionPoint(Ruling other) {
|
||||
|
||||
Ruling this_l = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
|
||||
Ruling other_l = other.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
|
||||
Ruling this_l = this.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
|
||||
Ruling other_l = other.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
|
||||
Ruling horizontal, vertical;
|
||||
|
||||
if (!this_l.intersectsLine(other_l)) {
|
||||
|
||||
@ -1,18 +1,18 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
@ -21,7 +21,8 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Slf4j
|
||||
public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
private final TreeMap<CellPosition, Cell> cells = new TreeMap<>();
|
||||
public static final double CELL_AREA_CONTAINED_THRESHOLD = 0.98;
|
||||
private final TreeMap<CellPosition, Cell> cellTreeMap = new TreeMap<>();
|
||||
|
||||
private final int rotation;
|
||||
@Getter
|
||||
@ -30,10 +31,14 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
private int unrotatedRowCount;
|
||||
private int unrotatedColCount;
|
||||
private List<List<Cell>> rows;
|
||||
@Getter
|
||||
@Setter
|
||||
private List<Cell> cells;
|
||||
|
||||
|
||||
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
|
||||
|
||||
this.cells = cells;
|
||||
addCells(cells);
|
||||
minX = area.getLeft();
|
||||
minY = area.getBottom();
|
||||
@ -50,6 +55,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
return getColCount() == 0 || getRowCount() == 0;
|
||||
}
|
||||
|
||||
|
||||
public List<List<Cell>> getRows() {
|
||||
|
||||
if (rows == null) {
|
||||
@ -80,14 +86,17 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
public int getColCount() {
|
||||
|
||||
return getRows().stream().mapToInt(List::size).max().orElse(0);
|
||||
return getRows().stream()
|
||||
.mapToInt(List::size)
|
||||
.max()
|
||||
.orElse(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Detect header cells (either first row or first column):
|
||||
* Column is marked as header if cell text is bold and row cell text is not bold.
|
||||
* Column is marked as header if originalCell text is bold and row originalCell text is not bold.
|
||||
* Defaults to row.
|
||||
*/
|
||||
private void computeHeaders() {
|
||||
@ -95,7 +104,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
if (rows == null) {
|
||||
rows = computeRows();
|
||||
}
|
||||
// A bold cell is a header cell as long as every cell to the left/top is bold, too
|
||||
// A bold originalCell is a header originalCell as long as every originalCell to the left/top is bold, too
|
||||
// we move from left to right and top to bottom
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
List<Cell> rowCells = rows.get(rowIndex);
|
||||
@ -120,7 +129,8 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
List<Cell> cellsToTheTop = new ArrayList<>();
|
||||
for (int i = 0; i < rowIndex; i++) {
|
||||
try {
|
||||
cellsToTheTop.add(rows.get(i).get(colIndex));
|
||||
cellsToTheTop.add(rows.get(i)
|
||||
.get(colIndex));
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
|
||||
}
|
||||
@ -135,7 +145,8 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
|
||||
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks()
|
||||
.get(0).getMostPopularWordStyle().equals("bold")) {
|
||||
cell.setHeaderCell(true);
|
||||
}
|
||||
}
|
||||
@ -151,7 +162,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
for (int i = 0; i < unrotatedColCount; i++) { // rows
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
|
||||
Cell cell = cells.get(new CellPosition(j, i));
|
||||
Cell cell = cellTreeMap.get(new CellPosition(j, i));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
@ -162,7 +173,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < unrotatedRowCount; j++) { // cols
|
||||
Cell cell = cells.get(new CellPosition(j, i));
|
||||
Cell cell = cellTreeMap.get(new CellPosition(j, i));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
@ -173,7 +184,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
for (int i = 0; i < unrotatedRowCount; i++) {
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < unrotatedColCount; j++) {
|
||||
Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
|
||||
Cell cell = cellTreeMap.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
@ -187,17 +198,6 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
private void add(Cell chunk, int row, int col) {
|
||||
|
||||
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
|
||||
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
|
||||
|
||||
CellPosition cp = new CellPosition(row, col);
|
||||
cells.put(cp, chunk);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void addCells(List<Cell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
@ -206,11 +206,12 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1);
|
||||
|
||||
List<List<Cell>> rowsOfCells = calculateStructure(cells);
|
||||
List<List<Cell>> rowsOfCellsMatrix = calculateTableStructure(cells);
|
||||
|
||||
for (int i = 0; i < rowsOfCells.size(); i++) {
|
||||
for (int j = 0; j < rowsOfCells.get(i).size(); j++) {
|
||||
add(rowsOfCells.get(i).get(j), i, j);
|
||||
for (int i = 0; i < rowsOfCellsMatrix.size(); i++) {
|
||||
for (int j = 0; j < rowsOfCellsMatrix.get(i).size(); j++) {
|
||||
addCellToRowAndCol(rowsOfCellsMatrix.get(i)
|
||||
.get(j), i, j);
|
||||
}
|
||||
}
|
||||
|
||||
@ -221,29 +222,36 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
||||
*
|
||||
* @param cells The found cells
|
||||
* @return TablePageBlock Structure
|
||||
* @return TablePageBlock Structure as a rows of cells matrix
|
||||
*/
|
||||
private List<List<Cell>> calculateStructure(List<Cell> cells) {
|
||||
|
||||
List<List<Cell>> matrix = new ArrayList<>();
|
||||
private List<List<Cell>> calculateTableStructure(List<Cell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return matrix;
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
Set<Float> uniqueX = new HashSet<>();
|
||||
Set<Float> uniqueY = new HashSet<>();
|
||||
cells.stream().filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3).forEach(c -> {
|
||||
uniqueX.add(c.getLeft());
|
||||
uniqueX.add(c.getRight());
|
||||
uniqueY.add(c.getBottom());
|
||||
uniqueY.add(c.getTop());
|
||||
});
|
||||
cells.stream()
|
||||
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
|
||||
.forEach(c -> {
|
||||
uniqueX.add(c.getLeft());
|
||||
uniqueX.add(c.getRight());
|
||||
uniqueY.add(c.getBottom());
|
||||
uniqueY.add(c.getTop());
|
||||
});
|
||||
|
||||
var sortedUniqueX = uniqueX.stream().sorted().toList();
|
||||
var sortedUniqueY = uniqueY.stream().sorted().toList();
|
||||
var sortedUniqueX = uniqueX.stream()
|
||||
.sorted()
|
||||
.toList();
|
||||
var sortedUniqueY = uniqueY.stream()
|
||||
.sorted()
|
||||
.toList();
|
||||
|
||||
List<List<Cell>> rowsOfCells = new ArrayList<>();
|
||||
|
||||
Float prevY = null;
|
||||
|
||||
for (Float y : sortedUniqueY) {
|
||||
|
||||
List<Cell> row = new ArrayList<>();
|
||||
@ -252,44 +260,87 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
for (Float x : sortedUniqueX) {
|
||||
|
||||
if (prevY != null && prevX != null) {
|
||||
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
||||
var cellFromGridStructure = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
||||
|
||||
var intersectionCell = cells.stream().filter(c -> intersects(cell, c)).findFirst();
|
||||
if (cellFromGridStructure.hasMinimumSize()) {
|
||||
|
||||
intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
|
||||
if (cell.hasMinimumSize()) {
|
||||
row.add(cell);
|
||||
cells.stream()
|
||||
.map(originalCell -> new CellWithIntersection(originalCell, RectangleTransformations.calculateIntersectedArea(cellFromGridStructure, originalCell)))
|
||||
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
|
||||
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
|
||||
.max(Comparator.comparing(CellWithIntersection::intersectedArea))
|
||||
.map(CellWithIntersection::originalCell)
|
||||
.ifPresent(matchingCell -> cellFromGridStructure.getTextBlocks().addAll(matchingCell.getTextBlocks()));
|
||||
|
||||
row.add(cellFromGridStructure);
|
||||
}
|
||||
}
|
||||
prevX = x;
|
||||
}
|
||||
|
||||
if (prevY != null && prevX != null && !row.isEmpty()) {
|
||||
matrix.add(row);
|
||||
// exclude empty rows and rows where all text blocks are empty
|
||||
if (prevY != null && prevX != null && !row.isEmpty() && !row.stream()
|
||||
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
|
||||
|
||||
rowsOfCells.add(row);
|
||||
}
|
||||
prevY = y;
|
||||
}
|
||||
|
||||
Collections.reverse(matrix);
|
||||
Collections.reverse(rowsOfCells);
|
||||
|
||||
return matrix;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public boolean intersects(Cell cell1, Cell cell2) {
|
||||
if (cell1.getHeight() <= 0 || cell2.getHeight() <= 0) {
|
||||
return false;
|
||||
// now cells are removed which are part of a column without any text blocks
|
||||
// this is done by first computing the inverse matrix which contains call columns of cells
|
||||
// then the column indices that have to be removed are determined
|
||||
List<List<Cell>> columnsOfCells = new ArrayList<>();
|
||||
int maxRowLength = rowsOfCells.stream()
|
||||
.map(List::size)
|
||||
.max(java.util.Comparator.naturalOrder())
|
||||
.orElse(0);
|
||||
for (int i = 0; i < maxRowLength; i++) {
|
||||
columnsOfCells.add(new ArrayList<>());
|
||||
}
|
||||
double x0 = cell1.getX() + 2;
|
||||
double y0 = cell1.getY() + 2;
|
||||
return (cell2.x + cell2.width > x0 &&
|
||||
cell2.y + cell2.height > y0 &&
|
||||
cell2.x < x0 + cell1.getWidth() -2 &&
|
||||
cell2.y < y0 + cell1.getHeight() -2);
|
||||
|
||||
for (List<Cell> row : rowsOfCells) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
columnsOfCells.get(j).add(row.get(j));
|
||||
}
|
||||
}
|
||||
|
||||
List<Integer> columnIndicesToRemove = new ArrayList<>();
|
||||
int columnIndex = 0;
|
||||
for (List<Cell> col : columnsOfCells) {
|
||||
if (col.stream()
|
||||
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
|
||||
columnIndicesToRemove.add(columnIndex);
|
||||
}
|
||||
columnIndex++;
|
||||
}
|
||||
columnIndicesToRemove.sort(Collections.reverseOrder());
|
||||
|
||||
// update all rows so that the values of the empty columns get removed
|
||||
var rowsOfCellsBefore = new ArrayList<>(rowsOfCells);
|
||||
rowsOfCells = new ArrayList<>();
|
||||
for (List<Cell> row : rowsOfCellsBefore) {
|
||||
var updatedRow = new ArrayList<>(row);
|
||||
columnIndicesToRemove.forEach(idxToRemove -> updatedRow.remove(updatedRow.get(idxToRemove)));
|
||||
rowsOfCells.add(updatedRow);
|
||||
}
|
||||
|
||||
return rowsOfCells;
|
||||
}
|
||||
|
||||
|
||||
private void addCellToRowAndCol(Cell cell, int row, int col) {
|
||||
|
||||
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
|
||||
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
|
||||
|
||||
CellPosition cp = new CellPosition(row, col);
|
||||
cellTreeMap.put(cp, cell);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getText() {
|
||||
@ -314,7 +365,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
if (!first) {
|
||||
sb.append("\n");
|
||||
}
|
||||
sb.append('\"').append(textBlock.getText().replaceAll("\"", "\\\"")).append('\"');
|
||||
sb.append('\"').append(textBlock.getText().replaceAll("\"", "\"")).append('\"');
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
@ -328,8 +379,6 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
public String getTextAsHtml() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
@ -363,4 +412,9 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
record CellWithIntersection(Cell originalCell, double intersectedArea) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,7 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
|
||||
@ -50,7 +49,13 @@ public class RedTextPosition {
|
||||
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
|
||||
|
||||
var pos = new RedTextPosition();
|
||||
BeanUtils.copyProperties(textPosition, pos);
|
||||
pos.setRotation(textPosition.getRotation());
|
||||
pos.setPageHeight(textPosition.getPageHeight());
|
||||
pos.setPageWidth(textPosition.getPageWidth());
|
||||
pos.setUnicode(textPosition.getUnicode());
|
||||
pos.setDir(textPosition.getDir());
|
||||
pos.setWidthOfSpace(textPosition.getWidthOfSpace());
|
||||
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
||||
pos.setFontName(textPosition.getFont().getName());
|
||||
|
||||
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
||||
|
||||
@ -190,6 +190,12 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
public float getTextHeightNoPadding() {
|
||||
|
||||
return textPositions.get(0).getHeightDir();
|
||||
}
|
||||
|
||||
|
||||
public float getTextHeight() {
|
||||
|
||||
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
|
||||
|
||||
@ -1,21 +1,21 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.X_FIRST_RULING_COMPARATOR;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.UnionFind;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -25,25 +25,145 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class RulingCleaningService {
|
||||
|
||||
private static final float THRESHOLD = 6;
|
||||
private static final float THRESHOLD_X_VERTICAL = 1;
|
||||
private static final float THRESHOLD_Y_VERTICAL = 2;
|
||||
private static final float THRESHOLD_X_HORIZONTAL = 2;
|
||||
private static final float THRESHOLD_Y_HORIZONTAL = 3;
|
||||
|
||||
|
||||
public CleanRulings getCleanRulings(List<TableCells> tableCells, List<Ruling> rulings) {
|
||||
|
||||
Rulings verticalAndHorizontalRulingLines;
|
||||
|
||||
if (!rulings.isEmpty()) {
|
||||
snapPoints(rulings);
|
||||
verticalAndHorizontalRulingLines = extractVerticalAndHorizontalRulingLines(rulings);
|
||||
} else {
|
||||
verticalAndHorizontalRulingLines = getRulingsFromParsedCells(tableCells);
|
||||
}
|
||||
|
||||
verticalAndHorizontalRulingLines.verticalLines.sort(X_FIRST_RULING_COMPARATOR);
|
||||
verticalAndHorizontalRulingLines.horizontalLines.sort(X_FIRST_RULING_COMPARATOR);
|
||||
verticalAndHorizontalRulingLines = cleanRulings(verticalAndHorizontalRulingLines);
|
||||
|
||||
return CleanRulings.builder().vertical(verticalAndHorizontalRulingLines.verticalLines()).horizontal(verticalAndHorizontalRulingLines.horizontalLines()).build();
|
||||
}
|
||||
|
||||
|
||||
private Rulings cleanRulings(Rulings rulings) {
|
||||
|
||||
List<List<Rectangle>> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
|
||||
.map(RulingCleaningService::getOverlapRectangle)
|
||||
.distinct()
|
||||
.toList());
|
||||
List<Ruling> cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream()
|
||||
.map(rectList -> getXCenteredRuling(Rectangle.boundingBoxOf(rectList)))
|
||||
.toList();
|
||||
|
||||
List<List<Rectangle>> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
|
||||
.map(RulingCleaningService::getOverlapRectangle)
|
||||
.distinct()
|
||||
.toList());
|
||||
|
||||
List<Ruling> cleanedHorizontalRulings = groupedOverlappingHorizontalRectangles.stream()
|
||||
.map(rectList -> getYCenteredRuling(Rectangle.boundingBoxOf(rectList)))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
return new Rulings(cleanedVerticalRulings, cleanedHorizontalRulings);
|
||||
}
|
||||
|
||||
|
||||
private List<List<Rectangle>> groupOverlappingRectangles(List<Rectangle> rectangles) {
|
||||
|
||||
UnionFind<Rectangle> unionFind = new UnionFind<>();
|
||||
for (int i = 0; i < rectangles.size(); i++) {
|
||||
for (int j = i + 1; j < rectangles.size(); j++) {
|
||||
Rectangle rectangle1 = rectangles.get(i);
|
||||
Rectangle rectangle2 = rectangles.get(j);
|
||||
|
||||
// we can stop early when we are too far off because of x-y-sorting
|
||||
if(rectangle1.getRight() < rectangle2.getLeft() && rectangle1.getBottom() < rectangle2.getTop()) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (rectangle1.intersects(rectangle2)) {
|
||||
unionFind.union(rectangle1, rectangle2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Map<Rectangle, List<Rectangle>> groups = new HashMap<>();
|
||||
for (Rectangle rectangle : rectangles) {
|
||||
Rectangle root = unionFind.find(rectangle);
|
||||
groups.computeIfAbsent(root, k -> new ArrayList<>()).add(rectangle);
|
||||
}
|
||||
return new ArrayList<>(groups.values());
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle getOverlapRectangle(Ruling ruling) {
|
||||
|
||||
float top;
|
||||
float left;
|
||||
float w;
|
||||
float h;
|
||||
|
||||
if (ruling.x1 < ruling.x2) {
|
||||
left = ruling.x1;
|
||||
w = ruling.x2 - ruling.x1;
|
||||
} else {
|
||||
left = ruling.x2;
|
||||
w = ruling.x1 - ruling.x2;
|
||||
}
|
||||
if (ruling.y1 < ruling.y2) {
|
||||
top = ruling.y1;
|
||||
h = ruling.y2 - ruling.y1;
|
||||
} else {
|
||||
top = ruling.y2;
|
||||
h = ruling.y1 - ruling.y2;
|
||||
}
|
||||
|
||||
if (ruling.horizontal()) {
|
||||
return new Rectangle(top - THRESHOLD_Y_HORIZONTAL, left - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
|
||||
} else {
|
||||
return new Rectangle(top - THRESHOLD_Y_VERTICAL, left - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static Ruling getXCenteredRuling(Rectangle rectangle) {
|
||||
|
||||
float x = (float) rectangle.getCenterX();
|
||||
float y1 = rectangle.getTop();
|
||||
float y2 = rectangle.getBottom();
|
||||
|
||||
Point2D point1 = new Point2D.Float(x, y1 + THRESHOLD_Y_VERTICAL);
|
||||
Point2D point2 = new Point2D.Float(x, y2 - THRESHOLD_Y_VERTICAL);
|
||||
|
||||
return new Ruling(point1, point2);
|
||||
}
|
||||
|
||||
|
||||
public static Ruling getYCenteredRuling(Rectangle rectangle) {
|
||||
|
||||
float x1 = rectangle.getLeft();
|
||||
float x2 = rectangle.getRight();
|
||||
float y = (float) rectangle.getCenterY();
|
||||
|
||||
Point2D point1 = new Point2D.Float(x1 + THRESHOLD_X_HORIZONTAL, y);
|
||||
Point2D point2 = new Point2D.Float(x2 - THRESHOLD_X_HORIZONTAL, y);
|
||||
|
||||
return new Ruling(point1, point2);
|
||||
}
|
||||
|
||||
|
||||
private Rulings extractVerticalAndHorizontalRulingLines(List<Ruling> rulings) {
|
||||
|
||||
List<Ruling> vrs = new ArrayList<>();
|
||||
for (Ruling vr : rulings) {
|
||||
if (vr.vertical()) {
|
||||
vrs.add(vr);
|
||||
}
|
||||
}
|
||||
if (vrs.isEmpty()) {
|
||||
vrs.addAll(extractVerticalRulings(tableCells));
|
||||
}
|
||||
List<Ruling> verticalRulingLines = collapseOrientedRulings(vrs);
|
||||
|
||||
List<Ruling> hrs = new ArrayList<>();
|
||||
for (Ruling hr : rulings) {
|
||||
@ -51,98 +171,26 @@ public class RulingCleaningService {
|
||||
hrs.add(hr);
|
||||
}
|
||||
}
|
||||
if (hrs.isEmpty()) {
|
||||
hrs.addAll(extractHorizontalRulings(tableCells));
|
||||
}
|
||||
List<Ruling> horizontalRulingLines = collapseOrientedRulings(hrs);
|
||||
|
||||
return CleanRulings.builder().vertical(verticalRulingLines).horizontal(horizontalRulingLines).build();
|
||||
return new Rulings(vrs, hrs);
|
||||
}
|
||||
|
||||
|
||||
public void snapPoints(List<? extends Line2D.Float> rulings) {
|
||||
private Rulings getRulingsFromParsedCells(List<TableCells> tableCells) {
|
||||
|
||||
// collect points and keep a Line -> p1,p2 map
|
||||
Map<Line2D.Float, Point2D[]> linesToPoints = new HashMap<>();
|
||||
List<Point2D> points = new ArrayList<>();
|
||||
for (Line2D.Float r : rulings) {
|
||||
Point2D p1 = r.getP1();
|
||||
Point2D p2 = r.getP2();
|
||||
linesToPoints.put(r, new Point2D[]{p1, p2});
|
||||
points.add(p1);
|
||||
points.add(p2);
|
||||
}
|
||||
|
||||
// snap by X
|
||||
points.sort(Comparator.comparingDouble(Point2D::getX));
|
||||
|
||||
List<List<Point2D>> groupedPoints = new ArrayList<>();
|
||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(points.get(0))));
|
||||
|
||||
for (Point2D p : points.subList(1, points.size() - 1)) {
|
||||
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
|
||||
if (Math.abs(p.getX() - last.get(0).getX()) < THRESHOLD) {
|
||||
groupedPoints.get(groupedPoints.size() - 1).add(p);
|
||||
} else {
|
||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
|
||||
}
|
||||
}
|
||||
|
||||
for (List<Point2D> group : groupedPoints) {
|
||||
float avgLoc = 0;
|
||||
for (Point2D p : group) {
|
||||
avgLoc += p.getX();
|
||||
}
|
||||
avgLoc /= group.size();
|
||||
for (Point2D p : group) {
|
||||
p.setLocation(avgLoc, p.getY());
|
||||
}
|
||||
}
|
||||
// ---
|
||||
|
||||
// snap by Y
|
||||
points.sort(Comparator.comparingDouble(Point2D::getY));
|
||||
|
||||
groupedPoints = new ArrayList<>();
|
||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(points.get(0))));
|
||||
|
||||
for (Point2D p : points.subList(1, points.size() - 1)) {
|
||||
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
|
||||
if (Math.abs(p.getY() - last.get(0).getY()) < THRESHOLD) {
|
||||
groupedPoints.get(groupedPoints.size() - 1).add(p);
|
||||
} else {
|
||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
|
||||
}
|
||||
}
|
||||
|
||||
for (List<Point2D> group : groupedPoints) {
|
||||
float avgLoc = 0;
|
||||
for (Point2D p : group) {
|
||||
avgLoc += p.getY();
|
||||
}
|
||||
avgLoc /= group.size();
|
||||
for (Point2D p : group) {
|
||||
p.setLocation(p.getX(), avgLoc);
|
||||
}
|
||||
}
|
||||
// ---
|
||||
|
||||
// finally, modify lines
|
||||
for (Map.Entry<Line2D.Float, Point2D[]> ltp : linesToPoints.entrySet()) {
|
||||
Point2D[] p = ltp.getValue();
|
||||
ltp.getKey().setLine(p[0], p[1]);
|
||||
}
|
||||
List<Ruling> vrs = extractVerticalRulingsFromParsedCells(tableCells);
|
||||
List<Ruling> hrs = extractHorizontalRulingsFromParsedCells(tableCells);
|
||||
return new Rulings(vrs, hrs);
|
||||
}
|
||||
|
||||
|
||||
private Collection<? extends Ruling> extractVerticalRulings(List<TableCells> cvParsedTableCells) {
|
||||
private List<Ruling> extractVerticalRulingsFromParsedCells(List<TableCells> tableCells) {
|
||||
|
||||
List<Ruling> vrs = new ArrayList<>();
|
||||
|
||||
if (cvParsedTableCells != null) {
|
||||
for (TableCells cvParsedTableCell : cvParsedTableCells) {
|
||||
Ruling leftLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX0(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1());
|
||||
Ruling rightLine = createRuling(cvParsedTableCell.getX1(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1());
|
||||
if (tableCells != null) {
|
||||
for (TableCells tableCell : tableCells) {
|
||||
Ruling leftLine = createRuling(tableCell.getX0(), tableCell.getX0(), tableCell.getY0(), tableCell.getY1());
|
||||
Ruling rightLine = createRuling(tableCell.getX1(), tableCell.getX1(), tableCell.getY0(), tableCell.getY1());
|
||||
vrs.add(leftLine);
|
||||
vrs.add(rightLine);
|
||||
}
|
||||
@ -151,19 +199,18 @@ public class RulingCleaningService {
|
||||
}
|
||||
|
||||
|
||||
private Collection<? extends Ruling> extractHorizontalRulings(List<TableCells> cvParsedTableCells) {
|
||||
private List<Ruling> extractHorizontalRulingsFromParsedCells(List<TableCells> tableCells) {
|
||||
|
||||
List<Ruling> hrs = new ArrayList<>();
|
||||
|
||||
if (cvParsedTableCells != null) {
|
||||
for (TableCells cvParsedTableCell : cvParsedTableCells) {
|
||||
Ruling topLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY1(), cvParsedTableCell.getY1());
|
||||
Ruling baseLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY0());
|
||||
if (tableCells != null) {
|
||||
for (TableCells tableCell : tableCells) {
|
||||
Ruling topLine = createRuling(tableCell.getX0(), tableCell.getX1(), tableCell.getY1(), tableCell.getY1());
|
||||
Ruling baseLine = createRuling(tableCell.getX0(), tableCell.getX1(), tableCell.getY0(), tableCell.getY0());
|
||||
hrs.add(topLine);
|
||||
hrs.add(baseLine);
|
||||
}
|
||||
}
|
||||
|
||||
return hrs;
|
||||
}
|
||||
|
||||
@ -189,46 +236,8 @@ public class RulingCleaningService {
|
||||
}
|
||||
|
||||
|
||||
private List<Ruling> collapseOrientedRulings(List<Ruling> lines) {
|
||||
private record Rulings(List<Ruling> verticalLines, List<Ruling> horizontalLines) {
|
||||
|
||||
int COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1;
|
||||
return collapseOrientedRulings(lines, COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT);
|
||||
}
|
||||
|
||||
|
||||
private List<Ruling> collapseOrientedRulings(List<Ruling> lines, int expandAmount) {
|
||||
|
||||
ArrayList<Ruling> rv = new ArrayList<>();
|
||||
lines.sort((a, b) -> {
|
||||
final float diff = a.getPosition() - b.getPosition();
|
||||
return Float.compare(diff == 0 ? a.getStart() - b.getStart() : diff, 0f);
|
||||
});
|
||||
|
||||
for (Ruling next_line : lines) {
|
||||
Ruling last = rv.isEmpty() ? null : rv.get(rv.size() - 1);
|
||||
// if current line colinear with next, and are "close enough": expand current line
|
||||
if (last != null && DoubleComparisons.feq(next_line.getPosition(), last.getPosition()) && last.nearlyIntersects(next_line, expandAmount)) {
|
||||
final float lastStart = last.getStart();
|
||||
final float lastEnd = last.getEnd();
|
||||
|
||||
final boolean lastFlipped = lastStart > lastEnd;
|
||||
final boolean nextFlipped = next_line.getStart() > next_line.getEnd();
|
||||
|
||||
boolean differentDirections = nextFlipped != lastFlipped;
|
||||
float nextS = differentDirections ? next_line.getEnd() : next_line.getStart();
|
||||
float nextE = differentDirections ? next_line.getStart() : next_line.getEnd();
|
||||
|
||||
final float newStart = lastFlipped ? Math.max(nextS, lastStart) : Math.min(nextS, lastStart);
|
||||
final float newEnd = lastFlipped ? Math.min(nextE, lastEnd) : Math.max(nextE, lastEnd);
|
||||
last.setStartEnd(newStart, newEnd);
|
||||
assert !last.oblique();
|
||||
} else if (next_line.length() == 0) {
|
||||
continue;
|
||||
} else {
|
||||
rv.add(next_line);
|
||||
}
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -43,7 +43,6 @@ public class SectionsBuilderService {
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
List<TextPageBlock> header = new ArrayList<>();
|
||||
List<TextPageBlock> footer = new ArrayList<>();
|
||||
List<TextPageBlock> unclassifiedText = new ArrayList<>();
|
||||
for (AbstractPageBlock current : page.getTextBlocks()) {
|
||||
|
||||
if (current.getClassification() == null) {
|
||||
@ -62,11 +61,6 @@ public class SectionsBuilderService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getClassification().equals(PageBlockType.OTHER)) {
|
||||
unclassifiedText.add((TextPageBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (prev != null && current.getClassification().isHeadline() && !prev.getClassification().isHeadline() || !document.isHeadlines()) {
|
||||
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
||||
chunkBlock.setHeadline(lastHeadline);
|
||||
@ -94,9 +88,6 @@ public class SectionsBuilderService {
|
||||
if (!footer.isEmpty()) {
|
||||
footers.add(new ClassificationFooter(footer));
|
||||
}
|
||||
if (!unclassifiedText.isEmpty()) {
|
||||
unclassifiedTexts.add(new UnclassifiedText(unclassifiedText));
|
||||
}
|
||||
}
|
||||
|
||||
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
||||
|
||||
@ -1,14 +1,13 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@ -21,59 +20,15 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder;
|
||||
|
||||
@Service
|
||||
public class TableExtractionService {
|
||||
|
||||
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
|
||||
|
||||
int rv = 0;
|
||||
float arg0X = DoubleComparisons.round(arg0.getX(), 2);
|
||||
float arg0Y = DoubleComparisons.round(arg0.getY(), 2);
|
||||
float arg1X = DoubleComparisons.round(arg1.getX(), 2);
|
||||
float arg1Y = DoubleComparisons.round(arg1.getY(), 2);
|
||||
|
||||
if (arg0X > arg1X) {
|
||||
rv = 1;
|
||||
} else if (arg0X < arg1X) {
|
||||
rv = -1;
|
||||
} else if (arg0Y > arg1Y) {
|
||||
rv = 1;
|
||||
} else if (arg0Y < arg1Y) {
|
||||
rv = -1;
|
||||
}
|
||||
return rv;
|
||||
};
|
||||
private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {
|
||||
|
||||
int rv = 0;
|
||||
float arg0X = DoubleComparisons.round(arg0.getX(), 2);
|
||||
float arg0Y = DoubleComparisons.round(arg0.getY(), 2);
|
||||
float arg1X = DoubleComparisons.round(arg1.getX(), 2);
|
||||
float arg1Y = DoubleComparisons.round(arg1.getY(), 2);
|
||||
|
||||
if (arg0Y > arg1Y) {
|
||||
rv = 1;
|
||||
} else if (arg0Y < arg1Y) {
|
||||
rv = -1;
|
||||
} else if (arg0X > arg1X) {
|
||||
rv = 1;
|
||||
} else if (arg0X < arg1X) {
|
||||
rv = -1;
|
||||
}
|
||||
return rv;
|
||||
};
|
||||
|
||||
|
||||
public boolean contains(Cell cell, double x, double y, double w, double h) {
|
||||
|
||||
if (cell.isEmpty() || w <= 0 || h <= 0) {
|
||||
return false;
|
||||
}
|
||||
double x0 = cell.getX();
|
||||
double y0 = cell.getY();
|
||||
return (x >= x0 - 2 && y >= y0 - 2 && (x + w) <= x0 + cell.getWidth() + 2 && (y + h) <= y0 + cell.getHeight() + 2);
|
||||
}
|
||||
private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
|
||||
private static final int TEXT_BLOCK_CONTAINMENT_TOLERANCE = 2;
|
||||
private static final double TABLE_UNIFORMITY_THRESHOLD = 0.7;
|
||||
|
||||
|
||||
/**
|
||||
@ -89,22 +44,18 @@ public class TableExtractionService {
|
||||
* @param cleanRulings The lines used to build the table.
|
||||
* @param page Page object that contains textblocks and statistics.
|
||||
*/
|
||||
|
||||
public void extractTables(CleanRulings cleanRulings, ClassificationPage page) {
|
||||
|
||||
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
|
||||
List<TextPageBlock> toBeRemoved = new ArrayList<>();
|
||||
// sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them
|
||||
cells.sort(CELL_SIZE_COMPARATOR);
|
||||
|
||||
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
||||
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
|
||||
for (Cell cell : cells) {
|
||||
if (cell.hasMinimumSize() && contains(cell,
|
||||
textBlock.getPdfMinX(),
|
||||
textBlock.getPdfMinY(),
|
||||
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
|
||||
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
|
||||
if (cell.hasMinimumSize() && doesCellContainTextBlock(cell, textBlock)) {
|
||||
cell.addTextBlock(textBlock);
|
||||
toBeRemoved.add(textBlock);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -113,245 +64,94 @@ public class TableExtractionService {
|
||||
cells = new ArrayList<>(new HashSet<>(cells));
|
||||
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
|
||||
|
||||
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells);
|
||||
List<Rectangle> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
|
||||
// sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first
|
||||
// this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells
|
||||
spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR);
|
||||
|
||||
List<TablePageBlock> tables = new ArrayList<>();
|
||||
for (Rectangle area : spreadsheetAreas) {
|
||||
|
||||
List<Cell> overlappingCells = new ArrayList<>();
|
||||
List<Cell> containedCells = new ArrayList<>();
|
||||
for (Cell c : cells) {
|
||||
if (c.hasMinimumSize() && c.intersects(area)) {
|
||||
overlappingCells.add(c);
|
||||
if (c.hasMinimumSize() && area.contains(c)) {
|
||||
containedCells.add(c);
|
||||
}
|
||||
}
|
||||
tables.add(new TablePageBlock(overlappingCells, area, page.getRotation()));
|
||||
|
||||
var containedCellsWithText = containedCells.stream()
|
||||
.filter(cell -> !cell.getTextBlocks().isEmpty())
|
||||
.toList();
|
||||
|
||||
// verify if table would contain fewer cells with text than the threshold allows
|
||||
if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
|
||||
tables.add(new TablePageBlock(containedCells, area, page.getRotation()));
|
||||
cells.removeAll(containedCells);
|
||||
}
|
||||
}
|
||||
|
||||
for (TablePageBlock table : tables) {
|
||||
int position = -1;
|
||||
|
||||
Iterator<AbstractPageBlock> itty = page.getTextBlocks().iterator();
|
||||
while (itty.hasNext()) {
|
||||
AbstractPageBlock textBlock = itty.next();
|
||||
if (textBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) textBlock) : table.contains(textBlock) && position == -1) {
|
||||
position = page.getTextBlocks().indexOf(textBlock);
|
||||
for (AbstractPageBlock pageBlock : page.getTextBlocks()) {
|
||||
if (pageBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) pageBlock) : table.contains(pageBlock) && position == -1) {
|
||||
position = page.getTextBlocks().indexOf(pageBlock);
|
||||
}
|
||||
}
|
||||
if (position != -1) {
|
||||
page.getTextBlocks().add(position, table);
|
||||
|
||||
var toBeRemoved = table.getCells()
|
||||
.stream()
|
||||
.map(Cell::getTextBlocks)
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
// remove text blocks from the page that were also added with the table (from its contained cells)
|
||||
page.getTextBlocks().removeAll(toBeRemoved);
|
||||
}
|
||||
}
|
||||
|
||||
page.getTextBlocks().removeAll(toBeRemoved);
|
||||
}
|
||||
|
||||
|
||||
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
private boolean checkIfTableCellsAreUniform(List<Cell> containedCells) {
|
||||
|
||||
// Fix for 211.pdf
|
||||
for (Ruling r : horizontalRulingLines) {
|
||||
if (r.getX2() < r.getX1()) {
|
||||
double a = r.getX2();
|
||||
r.x2 = (float) r.getX1();
|
||||
r.x1 = (float) a;
|
||||
}
|
||||
if(containedCells.size() <= 2) {
|
||||
return true;
|
||||
}
|
||||
|
||||
List<Cell> cellsFound = new ArrayList<>();
|
||||
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
|
||||
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
|
||||
intersectionPointsList.sort(POINT_COMPARATOR);
|
||||
Map<Long, List<Long>> cellsGroupedByRoundedWidth = containedCells.stream()
|
||||
.map(Rectangle::getWidth)
|
||||
.map(size -> Math.round(size / 10.0) * 10)
|
||||
.collect(Collectors.groupingBy(Long::longValue));
|
||||
|
||||
for (int i = 0; i < intersectionPointsList.size(); i++) {
|
||||
Point2D topLeft = intersectionPointsList.get(i);
|
||||
Ruling[] hv = intersectionPoints.get(topLeft);
|
||||
|
||||
// CrossingPointsDirectlyBelow( topLeft );
|
||||
List<Point2D> xPoints = new ArrayList<>();
|
||||
// CrossingPointsDirectlyToTheRight( topLeft );
|
||||
List<Point2D> yPoints = new ArrayList<>();
|
||||
|
||||
for (Point2D p : intersectionPointsList.subList(i, intersectionPointsList.size())) {
|
||||
if (p.getX() == topLeft.getX() && p.getY() > topLeft.getY()) {
|
||||
xPoints.add(p);
|
||||
}
|
||||
if (p.getY() == topLeft.getY() && p.getX() > topLeft.getX()) {
|
||||
yPoints.add(p);
|
||||
}
|
||||
}
|
||||
outer:
|
||||
for (Point2D xPoint : xPoints) {
|
||||
// is there a vertical edge b/w topLeft and xPoint?
|
||||
if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) {
|
||||
continue;
|
||||
}
|
||||
for (Point2D yPoint : yPoints) {
|
||||
// is there an horizontal edge b/w topLeft and yPoint ?
|
||||
if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) {
|
||||
continue;
|
||||
}
|
||||
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
|
||||
if (intersectionPoints.containsKey(btmRight) && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) && intersectionPoints.get(btmRight)[1].equals(
|
||||
intersectionPoints.get(yPoint)[1])) {
|
||||
cellsFound.add(new Cell(topLeft, btmRight));
|
||||
break outer;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO create cells for vertical ruling lines with aligned endpoints at the top/bottom of a grid
|
||||
// that aren't connected with an horizontal ruler?
|
||||
// see: https://github.com/jazzido/tabula-extractor/issues/78#issuecomment-41481207
|
||||
|
||||
return cellsFound;
|
||||
return (double) cellsGroupedByRoundedWidth.size() / containedCells.size() <= TABLE_UNIFORMITY_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
|
||||
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
|
||||
List<Rectangle> rectangles = new ArrayList<>();
|
||||
Set<Point2D> pointSet = new HashSet<>();
|
||||
Map<Point2D, Point2D> edgesH = new HashMap<>();
|
||||
Map<Point2D, Point2D> edgesV = new HashMap<>();
|
||||
int i = 0;
|
||||
private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) {
|
||||
|
||||
for (Rectangle cell : cells) {
|
||||
for (Point2D pt : cell.getPoints()) {
|
||||
if (pointSet.contains(pt)) { // shared vertex, remove it
|
||||
pointSet.remove(pt);
|
||||
} else {
|
||||
pointSet.add(pt);
|
||||
}
|
||||
}
|
||||
double x = textBlock.getPdfMinX();
|
||||
double y = textBlock.getPdfMinY();
|
||||
double w = textBlock.getPdfMaxX() - textBlock.getPdfMinX();
|
||||
double h = textBlock.getPdfMaxY() - textBlock.getPdfMinY();
|
||||
if (cell.isEmpty() || w <= 0 || h <= 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// X first sort
|
||||
List<Point2D> pointsSortX = new ArrayList<>(pointSet);
|
||||
pointsSortX.sort(X_FIRST_POINT_COMPARATOR);
|
||||
// Y first sort
|
||||
List<Point2D> pointsSortY = new ArrayList<>(pointSet);
|
||||
pointsSortY.sort(POINT_COMPARATOR);
|
||||
|
||||
while (i < pointSet.size()) {
|
||||
float currY = (float) pointsSortY.get(i).getY();
|
||||
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortY.get(i).getY(), currY)) {
|
||||
edgesH.put(pointsSortY.get(i), pointsSortY.get(i + 1));
|
||||
edgesH.put(pointsSortY.get(i + 1), pointsSortY.get(i));
|
||||
i += 2;
|
||||
}
|
||||
}
|
||||
|
||||
i = 0;
|
||||
while (i < pointSet.size()) {
|
||||
float currX = (float) pointsSortX.get(i).getX();
|
||||
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortX.get(i).getX(), currX)) {
|
||||
edgesV.put(pointsSortX.get(i), pointsSortX.get(i + 1));
|
||||
edgesV.put(pointsSortX.get(i + 1), pointsSortX.get(i));
|
||||
i += 2;
|
||||
}
|
||||
}
|
||||
|
||||
// Get all the polygons
|
||||
List<List<PolygonVertex>> polygons = new ArrayList<>();
|
||||
Point2D nextVertex;
|
||||
while (!edgesH.isEmpty()) {
|
||||
ArrayList<PolygonVertex> polygon = new ArrayList<>();
|
||||
Point2D first = edgesH.keySet().iterator().next();
|
||||
polygon.add(new PolygonVertex(first, Direction.HORIZONTAL));
|
||||
edgesH.remove(first);
|
||||
|
||||
while (true) {
|
||||
PolygonVertex curr = polygon.get(polygon.size() - 1);
|
||||
PolygonVertex lastAddedVertex;
|
||||
if (curr.direction == Direction.HORIZONTAL) {
|
||||
nextVertex = edgesV.get(curr.point);
|
||||
edgesV.remove(curr.point);
|
||||
lastAddedVertex = new PolygonVertex(nextVertex, Direction.VERTICAL);
|
||||
} else {
|
||||
nextVertex = edgesH.get(curr.point);
|
||||
edgesH.remove(curr.point);
|
||||
lastAddedVertex = new PolygonVertex(nextVertex, Direction.HORIZONTAL);
|
||||
}
|
||||
polygon.add(lastAddedVertex);
|
||||
|
||||
if (lastAddedVertex.equals(polygon.get(0))) {
|
||||
// closed polygon
|
||||
polygon.remove(polygon.size() - 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (PolygonVertex vertex : polygon) {
|
||||
edgesH.remove(vertex.point);
|
||||
edgesV.remove(vertex.point);
|
||||
}
|
||||
polygons.add(polygon);
|
||||
}
|
||||
|
||||
// calculate grid-aligned minimum area rectangles for each found polygon
|
||||
for (List<PolygonVertex> poly : polygons) {
|
||||
float top = Float.MAX_VALUE;
|
||||
float left = Float.MAX_VALUE;
|
||||
float bottom = Float.MIN_VALUE;
|
||||
float right = Float.MIN_VALUE;
|
||||
for (PolygonVertex pt : poly) {
|
||||
top = (float) Math.min(top, pt.point.getY());
|
||||
left = (float) Math.min(left, pt.point.getX());
|
||||
bottom = (float) Math.max(bottom, pt.point.getY());
|
||||
right = (float) Math.max(right, pt.point.getX());
|
||||
}
|
||||
rectangles.add(new Rectangle(top, left, right - left, bottom - top));
|
||||
}
|
||||
|
||||
return rectangles;
|
||||
double x0 = cell.getX();
|
||||
double y0 = cell.getY();
|
||||
return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE
|
||||
&& y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE
|
||||
&& (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE
|
||||
&& (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE);
|
||||
}
|
||||
|
||||
|
||||
private enum Direction {
|
||||
HORIZONTAL,
|
||||
VERTICAL
|
||||
}
|
||||
|
||||
static class PolygonVertex {
|
||||
|
||||
Point2D point;
|
||||
Direction direction;
|
||||
|
||||
|
||||
PolygonVertex(Point2D point, Direction direction) {
|
||||
|
||||
this.direction = direction;
|
||||
this.point = point;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
if (!(other instanceof PolygonVertex)) {
|
||||
return false;
|
||||
}
|
||||
return this.point.equals(((PolygonVertex) other).point);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return this.point.hashCode();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.format("%s[point=%s,direction=%s]", this.getClass().getName(), this.point.toString(), this.direction.toString());
|
||||
}
|
||||
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
|
||||
.stream()
|
||||
.map(Cell::new)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -61,8 +61,8 @@ public class DocuMineBlockificationService {
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle()
|
||||
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
||||
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle()
|
||||
.contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
||||
|
||||
Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString());
|
||||
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();
|
||||
|
||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
@ -13,10 +14,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
|
||||
@SuppressWarnings("all")
|
||||
@ -31,12 +35,12 @@ public class RedactManagerBlockificationService {
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
*
|
||||
* @param textPositions The words of a page.
|
||||
* @param horizontalRulingLines Horizontal table lines.
|
||||
* @param verticalRulingLines Vertical table lines.
|
||||
* @param textPositions The words of a page.
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> emptyCells) {
|
||||
|
||||
CleanRulings usedRulings = RectangleTransformations.extractRulings(emptyCells);
|
||||
|
||||
int indexOnPage = 0;
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
@ -54,7 +58,7 @@ public class RedactManagerBlockificationService {
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontal(), usedRulings.getVertical());
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||
|
||||
@ -5,7 +5,6 @@ import java.util.Locale;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
@ -13,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -23,7 +23,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class DocuMineClassificationService {
|
||||
|
||||
private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern pattern = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||
|
||||
@ -64,46 +64,54 @@ public class DocuMineClassificationService {
|
||||
return;
|
||||
}
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
||||
) {
|
||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
||||
) {
|
||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||
.size() == 1)) {
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification(PageBlockType.TITLE);
|
||||
}
|
||||
} else if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
|
||||
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||
|
||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
|
||||
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
|
||||
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
|
||||
.startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) {
|
||||
} else if (textBlock.getText().length() > 5
|
||||
&& (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular()
|
||||
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && matcher2.reset().find() && !textBlock.toString()
|
||||
.contains(":")
|
||||
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && matcher2.reset().find() && !textBlock.toString().contains(":")
|
||||
|| textBlock.toString().startsWith("APPENDIX")
|
||||
|| textBlock.toString().startsWith("FIGURE")
|
||||
|| textBlock.toString().startsWith("TABLE"))
|
||||
&& !textBlock.toString().endsWith(":")
|
||||
&& matcher2.reset().find()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(1));
|
||||
document.setHeadlines(true);
|
||||
|
||||
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
|
||||
} else if (matcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.reset().find() && !matcher3.reset().matches()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(2));
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||
&& textBlock.getMostPopularWordStyle().equals("bold")
|
||||
&& !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
||||
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
||||
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular())
|
||||
&& textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular())
|
||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
|
||||
.getMostPopular()
|
||||
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||
&& textBlock.getMostPopularWordStyle().equals("italic")
|
||||
&& !document.getFontStyleCounter().getMostPopular().equals("italic")
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
||||
|
||||
@ -18,8 +18,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
|
||||
@ -31,6 +29,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||
|
||||
@ -49,9 +49,14 @@ public class DocumentGraphFactory {
|
||||
Document documentGraph = new Document();
|
||||
Context context = new Context(documentGraph);
|
||||
|
||||
document.getPages().forEach(context::buildAndAddPageWithCounter);
|
||||
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image));
|
||||
addSections(document, context);
|
||||
document.getPages()
|
||||
.forEach(context::buildAndAddPageWithCounter);
|
||||
document.getSections()
|
||||
.stream()
|
||||
.flatMap(section -> section.getImages()
|
||||
.stream())
|
||||
.forEach(image -> context.getImages().add(image));
|
||||
addSections(document, context, documentGraph);
|
||||
addHeaderAndFooterToEachPage(document, context);
|
||||
|
||||
documentGraph.setNumberOfPages(context.pages.size());
|
||||
@ -62,9 +67,10 @@ public class DocumentGraphFactory {
|
||||
}
|
||||
|
||||
|
||||
private void addSections(ClassificationDocument document, Context context) {
|
||||
private void addSections(ClassificationDocument classificationDocument, Context context, Document document) {
|
||||
|
||||
document.getSections().forEach(section -> SectionNodeFactory.addSection(null, section.getNonEmptyPageBlocks(), section.getImages(), context));
|
||||
classificationDocument.getSections()
|
||||
.forEach(section -> SectionNodeFactory.addSection(null, section.getNonEmptyPageBlocks(), section.getImages(), context, document));
|
||||
}
|
||||
|
||||
|
||||
@ -74,9 +80,11 @@ public class DocumentGraphFactory {
|
||||
|
||||
GenericSemanticNode node;
|
||||
if (originalTextBlock.isHeadline()) {
|
||||
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
||||
node = Headline.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
} else {
|
||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
||||
node = Paragraph.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
}
|
||||
|
||||
page.getMainBody().add(node);
|
||||
@ -91,7 +99,16 @@ public class DocumentGraphFactory {
|
||||
}
|
||||
|
||||
|
||||
public void addImage(Section section, ClassifiedImage image, Context context) {
|
||||
public void addImage(GenericSemanticNode parent, ClassifiedImage image, Context context) {
|
||||
|
||||
Image imageNode = createImage(image, context);
|
||||
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parent, imageNode);
|
||||
imageNode.setTreeId(treeId);
|
||||
imageNode.setLeafTextBlock(context.textBlockFactory.emptyTextBlock(parent, context, context.getPage(image.getPage())));
|
||||
}
|
||||
|
||||
|
||||
private Image createImage(ClassifiedImage image, Context context) {
|
||||
|
||||
Rectangle2D position = image.getPosition();
|
||||
Page page = context.getPage(image.getPage());
|
||||
@ -104,9 +121,7 @@ public class DocumentGraphFactory {
|
||||
.documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
page.getMainBody().add(imageNode);
|
||||
|
||||
List<Integer> tocId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode);
|
||||
imageNode.setTreeId(tocId);
|
||||
return imageNode;
|
||||
}
|
||||
|
||||
|
||||
@ -146,10 +161,7 @@ public class DocumentGraphFactory {
|
||||
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
|
||||
footer,
|
||||
context,
|
||||
page);
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||
footer.setTreeId(tocId);
|
||||
footer.setLeafTextBlock(textBlock);
|
||||
@ -161,7 +173,7 @@ public class DocumentGraphFactory {
|
||||
|
||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||
header.setTreeId(tocId);
|
||||
header.setLeafTextBlock(textBlock);
|
||||
@ -172,7 +184,8 @@ public class DocumentGraphFactory {
|
||||
private void addEmptyFooter(int pageIndex, Context context) {
|
||||
|
||||
Page page = context.getPage(pageIndex);
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||
footer.setTreeId(tocId);
|
||||
@ -184,7 +197,8 @@ public class DocumentGraphFactory {
|
||||
private void addEmptyHeader(int pageIndex, Context context) {
|
||||
|
||||
Page page = context.getPage(pageIndex);
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
||||
Header header = Header.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
|
||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||
header.setTreeId(tocId);
|
||||
|
||||
@ -11,6 +11,7 @@ import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
@ -24,27 +25,46 @@ import lombok.experimental.UtilityClass;
|
||||
@UtilityClass
|
||||
public class SectionNodeFactory {
|
||||
|
||||
public void addSection(GenericSemanticNode parentNode, List<AbstractPageBlock> pageBlocks, List<ClassifiedImage> images, DocumentGraphFactory.Context context) {
|
||||
public void addSection(GenericSemanticNode parentNode,
|
||||
List<AbstractPageBlock> pageBlocks,
|
||||
List<ClassifiedImage> images,
|
||||
DocumentGraphFactory.Context context,
|
||||
Document document) {
|
||||
|
||||
// This is for the case where we have images on a page without any text/footer/header.
|
||||
// The pageBlocks list is empty, but we still need to add those images to the document.
|
||||
if (!images.isEmpty() && pageBlocks.isEmpty()) {
|
||||
images.stream()
|
||||
.distinct()
|
||||
.forEach(image -> DocumentGraphFactory.addImage(document, image, context));
|
||||
return;
|
||||
}
|
||||
|
||||
if (pageBlocks.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractPageBlock::getPage));
|
||||
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
|
||||
|
||||
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
|
||||
.collect(groupingBy(AbstractPageBlock::getPage));
|
||||
Section section = Section.builder().documentTree(context.getDocumentTree())
|
||||
.build();
|
||||
|
||||
context.getSections().add(section);
|
||||
blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
|
||||
blocksPerPage.keySet()
|
||||
.forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
|
||||
|
||||
section.setTreeId(getTreeId(parentNode, context, section));
|
||||
|
||||
addFirstHeadlineDirectlyToSection(pageBlocks, context, section);
|
||||
addFirstHeadlineDirectlyToSection(pageBlocks, context, section, document);
|
||||
if (containsTablesAndTextBlocks(pageBlocks)) {
|
||||
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context));
|
||||
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context, document));
|
||||
} else {
|
||||
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section);
|
||||
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section, document);
|
||||
}
|
||||
|
||||
images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context));
|
||||
images.stream()
|
||||
.distinct()
|
||||
.forEach(image -> DocumentGraphFactory.addImage(section, image, context));
|
||||
}
|
||||
|
||||
|
||||
@ -58,16 +78,16 @@ public class SectionNodeFactory {
|
||||
}
|
||||
|
||||
|
||||
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
|
||||
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section, Document document) {
|
||||
|
||||
if (pageBlocks.get(0).isHeadline()) {
|
||||
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section);
|
||||
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section, document);
|
||||
pageBlocks.remove(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
|
||||
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section, Document document) {
|
||||
|
||||
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
||||
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
|
||||
@ -86,7 +106,7 @@ public class SectionNodeFactory {
|
||||
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(tablesToMerge);
|
||||
TableNodeFactory.addTable(section, tablesToMerge, context);
|
||||
TableNodeFactory.addTable(section, tablesToMerge, context, document);
|
||||
} else {
|
||||
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
|
||||
}
|
||||
@ -96,7 +116,9 @@ public class SectionNodeFactory {
|
||||
|
||||
private boolean containsTablesAndTextBlocks(List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
return pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
|
||||
return pageBlocks.stream()
|
||||
.anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream()
|
||||
.anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
|
||||
}
|
||||
|
||||
|
||||
@ -112,7 +134,9 @@ public class SectionNodeFactory {
|
||||
|
||||
List<List<AbstractPageBlock>> splitList = splitIntoCoherentList(pageBlocks);
|
||||
movePrecedingHeadlineToTableList(splitList);
|
||||
return splitList.stream().filter(list -> !list.isEmpty()).toList();
|
||||
return splitList.stream()
|
||||
.filter(list -> !list.isEmpty())
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -133,7 +157,8 @@ public class SectionNodeFactory {
|
||||
|
||||
private boolean listIsTablesOnly(List<AbstractPageBlock> abstractPageBlocks) {
|
||||
|
||||
return abstractPageBlocks.stream().allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
|
||||
return abstractPageBlocks.stream()
|
||||
.allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -8,6 +8,7 @@ import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
@ -27,23 +28,26 @@ public class TableNodeFactory {
|
||||
public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
|
||||
|
||||
|
||||
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) {
|
||||
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context, Document document) {
|
||||
|
||||
setPageNumberInCells(tablesToMerge);
|
||||
Set<Page> pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet());
|
||||
List<List<Cell>> mergedRows = tablesToMerge.stream().map(TablePageBlock::getRows).flatMap(Collection::stream).toList();
|
||||
Set<Page> pages = tablesToMerge.stream()
|
||||
.map(AbstractPageBlock::getPage)
|
||||
.map(context::getPage)
|
||||
.collect(Collectors.toSet());
|
||||
List<List<Cell>> mergedRows = tablesToMerge.stream()
|
||||
.map(TablePageBlock::getRows)
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
|
||||
Table table = Table.builder()
|
||||
.documentTree(context.getDocumentTree())
|
||||
.numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size())
|
||||
.numberOfRows(mergedRows.size())
|
||||
Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size()).numberOfRows(mergedRows.size())
|
||||
.build();
|
||||
|
||||
pages.forEach(page -> addTableToPage(page, parentNode, table));
|
||||
|
||||
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
|
||||
table.setTreeId(treeId);
|
||||
addTableCells(mergedRows, table, context);
|
||||
addTableCells(mergedRows, table, context, document);
|
||||
|
||||
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
|
||||
}
|
||||
@ -63,7 +67,8 @@ public class TableNodeFactory {
|
||||
|
||||
private void setPageNumberInTextBlocksWithPageNumberSetTo0(TablePageBlock table, Cell cell) {
|
||||
|
||||
cell.getTextBlocks().stream()//
|
||||
cell.getTextBlocks()
|
||||
.stream()//
|
||||
.filter(tb -> tb.getPage() == 0)//
|
||||
.forEach(tb -> tb.setPage(table.getPage()));
|
||||
}
|
||||
@ -82,28 +87,32 @@ public class TableNodeFactory {
|
||||
|
||||
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
|
||||
|
||||
if (table.streamHeaders().findAny().isEmpty()) {
|
||||
table.streamRow(0).forEach(tableCellNode -> tableCellNode.setHeader(true));
|
||||
if (table.streamHeaders()
|
||||
.findAny().isEmpty()) {
|
||||
table.streamRow(0)
|
||||
.forEach(tableCellNode -> tableCellNode.setHeader(true));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) {
|
||||
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context, Document document) {
|
||||
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
|
||||
addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context);
|
||||
addTableCell(rows.get(rowIndex)
|
||||
.get(colIndex), rowIndex, colIndex, table, context, document);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
||||
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) {
|
||||
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context, Document document) {
|
||||
|
||||
Page page = context.getPage(cell.getPageNumber());
|
||||
|
||||
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()).build();
|
||||
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D())
|
||||
.build();
|
||||
page.getMainBody().add(tableCell);
|
||||
|
||||
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
|
||||
@ -113,16 +122,26 @@ public class TableNodeFactory {
|
||||
if (cell.getTextBlocks().isEmpty()) {
|
||||
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
|
||||
} else if (cell.getTextBlocks().size() == 1) {
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
|
||||
textBlock = context.getTextBlockFactory()
|
||||
.buildAtomicTextBlock(cell.getTextBlocks()
|
||||
.get(0).getSequences(), tableCell, context, page);
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else if (firstTextBlockIsHeadline(cell)) {
|
||||
SectionNodeFactory.addSection(tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context);
|
||||
SectionNodeFactory.addSection(tableCell,
|
||||
cell.getTextBlocks()
|
||||
.stream()
|
||||
.map(tb -> (AbstractPageBlock) tb)
|
||||
.toList(),
|
||||
emptyList(),
|
||||
context,
|
||||
document);
|
||||
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
|
||||
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else {
|
||||
cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
|
||||
cell.getTextBlocks()
|
||||
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
|
||||
}
|
||||
}
|
||||
|
||||
@ -135,7 +154,8 @@ public class TableNodeFactory {
|
||||
|
||||
private boolean firstTextBlockIsHeadline(Cell cell) {
|
||||
|
||||
return cell.getTextBlocks().get(0).isHeadline();
|
||||
return cell.getTextBlocks()
|
||||
.get(0).isHeadline();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -110,11 +110,13 @@ public class LayoutGridService {
|
||||
return;
|
||||
}
|
||||
for (Page page : table.getPages()) {
|
||||
|
||||
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0).filter(tableCell -> tableCell.isOnPage(page.getNumber())).map(TableCell::getRow).findFirst();
|
||||
if (optionalFirstRowOnPage.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
int firstRowOnPage = optionalFirstRowOnPage.get();
|
||||
|
||||
Stream<Double> xStream = switch (page.getRotation()) {
|
||||
case 90 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinX);
|
||||
case 180 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMaxX);
|
||||
@ -123,6 +125,7 @@ public class LayoutGridService {
|
||||
};
|
||||
List<Double> xs = xStream.collect(Collectors.toList());
|
||||
xs.remove(0);
|
||||
|
||||
Stream<Double> yStream = switch (page.getRotation()) {
|
||||
case 90 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMinY);
|
||||
case 180 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinY);
|
||||
@ -132,7 +135,7 @@ public class LayoutGridService {
|
||||
List<Double> ys = yStream.collect(Collectors.toList());
|
||||
ys.remove(0);
|
||||
|
||||
Rectangle2D tableBBox = table.getBBox().get(table.getFirstPage());
|
||||
Rectangle2D tableBBox = table.getBBox().get(page);
|
||||
List<ColoredLine> coloredLines = layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getColoredLines();
|
||||
xs.forEach(x -> {
|
||||
Line2D line = new Line2D.Double(new Point2D.Double(x, tableBBox.getMaxY()), new Point2D.Double(x, tableBBox.getMinY()));
|
||||
@ -188,14 +191,33 @@ public class LayoutGridService {
|
||||
@SneakyThrows
|
||||
private void addPlacedText(Page page, Rectangle2D textBBox, String s, LayoutGrid layoutGrid) {
|
||||
|
||||
Point2D.Float upperLeftCorner = switch (page.getRotation()) {
|
||||
case 90 -> new Point2D.Float((float) (textBBox.getMinX()), (float) textBBox.getMinY());
|
||||
case 180 -> new Point2D.Float((float) (textBBox.getMaxX()), (float) textBBox.getMinY());
|
||||
case 270 -> new Point2D.Float((float) (textBBox.getMaxX()), (float) textBBox.getMaxY());
|
||||
default -> new Point2D.Float((float) (textBBox.getMinX()), (float) textBBox.getMaxY());
|
||||
};
|
||||
// translates text, such that its right edge is a bit to the left of the drawn box
|
||||
float translationAmount = ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4);
|
||||
|
||||
Point2D upperLeftCorner;
|
||||
Point2D translationVector;
|
||||
switch (page.getRotation()) {
|
||||
case 90 -> {
|
||||
upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMinY());
|
||||
translationVector = new Point2D.Double(FONT_SIZE, -translationAmount);
|
||||
}
|
||||
case 180 -> {
|
||||
upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMinY());
|
||||
translationVector = new Point2D.Double(translationAmount, FONT_SIZE);
|
||||
}
|
||||
case 270 -> {
|
||||
upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMaxY());
|
||||
translationVector = new Point2D.Double(-FONT_SIZE, translationAmount);
|
||||
}
|
||||
default -> {
|
||||
upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMaxY());
|
||||
translationVector = new Point2D.Double(-translationAmount, -FONT_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
upperLeftCorner = add(upperLeftCorner, translationVector);
|
||||
|
||||
var placedTexts = layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getPlacedTexts();
|
||||
upperLeftCorner.setLocation(upperLeftCorner.getX() - ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), upperLeftCorner.getY() - FONT_SIZE);
|
||||
placedTexts.add(PlacedText.textFacingUp(s, upperLeftCorner, FONT_SIZE, Color.BLACK, FONT));
|
||||
}
|
||||
|
||||
@ -317,4 +339,10 @@ public class LayoutGridService {
|
||||
.add(new ColoredRectangle(textBBox, color, LINE_WIDTH)));
|
||||
}
|
||||
|
||||
|
||||
private Point2D add(Point2D a, Point2D b) {
|
||||
|
||||
return new Point2D.Double(a.getX() + b.getX(), a.getY() + b.getY());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,28 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.Color;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@Getter
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class DrawingOptions {
|
||||
|
||||
boolean stroke;
|
||||
@Builder.Default
|
||||
Color strokeColor = Color.BLACK;
|
||||
@Builder.Default
|
||||
float strokeWidth = 1f;
|
||||
|
||||
boolean fill;
|
||||
@Builder.Default
|
||||
Color fillColor = Color.BLACK;
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,88 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.Comparator;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
|
||||
public class GeometricComparators {
|
||||
|
||||
private static final int COMPARATOR_ROUNDING = 2;
|
||||
|
||||
public static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (point1, point2) -> {
|
||||
|
||||
int rv = 0;
|
||||
float point1X = DoubleComparisons.round(point1.getX(), COMPARATOR_ROUNDING);
|
||||
float point1Y = DoubleComparisons.round(point1.getY(), COMPARATOR_ROUNDING);
|
||||
float point2X = DoubleComparisons.round(point2.getX(), COMPARATOR_ROUNDING);
|
||||
float point2Y = DoubleComparisons.round(point2.getY(), COMPARATOR_ROUNDING);
|
||||
|
||||
if (point1X > point2X) {
|
||||
rv = 1;
|
||||
} else if (point1X < point2X) {
|
||||
rv = -1;
|
||||
} else if (point1Y > point2Y) {
|
||||
rv = 1;
|
||||
} else if (point1Y < point2Y) {
|
||||
rv = -1;
|
||||
}
|
||||
return rv;
|
||||
};
|
||||
|
||||
public static final Comparator<Point2D> Y_FIRST_POINT_COMPARATOR = (point1, point2) -> {
|
||||
|
||||
int rv = 0;
|
||||
float point1X = DoubleComparisons.round(point1.getX(), COMPARATOR_ROUNDING);
|
||||
float point1Y = DoubleComparisons.round(point1.getY(), COMPARATOR_ROUNDING);
|
||||
float point2X = DoubleComparisons.round(point2.getX(), COMPARATOR_ROUNDING);
|
||||
float point2Y = DoubleComparisons.round(point2.getY(), COMPARATOR_ROUNDING);
|
||||
|
||||
if (point1Y > point2Y) {
|
||||
rv = 1;
|
||||
} else if (point1Y < point2Y) {
|
||||
rv = -1;
|
||||
} else if (point1X > point2X) {
|
||||
rv = 1;
|
||||
} else if (point1X < point2X) {
|
||||
rv = -1;
|
||||
}
|
||||
return rv;
|
||||
};
|
||||
|
||||
public static final Comparator<Cell> CELL_SIZE_COMPARATOR = (cell1, cell2) -> {
|
||||
|
||||
Double cell1Size = cell1.getHeight() * cell1.getWidth();
|
||||
Double cell2Size = cell2.getHeight() * cell2.getWidth();
|
||||
return cell1Size.compareTo(cell2Size);
|
||||
};
|
||||
|
||||
public static final Comparator<Rectangle> RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> {
|
||||
|
||||
Double rect1Size = rect1.getHeight() * rect1.getWidth();
|
||||
Double rect2Size = rect2.getHeight() * rect2.getWidth();
|
||||
return rect1Size.compareTo(rect2Size);
|
||||
};
|
||||
|
||||
public static final Comparator<Ruling> X_FIRST_RULING_COMPARATOR = (ruling1, ruling2) -> {
|
||||
|
||||
int rv = 0;
|
||||
float point1X = DoubleComparisons.round(Math.min(ruling1.getLeft(), ruling1.getRight()), COMPARATOR_ROUNDING);
|
||||
float point1Y = DoubleComparisons.round(Math.min(ruling1.getTop(), ruling1.getBottom()), COMPARATOR_ROUNDING);
|
||||
float point2X = DoubleComparisons.round(Math.min(ruling2.getLeft(), ruling2.getRight()), COMPARATOR_ROUNDING);
|
||||
float point2Y = DoubleComparisons.round(Math.min(ruling2.getTop(), ruling2.getBottom()), COMPARATOR_ROUNDING);
|
||||
|
||||
if (point1X > point2X) {
|
||||
rv = 1;
|
||||
} else if (point1X < point2X) {
|
||||
rv = -1;
|
||||
} else if (point1Y > point2Y) {
|
||||
rv = 1;
|
||||
} else if (point1Y < point2Y) {
|
||||
rv = -1;
|
||||
}
|
||||
return rv;
|
||||
};
|
||||
|
||||
}
|
||||
@ -21,11 +21,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@ -41,20 +37,20 @@ public class PdfVisualisationUtility {
|
||||
|
||||
public void drawNode(PDDocument document, DocumentTree.Entry entry) {
|
||||
|
||||
Options options = buildStandardOptionsForNodes(entry);
|
||||
DrawingOptions options = buildStandardOptionsForNodes(entry);
|
||||
|
||||
drawBBoxAndLabelAndNumberOnPage(document, entry, options);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void drawTextBlock(PDDocument document, TextBlock textBlock, Options options) {
|
||||
public void drawTextBlock(PDDocument document, TextBlock textBlock, DrawingOptions options) {
|
||||
|
||||
textBlock.getAtomicTextBlocks().forEach(atb -> drawAtomicTextBlock(document, atb, options));
|
||||
}
|
||||
|
||||
|
||||
public void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, Options options) {
|
||||
public void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, DrawingOptions options) {
|
||||
|
||||
drawRectangle2DList(document, atomicTextBlock.getPage().getNumber(), atomicTextBlock.getPositions().stream().toList(), options);
|
||||
|
||||
@ -62,7 +58,7 @@ public class PdfVisualisationUtility {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, Options options) {
|
||||
public void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, DrawingOptions options) {
|
||||
|
||||
var pdPage = document.getPage(pageNumber - 1);
|
||||
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||
@ -80,14 +76,14 @@ public class PdfVisualisationUtility {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void drawRectangle2DList(PDDocument document, int pageNumber, List<Rectangle2D> rectCollection, Options options) {
|
||||
public void drawRectangle2DList(PDDocument document, int pageNumber, List<Rectangle2D> rectCollection, DrawingOptions options) {
|
||||
|
||||
var pdPage = document.getPage(pageNumber - 1);
|
||||
drawRectangle2DList(document, rectCollection, options, pdPage);
|
||||
}
|
||||
|
||||
|
||||
private void drawRectangle2DList(PDDocument document, List<Rectangle2D> rectCollection, Options options, PDPage pdPage) throws IOException {
|
||||
private void drawRectangle2DList(PDDocument document, List<Rectangle2D> rectCollection, DrawingOptions options, PDPage pdPage) throws IOException {
|
||||
|
||||
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||
|
||||
@ -110,9 +106,9 @@ public class PdfVisualisationUtility {
|
||||
}
|
||||
|
||||
|
||||
private Options buildStandardOptionsForNodes(DocumentTree.Entry entry) {
|
||||
private DrawingOptions buildStandardOptionsForNodes(DocumentTree.Entry entry) {
|
||||
|
||||
return Options.builder().stroke(true).strokeColor(switch (entry.getType()) {
|
||||
return DrawingOptions.builder().stroke(true).strokeColor(switch (entry.getType()) {
|
||||
case DOCUMENT -> Color.LIGHT_GRAY;
|
||||
case HEADER, FOOTER -> Color.GREEN;
|
||||
case PARAGRAPH -> Color.BLUE;
|
||||
@ -125,7 +121,7 @@ public class PdfVisualisationUtility {
|
||||
}
|
||||
|
||||
|
||||
private void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, Options options) {
|
||||
private void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, DrawingOptions options) {
|
||||
|
||||
Map<Page, Rectangle2D> rectanglesPerPage = entry.getNode().getBBox();
|
||||
rectanglesPerPage.forEach((page, rectangle2D) -> {
|
||||
@ -152,7 +148,7 @@ public class PdfVisualisationUtility {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List<? extends Line2D> line2DS, Options options) {
|
||||
public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List<? extends Line2D> line2DS, DrawingOptions options) {
|
||||
|
||||
var pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||
@ -176,21 +172,4 @@ public class PdfVisualisationUtility {
|
||||
contentStream.close();
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
@Getter
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public static class Options {
|
||||
|
||||
boolean fill;
|
||||
boolean stroke;
|
||||
@Builder.Default
|
||||
Color strokeColor = Color.BLACK;
|
||||
@Builder.Default
|
||||
float strokeWidth = 1f;
|
||||
@Builder.Default
|
||||
Color fillColor = Color.BLACK;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -2,8 +2,11 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Area;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
@ -19,6 +22,8 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.NoArgsConstructor;
|
||||
@ -37,15 +42,28 @@ public class RectangleTransformations {
|
||||
}
|
||||
|
||||
|
||||
public static double calculateIntersectedArea(Rectangle2D r1, Rectangle2D r2) {
|
||||
|
||||
Area a1 = new Area(r1);
|
||||
Area a2 = new Area(r2);
|
||||
a1.intersect(a2);
|
||||
Rectangle2D intersection = a1.getBounds2D();
|
||||
return intersection.getWidth() * intersection.getHeight();
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
|
||||
|
||||
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
|
||||
}
|
||||
|
||||
|
||||
public static Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> collectBBox() {
|
||||
|
||||
return new Rectangle2DBBoxCollector();
|
||||
}
|
||||
|
||||
|
||||
public static PDRectangle toPDRectangleBBox(List<Rectangle> rectangles) {
|
||||
|
||||
Rectangle2D rectangle2D = RectangleTransformations.rectangleBBox(rectangles);
|
||||
@ -70,6 +88,7 @@ public class RectangleTransformations {
|
||||
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) {
|
||||
|
||||
return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector());
|
||||
@ -84,6 +103,7 @@ public class RectangleTransformations {
|
||||
-redactionLogRectangle.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D toRectangle2D(PDRectangle rectangle) {
|
||||
|
||||
return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight());
|
||||
@ -133,7 +153,27 @@ public class RectangleTransformations {
|
||||
previousRectangle = currentRectangle;
|
||||
}
|
||||
}
|
||||
return rectangleListsWithGaps.stream().map(RectangleTransformations::rectangle2DBBox).toList();
|
||||
return rectangleListsWithGaps.stream()
|
||||
.map(RectangleTransformations::rectangle2DBBox)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
public static CleanRulings extractRulings(List<? extends Rectangle2D.Float> rectangles) {
|
||||
|
||||
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
|
||||
List<Ruling> horizontalRulings = new ArrayList<>();
|
||||
List<Ruling> verticalRulings = new ArrayList<>();
|
||||
|
||||
rectangles.forEach(rectangle -> {
|
||||
horizontalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y), new Point2D.Float(rectangle.x + rectangle.width, rectangle.y)));
|
||||
horizontalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y + rectangle.height),
|
||||
new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height)));
|
||||
verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y), new Point2D.Float(rectangle.x, rectangle.y + rectangle.height)));
|
||||
verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x + rectangle.width, rectangle.y),
|
||||
new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height)));
|
||||
});
|
||||
return CleanRulings.builder().vertical(verticalRulings).horizontal(horizontalRulings).build();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -0,0 +1,77 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
|
||||
public class RectangularIntersectionFinder {
|
||||
|
||||
public static List<Rectangle2D> find(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
// Fix for 211.pdf
|
||||
for (Ruling r : horizontalRulingLines) {
|
||||
if (r.getX2() < r.getX1()) {
|
||||
double a = r.getX2();
|
||||
r.x2 = (float) r.getX1();
|
||||
r.x1 = (float) a;
|
||||
}
|
||||
}
|
||||
|
||||
List<Rectangle2D> foundRectangles = new ArrayList<>();
|
||||
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
|
||||
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
|
||||
intersectionPointsList.sort(Y_FIRST_POINT_COMPARATOR);
|
||||
|
||||
for (int i = 0; i < intersectionPointsList.size(); i++) {
|
||||
Point2D topLeft = intersectionPointsList.get(i);
|
||||
Ruling[] hv = intersectionPoints.get(topLeft);
|
||||
|
||||
// CrossingPointsDirectlyBelow( topLeft );
|
||||
List<Point2D> xPoints = new ArrayList<>();
|
||||
// CrossingPointsDirectlyToTheRight( topLeft );
|
||||
List<Point2D> yPoints = new ArrayList<>();
|
||||
|
||||
for (Point2D p : intersectionPointsList.subList(i, intersectionPointsList.size())) {
|
||||
if (p.getX() == topLeft.getX() && p.getY() > topLeft.getY()) {
|
||||
xPoints.add(p);
|
||||
}
|
||||
if (p.getY() == topLeft.getY() && p.getX() > topLeft.getX()) {
|
||||
yPoints.add(p);
|
||||
}
|
||||
}
|
||||
outer:
|
||||
for (Point2D xPoint : xPoints) {
|
||||
// is there a vertical edge b/w topLeft and xPoint?
|
||||
if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) {
|
||||
continue;
|
||||
}
|
||||
for (Point2D yPoint : yPoints) {
|
||||
// is there a horizontal edge b/w topLeft and yPoint ?
|
||||
if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) {
|
||||
continue;
|
||||
}
|
||||
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
|
||||
if (intersectionPoints.containsKey(btmRight)
|
||||
&& intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0])
|
||||
&& intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) {
|
||||
foundRectangles.add(new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), btmRight.getX() - topLeft.getX(), btmRight.getY() - topLeft.getY()));
|
||||
break outer;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO create cells for vertical ruling lines with aligned endpoints at the top/bottom of a grid
|
||||
// that aren't connected with an horizontal ruler?
|
||||
// see: https://github.com/jazzido/tabula-extractor/issues/78#issuecomment-41481207
|
||||
|
||||
return foundRectangles;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,172 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.X_FIRST_POINT_COMPARATOR;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||
|
||||
public class SpreadsheetFinder {
|
||||
|
||||
private static final int MAX_OUTER_POINT_TOLERANCE = 10;
|
||||
private static final float AREA_TOLERANCE = 0.001f;
|
||||
|
||||
|
||||
public static List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
|
||||
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
|
||||
List<Rectangle> rectangles = new ArrayList<>();
|
||||
Set<Point2D> pointSet = new HashSet<>();
|
||||
Map<Point2D, Point2D> edgesH = new HashMap<>();
|
||||
Map<Point2D, Point2D> edgesV = new HashMap<>();
|
||||
|
||||
for (Rectangle cell : cells) {
|
||||
for (Point2D pt : cell.getPoints()) {
|
||||
if (pointSet.contains(pt)) { // shared vertex, remove it
|
||||
pointSet.remove(pt);
|
||||
} else {
|
||||
pointSet.add(pt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// X first sort
|
||||
List<Point2D> pointsSortX = new ArrayList<>(pointSet);
|
||||
pointsSortX.sort(X_FIRST_POINT_COMPARATOR);
|
||||
// Y first sort
|
||||
List<Point2D> pointsSortY = new ArrayList<>(pointSet);
|
||||
pointsSortY.sort(Y_FIRST_POINT_COMPARATOR);
|
||||
|
||||
int i = 0;
|
||||
while (i < pointSet.size()) {
|
||||
float currY = (float) pointsSortY.get(i).getY();
|
||||
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortY.get(i).getY(), currY)) {
|
||||
edgesH.put(pointsSortY.get(i), pointsSortY.get(i + 1));
|
||||
edgesH.put(pointsSortY.get(i + 1), pointsSortY.get(i));
|
||||
i += 2;
|
||||
}
|
||||
}
|
||||
|
||||
i = 0;
|
||||
while (i < pointSet.size()) {
|
||||
float currX = (float) pointsSortX.get(i).getX();
|
||||
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortX.get(i).getX(), currX)) {
|
||||
edgesV.put(pointsSortX.get(i), pointsSortX.get(i + 1));
|
||||
edgesV.put(pointsSortX.get(i + 1), pointsSortX.get(i));
|
||||
i += 2;
|
||||
}
|
||||
}
|
||||
|
||||
// Get all the polygons
|
||||
List<List<PolygonVertex>> polygons = new ArrayList<>();
|
||||
Point2D nextVertex;
|
||||
while (!edgesH.isEmpty()) {
|
||||
ArrayList<PolygonVertex> polygon = new ArrayList<>();
|
||||
Point2D first = edgesH.keySet()
|
||||
.iterator().next();
|
||||
polygon.add(new PolygonVertex(first, Direction.HORIZONTAL));
|
||||
edgesH.remove(first);
|
||||
|
||||
while (true) {
|
||||
PolygonVertex curr = polygon.get(polygon.size() - 1);
|
||||
PolygonVertex lastAddedVertex;
|
||||
if (curr.direction == Direction.HORIZONTAL) {
|
||||
nextVertex = edgesV.get(curr.point);
|
||||
edgesV.remove(curr.point);
|
||||
lastAddedVertex = new PolygonVertex(nextVertex, Direction.VERTICAL);
|
||||
} else {
|
||||
nextVertex = edgesH.get(curr.point);
|
||||
edgesH.remove(curr.point);
|
||||
lastAddedVertex = new PolygonVertex(nextVertex, Direction.HORIZONTAL);
|
||||
}
|
||||
polygon.add(lastAddedVertex);
|
||||
|
||||
if (lastAddedVertex.equals(polygon.get(0))) {
|
||||
// closed polygon
|
||||
polygon.remove(polygon.size() - 1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (PolygonVertex vertex : polygon) {
|
||||
edgesH.remove(vertex.point);
|
||||
edgesV.remove(vertex.point);
|
||||
}
|
||||
polygons.add(polygon);
|
||||
}
|
||||
|
||||
// calculate grid-aligned minimum area rectangles for each found polygon
|
||||
for (List<PolygonVertex> poly : polygons) {
|
||||
float top = Float.MAX_VALUE;
|
||||
float left = Float.MAX_VALUE;
|
||||
float bottom = Float.MIN_VALUE;
|
||||
float right = Float.MIN_VALUE;
|
||||
for (PolygonVertex pt : poly) {
|
||||
top = (float) Math.min(top, pt.point.getY());
|
||||
left = (float) Math.min(left, pt.point.getX());
|
||||
bottom = (float) Math.max(bottom, pt.point.getY());
|
||||
right = (float) Math.max(right, pt.point.getX());
|
||||
}
|
||||
|
||||
// do not add polygons with too many outer points as they are unlikely to be tables
|
||||
if (poly.size() <= MAX_OUTER_POINT_TOLERANCE) {
|
||||
rectangles.add(new Rectangle(top - AREA_TOLERANCE, left - AREA_TOLERANCE, right - left + 2 * AREA_TOLERANCE, bottom - top + 2 * AREA_TOLERANCE));
|
||||
}
|
||||
}
|
||||
return rectangles;
|
||||
}
|
||||
|
||||
|
||||
private enum Direction {
|
||||
HORIZONTAL,
|
||||
VERTICAL
|
||||
}
|
||||
|
||||
static class PolygonVertex {
|
||||
|
||||
Point2D point;
|
||||
Direction direction;
|
||||
|
||||
|
||||
PolygonVertex(Point2D point, Direction direction) {
|
||||
|
||||
this.direction = direction;
|
||||
this.point = point;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
if (!(other instanceof PolygonVertex)) {
|
||||
return false;
|
||||
}
|
||||
return this.point.equals(((PolygonVertex) other).point);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return this.point.hashCode();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.format("%s[point=%s,direction=%s]", this.getClass().getName(), this.point.toString(), this.direction.toString());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -30,8 +30,6 @@ public class TableMergingUtility {
|
||||
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock,
|
||||
consecutiveTable)) {
|
||||
consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return Stream.concat(Stream.of(originalTablePageBlock), consecutiveTablesWithSameColCountAndHeaders.stream()).toList();
|
||||
|
||||
@ -23,4 +23,10 @@ public class TextPositionOperations {
|
||||
return sequence;
|
||||
}
|
||||
|
||||
|
||||
public static List<TextPositionSequence> mergeTextPositionSequence(List<TextPageBlock> textBlocks) {
|
||||
|
||||
return textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -28,15 +28,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPosit
|
||||
*
|
||||
* @author Ben Litchfield
|
||||
*/
|
||||
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence>
|
||||
{
|
||||
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence> {
|
||||
|
||||
@Override
|
||||
public int compare(TextPositionSequence pos1, TextPositionSequence pos2)
|
||||
{
|
||||
public int compare(TextPositionSequence pos1, TextPositionSequence pos2) {
|
||||
// only compare text that is in the same direction
|
||||
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
|
||||
if (cmp1 != 0)
|
||||
{
|
||||
if (cmp1 != 0) {
|
||||
return cmp1;
|
||||
}
|
||||
|
||||
@ -48,25 +46,19 @@ public class TextPositionSequenceComparator implements Comparator<TextPositionSe
|
||||
float pos2YBottom = pos2.getMaxYDirAdj();
|
||||
|
||||
// note that the coordinates have been adjusted so 0,0 is in upper left
|
||||
float pos1YTop = pos1YBottom - pos1.getTextHeight();
|
||||
float pos2YTop = pos2YBottom - pos2.getTextHeight();
|
||||
float pos1YTop = pos1YBottom - pos1.getTextHeightNoPadding();
|
||||
float pos2YTop = pos2YBottom - pos2.getTextHeightNoPadding();
|
||||
|
||||
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
||||
|
||||
// we will do a simple tolerance comparison
|
||||
if (yDifference < .1 ||
|
||||
pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom ||
|
||||
pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)
|
||||
{
|
||||
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
|
||||
return Float.compare(x1, x2);
|
||||
}
|
||||
else if (pos1YBottom < pos2YBottom)
|
||||
{
|
||||
} else if (pos1YBottom < pos2YBottom) {
|
||||
return -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,44 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
// simple implementation of a disjoint-set data structure
|
||||
// https://en.wikipedia.org/wiki/Disjoint-set_data_structure
|
||||
public class UnionFind<T> {
|
||||
|
||||
Map<T, T> parents = new HashMap<>();
|
||||
Map<T, Integer> numberOfObjects = new HashMap<>();
|
||||
|
||||
|
||||
public T find(T node) {
|
||||
|
||||
if (!parents.containsKey(node)) {
|
||||
parents.put(node, node);
|
||||
numberOfObjects.put(node, 1);
|
||||
}
|
||||
if (!node.equals(parents.get(node))) {
|
||||
parents.put(node, find(parents.get(node)));
|
||||
}
|
||||
return parents.get(node);
|
||||
}
|
||||
|
||||
|
||||
public void union(T node1, T node2) {
|
||||
|
||||
T root1 = find(node1);
|
||||
T root2 = find(node2);
|
||||
|
||||
if (!root1.equals(root2)) {
|
||||
if (numberOfObjects.getOrDefault(root1, 1) < numberOfObjects.getOrDefault(root2, 1)) {
|
||||
parents.put(root1, root2);
|
||||
numberOfObjects.put(root2, numberOfObjects.get(root2) + numberOfObjects.get(root1));
|
||||
} else {
|
||||
parents.put(root2, root1);
|
||||
numberOfObjects.put(root1, numberOfObjects.get(root1) + numberOfObjects.get(root2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -29,6 +29,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
@ -111,7 +112,7 @@ public class BdrJsonBuildTest extends AbstractTest {
|
||||
|
||||
try (PDDocument pdDocument = Loader.loadPDF(file); var outputStream = new FileOutputStream(resultingFileName)) {
|
||||
PdfDraw.drawDocumentGraph(pdDocument, document);
|
||||
PdfDraw.drawTextBlock(pdDocument, textBlock, PdfDraw.Options.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build());
|
||||
PdfDraw.drawTextBlock(pdDocument, textBlock, DrawingOptions.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build());
|
||||
pdDocument.save(outputStream);
|
||||
}
|
||||
}
|
||||
|
||||
@ -28,7 +28,20 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info);
|
||||
Arrays.stream(finishedEvent.message().split("\n"))
|
||||
.forEach(log::info);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testLayoutParserEndToEnd_RED_8747() {
|
||||
|
||||
prepareStorage("files/SinglePages/MergedEntities.pdf");
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
Arrays.stream(finishedEvent.message().split("\n"))
|
||||
.forEach(log::info);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -13,6 +13,7 @@ import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
@ -70,7 +71,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentTest {
|
||||
try (PDDocument pdDocument = Loader.loadPDF(fileResource.getFile())) {
|
||||
log.info("drawing document");
|
||||
PdfDraw.drawDocumentGraph(pdDocument, documentGraph);
|
||||
PdfDraw.drawTextBlock(pdDocument, textBlock, PdfDraw.Options.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build());
|
||||
PdfDraw.drawTextBlock(pdDocument, textBlock, DrawingOptions.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build());
|
||||
log.info("saving document");
|
||||
pdDocument.save(tmpFile);
|
||||
log.info("saved document");
|
||||
|
||||
@ -25,7 +25,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String fileName = "files/SinglePages/T5 VV-640252-Page16.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
@ -35,9 +35,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||
long start = System.currentTimeMillis();
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
@ -51,7 +52,11 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, documentFile, new ImageServiceResponse(), tableResponse, Path.of(fileName).getFileName().toFile().toString());
|
||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||
documentFile,
|
||||
new ImageServiceResponse(),
|
||||
tableResponse,
|
||||
Path.of(fileName).getFileName().toFile().toString());
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
@ -60,3 +65,4 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.segmentation;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
@ -25,16 +26,20 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
@ -50,12 +55,6 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Autowired
|
||||
private RedactManagerClassificationService redactManagerClassificationService;
|
||||
|
||||
@Autowired
|
||||
private CvTableParsingAdapter cvTableParsingAdapter;
|
||||
|
||||
@Autowired
|
||||
private ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
|
||||
@Autowired
|
||||
private SectionsBuilderService sectionsBuilderService;
|
||||
|
||||
@ -64,10 +63,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
|
||||
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
tableServiceResponse,
|
||||
"document");
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
tableServiceResponse,
|
||||
"document");
|
||||
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
|
||||
@ -87,11 +86,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@Test
|
||||
public void tablesToHtmlDebugger() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T5 VV-640252-Page16.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
toHtml(document, "/tmp/A20622A izRMS (CZ) fRR Part B9_Page185.html");
|
||||
toHtml(document, "/tmp/T5.html");
|
||||
|
||||
}
|
||||
|
||||
@ -109,6 +108,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
@Disabled
|
||||
@Test
|
||||
public void testScanRotationBorderIsIgnored() throws IOException {
|
||||
|
||||
@ -117,8 +117,16 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
var tables = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList();
|
||||
|
||||
// Quality of the table parsing is not good, because the file is rotated at scanning.
|
||||
// We only asset that the table border is not the page border.
|
||||
@ -140,12 +148,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
imageServiceResponse.getData()
|
||||
.forEach(imageMetadata -> images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
||||
imageMetadata.getPosition().getY1(),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight()),
|
||||
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
|
||||
imageMetadata.isAlpha(),
|
||||
imageMetadata.getPosition().getPageNumber())));
|
||||
imageMetadata.getPosition().getY1(),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight()),
|
||||
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
|
||||
imageMetadata.isAlpha(),
|
||||
imageMetadata.getPosition().getPageNumber())));
|
||||
|
||||
System.out.println("object");
|
||||
}
|
||||
@ -157,11 +165,22 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock table = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(table.getColCount()).isEqualTo(6);
|
||||
assertThat(table.getRowCount()).isEqualTo(13);
|
||||
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
||||
assertThat(table.getRows()
|
||||
.stream()
|
||||
.mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
||||
}
|
||||
|
||||
|
||||
@ -171,15 +190,37 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
|
||||
TablePageBlock secondTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(2);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.toList().equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@ -189,15 +230,37 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
||||
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
|
||||
TablePageBlock secondTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(9);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1).stream().map(Cell::getHeaderCells).collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(firstTable.getRowCount() - 1)
|
||||
.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.toList().equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@ -207,19 +270,41 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.collect(Collectors.toList())).isNotEmpty();
|
||||
TablePageBlock firstTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
|
||||
TablePageBlock secondTable = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
assertThat(secondTable.getRows()
|
||||
.stream()
|
||||
.allMatch(row -> row.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.toList().equals(firstTableHeaderCells))).isTrue();
|
||||
}
|
||||
|
||||
|
||||
@Test // Non-sense test
|
||||
@Test
|
||||
public void testDoc56Page170() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf");
|
||||
@ -230,8 +315,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTable(document, 0, 1, 1, 0, 0);
|
||||
validateTable(document, 1, 2, 2, 0, 0);
|
||||
validateTable(document, 2, 6, 20, 0, 0);
|
||||
validateTable(document, 3, 7, 31, 0, 0);
|
||||
validateTable(document, 2, 4, 19, 12, 0);
|
||||
validateTable(document, 3, 2, 12, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -265,29 +350,30 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
validateTable(document, 0, 8, 8, 0, 0);
|
||||
|
||||
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
|
||||
"Author, date",
|
||||
"Study title",
|
||||
"Analytical method Author, date, No.",
|
||||
"Technique, LOQ of the method, validated working range",
|
||||
"Method meets analytical validation criteria",
|
||||
"Remarks (in case validation criteria are not met)",
|
||||
"Acceptability of the method"),
|
||||
Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
|
||||
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
|
||||
"Evans P.G. 2001 TMJ4569B, VV-323245",
|
||||
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
|
||||
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
|
||||
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
|
||||
"Y",
|
||||
"N/A",
|
||||
"Y"));
|
||||
"Author, date",
|
||||
"Study title",
|
||||
"Analytical method Author, date, No.",
|
||||
"Technique, LOQ of the method, validated working range",
|
||||
"Method meets analytical validation criteria",
|
||||
"Remarks (in case validation criteria are not met)",
|
||||
"Acceptability of the method"),
|
||||
Arrays.asList(
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
|
||||
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
|
||||
"Evans P.G. 2001 TMJ4569B, VV-323245",
|
||||
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
|
||||
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
|
||||
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
|
||||
"Y",
|
||||
"N/A",
|
||||
"Y"));
|
||||
|
||||
validateTable(document, 0, values);
|
||||
|
||||
@ -579,10 +665,156 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testT0() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T0 TableWithMergedCells.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 6, 8, 0, 0);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testT1() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T1 MultipleNestedTable.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 3, 3, 0, 0);
|
||||
validateTable(document, 1, 3, 6, 2, 0);
|
||||
validateTable(document, 2, 3, 3, 1, 0);
|
||||
validateTable(document, 3, 3, 3, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testT2() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T2 MultipleTables.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 6);
|
||||
|
||||
validateTable(document, 0, 5, 5, 0, 0);
|
||||
validateTable(document, 1, 5, 6, 0, 0);
|
||||
validateTable(document, 2, 5, 5, 0, 0);
|
||||
validateTable(document, 3, 5, 5, 0, 0);
|
||||
validateTable(document, 4, 5, 5, 0, 0);
|
||||
validateTable(document, 5, 5, 5, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testT3() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T3 S-Meto_Page29.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 6, 5, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testT4() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T4 138 IDD0000261736_Page16.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 5, 8, 1, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testT5() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T5 VV-640252-Page16.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 5);
|
||||
validateTable(document, 0, 1, 1, 0, 0);
|
||||
validateTable(document, 1, 1, 1, 0, 0);
|
||||
validateTable(document, 2, 1, 1, 0, 0);
|
||||
validateTable(document, 3, 1, 1, 0, 0);
|
||||
validateTable(document, 4, 1, 1, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testMergedEntities_Page26() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/MergedEntities.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 6, 6, 5, 0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testHeaderAndFooter() throws IOException {
|
||||
|
||||
String fileName = "files/SinglePages/Page1_54 Fludioxonil - EU AIR3 - Document E1 - Listing of Community and Member States MRLs.pdf";
|
||||
String textToSearch = "Annex to Regulation 283/2013 Annex to Regulation 284/2013";
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
||||
|
||||
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
|
||||
var textPositions = textPositionPerPage.stream()
|
||||
.flatMap(t -> t.getSortedTextPositionSequences()
|
||||
.stream()
|
||||
.map(TextPositionSequence::toString))
|
||||
.collect(Collectors.joining(" "));
|
||||
assertThat(textPositions.contains(textToSearch)).isFalse();
|
||||
|
||||
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks().size()).isEqualTo(3);
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks()
|
||||
.get(0).getSequences().size()).isEqualTo(8);
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks()
|
||||
.get(0).toString()).isEqualTo(textToSearch);
|
||||
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
|
||||
TextBlock leafTextBlock = document.getFirstPage().getHeader().getLeafTextBlock();
|
||||
assertTrue(leafTextBlock.getSearchText().contains(textToSearch));
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void toHtml(ClassificationDocument document, String filename) {
|
||||
|
||||
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
||||
var tables = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
int currentPage = 1;
|
||||
@ -603,9 +835,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
||||
|
||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
|
||||
TablePageBlock table = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(tableIndex);
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
|
||||
int emptyCellsFoundFound = rows.stream()
|
||||
.flatMap(List::stream)
|
||||
.toList()
|
||||
.stream()
|
||||
.filter(f -> f.toString().isEmpty())
|
||||
.toList().size();
|
||||
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
row.forEach(r -> System.out.println(r.toString()));
|
||||
@ -620,11 +862,20 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
|
||||
|
||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
|
||||
TablePageBlock table = document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList()
|
||||
.get(tableIndex);
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
|
||||
List<Cell> rowsFlattened = rows.stream().flatMap(List::stream).toList();
|
||||
List<String> valuesFlattened = values.stream().flatMap(List::stream).toList();
|
||||
List<Cell> rowsFlattened = rows.stream()
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
List<String> valuesFlattened = values.stream()
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
|
||||
for (int i = 0; i < valuesFlattened.size(); i++) {
|
||||
Cell cell = rowsFlattened.get(i);
|
||||
@ -637,7 +888,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
private void validateTableSize(ClassificationDocument document, int tableSize) {
|
||||
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize);
|
||||
assertThat(document.getSections()
|
||||
.stream()
|
||||
.flatMap(paragraph -> paragraph.getTables()
|
||||
.stream())
|
||||
.toList().size()).isEqualTo(tableSize);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -1,13 +1,17 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
@ -26,29 +30,50 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
|
||||
@Test
|
||||
// @Disabled
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
public void textRectanglesFromRulingsExtraction() {
|
||||
|
||||
String fileName = "files/SinglePages/T5 VV-640252-Page16.pdf";
|
||||
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_CELLS.pdf";
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
List<List<Rectangle2D>> rectanglesPerPage = new LinkedList<>();
|
||||
for (PageContents pageContent : pageContents) {
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings());
|
||||
List<Rectangle2D> rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
rectanglesPerPage.add(rects);
|
||||
}
|
||||
|
||||
PdfDraw.drawRectanglesPerPage(fileName, rectanglesPerPage, lineFileName, DrawingOptions.builder().stroke(true).strokeColor(Color.RED).build());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
public void textRulingExtraction() {
|
||||
|
||||
String fileName = "files/211.pdf";
|
||||
String fileName = "files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page35.pdf";
|
||||
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf";
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
|
||||
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
|
||||
for (PageContents pageContent : pageContents) {
|
||||
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings()));
|
||||
}
|
||||
var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVertical).collect(Collectors.toList());
|
||||
PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName);
|
||||
|
||||
}
|
||||
|
||||
@ -57,9 +82,6 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testTableExtraction() {
|
||||
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
|
||||
ClassPathResource resource = new ClassPathResource("files");
|
||||
List<String> pdfFileNames = Files.walk(resource.getFile().toPath())
|
||||
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
||||
@ -67,8 +89,8 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
.map(Path::toString)
|
||||
.toList();
|
||||
|
||||
for (int i = 0; i < pdfFileNames.size(); i++) {
|
||||
writeJsons(Path.of(pdfFileNames.get(i)));
|
||||
for (String pdfFileName : pdfFileNames) {
|
||||
writeJsons(Path.of(pdfFileName));
|
||||
}
|
||||
}
|
||||
|
||||
@ -88,13 +110,13 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
filename.toFile().toString()));
|
||||
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
|
||||
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
|
||||
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
|
||||
String tmpFileNameBefore = "C:/Users/YANNIK~1/AppData/Local/Temp/before." + filename.getFileName().toString();
|
||||
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure())) {
|
||||
String tmpFileNameBefore = "/tmp/before." + filename.getFileName().toString();
|
||||
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
|
||||
PdfDraw.drawDocumentGraph(pdDocument, documentGraphBefore);
|
||||
pdDocument.save(tmpFileNameBefore);
|
||||
}
|
||||
String tmpFileNameAfter = "C:/Users/YANNIK~1/AppData/Local/Temp/after." + filename.getFileName().toString();
|
||||
String tmpFileNameAfter = "/tmp/after." + filename.getFileName().toString();
|
||||
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
|
||||
PdfDraw.drawDocumentGraph(pdDocument, documentGraphAfter);
|
||||
pdDocument.save(tmpFileNameAfter);
|
||||
@ -105,9 +127,9 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2, String pdfName) {
|
||||
private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2) {
|
||||
|
||||
List listStructure1 = structure1.streamAllEntries()
|
||||
List<Table> listStructure1 = structure1.streamAllEntries()
|
||||
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
||||
.map(DocumentStructure.EntryData::getProperties)
|
||||
.map(properties -> {
|
||||
@ -117,7 +139,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
})
|
||||
.toList();
|
||||
|
||||
List listStructure2 = structure2.streamAllEntries()
|
||||
List<Table> listStructure2 = structure2.streamAllEntries()
|
||||
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
||||
.map(DocumentStructure.EntryData::getProperties)
|
||||
.map(properties -> {
|
||||
@ -128,8 +150,8 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
.toList();
|
||||
|
||||
for (int i = 0; i < listStructure1.size(); i++) {
|
||||
Table tableNode1 = (Table) listStructure1.get(i);
|
||||
Table tableNode2 = (Table) listStructure2.get(i);
|
||||
Table tableNode1 = listStructure1.get(i);
|
||||
Table tableNode2 = listStructure2.get(i);
|
||||
if (tableNode1.getNumberOfRows() != tableNode2.getNumberOfRows() || tableNode1.getNumberOfCols() != tableNode2.getNumberOfCols()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -24,20 +24,31 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class PdfDraw {
|
||||
|
||||
public static void drawRectanglesPerPage(String filename, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName, DrawingOptions options) throws IOException {
|
||||
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
||||
pageNumber,
|
||||
rectanglesPerPage.get(pageNumber - 1),
|
||||
options);
|
||||
}
|
||||
pdDocument.save(out);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static void drawRectanglesPerPage(String filename, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) throws IOException {
|
||||
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
@ -46,7 +57,7 @@ public class PdfDraw {
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
||||
pageNumber,
|
||||
rectanglesPerPage.get(pageNumber - 1),
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
DrawingOptions.builder().stroke(true).build());
|
||||
}
|
||||
pdDocument.save(out);
|
||||
}
|
||||
@ -62,13 +73,13 @@ public class PdfDraw {
|
||||
var rectanglesOnPage = rectanglesPerPage.get(pageNumber - 1);
|
||||
for (int lineNumber = 0; lineNumber < rectanglesOnPage.size(); lineNumber++) {
|
||||
var rectanglesInLine = rectanglesOnPage.get(lineNumber);
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, DrawingOptions.builder().stroke(true).build());
|
||||
double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY());
|
||||
PdfVisualisationUtility.drawText(String.format("%d", lineNumber),
|
||||
pdDocument,
|
||||
new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2),
|
||||
pageNumber,
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
DrawingOptions.builder().stroke(true).build());
|
||||
}
|
||||
}
|
||||
pdDocument.save(out);
|
||||
@ -99,20 +110,20 @@ public class PdfDraw {
|
||||
|
||||
public static void drawNode(PDDocument document, DocumentTree.Entry entry) {
|
||||
|
||||
Options options = buildStandardOptionsForNodes(entry);
|
||||
DrawingOptions options = buildStandardOptionsForNodes(entry);
|
||||
|
||||
drawBBoxAndLabelAndNumberOnPage(document, entry, options);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public static void drawTextBlock(PDDocument document, TextBlock textBlock, Options options) {
|
||||
public static void drawTextBlock(PDDocument document, TextBlock textBlock, DrawingOptions options) {
|
||||
|
||||
textBlock.getAtomicTextBlocks().forEach(atb -> drawAtomicTextBlock(document, atb, options));
|
||||
}
|
||||
|
||||
|
||||
public static void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, Options options) {
|
||||
public static void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, DrawingOptions options) {
|
||||
|
||||
drawRectangle2DList(document, atomicTextBlock.getPage().getNumber(), atomicTextBlock.getPositions().stream().toList(), options);
|
||||
|
||||
@ -120,7 +131,7 @@ public class PdfDraw {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, Options options, boolean rotate) {
|
||||
private static void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, DrawingOptions options, boolean rotate) {
|
||||
|
||||
var pdPage = document.getPage(pageNumber - 1);
|
||||
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||
@ -142,14 +153,14 @@ public class PdfDraw {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawRectangle2DList(PDDocument document, int pageNumber, List<Rectangle2D> rectCollection, Options options) {
|
||||
public static void drawRectangle2DList(PDDocument document, int pageNumber, List<Rectangle2D> rectCollection, DrawingOptions options) {
|
||||
|
||||
var pdPage = document.getPage(pageNumber - 1);
|
||||
drawRectangle2DList(document, rectCollection, options, pdPage);
|
||||
}
|
||||
|
||||
|
||||
private static void drawRectangle2DList(PDDocument document, List<Rectangle2D> rectCollection, Options options, PDPage pdPage) throws IOException {
|
||||
private static void drawRectangle2DList(PDDocument document, List<Rectangle2D> rectCollection, DrawingOptions options, PDPage pdPage) throws IOException {
|
||||
|
||||
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||
|
||||
@ -181,12 +192,12 @@ public class PdfDraw {
|
||||
// PdfVisualisationUtility.drawLine2DList(pdDocument,
|
||||
// pageNumber,
|
||||
// list.get(pageNumber - 1),
|
||||
// PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
// PdfVisualisationUtility.DrawingOptions.builder().stroke(true).build());
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
||||
pageNumber,
|
||||
rectanglesPerPage.get(pageNumber - 1),
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, list.get(pageNumber - 1), PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
DrawingOptions.builder().stroke(true).build());
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, list.get(pageNumber - 1), DrawingOptions.builder().stroke(true).build());
|
||||
}
|
||||
pdDocument.save(out);
|
||||
}
|
||||
@ -202,35 +213,18 @@ public class PdfDraw {
|
||||
PdfVisualisationUtility.drawLine2DList(pdDocument,
|
||||
pageNumber,
|
||||
linesPerPage.get(pageNumber - 1),
|
||||
PdfVisualisationUtility.Options.builder().strokeColor(Color.RED).stroke(true).build());
|
||||
DrawingOptions.builder().strokeColor(Color.RED).stroke(true).build());
|
||||
}
|
||||
pdDocument.save(out);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@Getter
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public static class Options {
|
||||
|
||||
boolean stroke;
|
||||
@Builder.Default
|
||||
Color strokeColor = Color.BLACK;
|
||||
@Builder.Default
|
||||
float strokeWidth = 1f;
|
||||
|
||||
boolean fill;
|
||||
@Builder.Default
|
||||
Color fillColor = Color.BLACK;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static Options buildStandardOptionsForNodes(DocumentTree.Entry entry) {
|
||||
private static DrawingOptions buildStandardOptionsForNodes(DocumentTree.Entry entry) {
|
||||
|
||||
return Options.builder().stroke(true).strokeColor(switch (entry.getType()) {
|
||||
return DrawingOptions.builder().stroke(true).strokeColor(switch (entry.getType()) {
|
||||
case DOCUMENT -> Color.LIGHT_GRAY;
|
||||
case HEADER, FOOTER -> Color.GREEN;
|
||||
case PARAGRAPH -> Color.BLUE;
|
||||
@ -243,7 +237,7 @@ public class PdfDraw {
|
||||
}
|
||||
|
||||
|
||||
private static void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, Options options) {
|
||||
private static void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, DrawingOptions options) {
|
||||
|
||||
Map<Page, Rectangle2D> rectanglesPerPage = entry.getNode().getBBox();
|
||||
for (Page page : rectanglesPerPage.keySet()) {
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -17,6 +17,7 @@ import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdfwriter.compress.CompressParameters;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
@ -126,8 +127,8 @@ public class ViewerDocumentService {
|
||||
pdDocument = openPDDocument(tmpFile.toFile());
|
||||
}
|
||||
}
|
||||
observedIncrementalSave(pdDocument, destinationFile);
|
||||
|
||||
observedIncrementalSave(pdDocument, destinationFile);
|
||||
pdDocument.close();
|
||||
assert tmpFile.toFile().delete();
|
||||
}
|
||||
@ -282,10 +283,12 @@ public class ViewerDocumentService {
|
||||
|
||||
@SneakyThrows
|
||||
private void observedIncrementalSave(PDDocument pdDocument, File outputFile) {
|
||||
|
||||
/*
|
||||
Sometimes the viewer document is corrupted after saving and missing the content streams on a random page, for the files we viewed it did not seem to happen with incrementalSave. It might only be a timing issue though
|
||||
*/
|
||||
Observation.createNotStarted("ViewerDocumentService", registry).contextualName("incremental-save").observe(() -> {
|
||||
try (var out = new FileOutputStream(outputFile)) {
|
||||
pdDocument.save(out);
|
||||
pdDocument.save(out, CompressParameters.NO_COMPRESSION);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user