Compare commits
24 Commits
main
...
release/0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c4c71efadd | ||
|
|
5e88cb9a2d | ||
|
|
45ff220d83 | ||
|
|
f4f01644f7 | ||
|
|
9eaecdf378 | ||
|
|
59745a916c | ||
|
|
0dda309829 | ||
|
|
bfa90c2d79 | ||
|
|
37f7a6a03f | ||
|
|
bdbac18169 | ||
|
|
2addf63baf | ||
|
|
778bae0f7f | ||
|
|
a01958c842 | ||
|
|
fbe9a34343 | ||
|
|
fd7c461c8d | ||
|
|
cafbcbefc6 | ||
|
|
34b260bb60 | ||
|
|
1ca02f72c8 | ||
|
|
350513a699 | ||
|
|
ab7b2cf0d5 | ||
|
|
007cbfd1ee | ||
|
|
a266d98f11 | ||
|
|
33f726c689 | ||
|
|
18a28e82d0 |
@ -28,6 +28,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
@ -94,16 +95,23 @@ public class LayoutParsingPipeline {
|
|||||||
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||||
|
|
||||||
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
|
// File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
|
||||||
|
// .orElse(originFile);
|
||||||
|
File viewerDocumentFile = originFile;
|
||||||
|
|
||||||
|
|
||||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||||
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
if (layoutParsingRequest.imagesFileStorageId()
|
||||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
.isPresent()) {
|
||||||
|
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
|
||||||
|
.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
if (layoutParsingRequest.tablesFileStorageId()
|
||||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
.isPresent()) {
|
||||||
|
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
|
||||||
|
.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
|
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
|
||||||
@ -142,25 +150,25 @@ public class LayoutParsingPipeline {
|
|||||||
.numberOfPages(documentGraph.getNumberOfPages())
|
.numberOfPages(documentGraph.getNumberOfPages())
|
||||||
.duration(System.currentTimeMillis() - start)
|
.duration(System.currentTimeMillis() - start)
|
||||||
.message(format("""
|
.message(format("""
|
||||||
Layout parsing has finished in %.02f s.
|
Layout parsing has finished in %.02f s.
|
||||||
identifiers: %s
|
identifiers: %s
|
||||||
%s
|
%s
|
||||||
Files have been saved with Ids:
|
Files have been saved with Ids:
|
||||||
Structure: %s
|
Structure: %s
|
||||||
Text: %s
|
Text: %s
|
||||||
Positions: %s
|
Positions: %s
|
||||||
PageData: %s
|
PageData: %s
|
||||||
Simplified Text: %s
|
Simplified Text: %s
|
||||||
Viewer Doc: %s""",
|
Viewer Doc: %s""",
|
||||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||||
layoutParsingRequest.identifier(),
|
layoutParsingRequest.identifier(),
|
||||||
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
|
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
|
||||||
layoutParsingRequest.structureFileStorageId(),
|
layoutParsingRequest.structureFileStorageId(),
|
||||||
layoutParsingRequest.textBlockFileStorageId(),
|
layoutParsingRequest.textBlockFileStorageId(),
|
||||||
layoutParsingRequest.positionBlockFileStorageId(),
|
layoutParsingRequest.positionBlockFileStorageId(),
|
||||||
layoutParsingRequest.pageFileStorageId(),
|
layoutParsingRequest.pageFileStorageId(),
|
||||||
layoutParsingRequest.simplifiedTextStorageId(),
|
layoutParsingRequest.simplifiedTextStorageId(),
|
||||||
layoutParsingRequest.viewerDocumentStorageId()))
|
layoutParsingRequest.viewerDocumentStorageId()))
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -170,9 +178,9 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
AtomicReference<Document> documentReference = new AtomicReference<>();
|
AtomicReference<Document> documentReference = new AtomicReference<>();
|
||||||
|
|
||||||
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry).contextualName("build-document-graph").observe(() -> {
|
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry)
|
||||||
documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument));
|
.contextualName("build-document-graph")
|
||||||
});
|
.observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument)));
|
||||||
|
|
||||||
return documentReference.get();
|
return documentReference.get();
|
||||||
}
|
}
|
||||||
@ -181,14 +189,14 @@ public class LayoutParsingPipeline {
|
|||||||
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
||||||
|
|
||||||
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||||
numberOfPages,
|
numberOfPages,
|
||||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -243,8 +251,10 @@ public class LayoutParsingPipeline {
|
|||||||
PDRectangle cropbox = pdPage.getCropBox();
|
PDRectangle cropbox = pdPage.getCropBox();
|
||||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||||
|
|
||||||
|
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
|
|
||||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||||
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
|
||||||
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
};
|
};
|
||||||
@ -319,9 +329,7 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||||
|
|
||||||
if (!classificationPage.isLandscape()) {
|
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
|
||||||
}
|
|
||||||
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
||||||
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
||||||
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
||||||
|
|||||||
@ -6,12 +6,14 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
|
|||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
import lombok.NoArgsConstructor;
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
@NoArgsConstructor
|
@NoArgsConstructor
|
||||||
public abstract class AbstractPageBlock {
|
@EqualsAndHashCode(callSuper = true)
|
||||||
|
public abstract class AbstractPageBlock extends Rectangle {
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
protected float minX;
|
protected float minX;
|
||||||
|
|||||||
@ -12,7 +12,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
@ -36,6 +35,8 @@ public class Image implements GenericSemanticNode {
|
|||||||
boolean transparent;
|
boolean transparent;
|
||||||
Rectangle2D position;
|
Rectangle2D position;
|
||||||
|
|
||||||
|
TextBlock leafTextBlock;
|
||||||
|
|
||||||
boolean redaction;
|
boolean redaction;
|
||||||
boolean ignored;
|
boolean ignored;
|
||||||
@Builder.Default
|
@Builder.Default
|
||||||
@ -66,7 +67,7 @@ public class Image implements GenericSemanticNode {
|
|||||||
@Override
|
@Override
|
||||||
public TextBlock getTextBlock() {
|
public TextBlock getTextBlock() {
|
||||||
|
|
||||||
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
return leafTextBlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -92,4 +93,11 @@ public class Image implements GenericSemanticNode {
|
|||||||
return bBoxPerPage;
|
return bBoxPerPage;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isLeaf() {
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -84,14 +84,16 @@ public class TableCell implements GenericSemanticNode {
|
|||||||
|
|
||||||
private TextBlock buildTextBlock() {
|
private TextBlock buildTextBlock() {
|
||||||
|
|
||||||
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
return streamAllSubNodes().filter(SemanticNode::isLeaf)
|
||||||
|
.map(SemanticNode::getLeafTextBlock)
|
||||||
|
.collect(new TextBlockCollector());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
||||||
return treeId + ": " + NodeType.TABLE_CELL + ": " + this.buildTextBlock().buildSummary();
|
return treeId + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -36,6 +37,12 @@ public class Cell extends Rectangle {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Cell(Rectangle2D r) {
|
||||||
|
|
||||||
|
super((float) r.getY(), (float) r.getX(), (float) r.getWidth(), (float) r.getHeight());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public void addTextBlock(TextPageBlock textBlock) {
|
public void addTextBlock(TextPageBlock textBlock) {
|
||||||
|
|
||||||
textBlocks.add(textBlock);
|
textBlocks.add(textBlock);
|
||||||
|
|||||||
@ -20,7 +20,8 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@SuppressWarnings("all")
|
@SuppressWarnings("all")
|
||||||
public class Ruling extends Line2D.Float {
|
public class Ruling extends Line2D.Float {
|
||||||
|
|
||||||
private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2;
|
public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2;
|
||||||
|
public static final int COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT = 2;
|
||||||
|
|
||||||
|
|
||||||
public Ruling(Point2D p1, Point2D p2) {
|
public Ruling(Point2D p1, Point2D p2) {
|
||||||
@ -110,8 +111,8 @@ public class Ruling extends Line2D.Float {
|
|||||||
});
|
});
|
||||||
|
|
||||||
for (Ruling h : horizontals) {
|
for (Ruling h : horizontals) {
|
||||||
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, h));
|
||||||
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, h));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (Ruling v : verticals) {
|
for (Ruling v : verticals) {
|
||||||
@ -151,7 +152,7 @@ public class Ruling extends Line2D.Float {
|
|||||||
if (i == null) {
|
if (i == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
|
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT)});
|
||||||
} catch (UnsupportedOperationException e) {
|
} catch (UnsupportedOperationException e) {
|
||||||
log.info("Some line are oblique, ignoring...");
|
log.info("Some line are oblique, ignoring...");
|
||||||
continue;
|
continue;
|
||||||
@ -267,7 +268,7 @@ public class Ruling extends Line2D.Float {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) {
|
public boolean nearlyIntersects(Ruling another) {
|
||||||
|
|
||||||
if (this.intersectsLine(another)) {
|
if (this.intersectsLine(another)) {
|
||||||
return true;
|
return true;
|
||||||
@ -276,9 +277,9 @@ public class Ruling extends Line2D.Float {
|
|||||||
boolean rv = false;
|
boolean rv = false;
|
||||||
|
|
||||||
if (this.perpendicularTo(another)) {
|
if (this.perpendicularTo(another)) {
|
||||||
rv = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).intersectsLine(another);
|
rv = this.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT).intersectsLine(another);
|
||||||
} else {
|
} else {
|
||||||
rv = this.expand(colinearOrParallelExpandAmount).intersectsLine(another.expand(colinearOrParallelExpandAmount));
|
rv = this.expand(COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT).intersectsLine(another.expand(COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT));
|
||||||
}
|
}
|
||||||
|
|
||||||
return rv;
|
return rv;
|
||||||
@ -319,8 +320,8 @@ public class Ruling extends Line2D.Float {
|
|||||||
|
|
||||||
public Point2D intersectionPoint(Ruling other) {
|
public Point2D intersectionPoint(Ruling other) {
|
||||||
|
|
||||||
Ruling this_l = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
|
Ruling this_l = this.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
|
||||||
Ruling other_l = other.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
|
Ruling other_l = other.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
|
||||||
Ruling horizontal, vertical;
|
Ruling horizontal, vertical;
|
||||||
|
|
||||||
if (!this_l.intersectsLine(other_l)) {
|
if (!this_l.intersectsLine(other_l)) {
|
||||||
|
|||||||
@ -1,18 +1,18 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
import java.awt.geom.Rectangle2D;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import lombok.Setter;
|
import lombok.Setter;
|
||||||
@ -21,7 +21,8 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@Slf4j
|
@Slf4j
|
||||||
public class TablePageBlock extends AbstractPageBlock {
|
public class TablePageBlock extends AbstractPageBlock {
|
||||||
|
|
||||||
private final TreeMap<CellPosition, Cell> cells = new TreeMap<>();
|
public static final double CELL_AREA_CONTAINED_THRESHOLD = 0.98;
|
||||||
|
private final TreeMap<CellPosition, Cell> cellTreeMap = new TreeMap<>();
|
||||||
|
|
||||||
private final int rotation;
|
private final int rotation;
|
||||||
@Getter
|
@Getter
|
||||||
@ -30,10 +31,14 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
private int unrotatedRowCount;
|
private int unrotatedRowCount;
|
||||||
private int unrotatedColCount;
|
private int unrotatedColCount;
|
||||||
private List<List<Cell>> rows;
|
private List<List<Cell>> rows;
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
private List<Cell> cells;
|
||||||
|
|
||||||
|
|
||||||
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
|
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
|
||||||
|
|
||||||
|
this.cells = cells;
|
||||||
addCells(cells);
|
addCells(cells);
|
||||||
minX = area.getLeft();
|
minX = area.getLeft();
|
||||||
minY = area.getBottom();
|
minY = area.getBottom();
|
||||||
@ -50,6 +55,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
return getColCount() == 0 || getRowCount() == 0;
|
return getColCount() == 0 || getRowCount() == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<List<Cell>> getRows() {
|
public List<List<Cell>> getRows() {
|
||||||
|
|
||||||
if (rows == null) {
|
if (rows == null) {
|
||||||
@ -80,14 +86,17 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
public int getColCount() {
|
public int getColCount() {
|
||||||
|
|
||||||
return getRows().stream().mapToInt(List::size).max().orElse(0);
|
return getRows().stream()
|
||||||
|
.mapToInt(List::size)
|
||||||
|
.max()
|
||||||
|
.orElse(0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Detect header cells (either first row or first column):
|
* Detect header cells (either first row or first column):
|
||||||
* Column is marked as header if cell text is bold and row cell text is not bold.
|
* Column is marked as header if originalCell text is bold and row originalCell text is not bold.
|
||||||
* Defaults to row.
|
* Defaults to row.
|
||||||
*/
|
*/
|
||||||
private void computeHeaders() {
|
private void computeHeaders() {
|
||||||
@ -95,7 +104,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
if (rows == null) {
|
if (rows == null) {
|
||||||
rows = computeRows();
|
rows = computeRows();
|
||||||
}
|
}
|
||||||
// A bold cell is a header cell as long as every cell to the left/top is bold, too
|
// A bold originalCell is a header originalCell as long as every originalCell to the left/top is bold, too
|
||||||
// we move from left to right and top to bottom
|
// we move from left to right and top to bottom
|
||||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||||
List<Cell> rowCells = rows.get(rowIndex);
|
List<Cell> rowCells = rows.get(rowIndex);
|
||||||
@ -120,7 +129,8 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
List<Cell> cellsToTheTop = new ArrayList<>();
|
List<Cell> cellsToTheTop = new ArrayList<>();
|
||||||
for (int i = 0; i < rowIndex; i++) {
|
for (int i = 0; i < rowIndex; i++) {
|
||||||
try {
|
try {
|
||||||
cellsToTheTop.add(rows.get(i).get(colIndex));
|
cellsToTheTop.add(rows.get(i)
|
||||||
|
.get(colIndex));
|
||||||
} catch (IndexOutOfBoundsException e) {
|
} catch (IndexOutOfBoundsException e) {
|
||||||
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
|
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
|
||||||
}
|
}
|
||||||
@ -135,7 +145,8 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
if (lastHeaderCell != null) {
|
if (lastHeaderCell != null) {
|
||||||
cell.getHeaderCells().add(lastHeaderCell);
|
cell.getHeaderCells().add(lastHeaderCell);
|
||||||
}
|
}
|
||||||
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
|
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks()
|
||||||
|
.get(0).getMostPopularWordStyle().equals("bold")) {
|
||||||
cell.setHeaderCell(true);
|
cell.setHeaderCell(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -151,7 +162,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
for (int i = 0; i < unrotatedColCount; i++) { // rows
|
for (int i = 0; i < unrotatedColCount; i++) { // rows
|
||||||
List<Cell> lastRow = new ArrayList<>();
|
List<Cell> lastRow = new ArrayList<>();
|
||||||
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
|
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
|
||||||
Cell cell = cells.get(new CellPosition(j, i));
|
Cell cell = cellTreeMap.get(new CellPosition(j, i));
|
||||||
if (cell != null) {
|
if (cell != null) {
|
||||||
lastRow.add(cell);
|
lastRow.add(cell);
|
||||||
}
|
}
|
||||||
@ -162,7 +173,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
|
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
|
||||||
List<Cell> lastRow = new ArrayList<>();
|
List<Cell> lastRow = new ArrayList<>();
|
||||||
for (int j = 0; j < unrotatedRowCount; j++) { // cols
|
for (int j = 0; j < unrotatedRowCount; j++) { // cols
|
||||||
Cell cell = cells.get(new CellPosition(j, i));
|
Cell cell = cellTreeMap.get(new CellPosition(j, i));
|
||||||
if (cell != null) {
|
if (cell != null) {
|
||||||
lastRow.add(cell);
|
lastRow.add(cell);
|
||||||
}
|
}
|
||||||
@ -173,7 +184,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
for (int i = 0; i < unrotatedRowCount; i++) {
|
for (int i = 0; i < unrotatedRowCount; i++) {
|
||||||
List<Cell> lastRow = new ArrayList<>();
|
List<Cell> lastRow = new ArrayList<>();
|
||||||
for (int j = 0; j < unrotatedColCount; j++) {
|
for (int j = 0; j < unrotatedColCount; j++) {
|
||||||
Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
|
Cell cell = cellTreeMap.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
|
||||||
if (cell != null) {
|
if (cell != null) {
|
||||||
lastRow.add(cell);
|
lastRow.add(cell);
|
||||||
}
|
}
|
||||||
@ -187,17 +198,6 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void add(Cell chunk, int row, int col) {
|
|
||||||
|
|
||||||
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
|
|
||||||
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
|
|
||||||
|
|
||||||
CellPosition cp = new CellPosition(row, col);
|
|
||||||
cells.put(cp, chunk);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void addCells(List<Cell> cells) {
|
private void addCells(List<Cell> cells) {
|
||||||
|
|
||||||
if (cells.isEmpty()) {
|
if (cells.isEmpty()) {
|
||||||
@ -206,11 +206,12 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1);
|
cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1);
|
||||||
|
|
||||||
List<List<Cell>> rowsOfCells = calculateStructure(cells);
|
List<List<Cell>> rowsOfCellsMatrix = calculateTableStructure(cells);
|
||||||
|
|
||||||
for (int i = 0; i < rowsOfCells.size(); i++) {
|
for (int i = 0; i < rowsOfCellsMatrix.size(); i++) {
|
||||||
for (int j = 0; j < rowsOfCells.get(i).size(); j++) {
|
for (int j = 0; j < rowsOfCellsMatrix.get(i).size(); j++) {
|
||||||
add(rowsOfCells.get(i).get(j), i, j);
|
addCellToRowAndCol(rowsOfCellsMatrix.get(i)
|
||||||
|
.get(j), i, j);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -221,29 +222,36 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
||||||
*
|
*
|
||||||
* @param cells The found cells
|
* @param cells The found cells
|
||||||
* @return TablePageBlock Structure
|
* @return TablePageBlock Structure as a rows of cells matrix
|
||||||
*/
|
*/
|
||||||
private List<List<Cell>> calculateStructure(List<Cell> cells) {
|
private List<List<Cell>> calculateTableStructure(List<Cell> cells) {
|
||||||
|
|
||||||
List<List<Cell>> matrix = new ArrayList<>();
|
|
||||||
|
|
||||||
if (cells.isEmpty()) {
|
if (cells.isEmpty()) {
|
||||||
return matrix;
|
return new ArrayList<>();
|
||||||
}
|
}
|
||||||
|
|
||||||
Set<Float> uniqueX = new HashSet<>();
|
Set<Float> uniqueX = new HashSet<>();
|
||||||
Set<Float> uniqueY = new HashSet<>();
|
Set<Float> uniqueY = new HashSet<>();
|
||||||
cells.stream().filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3).forEach(c -> {
|
cells.stream()
|
||||||
uniqueX.add(c.getLeft());
|
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
|
||||||
uniqueX.add(c.getRight());
|
.forEach(c -> {
|
||||||
uniqueY.add(c.getBottom());
|
uniqueX.add(c.getLeft());
|
||||||
uniqueY.add(c.getTop());
|
uniqueX.add(c.getRight());
|
||||||
});
|
uniqueY.add(c.getBottom());
|
||||||
|
uniqueY.add(c.getTop());
|
||||||
|
});
|
||||||
|
|
||||||
var sortedUniqueX = uniqueX.stream().sorted().toList();
|
var sortedUniqueX = uniqueX.stream()
|
||||||
var sortedUniqueY = uniqueY.stream().sorted().toList();
|
.sorted()
|
||||||
|
.toList();
|
||||||
|
var sortedUniqueY = uniqueY.stream()
|
||||||
|
.sorted()
|
||||||
|
.toList();
|
||||||
|
|
||||||
|
List<List<Cell>> rowsOfCells = new ArrayList<>();
|
||||||
|
|
||||||
Float prevY = null;
|
Float prevY = null;
|
||||||
|
|
||||||
for (Float y : sortedUniqueY) {
|
for (Float y : sortedUniqueY) {
|
||||||
|
|
||||||
List<Cell> row = new ArrayList<>();
|
List<Cell> row = new ArrayList<>();
|
||||||
@ -252,44 +260,87 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
for (Float x : sortedUniqueX) {
|
for (Float x : sortedUniqueX) {
|
||||||
|
|
||||||
if (prevY != null && prevX != null) {
|
if (prevY != null && prevX != null) {
|
||||||
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
var cellFromGridStructure = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
||||||
|
|
||||||
var intersectionCell = cells.stream().filter(c -> intersects(cell, c)).findFirst();
|
if (cellFromGridStructure.hasMinimumSize()) {
|
||||||
|
|
||||||
intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
|
cells.stream()
|
||||||
if (cell.hasMinimumSize()) {
|
.map(originalCell -> new CellWithIntersection(originalCell, RectangleTransformations.calculateIntersectedArea(cellFromGridStructure, originalCell)))
|
||||||
row.add(cell);
|
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
|
||||||
|
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
|
||||||
|
.max(Comparator.comparing(CellWithIntersection::intersectedArea))
|
||||||
|
.map(CellWithIntersection::originalCell)
|
||||||
|
.ifPresent(matchingCell -> cellFromGridStructure.getTextBlocks().addAll(matchingCell.getTextBlocks()));
|
||||||
|
|
||||||
|
row.add(cellFromGridStructure);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
prevX = x;
|
prevX = x;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (prevY != null && prevX != null && !row.isEmpty()) {
|
// exclude empty rows and rows where all text blocks are empty
|
||||||
matrix.add(row);
|
if (prevY != null && prevX != null && !row.isEmpty() && !row.stream()
|
||||||
|
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
|
||||||
|
|
||||||
|
rowsOfCells.add(row);
|
||||||
}
|
}
|
||||||
prevY = y;
|
prevY = y;
|
||||||
}
|
}
|
||||||
|
|
||||||
Collections.reverse(matrix);
|
Collections.reverse(rowsOfCells);
|
||||||
|
|
||||||
return matrix;
|
// now cells are removed which are part of a column without any text blocks
|
||||||
}
|
// this is done by first computing the inverse matrix which contains call columns of cells
|
||||||
|
// then the column indices that have to be removed are determined
|
||||||
|
List<List<Cell>> columnsOfCells = new ArrayList<>();
|
||||||
|
int maxRowLength = rowsOfCells.stream()
|
||||||
public boolean intersects(Cell cell1, Cell cell2) {
|
.map(List::size)
|
||||||
if (cell1.getHeight() <= 0 || cell2.getHeight() <= 0) {
|
.max(java.util.Comparator.naturalOrder())
|
||||||
return false;
|
.orElse(0);
|
||||||
|
for (int i = 0; i < maxRowLength; i++) {
|
||||||
|
columnsOfCells.add(new ArrayList<>());
|
||||||
}
|
}
|
||||||
double x0 = cell1.getX() + 2;
|
|
||||||
double y0 = cell1.getY() + 2;
|
for (List<Cell> row : rowsOfCells) {
|
||||||
return (cell2.x + cell2.width > x0 &&
|
for (int j = 0; j < row.size(); j++) {
|
||||||
cell2.y + cell2.height > y0 &&
|
columnsOfCells.get(j).add(row.get(j));
|
||||||
cell2.x < x0 + cell1.getWidth() -2 &&
|
}
|
||||||
cell2.y < y0 + cell1.getHeight() -2);
|
}
|
||||||
|
|
||||||
|
List<Integer> columnIndicesToRemove = new ArrayList<>();
|
||||||
|
int columnIndex = 0;
|
||||||
|
for (List<Cell> col : columnsOfCells) {
|
||||||
|
if (col.stream()
|
||||||
|
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
|
||||||
|
columnIndicesToRemove.add(columnIndex);
|
||||||
|
}
|
||||||
|
columnIndex++;
|
||||||
|
}
|
||||||
|
columnIndicesToRemove.sort(Collections.reverseOrder());
|
||||||
|
|
||||||
|
// update all rows so that the values of the empty columns get removed
|
||||||
|
var rowsOfCellsBefore = new ArrayList<>(rowsOfCells);
|
||||||
|
rowsOfCells = new ArrayList<>();
|
||||||
|
for (List<Cell> row : rowsOfCellsBefore) {
|
||||||
|
var updatedRow = new ArrayList<>(row);
|
||||||
|
columnIndicesToRemove.forEach(idxToRemove -> updatedRow.remove(updatedRow.get(idxToRemove)));
|
||||||
|
rowsOfCells.add(updatedRow);
|
||||||
|
}
|
||||||
|
|
||||||
|
return rowsOfCells;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void addCellToRowAndCol(Cell cell, int row, int col) {
|
||||||
|
|
||||||
|
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
|
||||||
|
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
|
||||||
|
|
||||||
|
CellPosition cp = new CellPosition(row, col);
|
||||||
|
cellTreeMap.put(cp, cell);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getText() {
|
public String getText() {
|
||||||
@ -314,7 +365,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
if (!first) {
|
if (!first) {
|
||||||
sb.append("\n");
|
sb.append("\n");
|
||||||
}
|
}
|
||||||
sb.append('\"').append(textBlock.getText().replaceAll("\"", "\\\"")).append('\"');
|
sb.append('\"').append(textBlock.getText().replaceAll("\"", "\"")).append('\"');
|
||||||
first = false;
|
first = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -328,8 +379,6 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public String getTextAsHtml() {
|
public String getTextAsHtml() {
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
@ -363,4 +412,9 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
record CellWithIntersection(Cell originalCell, double intersectedArea) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -1,7 +1,6 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||||
|
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
import org.springframework.beans.BeanUtils;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||||
|
|
||||||
@ -50,7 +49,13 @@ public class RedTextPosition {
|
|||||||
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
|
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
|
||||||
|
|
||||||
var pos = new RedTextPosition();
|
var pos = new RedTextPosition();
|
||||||
BeanUtils.copyProperties(textPosition, pos);
|
pos.setRotation(textPosition.getRotation());
|
||||||
|
pos.setPageHeight(textPosition.getPageHeight());
|
||||||
|
pos.setPageWidth(textPosition.getPageWidth());
|
||||||
|
pos.setUnicode(textPosition.getUnicode());
|
||||||
|
pos.setDir(textPosition.getDir());
|
||||||
|
pos.setWidthOfSpace(textPosition.getWidthOfSpace());
|
||||||
|
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
||||||
pos.setFontName(textPosition.getFont().getName());
|
pos.setFontName(textPosition.getFont().getName());
|
||||||
|
|
||||||
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
||||||
|
|||||||
@ -190,6 +190,12 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public float getTextHeightNoPadding() {
|
||||||
|
|
||||||
|
return textPositions.get(0).getHeightDir();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public float getTextHeight() {
|
public float getTextHeight() {
|
||||||
|
|
||||||
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
|
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
|
||||||
|
|||||||
@ -1,21 +1,21 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||||
|
|
||||||
import java.awt.geom.Line2D;
|
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.X_FIRST_RULING_COMPARATOR;
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.UnionFind;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
@ -25,25 +25,145 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class RulingCleaningService {
|
public class RulingCleaningService {
|
||||||
|
|
||||||
private static final float THRESHOLD = 6;
|
private static final float THRESHOLD_X_VERTICAL = 1;
|
||||||
|
private static final float THRESHOLD_Y_VERTICAL = 2;
|
||||||
|
private static final float THRESHOLD_X_HORIZONTAL = 2;
|
||||||
|
private static final float THRESHOLD_Y_HORIZONTAL = 3;
|
||||||
|
|
||||||
|
|
||||||
public CleanRulings getCleanRulings(List<TableCells> tableCells, List<Ruling> rulings) {
|
public CleanRulings getCleanRulings(List<TableCells> tableCells, List<Ruling> rulings) {
|
||||||
|
|
||||||
|
Rulings verticalAndHorizontalRulingLines;
|
||||||
|
|
||||||
if (!rulings.isEmpty()) {
|
if (!rulings.isEmpty()) {
|
||||||
snapPoints(rulings);
|
verticalAndHorizontalRulingLines = extractVerticalAndHorizontalRulingLines(rulings);
|
||||||
|
} else {
|
||||||
|
verticalAndHorizontalRulingLines = getRulingsFromParsedCells(tableCells);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
verticalAndHorizontalRulingLines.verticalLines.sort(X_FIRST_RULING_COMPARATOR);
|
||||||
|
verticalAndHorizontalRulingLines.horizontalLines.sort(X_FIRST_RULING_COMPARATOR);
|
||||||
|
verticalAndHorizontalRulingLines = cleanRulings(verticalAndHorizontalRulingLines);
|
||||||
|
|
||||||
|
return CleanRulings.builder().vertical(verticalAndHorizontalRulingLines.verticalLines()).horizontal(verticalAndHorizontalRulingLines.horizontalLines()).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Rulings cleanRulings(Rulings rulings) {
|
||||||
|
|
||||||
|
List<List<Rectangle>> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
|
||||||
|
.map(RulingCleaningService::getOverlapRectangle)
|
||||||
|
.distinct()
|
||||||
|
.toList());
|
||||||
|
List<Ruling> cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream()
|
||||||
|
.map(rectList -> getXCenteredRuling(Rectangle.boundingBoxOf(rectList)))
|
||||||
|
.toList();
|
||||||
|
|
||||||
|
List<List<Rectangle>> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
|
||||||
|
.map(RulingCleaningService::getOverlapRectangle)
|
||||||
|
.distinct()
|
||||||
|
.toList());
|
||||||
|
|
||||||
|
List<Ruling> cleanedHorizontalRulings = groupedOverlappingHorizontalRectangles.stream()
|
||||||
|
.map(rectList -> getYCenteredRuling(Rectangle.boundingBoxOf(rectList)))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
return new Rulings(cleanedVerticalRulings, cleanedHorizontalRulings);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<List<Rectangle>> groupOverlappingRectangles(List<Rectangle> rectangles) {
|
||||||
|
|
||||||
|
UnionFind<Rectangle> unionFind = new UnionFind<>();
|
||||||
|
for (int i = 0; i < rectangles.size(); i++) {
|
||||||
|
for (int j = i + 1; j < rectangles.size(); j++) {
|
||||||
|
Rectangle rectangle1 = rectangles.get(i);
|
||||||
|
Rectangle rectangle2 = rectangles.get(j);
|
||||||
|
|
||||||
|
// we can stop early when we are too far off because of x-y-sorting
|
||||||
|
if(rectangle1.getRight() < rectangle2.getLeft() && rectangle1.getBottom() < rectangle2.getTop()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rectangle1.intersects(rectangle2)) {
|
||||||
|
unionFind.union(rectangle1, rectangle2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Map<Rectangle, List<Rectangle>> groups = new HashMap<>();
|
||||||
|
for (Rectangle rectangle : rectangles) {
|
||||||
|
Rectangle root = unionFind.find(rectangle);
|
||||||
|
groups.computeIfAbsent(root, k -> new ArrayList<>()).add(rectangle);
|
||||||
|
}
|
||||||
|
return new ArrayList<>(groups.values());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static Rectangle getOverlapRectangle(Ruling ruling) {
|
||||||
|
|
||||||
|
float top;
|
||||||
|
float left;
|
||||||
|
float w;
|
||||||
|
float h;
|
||||||
|
|
||||||
|
if (ruling.x1 < ruling.x2) {
|
||||||
|
left = ruling.x1;
|
||||||
|
w = ruling.x2 - ruling.x1;
|
||||||
|
} else {
|
||||||
|
left = ruling.x2;
|
||||||
|
w = ruling.x1 - ruling.x2;
|
||||||
|
}
|
||||||
|
if (ruling.y1 < ruling.y2) {
|
||||||
|
top = ruling.y1;
|
||||||
|
h = ruling.y2 - ruling.y1;
|
||||||
|
} else {
|
||||||
|
top = ruling.y2;
|
||||||
|
h = ruling.y1 - ruling.y2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ruling.horizontal()) {
|
||||||
|
return new Rectangle(top - THRESHOLD_Y_HORIZONTAL, left - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
|
||||||
|
} else {
|
||||||
|
return new Rectangle(top - THRESHOLD_Y_VERTICAL, left - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static Ruling getXCenteredRuling(Rectangle rectangle) {
|
||||||
|
|
||||||
|
float x = (float) rectangle.getCenterX();
|
||||||
|
float y1 = rectangle.getTop();
|
||||||
|
float y2 = rectangle.getBottom();
|
||||||
|
|
||||||
|
Point2D point1 = new Point2D.Float(x, y1 + THRESHOLD_Y_VERTICAL);
|
||||||
|
Point2D point2 = new Point2D.Float(x, y2 - THRESHOLD_Y_VERTICAL);
|
||||||
|
|
||||||
|
return new Ruling(point1, point2);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static Ruling getYCenteredRuling(Rectangle rectangle) {
|
||||||
|
|
||||||
|
float x1 = rectangle.getLeft();
|
||||||
|
float x2 = rectangle.getRight();
|
||||||
|
float y = (float) rectangle.getCenterY();
|
||||||
|
|
||||||
|
Point2D point1 = new Point2D.Float(x1 + THRESHOLD_X_HORIZONTAL, y);
|
||||||
|
Point2D point2 = new Point2D.Float(x2 - THRESHOLD_X_HORIZONTAL, y);
|
||||||
|
|
||||||
|
return new Ruling(point1, point2);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Rulings extractVerticalAndHorizontalRulingLines(List<Ruling> rulings) {
|
||||||
|
|
||||||
List<Ruling> vrs = new ArrayList<>();
|
List<Ruling> vrs = new ArrayList<>();
|
||||||
for (Ruling vr : rulings) {
|
for (Ruling vr : rulings) {
|
||||||
if (vr.vertical()) {
|
if (vr.vertical()) {
|
||||||
vrs.add(vr);
|
vrs.add(vr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (vrs.isEmpty()) {
|
|
||||||
vrs.addAll(extractVerticalRulings(tableCells));
|
|
||||||
}
|
|
||||||
List<Ruling> verticalRulingLines = collapseOrientedRulings(vrs);
|
|
||||||
|
|
||||||
List<Ruling> hrs = new ArrayList<>();
|
List<Ruling> hrs = new ArrayList<>();
|
||||||
for (Ruling hr : rulings) {
|
for (Ruling hr : rulings) {
|
||||||
@ -51,98 +171,26 @@ public class RulingCleaningService {
|
|||||||
hrs.add(hr);
|
hrs.add(hr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (hrs.isEmpty()) {
|
return new Rulings(vrs, hrs);
|
||||||
hrs.addAll(extractHorizontalRulings(tableCells));
|
|
||||||
}
|
|
||||||
List<Ruling> horizontalRulingLines = collapseOrientedRulings(hrs);
|
|
||||||
|
|
||||||
return CleanRulings.builder().vertical(verticalRulingLines).horizontal(horizontalRulingLines).build();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void snapPoints(List<? extends Line2D.Float> rulings) {
|
private Rulings getRulingsFromParsedCells(List<TableCells> tableCells) {
|
||||||
|
|
||||||
// collect points and keep a Line -> p1,p2 map
|
List<Ruling> vrs = extractVerticalRulingsFromParsedCells(tableCells);
|
||||||
Map<Line2D.Float, Point2D[]> linesToPoints = new HashMap<>();
|
List<Ruling> hrs = extractHorizontalRulingsFromParsedCells(tableCells);
|
||||||
List<Point2D> points = new ArrayList<>();
|
return new Rulings(vrs, hrs);
|
||||||
for (Line2D.Float r : rulings) {
|
|
||||||
Point2D p1 = r.getP1();
|
|
||||||
Point2D p2 = r.getP2();
|
|
||||||
linesToPoints.put(r, new Point2D[]{p1, p2});
|
|
||||||
points.add(p1);
|
|
||||||
points.add(p2);
|
|
||||||
}
|
|
||||||
|
|
||||||
// snap by X
|
|
||||||
points.sort(Comparator.comparingDouble(Point2D::getX));
|
|
||||||
|
|
||||||
List<List<Point2D>> groupedPoints = new ArrayList<>();
|
|
||||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(points.get(0))));
|
|
||||||
|
|
||||||
for (Point2D p : points.subList(1, points.size() - 1)) {
|
|
||||||
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
|
|
||||||
if (Math.abs(p.getX() - last.get(0).getX()) < THRESHOLD) {
|
|
||||||
groupedPoints.get(groupedPoints.size() - 1).add(p);
|
|
||||||
} else {
|
|
||||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (List<Point2D> group : groupedPoints) {
|
|
||||||
float avgLoc = 0;
|
|
||||||
for (Point2D p : group) {
|
|
||||||
avgLoc += p.getX();
|
|
||||||
}
|
|
||||||
avgLoc /= group.size();
|
|
||||||
for (Point2D p : group) {
|
|
||||||
p.setLocation(avgLoc, p.getY());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// ---
|
|
||||||
|
|
||||||
// snap by Y
|
|
||||||
points.sort(Comparator.comparingDouble(Point2D::getY));
|
|
||||||
|
|
||||||
groupedPoints = new ArrayList<>();
|
|
||||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(points.get(0))));
|
|
||||||
|
|
||||||
for (Point2D p : points.subList(1, points.size() - 1)) {
|
|
||||||
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
|
|
||||||
if (Math.abs(p.getY() - last.get(0).getY()) < THRESHOLD) {
|
|
||||||
groupedPoints.get(groupedPoints.size() - 1).add(p);
|
|
||||||
} else {
|
|
||||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (List<Point2D> group : groupedPoints) {
|
|
||||||
float avgLoc = 0;
|
|
||||||
for (Point2D p : group) {
|
|
||||||
avgLoc += p.getY();
|
|
||||||
}
|
|
||||||
avgLoc /= group.size();
|
|
||||||
for (Point2D p : group) {
|
|
||||||
p.setLocation(p.getX(), avgLoc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// ---
|
|
||||||
|
|
||||||
// finally, modify lines
|
|
||||||
for (Map.Entry<Line2D.Float, Point2D[]> ltp : linesToPoints.entrySet()) {
|
|
||||||
Point2D[] p = ltp.getValue();
|
|
||||||
ltp.getKey().setLine(p[0], p[1]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Collection<? extends Ruling> extractVerticalRulings(List<TableCells> cvParsedTableCells) {
|
private List<Ruling> extractVerticalRulingsFromParsedCells(List<TableCells> tableCells) {
|
||||||
|
|
||||||
List<Ruling> vrs = new ArrayList<>();
|
List<Ruling> vrs = new ArrayList<>();
|
||||||
|
|
||||||
if (cvParsedTableCells != null) {
|
if (tableCells != null) {
|
||||||
for (TableCells cvParsedTableCell : cvParsedTableCells) {
|
for (TableCells tableCell : tableCells) {
|
||||||
Ruling leftLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX0(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1());
|
Ruling leftLine = createRuling(tableCell.getX0(), tableCell.getX0(), tableCell.getY0(), tableCell.getY1());
|
||||||
Ruling rightLine = createRuling(cvParsedTableCell.getX1(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1());
|
Ruling rightLine = createRuling(tableCell.getX1(), tableCell.getX1(), tableCell.getY0(), tableCell.getY1());
|
||||||
vrs.add(leftLine);
|
vrs.add(leftLine);
|
||||||
vrs.add(rightLine);
|
vrs.add(rightLine);
|
||||||
}
|
}
|
||||||
@ -151,19 +199,18 @@ public class RulingCleaningService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Collection<? extends Ruling> extractHorizontalRulings(List<TableCells> cvParsedTableCells) {
|
private List<Ruling> extractHorizontalRulingsFromParsedCells(List<TableCells> tableCells) {
|
||||||
|
|
||||||
List<Ruling> hrs = new ArrayList<>();
|
List<Ruling> hrs = new ArrayList<>();
|
||||||
|
|
||||||
if (cvParsedTableCells != null) {
|
if (tableCells != null) {
|
||||||
for (TableCells cvParsedTableCell : cvParsedTableCells) {
|
for (TableCells tableCell : tableCells) {
|
||||||
Ruling topLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY1(), cvParsedTableCell.getY1());
|
Ruling topLine = createRuling(tableCell.getX0(), tableCell.getX1(), tableCell.getY1(), tableCell.getY1());
|
||||||
Ruling baseLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY0());
|
Ruling baseLine = createRuling(tableCell.getX0(), tableCell.getX1(), tableCell.getY0(), tableCell.getY0());
|
||||||
hrs.add(topLine);
|
hrs.add(topLine);
|
||||||
hrs.add(baseLine);
|
hrs.add(baseLine);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return hrs;
|
return hrs;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -189,46 +236,8 @@ public class RulingCleaningService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<Ruling> collapseOrientedRulings(List<Ruling> lines) {
|
private record Rulings(List<Ruling> verticalLines, List<Ruling> horizontalLines) {
|
||||||
|
|
||||||
int COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1;
|
|
||||||
return collapseOrientedRulings(lines, COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private List<Ruling> collapseOrientedRulings(List<Ruling> lines, int expandAmount) {
|
|
||||||
|
|
||||||
ArrayList<Ruling> rv = new ArrayList<>();
|
|
||||||
lines.sort((a, b) -> {
|
|
||||||
final float diff = a.getPosition() - b.getPosition();
|
|
||||||
return Float.compare(diff == 0 ? a.getStart() - b.getStart() : diff, 0f);
|
|
||||||
});
|
|
||||||
|
|
||||||
for (Ruling next_line : lines) {
|
|
||||||
Ruling last = rv.isEmpty() ? null : rv.get(rv.size() - 1);
|
|
||||||
// if current line colinear with next, and are "close enough": expand current line
|
|
||||||
if (last != null && DoubleComparisons.feq(next_line.getPosition(), last.getPosition()) && last.nearlyIntersects(next_line, expandAmount)) {
|
|
||||||
final float lastStart = last.getStart();
|
|
||||||
final float lastEnd = last.getEnd();
|
|
||||||
|
|
||||||
final boolean lastFlipped = lastStart > lastEnd;
|
|
||||||
final boolean nextFlipped = next_line.getStart() > next_line.getEnd();
|
|
||||||
|
|
||||||
boolean differentDirections = nextFlipped != lastFlipped;
|
|
||||||
float nextS = differentDirections ? next_line.getEnd() : next_line.getStart();
|
|
||||||
float nextE = differentDirections ? next_line.getStart() : next_line.getEnd();
|
|
||||||
|
|
||||||
final float newStart = lastFlipped ? Math.max(nextS, lastStart) : Math.min(nextS, lastStart);
|
|
||||||
final float newEnd = lastFlipped ? Math.min(nextE, lastEnd) : Math.max(nextE, lastEnd);
|
|
||||||
last.setStartEnd(newStart, newEnd);
|
|
||||||
assert !last.oblique();
|
|
||||||
} else if (next_line.length() == 0) {
|
|
||||||
continue;
|
|
||||||
} else {
|
|
||||||
rv.add(next_line);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rv;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -43,7 +43,6 @@ public class SectionsBuilderService {
|
|||||||
for (ClassificationPage page : document.getPages()) {
|
for (ClassificationPage page : document.getPages()) {
|
||||||
List<TextPageBlock> header = new ArrayList<>();
|
List<TextPageBlock> header = new ArrayList<>();
|
||||||
List<TextPageBlock> footer = new ArrayList<>();
|
List<TextPageBlock> footer = new ArrayList<>();
|
||||||
List<TextPageBlock> unclassifiedText = new ArrayList<>();
|
|
||||||
for (AbstractPageBlock current : page.getTextBlocks()) {
|
for (AbstractPageBlock current : page.getTextBlocks()) {
|
||||||
|
|
||||||
if (current.getClassification() == null) {
|
if (current.getClassification() == null) {
|
||||||
@ -62,11 +61,6 @@ public class SectionsBuilderService {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (current.getClassification().equals(PageBlockType.OTHER)) {
|
|
||||||
unclassifiedText.add((TextPageBlock) current);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (prev != null && current.getClassification().isHeadline() && !prev.getClassification().isHeadline() || !document.isHeadlines()) {
|
if (prev != null && current.getClassification().isHeadline() && !prev.getClassification().isHeadline() || !document.isHeadlines()) {
|
||||||
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
||||||
chunkBlock.setHeadline(lastHeadline);
|
chunkBlock.setHeadline(lastHeadline);
|
||||||
@ -94,9 +88,6 @@ public class SectionsBuilderService {
|
|||||||
if (!footer.isEmpty()) {
|
if (!footer.isEmpty()) {
|
||||||
footers.add(new ClassificationFooter(footer));
|
footers.add(new ClassificationFooter(footer));
|
||||||
}
|
}
|
||||||
if (!unclassifiedText.isEmpty()) {
|
|
||||||
unclassifiedTexts.add(new UnclassifiedText(unclassifiedText));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
||||||
|
|||||||
@ -1,14 +1,13 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
@ -21,59 +20,15 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
public class TableExtractionService {
|
public class TableExtractionService {
|
||||||
|
|
||||||
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
|
private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
|
||||||
|
private static final int TEXT_BLOCK_CONTAINMENT_TOLERANCE = 2;
|
||||||
int rv = 0;
|
private static final double TABLE_UNIFORMITY_THRESHOLD = 0.7;
|
||||||
float arg0X = DoubleComparisons.round(arg0.getX(), 2);
|
|
||||||
float arg0Y = DoubleComparisons.round(arg0.getY(), 2);
|
|
||||||
float arg1X = DoubleComparisons.round(arg1.getX(), 2);
|
|
||||||
float arg1Y = DoubleComparisons.round(arg1.getY(), 2);
|
|
||||||
|
|
||||||
if (arg0X > arg1X) {
|
|
||||||
rv = 1;
|
|
||||||
} else if (arg0X < arg1X) {
|
|
||||||
rv = -1;
|
|
||||||
} else if (arg0Y > arg1Y) {
|
|
||||||
rv = 1;
|
|
||||||
} else if (arg0Y < arg1Y) {
|
|
||||||
rv = -1;
|
|
||||||
}
|
|
||||||
return rv;
|
|
||||||
};
|
|
||||||
private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {
|
|
||||||
|
|
||||||
int rv = 0;
|
|
||||||
float arg0X = DoubleComparisons.round(arg0.getX(), 2);
|
|
||||||
float arg0Y = DoubleComparisons.round(arg0.getY(), 2);
|
|
||||||
float arg1X = DoubleComparisons.round(arg1.getX(), 2);
|
|
||||||
float arg1Y = DoubleComparisons.round(arg1.getY(), 2);
|
|
||||||
|
|
||||||
if (arg0Y > arg1Y) {
|
|
||||||
rv = 1;
|
|
||||||
} else if (arg0Y < arg1Y) {
|
|
||||||
rv = -1;
|
|
||||||
} else if (arg0X > arg1X) {
|
|
||||||
rv = 1;
|
|
||||||
} else if (arg0X < arg1X) {
|
|
||||||
rv = -1;
|
|
||||||
}
|
|
||||||
return rv;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
public boolean contains(Cell cell, double x, double y, double w, double h) {
|
|
||||||
|
|
||||||
if (cell.isEmpty() || w <= 0 || h <= 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
double x0 = cell.getX();
|
|
||||||
double y0 = cell.getY();
|
|
||||||
return (x >= x0 - 2 && y >= y0 - 2 && (x + w) <= x0 + cell.getWidth() + 2 && (y + h) <= y0 + cell.getHeight() + 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -89,22 +44,18 @@ public class TableExtractionService {
|
|||||||
* @param cleanRulings The lines used to build the table.
|
* @param cleanRulings The lines used to build the table.
|
||||||
* @param page Page object that contains textblocks and statistics.
|
* @param page Page object that contains textblocks and statistics.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public void extractTables(CleanRulings cleanRulings, ClassificationPage page) {
|
public void extractTables(CleanRulings cleanRulings, ClassificationPage page) {
|
||||||
|
|
||||||
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
|
// sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them
|
||||||
List<TextPageBlock> toBeRemoved = new ArrayList<>();
|
cells.sort(CELL_SIZE_COMPARATOR);
|
||||||
|
|
||||||
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
||||||
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
|
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
|
||||||
for (Cell cell : cells) {
|
for (Cell cell : cells) {
|
||||||
if (cell.hasMinimumSize() && contains(cell,
|
if (cell.hasMinimumSize() && doesCellContainTextBlock(cell, textBlock)) {
|
||||||
textBlock.getPdfMinX(),
|
|
||||||
textBlock.getPdfMinY(),
|
|
||||||
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
|
|
||||||
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
|
|
||||||
cell.addTextBlock(textBlock);
|
cell.addTextBlock(textBlock);
|
||||||
toBeRemoved.add(textBlock);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -113,245 +64,94 @@ public class TableExtractionService {
|
|||||||
cells = new ArrayList<>(new HashSet<>(cells));
|
cells = new ArrayList<>(new HashSet<>(cells));
|
||||||
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
|
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
|
||||||
|
|
||||||
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells);
|
List<Rectangle> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
|
||||||
|
// sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first
|
||||||
|
// this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells
|
||||||
|
spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR);
|
||||||
|
|
||||||
List<TablePageBlock> tables = new ArrayList<>();
|
List<TablePageBlock> tables = new ArrayList<>();
|
||||||
for (Rectangle area : spreadsheetAreas) {
|
for (Rectangle area : spreadsheetAreas) {
|
||||||
|
|
||||||
List<Cell> overlappingCells = new ArrayList<>();
|
List<Cell> containedCells = new ArrayList<>();
|
||||||
for (Cell c : cells) {
|
for (Cell c : cells) {
|
||||||
if (c.hasMinimumSize() && c.intersects(area)) {
|
if (c.hasMinimumSize() && area.contains(c)) {
|
||||||
overlappingCells.add(c);
|
containedCells.add(c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
tables.add(new TablePageBlock(overlappingCells, area, page.getRotation()));
|
|
||||||
|
var containedCellsWithText = containedCells.stream()
|
||||||
|
.filter(cell -> !cell.getTextBlocks().isEmpty())
|
||||||
|
.toList();
|
||||||
|
|
||||||
|
// verify if table would contain fewer cells with text than the threshold allows
|
||||||
|
if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
|
||||||
|
tables.add(new TablePageBlock(containedCells, area, page.getRotation()));
|
||||||
|
cells.removeAll(containedCells);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (TablePageBlock table : tables) {
|
for (TablePageBlock table : tables) {
|
||||||
int position = -1;
|
int position = -1;
|
||||||
|
|
||||||
Iterator<AbstractPageBlock> itty = page.getTextBlocks().iterator();
|
for (AbstractPageBlock pageBlock : page.getTextBlocks()) {
|
||||||
while (itty.hasNext()) {
|
if (pageBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) pageBlock) : table.contains(pageBlock) && position == -1) {
|
||||||
AbstractPageBlock textBlock = itty.next();
|
position = page.getTextBlocks().indexOf(pageBlock);
|
||||||
if (textBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) textBlock) : table.contains(textBlock) && position == -1) {
|
|
||||||
position = page.getTextBlocks().indexOf(textBlock);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (position != -1) {
|
if (position != -1) {
|
||||||
page.getTextBlocks().add(position, table);
|
page.getTextBlocks().add(position, table);
|
||||||
|
|
||||||
|
var toBeRemoved = table.getCells()
|
||||||
|
.stream()
|
||||||
|
.map(Cell::getTextBlocks)
|
||||||
|
.flatMap(List::stream)
|
||||||
|
.toList();
|
||||||
|
// remove text blocks from the page that were also added with the table (from its contained cells)
|
||||||
|
page.getTextBlocks().removeAll(toBeRemoved);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
page.getTextBlocks().removeAll(toBeRemoved);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
private boolean checkIfTableCellsAreUniform(List<Cell> containedCells) {
|
||||||
|
|
||||||
// Fix for 211.pdf
|
if(containedCells.size() <= 2) {
|
||||||
for (Ruling r : horizontalRulingLines) {
|
return true;
|
||||||
if (r.getX2() < r.getX1()) {
|
|
||||||
double a = r.getX2();
|
|
||||||
r.x2 = (float) r.getX1();
|
|
||||||
r.x1 = (float) a;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
List<Cell> cellsFound = new ArrayList<>();
|
Map<Long, List<Long>> cellsGroupedByRoundedWidth = containedCells.stream()
|
||||||
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
|
.map(Rectangle::getWidth)
|
||||||
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
|
.map(size -> Math.round(size / 10.0) * 10)
|
||||||
intersectionPointsList.sort(POINT_COMPARATOR);
|
.collect(Collectors.groupingBy(Long::longValue));
|
||||||
|
|
||||||
for (int i = 0; i < intersectionPointsList.size(); i++) {
|
return (double) cellsGroupedByRoundedWidth.size() / containedCells.size() <= TABLE_UNIFORMITY_THRESHOLD;
|
||||||
Point2D topLeft = intersectionPointsList.get(i);
|
|
||||||
Ruling[] hv = intersectionPoints.get(topLeft);
|
|
||||||
|
|
||||||
// CrossingPointsDirectlyBelow( topLeft );
|
|
||||||
List<Point2D> xPoints = new ArrayList<>();
|
|
||||||
// CrossingPointsDirectlyToTheRight( topLeft );
|
|
||||||
List<Point2D> yPoints = new ArrayList<>();
|
|
||||||
|
|
||||||
for (Point2D p : intersectionPointsList.subList(i, intersectionPointsList.size())) {
|
|
||||||
if (p.getX() == topLeft.getX() && p.getY() > topLeft.getY()) {
|
|
||||||
xPoints.add(p);
|
|
||||||
}
|
|
||||||
if (p.getY() == topLeft.getY() && p.getX() > topLeft.getX()) {
|
|
||||||
yPoints.add(p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
outer:
|
|
||||||
for (Point2D xPoint : xPoints) {
|
|
||||||
// is there a vertical edge b/w topLeft and xPoint?
|
|
||||||
if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
for (Point2D yPoint : yPoints) {
|
|
||||||
// is there an horizontal edge b/w topLeft and yPoint ?
|
|
||||||
if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
|
|
||||||
if (intersectionPoints.containsKey(btmRight) && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) && intersectionPoints.get(btmRight)[1].equals(
|
|
||||||
intersectionPoints.get(yPoint)[1])) {
|
|
||||||
cellsFound.add(new Cell(topLeft, btmRight));
|
|
||||||
break outer;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO create cells for vertical ruling lines with aligned endpoints at the top/bottom of a grid
|
|
||||||
// that aren't connected with an horizontal ruler?
|
|
||||||
// see: https://github.com/jazzido/tabula-extractor/issues/78#issuecomment-41481207
|
|
||||||
|
|
||||||
return cellsFound;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
|
private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) {
|
||||||
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
|
|
||||||
List<Rectangle> rectangles = new ArrayList<>();
|
|
||||||
Set<Point2D> pointSet = new HashSet<>();
|
|
||||||
Map<Point2D, Point2D> edgesH = new HashMap<>();
|
|
||||||
Map<Point2D, Point2D> edgesV = new HashMap<>();
|
|
||||||
int i = 0;
|
|
||||||
|
|
||||||
for (Rectangle cell : cells) {
|
double x = textBlock.getPdfMinX();
|
||||||
for (Point2D pt : cell.getPoints()) {
|
double y = textBlock.getPdfMinY();
|
||||||
if (pointSet.contains(pt)) { // shared vertex, remove it
|
double w = textBlock.getPdfMaxX() - textBlock.getPdfMinX();
|
||||||
pointSet.remove(pt);
|
double h = textBlock.getPdfMaxY() - textBlock.getPdfMinY();
|
||||||
} else {
|
if (cell.isEmpty() || w <= 0 || h <= 0) {
|
||||||
pointSet.add(pt);
|
return false;
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
double x0 = cell.getX();
|
||||||
// X first sort
|
double y0 = cell.getY();
|
||||||
List<Point2D> pointsSortX = new ArrayList<>(pointSet);
|
return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE
|
||||||
pointsSortX.sort(X_FIRST_POINT_COMPARATOR);
|
&& y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE
|
||||||
// Y first sort
|
&& (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE
|
||||||
List<Point2D> pointsSortY = new ArrayList<>(pointSet);
|
&& (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE);
|
||||||
pointsSortY.sort(POINT_COMPARATOR);
|
|
||||||
|
|
||||||
while (i < pointSet.size()) {
|
|
||||||
float currY = (float) pointsSortY.get(i).getY();
|
|
||||||
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortY.get(i).getY(), currY)) {
|
|
||||||
edgesH.put(pointsSortY.get(i), pointsSortY.get(i + 1));
|
|
||||||
edgesH.put(pointsSortY.get(i + 1), pointsSortY.get(i));
|
|
||||||
i += 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
i = 0;
|
|
||||||
while (i < pointSet.size()) {
|
|
||||||
float currX = (float) pointsSortX.get(i).getX();
|
|
||||||
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortX.get(i).getX(), currX)) {
|
|
||||||
edgesV.put(pointsSortX.get(i), pointsSortX.get(i + 1));
|
|
||||||
edgesV.put(pointsSortX.get(i + 1), pointsSortX.get(i));
|
|
||||||
i += 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get all the polygons
|
|
||||||
List<List<PolygonVertex>> polygons = new ArrayList<>();
|
|
||||||
Point2D nextVertex;
|
|
||||||
while (!edgesH.isEmpty()) {
|
|
||||||
ArrayList<PolygonVertex> polygon = new ArrayList<>();
|
|
||||||
Point2D first = edgesH.keySet().iterator().next();
|
|
||||||
polygon.add(new PolygonVertex(first, Direction.HORIZONTAL));
|
|
||||||
edgesH.remove(first);
|
|
||||||
|
|
||||||
while (true) {
|
|
||||||
PolygonVertex curr = polygon.get(polygon.size() - 1);
|
|
||||||
PolygonVertex lastAddedVertex;
|
|
||||||
if (curr.direction == Direction.HORIZONTAL) {
|
|
||||||
nextVertex = edgesV.get(curr.point);
|
|
||||||
edgesV.remove(curr.point);
|
|
||||||
lastAddedVertex = new PolygonVertex(nextVertex, Direction.VERTICAL);
|
|
||||||
} else {
|
|
||||||
nextVertex = edgesH.get(curr.point);
|
|
||||||
edgesH.remove(curr.point);
|
|
||||||
lastAddedVertex = new PolygonVertex(nextVertex, Direction.HORIZONTAL);
|
|
||||||
}
|
|
||||||
polygon.add(lastAddedVertex);
|
|
||||||
|
|
||||||
if (lastAddedVertex.equals(polygon.get(0))) {
|
|
||||||
// closed polygon
|
|
||||||
polygon.remove(polygon.size() - 1);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (PolygonVertex vertex : polygon) {
|
|
||||||
edgesH.remove(vertex.point);
|
|
||||||
edgesV.remove(vertex.point);
|
|
||||||
}
|
|
||||||
polygons.add(polygon);
|
|
||||||
}
|
|
||||||
|
|
||||||
// calculate grid-aligned minimum area rectangles for each found polygon
|
|
||||||
for (List<PolygonVertex> poly : polygons) {
|
|
||||||
float top = Float.MAX_VALUE;
|
|
||||||
float left = Float.MAX_VALUE;
|
|
||||||
float bottom = Float.MIN_VALUE;
|
|
||||||
float right = Float.MIN_VALUE;
|
|
||||||
for (PolygonVertex pt : poly) {
|
|
||||||
top = (float) Math.min(top, pt.point.getY());
|
|
||||||
left = (float) Math.min(left, pt.point.getX());
|
|
||||||
bottom = (float) Math.max(bottom, pt.point.getY());
|
|
||||||
right = (float) Math.max(right, pt.point.getX());
|
|
||||||
}
|
|
||||||
rectangles.add(new Rectangle(top, left, right - left, bottom - top));
|
|
||||||
}
|
|
||||||
|
|
||||||
return rectangles;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private enum Direction {
|
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||||
HORIZONTAL,
|
|
||||||
VERTICAL
|
|
||||||
}
|
|
||||||
|
|
||||||
static class PolygonVertex {
|
|
||||||
|
|
||||||
Point2D point;
|
|
||||||
Direction direction;
|
|
||||||
|
|
||||||
|
|
||||||
PolygonVertex(Point2D point, Direction direction) {
|
|
||||||
|
|
||||||
this.direction = direction;
|
|
||||||
this.point = point;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object other) {
|
|
||||||
|
|
||||||
if (this == other) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (!(other instanceof PolygonVertex)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return this.point.equals(((PolygonVertex) other).point);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode() {
|
|
||||||
|
|
||||||
return this.point.hashCode();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
|
|
||||||
return String.format("%s[point=%s,direction=%s]", this.getClass().getName(), this.point.toString(), this.direction.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
|
||||||
|
.stream()
|
||||||
|
.map(Cell::new)
|
||||||
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -61,8 +61,8 @@ public class DocuMineBlockificationService {
|
|||||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||||
boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle()
|
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle()
|
||||||
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
.contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
||||||
|
|
||||||
Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString());
|
Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString());
|
||||||
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();
|
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();
|
||||||
|
|||||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica
|
|||||||
|
|
||||||
import static java.util.stream.Collectors.toSet;
|
import static java.util.stream.Collectors.toSet;
|
||||||
|
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
@ -13,10 +14,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||||
|
|
||||||
@SuppressWarnings("all")
|
@SuppressWarnings("all")
|
||||||
@ -31,12 +35,12 @@ public class RedactManagerBlockificationService {
|
|||||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||||
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||||
*
|
*
|
||||||
* @param textPositions The words of a page.
|
* @param textPositions The words of a page.
|
||||||
* @param horizontalRulingLines Horizontal table lines.
|
|
||||||
* @param verticalRulingLines Vertical table lines.
|
|
||||||
* @return Page object that contains the Textblock and text statistics.
|
* @return Page object that contains the Textblock and text statistics.
|
||||||
*/
|
*/
|
||||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> emptyCells) {
|
||||||
|
|
||||||
|
CleanRulings usedRulings = RectangleTransformations.extractRulings(emptyCells);
|
||||||
|
|
||||||
int indexOnPage = 0;
|
int indexOnPage = 0;
|
||||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||||
@ -54,7 +58,7 @@ public class RedactManagerBlockificationService {
|
|||||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||||
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
||||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontal(), usedRulings.getVertical());
|
||||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||||
|
|
||||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||||
|
|||||||
@ -5,7 +5,6 @@ import java.util.Locale;
|
|||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
@ -13,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -23,7 +23,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class DocuMineClassificationService {
|
public class DocuMineClassificationService {
|
||||||
|
|
||||||
private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
private static final Pattern pattern = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||||
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||||
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||||
|
|
||||||
@ -64,46 +64,54 @@ public class DocuMineClassificationService {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||||
) {
|
.getMostPopular())) {
|
||||||
textBlock.setClassification(PageBlockType.HEADER);
|
textBlock.setClassification(PageBlockType.HEADER);
|
||||||
|
|
||||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||||
) {
|
.getMostPopular())) {
|
||||||
textBlock.setClassification(PageBlockType.FOOTER);
|
textBlock.setClassification(PageBlockType.FOOTER);
|
||||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||||
.size() == 1)) {
|
|
||||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||||
textBlock.setClassification(PageBlockType.TITLE);
|
textBlock.setClassification(PageBlockType.TITLE);
|
||||||
}
|
}
|
||||||
} else if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
|
} else if (textBlock.getText().length() > 5
|
||||||
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
|
&& (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular()
|
||||||
|
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
|
||||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
|
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||||
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
|
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && matcher2.reset().find() && !textBlock.toString()
|
||||||
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
|
.contains(":")
|
||||||
.startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) {
|
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && matcher2.reset().find() && !textBlock.toString().contains(":")
|
||||||
|
|| textBlock.toString().startsWith("APPENDIX")
|
||||||
|
|| textBlock.toString().startsWith("FIGURE")
|
||||||
|
|| textBlock.toString().startsWith("TABLE"))
|
||||||
|
&& !textBlock.toString().endsWith(":")
|
||||||
|
&& matcher2.reset().find()) {
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(1));
|
textBlock.setClassification(PageBlockType.getHeadlineType(1));
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
|
|
||||||
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
|
} else if (matcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.reset().find() && !matcher3.reset().matches()) {
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(2));
|
textBlock.setClassification(PageBlockType.getHeadlineType(2));
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||||
|
&& textBlock.getMostPopularWordStyle().equals("bold")
|
||||||
|
&& !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||||
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
&& textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular())
|
||||||
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
&& textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular())
|
||||||
|
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
|
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||||
.getMostPopular()
|
&& textBlock.getMostPopularWordStyle().equals("italic")
|
||||||
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
&& !document.getFontStyleCounter().getMostPopular().equals("italic")
|
||||||
|
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
||||||
|
|||||||
@ -18,8 +18,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
|
||||||
@ -31,6 +29,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||||
|
|
||||||
@ -49,9 +49,14 @@ public class DocumentGraphFactory {
|
|||||||
Document documentGraph = new Document();
|
Document documentGraph = new Document();
|
||||||
Context context = new Context(documentGraph);
|
Context context = new Context(documentGraph);
|
||||||
|
|
||||||
document.getPages().forEach(context::buildAndAddPageWithCounter);
|
document.getPages()
|
||||||
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image));
|
.forEach(context::buildAndAddPageWithCounter);
|
||||||
addSections(document, context);
|
document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(section -> section.getImages()
|
||||||
|
.stream())
|
||||||
|
.forEach(image -> context.getImages().add(image));
|
||||||
|
addSections(document, context, documentGraph);
|
||||||
addHeaderAndFooterToEachPage(document, context);
|
addHeaderAndFooterToEachPage(document, context);
|
||||||
|
|
||||||
documentGraph.setNumberOfPages(context.pages.size());
|
documentGraph.setNumberOfPages(context.pages.size());
|
||||||
@ -62,9 +67,10 @@ public class DocumentGraphFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void addSections(ClassificationDocument document, Context context) {
|
private void addSections(ClassificationDocument classificationDocument, Context context, Document document) {
|
||||||
|
|
||||||
document.getSections().forEach(section -> SectionNodeFactory.addSection(null, section.getNonEmptyPageBlocks(), section.getImages(), context));
|
classificationDocument.getSections()
|
||||||
|
.forEach(section -> SectionNodeFactory.addSection(null, section.getNonEmptyPageBlocks(), section.getImages(), context, document));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -74,9 +80,11 @@ public class DocumentGraphFactory {
|
|||||||
|
|
||||||
GenericSemanticNode node;
|
GenericSemanticNode node;
|
||||||
if (originalTextBlock.isHeadline()) {
|
if (originalTextBlock.isHeadline()) {
|
||||||
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
node = Headline.builder().documentTree(context.getDocumentTree())
|
||||||
|
.build();
|
||||||
} else {
|
} else {
|
||||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
node = Paragraph.builder().documentTree(context.getDocumentTree())
|
||||||
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
page.getMainBody().add(node);
|
page.getMainBody().add(node);
|
||||||
@ -91,7 +99,16 @@ public class DocumentGraphFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void addImage(Section section, ClassifiedImage image, Context context) {
|
public void addImage(GenericSemanticNode parent, ClassifiedImage image, Context context) {
|
||||||
|
|
||||||
|
Image imageNode = createImage(image, context);
|
||||||
|
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parent, imageNode);
|
||||||
|
imageNode.setTreeId(treeId);
|
||||||
|
imageNode.setLeafTextBlock(context.textBlockFactory.emptyTextBlock(parent, context, context.getPage(image.getPage())));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Image createImage(ClassifiedImage image, Context context) {
|
||||||
|
|
||||||
Rectangle2D position = image.getPosition();
|
Rectangle2D position = image.getPosition();
|
||||||
Page page = context.getPage(image.getPage());
|
Page page = context.getPage(image.getPage());
|
||||||
@ -104,9 +121,7 @@ public class DocumentGraphFactory {
|
|||||||
.documentTree(context.getDocumentTree())
|
.documentTree(context.getDocumentTree())
|
||||||
.build();
|
.build();
|
||||||
page.getMainBody().add(imageNode);
|
page.getMainBody().add(imageNode);
|
||||||
|
return imageNode;
|
||||||
List<Integer> tocId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode);
|
|
||||||
imageNode.setTreeId(tocId);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -146,10 +161,7 @@ public class DocumentGraphFactory {
|
|||||||
|
|
||||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
|
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page);
|
||||||
footer,
|
|
||||||
context,
|
|
||||||
page);
|
|
||||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||||
footer.setTreeId(tocId);
|
footer.setTreeId(tocId);
|
||||||
footer.setLeafTextBlock(textBlock);
|
footer.setLeafTextBlock(textBlock);
|
||||||
@ -161,7 +173,7 @@ public class DocumentGraphFactory {
|
|||||||
|
|
||||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
||||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
|
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), header, 0, page);
|
||||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||||
header.setTreeId(tocId);
|
header.setTreeId(tocId);
|
||||||
header.setLeafTextBlock(textBlock);
|
header.setLeafTextBlock(textBlock);
|
||||||
@ -172,7 +184,8 @@ public class DocumentGraphFactory {
|
|||||||
private void addEmptyFooter(int pageIndex, Context context) {
|
private void addEmptyFooter(int pageIndex, Context context) {
|
||||||
|
|
||||||
Page page = context.getPage(pageIndex);
|
Page page = context.getPage(pageIndex);
|
||||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
|
||||||
|
.build();
|
||||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
|
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
|
||||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||||
footer.setTreeId(tocId);
|
footer.setTreeId(tocId);
|
||||||
@ -184,7 +197,8 @@ public class DocumentGraphFactory {
|
|||||||
private void addEmptyHeader(int pageIndex, Context context) {
|
private void addEmptyHeader(int pageIndex, Context context) {
|
||||||
|
|
||||||
Page page = context.getPage(pageIndex);
|
Page page = context.getPage(pageIndex);
|
||||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
Header header = Header.builder().documentTree(context.getDocumentTree())
|
||||||
|
.build();
|
||||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
|
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
|
||||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
||||||
header.setTreeId(tocId);
|
header.setTreeId(tocId);
|
||||||
|
|||||||
@ -11,6 +11,7 @@ import java.util.Map;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
@ -24,27 +25,46 @@ import lombok.experimental.UtilityClass;
|
|||||||
@UtilityClass
|
@UtilityClass
|
||||||
public class SectionNodeFactory {
|
public class SectionNodeFactory {
|
||||||
|
|
||||||
public void addSection(GenericSemanticNode parentNode, List<AbstractPageBlock> pageBlocks, List<ClassifiedImage> images, DocumentGraphFactory.Context context) {
|
public void addSection(GenericSemanticNode parentNode,
|
||||||
|
List<AbstractPageBlock> pageBlocks,
|
||||||
|
List<ClassifiedImage> images,
|
||||||
|
DocumentGraphFactory.Context context,
|
||||||
|
Document document) {
|
||||||
|
|
||||||
|
// This is for the case where we have images on a page without any text/footer/header.
|
||||||
|
// The pageBlocks list is empty, but we still need to add those images to the document.
|
||||||
|
if (!images.isEmpty() && pageBlocks.isEmpty()) {
|
||||||
|
images.stream()
|
||||||
|
.distinct()
|
||||||
|
.forEach(image -> DocumentGraphFactory.addImage(document, image, context));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (pageBlocks.isEmpty()) {
|
if (pageBlocks.isEmpty()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractPageBlock::getPage));
|
|
||||||
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
|
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
|
||||||
|
.collect(groupingBy(AbstractPageBlock::getPage));
|
||||||
|
Section section = Section.builder().documentTree(context.getDocumentTree())
|
||||||
|
.build();
|
||||||
|
|
||||||
context.getSections().add(section);
|
context.getSections().add(section);
|
||||||
blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
|
blocksPerPage.keySet()
|
||||||
|
.forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
|
||||||
|
|
||||||
section.setTreeId(getTreeId(parentNode, context, section));
|
section.setTreeId(getTreeId(parentNode, context, section));
|
||||||
|
|
||||||
addFirstHeadlineDirectlyToSection(pageBlocks, context, section);
|
addFirstHeadlineDirectlyToSection(pageBlocks, context, section, document);
|
||||||
if (containsTablesAndTextBlocks(pageBlocks)) {
|
if (containsTablesAndTextBlocks(pageBlocks)) {
|
||||||
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context));
|
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context, document));
|
||||||
} else {
|
} else {
|
||||||
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section);
|
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section, document);
|
||||||
}
|
}
|
||||||
|
|
||||||
images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context));
|
images.stream()
|
||||||
|
.distinct()
|
||||||
|
.forEach(image -> DocumentGraphFactory.addImage(section, image, context));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -58,16 +78,16 @@ public class SectionNodeFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
|
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section, Document document) {
|
||||||
|
|
||||||
if (pageBlocks.get(0).isHeadline()) {
|
if (pageBlocks.get(0).isHeadline()) {
|
||||||
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section);
|
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section, document);
|
||||||
pageBlocks.remove(0);
|
pageBlocks.remove(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
|
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section, Document document) {
|
||||||
|
|
||||||
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
||||||
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
|
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
|
||||||
@ -86,7 +106,7 @@ public class SectionNodeFactory {
|
|||||||
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||||
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
|
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
|
||||||
alreadyMerged.addAll(tablesToMerge);
|
alreadyMerged.addAll(tablesToMerge);
|
||||||
TableNodeFactory.addTable(section, tablesToMerge, context);
|
TableNodeFactory.addTable(section, tablesToMerge, context, document);
|
||||||
} else {
|
} else {
|
||||||
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
|
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
|
||||||
}
|
}
|
||||||
@ -96,7 +116,9 @@ public class SectionNodeFactory {
|
|||||||
|
|
||||||
private boolean containsTablesAndTextBlocks(List<AbstractPageBlock> pageBlocks) {
|
private boolean containsTablesAndTextBlocks(List<AbstractPageBlock> pageBlocks) {
|
||||||
|
|
||||||
return pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
|
return pageBlocks.stream()
|
||||||
|
.anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream()
|
||||||
|
.anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -112,7 +134,9 @@ public class SectionNodeFactory {
|
|||||||
|
|
||||||
List<List<AbstractPageBlock>> splitList = splitIntoCoherentList(pageBlocks);
|
List<List<AbstractPageBlock>> splitList = splitIntoCoherentList(pageBlocks);
|
||||||
movePrecedingHeadlineToTableList(splitList);
|
movePrecedingHeadlineToTableList(splitList);
|
||||||
return splitList.stream().filter(list -> !list.isEmpty()).toList();
|
return splitList.stream()
|
||||||
|
.filter(list -> !list.isEmpty())
|
||||||
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -133,7 +157,8 @@ public class SectionNodeFactory {
|
|||||||
|
|
||||||
private boolean listIsTablesOnly(List<AbstractPageBlock> abstractPageBlocks) {
|
private boolean listIsTablesOnly(List<AbstractPageBlock> abstractPageBlocks) {
|
||||||
|
|
||||||
return abstractPageBlocks.stream().allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
|
return abstractPageBlocks.stream()
|
||||||
|
.allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -8,6 +8,7 @@ import java.util.Set;
|
|||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
@ -27,23 +28,26 @@ public class TableNodeFactory {
|
|||||||
public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
|
public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
|
||||||
|
|
||||||
|
|
||||||
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) {
|
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context, Document document) {
|
||||||
|
|
||||||
setPageNumberInCells(tablesToMerge);
|
setPageNumberInCells(tablesToMerge);
|
||||||
Set<Page> pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet());
|
Set<Page> pages = tablesToMerge.stream()
|
||||||
List<List<Cell>> mergedRows = tablesToMerge.stream().map(TablePageBlock::getRows).flatMap(Collection::stream).toList();
|
.map(AbstractPageBlock::getPage)
|
||||||
|
.map(context::getPage)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
List<List<Cell>> mergedRows = tablesToMerge.stream()
|
||||||
|
.map(TablePageBlock::getRows)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.toList();
|
||||||
|
|
||||||
Table table = Table.builder()
|
Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size()).numberOfRows(mergedRows.size())
|
||||||
.documentTree(context.getDocumentTree())
|
|
||||||
.numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size())
|
|
||||||
.numberOfRows(mergedRows.size())
|
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
pages.forEach(page -> addTableToPage(page, parentNode, table));
|
pages.forEach(page -> addTableToPage(page, parentNode, table));
|
||||||
|
|
||||||
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
|
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
|
||||||
table.setTreeId(treeId);
|
table.setTreeId(treeId);
|
||||||
addTableCells(mergedRows, table, context);
|
addTableCells(mergedRows, table, context, document);
|
||||||
|
|
||||||
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
|
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
|
||||||
}
|
}
|
||||||
@ -63,7 +67,8 @@ public class TableNodeFactory {
|
|||||||
|
|
||||||
private void setPageNumberInTextBlocksWithPageNumberSetTo0(TablePageBlock table, Cell cell) {
|
private void setPageNumberInTextBlocksWithPageNumberSetTo0(TablePageBlock table, Cell cell) {
|
||||||
|
|
||||||
cell.getTextBlocks().stream()//
|
cell.getTextBlocks()
|
||||||
|
.stream()//
|
||||||
.filter(tb -> tb.getPage() == 0)//
|
.filter(tb -> tb.getPage() == 0)//
|
||||||
.forEach(tb -> tb.setPage(table.getPage()));
|
.forEach(tb -> tb.setPage(table.getPage()));
|
||||||
}
|
}
|
||||||
@ -82,28 +87,32 @@ public class TableNodeFactory {
|
|||||||
|
|
||||||
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
|
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
|
||||||
|
|
||||||
if (table.streamHeaders().findAny().isEmpty()) {
|
if (table.streamHeaders()
|
||||||
table.streamRow(0).forEach(tableCellNode -> tableCellNode.setHeader(true));
|
.findAny().isEmpty()) {
|
||||||
|
table.streamRow(0)
|
||||||
|
.forEach(tableCellNode -> tableCellNode.setHeader(true));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) {
|
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context, Document document) {
|
||||||
|
|
||||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||||
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
|
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
|
||||||
addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context);
|
addTableCell(rows.get(rowIndex)
|
||||||
|
.get(colIndex), rowIndex, colIndex, table, context, document);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
||||||
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) {
|
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context, Document document) {
|
||||||
|
|
||||||
Page page = context.getPage(cell.getPageNumber());
|
Page page = context.getPage(cell.getPageNumber());
|
||||||
|
|
||||||
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()).build();
|
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D())
|
||||||
|
.build();
|
||||||
page.getMainBody().add(tableCell);
|
page.getMainBody().add(tableCell);
|
||||||
|
|
||||||
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
|
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
|
||||||
@ -113,16 +122,26 @@ public class TableNodeFactory {
|
|||||||
if (cell.getTextBlocks().isEmpty()) {
|
if (cell.getTextBlocks().isEmpty()) {
|
||||||
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
|
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
|
||||||
} else if (cell.getTextBlocks().size() == 1) {
|
} else if (cell.getTextBlocks().size() == 1) {
|
||||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
|
textBlock = context.getTextBlockFactory()
|
||||||
|
.buildAtomicTextBlock(cell.getTextBlocks()
|
||||||
|
.get(0).getSequences(), tableCell, context, page);
|
||||||
tableCell.setLeafTextBlock(textBlock);
|
tableCell.setLeafTextBlock(textBlock);
|
||||||
} else if (firstTextBlockIsHeadline(cell)) {
|
} else if (firstTextBlockIsHeadline(cell)) {
|
||||||
SectionNodeFactory.addSection(tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context);
|
SectionNodeFactory.addSection(tableCell,
|
||||||
|
cell.getTextBlocks()
|
||||||
|
.stream()
|
||||||
|
.map(tb -> (AbstractPageBlock) tb)
|
||||||
|
.toList(),
|
||||||
|
emptyList(),
|
||||||
|
context,
|
||||||
|
document);
|
||||||
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
|
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
|
||||||
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
|
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
|
||||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
|
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
|
||||||
tableCell.setLeafTextBlock(textBlock);
|
tableCell.setLeafTextBlock(textBlock);
|
||||||
} else {
|
} else {
|
||||||
cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
|
cell.getTextBlocks()
|
||||||
|
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -135,7 +154,8 @@ public class TableNodeFactory {
|
|||||||
|
|
||||||
private boolean firstTextBlockIsHeadline(Cell cell) {
|
private boolean firstTextBlockIsHeadline(Cell cell) {
|
||||||
|
|
||||||
return cell.getTextBlocks().get(0).isHeadline();
|
return cell.getTextBlocks()
|
||||||
|
.get(0).isHeadline();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -110,11 +110,13 @@ public class LayoutGridService {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
for (Page page : table.getPages()) {
|
for (Page page : table.getPages()) {
|
||||||
|
|
||||||
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0).filter(tableCell -> tableCell.isOnPage(page.getNumber())).map(TableCell::getRow).findFirst();
|
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0).filter(tableCell -> tableCell.isOnPage(page.getNumber())).map(TableCell::getRow).findFirst();
|
||||||
if (optionalFirstRowOnPage.isEmpty()) {
|
if (optionalFirstRowOnPage.isEmpty()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int firstRowOnPage = optionalFirstRowOnPage.get();
|
int firstRowOnPage = optionalFirstRowOnPage.get();
|
||||||
|
|
||||||
Stream<Double> xStream = switch (page.getRotation()) {
|
Stream<Double> xStream = switch (page.getRotation()) {
|
||||||
case 90 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinX);
|
case 90 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinX);
|
||||||
case 180 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMaxX);
|
case 180 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMaxX);
|
||||||
@ -123,6 +125,7 @@ public class LayoutGridService {
|
|||||||
};
|
};
|
||||||
List<Double> xs = xStream.collect(Collectors.toList());
|
List<Double> xs = xStream.collect(Collectors.toList());
|
||||||
xs.remove(0);
|
xs.remove(0);
|
||||||
|
|
||||||
Stream<Double> yStream = switch (page.getRotation()) {
|
Stream<Double> yStream = switch (page.getRotation()) {
|
||||||
case 90 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMinY);
|
case 90 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMinY);
|
||||||
case 180 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinY);
|
case 180 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinY);
|
||||||
@ -132,7 +135,7 @@ public class LayoutGridService {
|
|||||||
List<Double> ys = yStream.collect(Collectors.toList());
|
List<Double> ys = yStream.collect(Collectors.toList());
|
||||||
ys.remove(0);
|
ys.remove(0);
|
||||||
|
|
||||||
Rectangle2D tableBBox = table.getBBox().get(table.getFirstPage());
|
Rectangle2D tableBBox = table.getBBox().get(page);
|
||||||
List<ColoredLine> coloredLines = layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getColoredLines();
|
List<ColoredLine> coloredLines = layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getColoredLines();
|
||||||
xs.forEach(x -> {
|
xs.forEach(x -> {
|
||||||
Line2D line = new Line2D.Double(new Point2D.Double(x, tableBBox.getMaxY()), new Point2D.Double(x, tableBBox.getMinY()));
|
Line2D line = new Line2D.Double(new Point2D.Double(x, tableBBox.getMaxY()), new Point2D.Double(x, tableBBox.getMinY()));
|
||||||
@ -188,14 +191,33 @@ public class LayoutGridService {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void addPlacedText(Page page, Rectangle2D textBBox, String s, LayoutGrid layoutGrid) {
|
private void addPlacedText(Page page, Rectangle2D textBBox, String s, LayoutGrid layoutGrid) {
|
||||||
|
|
||||||
Point2D.Float upperLeftCorner = switch (page.getRotation()) {
|
// translates text, such that its right edge is a bit to the left of the drawn box
|
||||||
case 90 -> new Point2D.Float((float) (textBBox.getMinX()), (float) textBBox.getMinY());
|
float translationAmount = ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4);
|
||||||
case 180 -> new Point2D.Float((float) (textBBox.getMaxX()), (float) textBBox.getMinY());
|
|
||||||
case 270 -> new Point2D.Float((float) (textBBox.getMaxX()), (float) textBBox.getMaxY());
|
Point2D upperLeftCorner;
|
||||||
default -> new Point2D.Float((float) (textBBox.getMinX()), (float) textBBox.getMaxY());
|
Point2D translationVector;
|
||||||
};
|
switch (page.getRotation()) {
|
||||||
|
case 90 -> {
|
||||||
|
upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMinY());
|
||||||
|
translationVector = new Point2D.Double(FONT_SIZE, -translationAmount);
|
||||||
|
}
|
||||||
|
case 180 -> {
|
||||||
|
upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMinY());
|
||||||
|
translationVector = new Point2D.Double(translationAmount, FONT_SIZE);
|
||||||
|
}
|
||||||
|
case 270 -> {
|
||||||
|
upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMaxY());
|
||||||
|
translationVector = new Point2D.Double(-FONT_SIZE, translationAmount);
|
||||||
|
}
|
||||||
|
default -> {
|
||||||
|
upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMaxY());
|
||||||
|
translationVector = new Point2D.Double(-translationAmount, -FONT_SIZE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
upperLeftCorner = add(upperLeftCorner, translationVector);
|
||||||
|
|
||||||
var placedTexts = layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getPlacedTexts();
|
var placedTexts = layoutGrid.getVisualizationsPerPages().get(page.getNumber() - 1).getPlacedTexts();
|
||||||
upperLeftCorner.setLocation(upperLeftCorner.getX() - ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), upperLeftCorner.getY() - FONT_SIZE);
|
|
||||||
placedTexts.add(PlacedText.textFacingUp(s, upperLeftCorner, FONT_SIZE, Color.BLACK, FONT));
|
placedTexts.add(PlacedText.textFacingUp(s, upperLeftCorner, FONT_SIZE, Color.BLACK, FONT));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -317,4 +339,10 @@ public class LayoutGridService {
|
|||||||
.add(new ColoredRectangle(textBBox, color, LINE_WIDTH)));
|
.add(new ColoredRectangle(textBBox, color, LINE_WIDTH)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Point2D add(Point2D a, Point2D b) {
|
||||||
|
|
||||||
|
return new Point2D.Double(a.getX() + b.getX(), a.getY() + b.getY());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,28 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||||
|
|
||||||
|
import java.awt.Color;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Builder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@Getter
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public class DrawingOptions {
|
||||||
|
|
||||||
|
boolean stroke;
|
||||||
|
@Builder.Default
|
||||||
|
Color strokeColor = Color.BLACK;
|
||||||
|
@Builder.Default
|
||||||
|
float strokeWidth = 1f;
|
||||||
|
|
||||||
|
boolean fill;
|
||||||
|
@Builder.Default
|
||||||
|
Color fillColor = Color.BLACK;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@ -0,0 +1,88 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||||
|
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
|
|
||||||
|
public class GeometricComparators {
|
||||||
|
|
||||||
|
private static final int COMPARATOR_ROUNDING = 2;
|
||||||
|
|
||||||
|
public static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (point1, point2) -> {
|
||||||
|
|
||||||
|
int rv = 0;
|
||||||
|
float point1X = DoubleComparisons.round(point1.getX(), COMPARATOR_ROUNDING);
|
||||||
|
float point1Y = DoubleComparisons.round(point1.getY(), COMPARATOR_ROUNDING);
|
||||||
|
float point2X = DoubleComparisons.round(point2.getX(), COMPARATOR_ROUNDING);
|
||||||
|
float point2Y = DoubleComparisons.round(point2.getY(), COMPARATOR_ROUNDING);
|
||||||
|
|
||||||
|
if (point1X > point2X) {
|
||||||
|
rv = 1;
|
||||||
|
} else if (point1X < point2X) {
|
||||||
|
rv = -1;
|
||||||
|
} else if (point1Y > point2Y) {
|
||||||
|
rv = 1;
|
||||||
|
} else if (point1Y < point2Y) {
|
||||||
|
rv = -1;
|
||||||
|
}
|
||||||
|
return rv;
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final Comparator<Point2D> Y_FIRST_POINT_COMPARATOR = (point1, point2) -> {
|
||||||
|
|
||||||
|
int rv = 0;
|
||||||
|
float point1X = DoubleComparisons.round(point1.getX(), COMPARATOR_ROUNDING);
|
||||||
|
float point1Y = DoubleComparisons.round(point1.getY(), COMPARATOR_ROUNDING);
|
||||||
|
float point2X = DoubleComparisons.round(point2.getX(), COMPARATOR_ROUNDING);
|
||||||
|
float point2Y = DoubleComparisons.round(point2.getY(), COMPARATOR_ROUNDING);
|
||||||
|
|
||||||
|
if (point1Y > point2Y) {
|
||||||
|
rv = 1;
|
||||||
|
} else if (point1Y < point2Y) {
|
||||||
|
rv = -1;
|
||||||
|
} else if (point1X > point2X) {
|
||||||
|
rv = 1;
|
||||||
|
} else if (point1X < point2X) {
|
||||||
|
rv = -1;
|
||||||
|
}
|
||||||
|
return rv;
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final Comparator<Cell> CELL_SIZE_COMPARATOR = (cell1, cell2) -> {
|
||||||
|
|
||||||
|
Double cell1Size = cell1.getHeight() * cell1.getWidth();
|
||||||
|
Double cell2Size = cell2.getHeight() * cell2.getWidth();
|
||||||
|
return cell1Size.compareTo(cell2Size);
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final Comparator<Rectangle> RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> {
|
||||||
|
|
||||||
|
Double rect1Size = rect1.getHeight() * rect1.getWidth();
|
||||||
|
Double rect2Size = rect2.getHeight() * rect2.getWidth();
|
||||||
|
return rect1Size.compareTo(rect2Size);
|
||||||
|
};
|
||||||
|
|
||||||
|
public static final Comparator<Ruling> X_FIRST_RULING_COMPARATOR = (ruling1, ruling2) -> {
|
||||||
|
|
||||||
|
int rv = 0;
|
||||||
|
float point1X = DoubleComparisons.round(Math.min(ruling1.getLeft(), ruling1.getRight()), COMPARATOR_ROUNDING);
|
||||||
|
float point1Y = DoubleComparisons.round(Math.min(ruling1.getTop(), ruling1.getBottom()), COMPARATOR_ROUNDING);
|
||||||
|
float point2X = DoubleComparisons.round(Math.min(ruling2.getLeft(), ruling2.getRight()), COMPARATOR_ROUNDING);
|
||||||
|
float point2Y = DoubleComparisons.round(Math.min(ruling2.getTop(), ruling2.getBottom()), COMPARATOR_ROUNDING);
|
||||||
|
|
||||||
|
if (point1X > point2X) {
|
||||||
|
rv = 1;
|
||||||
|
} else if (point1X < point2X) {
|
||||||
|
rv = -1;
|
||||||
|
} else if (point1Y > point2Y) {
|
||||||
|
rv = 1;
|
||||||
|
} else if (point1Y < point2Y) {
|
||||||
|
rv = -1;
|
||||||
|
}
|
||||||
|
return rv;
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
@ -21,11 +21,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.experimental.FieldDefaults;
|
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@ -41,20 +37,20 @@ public class PdfVisualisationUtility {
|
|||||||
|
|
||||||
public void drawNode(PDDocument document, DocumentTree.Entry entry) {
|
public void drawNode(PDDocument document, DocumentTree.Entry entry) {
|
||||||
|
|
||||||
Options options = buildStandardOptionsForNodes(entry);
|
DrawingOptions options = buildStandardOptionsForNodes(entry);
|
||||||
|
|
||||||
drawBBoxAndLabelAndNumberOnPage(document, entry, options);
|
drawBBoxAndLabelAndNumberOnPage(document, entry, options);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void drawTextBlock(PDDocument document, TextBlock textBlock, Options options) {
|
public void drawTextBlock(PDDocument document, TextBlock textBlock, DrawingOptions options) {
|
||||||
|
|
||||||
textBlock.getAtomicTextBlocks().forEach(atb -> drawAtomicTextBlock(document, atb, options));
|
textBlock.getAtomicTextBlocks().forEach(atb -> drawAtomicTextBlock(document, atb, options));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, Options options) {
|
public void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, DrawingOptions options) {
|
||||||
|
|
||||||
drawRectangle2DList(document, atomicTextBlock.getPage().getNumber(), atomicTextBlock.getPositions().stream().toList(), options);
|
drawRectangle2DList(document, atomicTextBlock.getPage().getNumber(), atomicTextBlock.getPositions().stream().toList(), options);
|
||||||
|
|
||||||
@ -62,7 +58,7 @@ public class PdfVisualisationUtility {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, Options options) {
|
public void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, DrawingOptions options) {
|
||||||
|
|
||||||
var pdPage = document.getPage(pageNumber - 1);
|
var pdPage = document.getPage(pageNumber - 1);
|
||||||
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||||
@ -80,14 +76,14 @@ public class PdfVisualisationUtility {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void drawRectangle2DList(PDDocument document, int pageNumber, List<Rectangle2D> rectCollection, Options options) {
|
public void drawRectangle2DList(PDDocument document, int pageNumber, List<Rectangle2D> rectCollection, DrawingOptions options) {
|
||||||
|
|
||||||
var pdPage = document.getPage(pageNumber - 1);
|
var pdPage = document.getPage(pageNumber - 1);
|
||||||
drawRectangle2DList(document, rectCollection, options, pdPage);
|
drawRectangle2DList(document, rectCollection, options, pdPage);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void drawRectangle2DList(PDDocument document, List<Rectangle2D> rectCollection, Options options, PDPage pdPage) throws IOException {
|
private void drawRectangle2DList(PDDocument document, List<Rectangle2D> rectCollection, DrawingOptions options, PDPage pdPage) throws IOException {
|
||||||
|
|
||||||
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||||
|
|
||||||
@ -110,9 +106,9 @@ public class PdfVisualisationUtility {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Options buildStandardOptionsForNodes(DocumentTree.Entry entry) {
|
private DrawingOptions buildStandardOptionsForNodes(DocumentTree.Entry entry) {
|
||||||
|
|
||||||
return Options.builder().stroke(true).strokeColor(switch (entry.getType()) {
|
return DrawingOptions.builder().stroke(true).strokeColor(switch (entry.getType()) {
|
||||||
case DOCUMENT -> Color.LIGHT_GRAY;
|
case DOCUMENT -> Color.LIGHT_GRAY;
|
||||||
case HEADER, FOOTER -> Color.GREEN;
|
case HEADER, FOOTER -> Color.GREEN;
|
||||||
case PARAGRAPH -> Color.BLUE;
|
case PARAGRAPH -> Color.BLUE;
|
||||||
@ -125,7 +121,7 @@ public class PdfVisualisationUtility {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, Options options) {
|
private void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, DrawingOptions options) {
|
||||||
|
|
||||||
Map<Page, Rectangle2D> rectanglesPerPage = entry.getNode().getBBox();
|
Map<Page, Rectangle2D> rectanglesPerPage = entry.getNode().getBBox();
|
||||||
rectanglesPerPage.forEach((page, rectangle2D) -> {
|
rectanglesPerPage.forEach((page, rectangle2D) -> {
|
||||||
@ -152,7 +148,7 @@ public class PdfVisualisationUtility {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List<? extends Line2D> line2DS, Options options) {
|
public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List<? extends Line2D> line2DS, DrawingOptions options) {
|
||||||
|
|
||||||
var pdPage = pdDocument.getPage(pageNumber - 1);
|
var pdPage = pdDocument.getPage(pageNumber - 1);
|
||||||
var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||||
@ -176,21 +172,4 @@ public class PdfVisualisationUtility {
|
|||||||
contentStream.close();
|
contentStream.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Builder
|
|
||||||
@Getter
|
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
|
||||||
public static class Options {
|
|
||||||
|
|
||||||
boolean fill;
|
|
||||||
boolean stroke;
|
|
||||||
@Builder.Default
|
|
||||||
Color strokeColor = Color.BLACK;
|
|
||||||
@Builder.Default
|
|
||||||
float strokeWidth = 1f;
|
|
||||||
@Builder.Default
|
|
||||||
Color fillColor = Color.BLACK;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,8 +2,11 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
|
|||||||
|
|
||||||
import static java.lang.String.format;
|
import static java.lang.String.format;
|
||||||
|
|
||||||
|
import java.awt.geom.Area;
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.awt.geom.RectangularShape;
|
import java.awt.geom.RectangularShape;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -19,6 +22,8 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
|||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.NoArgsConstructor;
|
import lombok.NoArgsConstructor;
|
||||||
@ -37,15 +42,28 @@ public class RectangleTransformations {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static double calculateIntersectedArea(Rectangle2D r1, Rectangle2D r2) {
|
||||||
|
|
||||||
|
Area a1 = new Area(r1);
|
||||||
|
Area a2 = new Area(r2);
|
||||||
|
a1.intersect(a2);
|
||||||
|
Rectangle2D intersection = a1.getBounds2D();
|
||||||
|
return intersection.getWidth() * intersection.getHeight();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
|
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
|
||||||
|
|
||||||
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
|
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> collectBBox() {
|
public static Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> collectBBox() {
|
||||||
|
|
||||||
return new Rectangle2DBBoxCollector();
|
return new Rectangle2DBBoxCollector();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static PDRectangle toPDRectangleBBox(List<Rectangle> rectangles) {
|
public static PDRectangle toPDRectangleBBox(List<Rectangle> rectangles) {
|
||||||
|
|
||||||
Rectangle2D rectangle2D = RectangleTransformations.rectangleBBox(rectangles);
|
Rectangle2D rectangle2D = RectangleTransformations.rectangleBBox(rectangles);
|
||||||
@ -70,6 +88,7 @@ public class RectangleTransformations {
|
|||||||
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) {
|
public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) {
|
||||||
|
|
||||||
return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector());
|
return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector());
|
||||||
@ -84,6 +103,7 @@ public class RectangleTransformations {
|
|||||||
-redactionLogRectangle.getHeight());
|
-redactionLogRectangle.getHeight());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Rectangle2D toRectangle2D(PDRectangle rectangle) {
|
public static Rectangle2D toRectangle2D(PDRectangle rectangle) {
|
||||||
|
|
||||||
return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight());
|
return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight());
|
||||||
@ -133,7 +153,27 @@ public class RectangleTransformations {
|
|||||||
previousRectangle = currentRectangle;
|
previousRectangle = currentRectangle;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return rectangleListsWithGaps.stream().map(RectangleTransformations::rectangle2DBBox).toList();
|
return rectangleListsWithGaps.stream()
|
||||||
|
.map(RectangleTransformations::rectangle2DBBox)
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static CleanRulings extractRulings(List<? extends Rectangle2D.Float> rectangles) {
|
||||||
|
|
||||||
|
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
|
||||||
|
List<Ruling> horizontalRulings = new ArrayList<>();
|
||||||
|
List<Ruling> verticalRulings = new ArrayList<>();
|
||||||
|
|
||||||
|
rectangles.forEach(rectangle -> {
|
||||||
|
horizontalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y), new Point2D.Float(rectangle.x + rectangle.width, rectangle.y)));
|
||||||
|
horizontalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y + rectangle.height),
|
||||||
|
new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height)));
|
||||||
|
verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x, rectangle.y), new Point2D.Float(rectangle.x, rectangle.y + rectangle.height)));
|
||||||
|
verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x + rectangle.width, rectangle.y),
|
||||||
|
new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height)));
|
||||||
|
});
|
||||||
|
return CleanRulings.builder().vertical(verticalRulings).horizontal(horizontalRulings).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,77 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||||
|
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR;
|
||||||
|
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
|
|
||||||
|
public class RectangularIntersectionFinder {
|
||||||
|
|
||||||
|
public static List<Rectangle2D> find(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||||
|
|
||||||
|
// Fix for 211.pdf
|
||||||
|
for (Ruling r : horizontalRulingLines) {
|
||||||
|
if (r.getX2() < r.getX1()) {
|
||||||
|
double a = r.getX2();
|
||||||
|
r.x2 = (float) r.getX1();
|
||||||
|
r.x1 = (float) a;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Rectangle2D> foundRectangles = new ArrayList<>();
|
||||||
|
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
|
||||||
|
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
|
||||||
|
intersectionPointsList.sort(Y_FIRST_POINT_COMPARATOR);
|
||||||
|
|
||||||
|
for (int i = 0; i < intersectionPointsList.size(); i++) {
|
||||||
|
Point2D topLeft = intersectionPointsList.get(i);
|
||||||
|
Ruling[] hv = intersectionPoints.get(topLeft);
|
||||||
|
|
||||||
|
// CrossingPointsDirectlyBelow( topLeft );
|
||||||
|
List<Point2D> xPoints = new ArrayList<>();
|
||||||
|
// CrossingPointsDirectlyToTheRight( topLeft );
|
||||||
|
List<Point2D> yPoints = new ArrayList<>();
|
||||||
|
|
||||||
|
for (Point2D p : intersectionPointsList.subList(i, intersectionPointsList.size())) {
|
||||||
|
if (p.getX() == topLeft.getX() && p.getY() > topLeft.getY()) {
|
||||||
|
xPoints.add(p);
|
||||||
|
}
|
||||||
|
if (p.getY() == topLeft.getY() && p.getX() > topLeft.getX()) {
|
||||||
|
yPoints.add(p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
outer:
|
||||||
|
for (Point2D xPoint : xPoints) {
|
||||||
|
// is there a vertical edge b/w topLeft and xPoint?
|
||||||
|
if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (Point2D yPoint : yPoints) {
|
||||||
|
// is there a horizontal edge b/w topLeft and yPoint ?
|
||||||
|
if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
|
||||||
|
if (intersectionPoints.containsKey(btmRight)
|
||||||
|
&& intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0])
|
||||||
|
&& intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) {
|
||||||
|
foundRectangles.add(new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), btmRight.getX() - topLeft.getX(), btmRight.getY() - topLeft.getY()));
|
||||||
|
break outer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO create cells for vertical ruling lines with aligned endpoints at the top/bottom of a grid
|
||||||
|
// that aren't connected with an horizontal ruler?
|
||||||
|
// see: https://github.com/jazzido/tabula-extractor/issues/78#issuecomment-41481207
|
||||||
|
|
||||||
|
return foundRectangles;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,172 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||||
|
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.X_FIRST_POINT_COMPARATOR;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR;
|
||||||
|
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||||
|
|
||||||
|
public class SpreadsheetFinder {
|
||||||
|
|
||||||
|
private static final int MAX_OUTER_POINT_TOLERANCE = 10;
|
||||||
|
private static final float AREA_TOLERANCE = 0.001f;
|
||||||
|
|
||||||
|
|
||||||
|
public static List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
|
||||||
|
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
|
||||||
|
List<Rectangle> rectangles = new ArrayList<>();
|
||||||
|
Set<Point2D> pointSet = new HashSet<>();
|
||||||
|
Map<Point2D, Point2D> edgesH = new HashMap<>();
|
||||||
|
Map<Point2D, Point2D> edgesV = new HashMap<>();
|
||||||
|
|
||||||
|
for (Rectangle cell : cells) {
|
||||||
|
for (Point2D pt : cell.getPoints()) {
|
||||||
|
if (pointSet.contains(pt)) { // shared vertex, remove it
|
||||||
|
pointSet.remove(pt);
|
||||||
|
} else {
|
||||||
|
pointSet.add(pt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// X first sort
|
||||||
|
List<Point2D> pointsSortX = new ArrayList<>(pointSet);
|
||||||
|
pointsSortX.sort(X_FIRST_POINT_COMPARATOR);
|
||||||
|
// Y first sort
|
||||||
|
List<Point2D> pointsSortY = new ArrayList<>(pointSet);
|
||||||
|
pointsSortY.sort(Y_FIRST_POINT_COMPARATOR);
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
while (i < pointSet.size()) {
|
||||||
|
float currY = (float) pointsSortY.get(i).getY();
|
||||||
|
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortY.get(i).getY(), currY)) {
|
||||||
|
edgesH.put(pointsSortY.get(i), pointsSortY.get(i + 1));
|
||||||
|
edgesH.put(pointsSortY.get(i + 1), pointsSortY.get(i));
|
||||||
|
i += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
i = 0;
|
||||||
|
while (i < pointSet.size()) {
|
||||||
|
float currX = (float) pointsSortX.get(i).getX();
|
||||||
|
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortX.get(i).getX(), currX)) {
|
||||||
|
edgesV.put(pointsSortX.get(i), pointsSortX.get(i + 1));
|
||||||
|
edgesV.put(pointsSortX.get(i + 1), pointsSortX.get(i));
|
||||||
|
i += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get all the polygons
|
||||||
|
List<List<PolygonVertex>> polygons = new ArrayList<>();
|
||||||
|
Point2D nextVertex;
|
||||||
|
while (!edgesH.isEmpty()) {
|
||||||
|
ArrayList<PolygonVertex> polygon = new ArrayList<>();
|
||||||
|
Point2D first = edgesH.keySet()
|
||||||
|
.iterator().next();
|
||||||
|
polygon.add(new PolygonVertex(first, Direction.HORIZONTAL));
|
||||||
|
edgesH.remove(first);
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
PolygonVertex curr = polygon.get(polygon.size() - 1);
|
||||||
|
PolygonVertex lastAddedVertex;
|
||||||
|
if (curr.direction == Direction.HORIZONTAL) {
|
||||||
|
nextVertex = edgesV.get(curr.point);
|
||||||
|
edgesV.remove(curr.point);
|
||||||
|
lastAddedVertex = new PolygonVertex(nextVertex, Direction.VERTICAL);
|
||||||
|
} else {
|
||||||
|
nextVertex = edgesH.get(curr.point);
|
||||||
|
edgesH.remove(curr.point);
|
||||||
|
lastAddedVertex = new PolygonVertex(nextVertex, Direction.HORIZONTAL);
|
||||||
|
}
|
||||||
|
polygon.add(lastAddedVertex);
|
||||||
|
|
||||||
|
if (lastAddedVertex.equals(polygon.get(0))) {
|
||||||
|
// closed polygon
|
||||||
|
polygon.remove(polygon.size() - 1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (PolygonVertex vertex : polygon) {
|
||||||
|
edgesH.remove(vertex.point);
|
||||||
|
edgesV.remove(vertex.point);
|
||||||
|
}
|
||||||
|
polygons.add(polygon);
|
||||||
|
}
|
||||||
|
|
||||||
|
// calculate grid-aligned minimum area rectangles for each found polygon
|
||||||
|
for (List<PolygonVertex> poly : polygons) {
|
||||||
|
float top = Float.MAX_VALUE;
|
||||||
|
float left = Float.MAX_VALUE;
|
||||||
|
float bottom = Float.MIN_VALUE;
|
||||||
|
float right = Float.MIN_VALUE;
|
||||||
|
for (PolygonVertex pt : poly) {
|
||||||
|
top = (float) Math.min(top, pt.point.getY());
|
||||||
|
left = (float) Math.min(left, pt.point.getX());
|
||||||
|
bottom = (float) Math.max(bottom, pt.point.getY());
|
||||||
|
right = (float) Math.max(right, pt.point.getX());
|
||||||
|
}
|
||||||
|
|
||||||
|
// do not add polygons with too many outer points as they are unlikely to be tables
|
||||||
|
if (poly.size() <= MAX_OUTER_POINT_TOLERANCE) {
|
||||||
|
rectangles.add(new Rectangle(top - AREA_TOLERANCE, left - AREA_TOLERANCE, right - left + 2 * AREA_TOLERANCE, bottom - top + 2 * AREA_TOLERANCE));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rectangles;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private enum Direction {
|
||||||
|
HORIZONTAL,
|
||||||
|
VERTICAL
|
||||||
|
}
|
||||||
|
|
||||||
|
static class PolygonVertex {
|
||||||
|
|
||||||
|
Point2D point;
|
||||||
|
Direction direction;
|
||||||
|
|
||||||
|
|
||||||
|
PolygonVertex(Point2D point, Direction direction) {
|
||||||
|
|
||||||
|
this.direction = direction;
|
||||||
|
this.point = point;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
|
||||||
|
if (this == other) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (!(other instanceof PolygonVertex)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return this.point.equals(((PolygonVertex) other).point);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
|
||||||
|
return this.point.hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return String.format("%s[point=%s,direction=%s]", this.getClass().getName(), this.point.toString(), this.direction.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -30,8 +30,6 @@ public class TableMergingUtility {
|
|||||||
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock,
|
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock,
|
||||||
consecutiveTable)) {
|
consecutiveTable)) {
|
||||||
consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable);
|
consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable);
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return Stream.concat(Stream.of(originalTablePageBlock), consecutiveTablesWithSameColCountAndHeaders.stream()).toList();
|
return Stream.concat(Stream.of(originalTablePageBlock), consecutiveTablesWithSameColCountAndHeaders.stream()).toList();
|
||||||
|
|||||||
@ -23,4 +23,10 @@ public class TextPositionOperations {
|
|||||||
return sequence;
|
return sequence;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static List<TextPositionSequence> mergeTextPositionSequence(List<TextPageBlock> textBlocks) {
|
||||||
|
|
||||||
|
return textBlocks.stream().flatMap(tb -> tb.getSequences().stream()).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -28,15 +28,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPosit
|
|||||||
*
|
*
|
||||||
* @author Ben Litchfield
|
* @author Ben Litchfield
|
||||||
*/
|
*/
|
||||||
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence>
|
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence> {
|
||||||
{
|
|
||||||
@Override
|
@Override
|
||||||
public int compare(TextPositionSequence pos1, TextPositionSequence pos2)
|
public int compare(TextPositionSequence pos1, TextPositionSequence pos2) {
|
||||||
{
|
|
||||||
// only compare text that is in the same direction
|
// only compare text that is in the same direction
|
||||||
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
|
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
|
||||||
if (cmp1 != 0)
|
if (cmp1 != 0) {
|
||||||
{
|
|
||||||
return cmp1;
|
return cmp1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -48,25 +46,19 @@ public class TextPositionSequenceComparator implements Comparator<TextPositionSe
|
|||||||
float pos2YBottom = pos2.getMaxYDirAdj();
|
float pos2YBottom = pos2.getMaxYDirAdj();
|
||||||
|
|
||||||
// note that the coordinates have been adjusted so 0,0 is in upper left
|
// note that the coordinates have been adjusted so 0,0 is in upper left
|
||||||
float pos1YTop = pos1YBottom - pos1.getTextHeight();
|
float pos1YTop = pos1YBottom - pos1.getTextHeightNoPadding();
|
||||||
float pos2YTop = pos2YBottom - pos2.getTextHeight();
|
float pos2YTop = pos2YBottom - pos2.getTextHeightNoPadding();
|
||||||
|
|
||||||
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
||||||
|
|
||||||
// we will do a simple tolerance comparison
|
// we will do a simple tolerance comparison
|
||||||
if (yDifference < .1 ||
|
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
|
||||||
pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom ||
|
|
||||||
pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)
|
|
||||||
{
|
|
||||||
return Float.compare(x1, x2);
|
return Float.compare(x1, x2);
|
||||||
}
|
} else if (pos1YBottom < pos2YBottom) {
|
||||||
else if (pos1YBottom < pos2YBottom)
|
|
||||||
{
|
|
||||||
return -1;
|
return -1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,44 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
// simple implementation of a disjoint-set data structure
|
||||||
|
// https://en.wikipedia.org/wiki/Disjoint-set_data_structure
|
||||||
|
public class UnionFind<T> {
|
||||||
|
|
||||||
|
Map<T, T> parents = new HashMap<>();
|
||||||
|
Map<T, Integer> numberOfObjects = new HashMap<>();
|
||||||
|
|
||||||
|
|
||||||
|
public T find(T node) {
|
||||||
|
|
||||||
|
if (!parents.containsKey(node)) {
|
||||||
|
parents.put(node, node);
|
||||||
|
numberOfObjects.put(node, 1);
|
||||||
|
}
|
||||||
|
if (!node.equals(parents.get(node))) {
|
||||||
|
parents.put(node, find(parents.get(node)));
|
||||||
|
}
|
||||||
|
return parents.get(node);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void union(T node1, T node2) {
|
||||||
|
|
||||||
|
T root1 = find(node1);
|
||||||
|
T root2 = find(node2);
|
||||||
|
|
||||||
|
if (!root1.equals(root2)) {
|
||||||
|
if (numberOfObjects.getOrDefault(root1, 1) < numberOfObjects.getOrDefault(root2, 1)) {
|
||||||
|
parents.put(root1, root2);
|
||||||
|
numberOfObjects.put(root2, numberOfObjects.get(root2) + numberOfObjects.get(root1));
|
||||||
|
} else {
|
||||||
|
parents.put(root2, root1);
|
||||||
|
numberOfObjects.put(root1, numberOfObjects.get(root1) + numberOfObjects.get(root2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@ -29,6 +29,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||||
|
|
||||||
@ -111,7 +112,7 @@ public class BdrJsonBuildTest extends AbstractTest {
|
|||||||
|
|
||||||
try (PDDocument pdDocument = Loader.loadPDF(file); var outputStream = new FileOutputStream(resultingFileName)) {
|
try (PDDocument pdDocument = Loader.loadPDF(file); var outputStream = new FileOutputStream(resultingFileName)) {
|
||||||
PdfDraw.drawDocumentGraph(pdDocument, document);
|
PdfDraw.drawDocumentGraph(pdDocument, document);
|
||||||
PdfDraw.drawTextBlock(pdDocument, textBlock, PdfDraw.Options.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build());
|
PdfDraw.drawTextBlock(pdDocument, textBlock, DrawingOptions.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build());
|
||||||
pdDocument.save(outputStream);
|
pdDocument.save(outputStream);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -28,7 +28,20 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
|||||||
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
|
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
|
||||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||||
Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info);
|
Arrays.stream(finishedEvent.message().split("\n"))
|
||||||
|
.forEach(log::info);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void testLayoutParserEndToEnd_RED_8747() {
|
||||||
|
|
||||||
|
prepareStorage("files/SinglePages/MergedEntities.pdf");
|
||||||
|
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||||
|
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||||
|
Arrays.stream(finishedEvent.message().split("\n"))
|
||||||
|
.forEach(log::info);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -13,6 +13,7 @@ import org.springframework.core.io.ClassPathResource;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||||
|
|
||||||
@ -70,7 +71,7 @@ public class DocumentGraphVisualizationTest extends BuildDocumentTest {
|
|||||||
try (PDDocument pdDocument = Loader.loadPDF(fileResource.getFile())) {
|
try (PDDocument pdDocument = Loader.loadPDF(fileResource.getFile())) {
|
||||||
log.info("drawing document");
|
log.info("drawing document");
|
||||||
PdfDraw.drawDocumentGraph(pdDocument, documentGraph);
|
PdfDraw.drawDocumentGraph(pdDocument, documentGraph);
|
||||||
PdfDraw.drawTextBlock(pdDocument, textBlock, PdfDraw.Options.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build());
|
PdfDraw.drawTextBlock(pdDocument, textBlock, DrawingOptions.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build());
|
||||||
log.info("saving document");
|
log.info("saving document");
|
||||||
pdDocument.save(tmpFile);
|
pdDocument.save(tmpFile);
|
||||||
log.info("saved document");
|
log.info("saved document");
|
||||||
|
|||||||
@ -25,7 +25,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testViewerDocument() {
|
public void testViewerDocument() {
|
||||||
|
|
||||||
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
String fileName = "files/SinglePages/T5 VV-640252-Page16.pdf";
|
||||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||||
|
|
||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
@ -35,9 +35,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Disabled
|
@Disabled
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@ -51,7 +52,11 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
|
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
|
||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
|
|
||||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, documentFile, new ImageServiceResponse(), tableResponse, Path.of(fileName).getFileName().toFile().toString());
|
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||||
|
documentFile,
|
||||||
|
new ImageServiceResponse(),
|
||||||
|
tableResponse,
|
||||||
|
Path.of(fileName).getFileName().toFile().toString());
|
||||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||||
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||||
@ -60,3 +65,4 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server.segmentation;
|
package com.knecon.fforesight.service.layoutparser.server.segmentation;
|
||||||
|
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
@ -25,16 +26,20 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
@ -50,12 +55,6 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
@Autowired
|
@Autowired
|
||||||
private RedactManagerClassificationService redactManagerClassificationService;
|
private RedactManagerClassificationService redactManagerClassificationService;
|
||||||
|
|
||||||
@Autowired
|
|
||||||
private CvTableParsingAdapter cvTableParsingAdapter;
|
|
||||||
|
|
||||||
@Autowired
|
|
||||||
private ImageServiceResponseAdapter imageServiceResponseAdapter;
|
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
private SectionsBuilderService sectionsBuilderService;
|
private SectionsBuilderService sectionsBuilderService;
|
||||||
|
|
||||||
@ -64,10 +63,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
|
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
originDocument,
|
originDocument,
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
tableServiceResponse,
|
tableServiceResponse,
|
||||||
"document");
|
"document");
|
||||||
|
|
||||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||||
|
|
||||||
@ -87,11 +86,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
@Test
|
@Test
|
||||||
public void tablesToHtmlDebugger() throws IOException {
|
public void tablesToHtmlDebugger() throws IOException {
|
||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T5 VV-640252-Page16.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
toHtml(document, "/tmp/A20622A izRMS (CZ) fRR Part B9_Page185.html");
|
toHtml(document, "/tmp/T5.html");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -109,6 +108,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Disabled
|
||||||
@Test
|
@Test
|
||||||
public void testScanRotationBorderIsIgnored() throws IOException {
|
public void testScanRotationBorderIsIgnored() throws IOException {
|
||||||
|
|
||||||
@ -117,8 +117,16 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
|
||||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
assertThat(document.getSections()
|
||||||
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.collect(Collectors.toList())).isNotEmpty();
|
||||||
|
var tables = document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.toList();
|
||||||
|
|
||||||
// Quality of the table parsing is not good, because the file is rotated at scanning.
|
// Quality of the table parsing is not good, because the file is rotated at scanning.
|
||||||
// We only asset that the table border is not the page border.
|
// We only asset that the table border is not the page border.
|
||||||
@ -140,12 +148,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
imageServiceResponse.getData()
|
imageServiceResponse.getData()
|
||||||
.forEach(imageMetadata -> images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
.forEach(imageMetadata -> images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||||
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
||||||
imageMetadata.getPosition().getY1(),
|
imageMetadata.getPosition().getY1(),
|
||||||
imageMetadata.getGeometry().getWidth(),
|
imageMetadata.getGeometry().getWidth(),
|
||||||
imageMetadata.getGeometry().getHeight()),
|
imageMetadata.getGeometry().getHeight()),
|
||||||
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
|
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
|
||||||
imageMetadata.isAlpha(),
|
imageMetadata.isAlpha(),
|
||||||
imageMetadata.getPosition().getPageNumber())));
|
imageMetadata.getPosition().getPageNumber())));
|
||||||
|
|
||||||
System.out.println("object");
|
System.out.println("object");
|
||||||
}
|
}
|
||||||
@ -157,11 +165,22 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
assertThat(document.getSections()
|
||||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.collect(Collectors.toList())).isNotEmpty();
|
||||||
|
TablePageBlock table = document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.toList()
|
||||||
|
.get(0);
|
||||||
assertThat(table.getColCount()).isEqualTo(6);
|
assertThat(table.getColCount()).isEqualTo(6);
|
||||||
assertThat(table.getRowCount()).isEqualTo(13);
|
assertThat(table.getRowCount()).isEqualTo(13);
|
||||||
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
assertThat(table.getRows()
|
||||||
|
.stream()
|
||||||
|
.mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -171,15 +190,37 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
assertThat(document.getSections()
|
||||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.collect(Collectors.toList())).isNotEmpty();
|
||||||
|
TablePageBlock firstTable = document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.toList()
|
||||||
|
.get(0);
|
||||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||||
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
|
TablePageBlock secondTable = document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.toList()
|
||||||
|
.get(1);
|
||||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||||
assertThat(secondTable.getRowCount()).isEqualTo(2);
|
assertThat(secondTable.getRowCount()).isEqualTo(2);
|
||||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
|
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
.get(0)
|
||||||
|
.stream()
|
||||||
|
.map(Collections::singletonList)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
assertThat(secondTable.getRows()
|
||||||
|
.stream()
|
||||||
|
.allMatch(row -> row.stream()
|
||||||
|
.map(Cell::getHeaderCells)
|
||||||
|
.toList().equals(firstTableHeaderCells))).isTrue();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -189,15 +230,37 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
assertThat(document.getSections()
|
||||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.collect(Collectors.toList())).isNotEmpty();
|
||||||
|
TablePageBlock firstTable = document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.toList()
|
||||||
|
.get(0);
|
||||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||||
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
||||||
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
|
TablePageBlock secondTable = document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.toList()
|
||||||
|
.get(1);
|
||||||
assertThat(secondTable.getColCount()).isEqualTo(9);
|
assertThat(secondTable.getColCount()).isEqualTo(9);
|
||||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1).stream().map(Cell::getHeaderCells).collect(Collectors.toList());
|
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
.get(firstTable.getRowCount() - 1)
|
||||||
|
.stream()
|
||||||
|
.map(Cell::getHeaderCells)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
assertThat(secondTable.getRows()
|
||||||
|
.stream()
|
||||||
|
.allMatch(row -> row.stream()
|
||||||
|
.map(Cell::getHeaderCells)
|
||||||
|
.toList().equals(firstTableHeaderCells))).isTrue();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -207,19 +270,41 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
assertThat(document.getSections()
|
||||||
TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0);
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.collect(Collectors.toList())).isNotEmpty();
|
||||||
|
TablePageBlock firstTable = document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.toList()
|
||||||
|
.get(0);
|
||||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||||
TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(1);
|
TablePageBlock secondTable = document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.toList()
|
||||||
|
.get(1);
|
||||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList());
|
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||||
assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue();
|
.get(0)
|
||||||
|
.stream()
|
||||||
|
.map(Collections::singletonList)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
assertThat(secondTable.getRows()
|
||||||
|
.stream()
|
||||||
|
.allMatch(row -> row.stream()
|
||||||
|
.map(Cell::getHeaderCells)
|
||||||
|
.toList().equals(firstTableHeaderCells))).isTrue();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test // Non-sense test
|
@Test
|
||||||
public void testDoc56Page170() throws IOException {
|
public void testDoc56Page170() throws IOException {
|
||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf");
|
||||||
@ -230,8 +315,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
validateTable(document, 0, 1, 1, 0, 0);
|
validateTable(document, 0, 1, 1, 0, 0);
|
||||||
validateTable(document, 1, 2, 2, 0, 0);
|
validateTable(document, 1, 2, 2, 0, 0);
|
||||||
validateTable(document, 2, 6, 20, 0, 0);
|
validateTable(document, 2, 4, 19, 12, 0);
|
||||||
validateTable(document, 3, 7, 31, 0, 0);
|
validateTable(document, 3, 2, 12, 0, 0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -265,29 +350,30 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
validateTable(document, 0, 8, 8, 0, 0);
|
validateTable(document, 0, 8, 8, 0, 0);
|
||||||
|
|
||||||
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
|
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
|
||||||
"Author, date",
|
"Author, date",
|
||||||
"Study title",
|
"Study title",
|
||||||
"Analytical method Author, date, No.",
|
"Analytical method Author, date, No.",
|
||||||
"Technique, LOQ of the method, validated working range",
|
"Technique, LOQ of the method, validated working range",
|
||||||
"Method meets analytical validation criteria",
|
"Method meets analytical validation criteria",
|
||||||
"Remarks (in case validation criteria are not met)",
|
"Remarks (in case validation criteria are not met)",
|
||||||
"Acceptability of the method"),
|
"Acceptability of the method"),
|
||||||
Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
Arrays.asList(
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
|
||||||
"Evans P.G. 2001 TMJ4569B, VV-323245",
|
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
|
||||||
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
|
"Evans P.G. 2001 TMJ4569B, VV-323245",
|
||||||
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
|
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
|
||||||
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
|
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
|
||||||
"Y",
|
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
|
||||||
"N/A",
|
"Y",
|
||||||
"Y"));
|
"N/A",
|
||||||
|
"Y"));
|
||||||
|
|
||||||
validateTable(document, 0, values);
|
validateTable(document, 0, values);
|
||||||
|
|
||||||
@ -579,10 +665,156 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testT0() throws IOException {
|
||||||
|
|
||||||
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T0 TableWithMergedCells.pdf");
|
||||||
|
|
||||||
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
|
validateTableSize(document, 1);
|
||||||
|
|
||||||
|
validateTable(document, 0, 6, 8, 0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testT1() throws IOException {
|
||||||
|
|
||||||
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T1 MultipleNestedTable.pdf");
|
||||||
|
|
||||||
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
|
validateTableSize(document, 4);
|
||||||
|
|
||||||
|
validateTable(document, 0, 3, 3, 0, 0);
|
||||||
|
validateTable(document, 1, 3, 6, 2, 0);
|
||||||
|
validateTable(document, 2, 3, 3, 1, 0);
|
||||||
|
validateTable(document, 3, 3, 3, 0, 0);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testT2() throws IOException {
|
||||||
|
|
||||||
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T2 MultipleTables.pdf");
|
||||||
|
|
||||||
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
|
validateTableSize(document, 6);
|
||||||
|
|
||||||
|
validateTable(document, 0, 5, 5, 0, 0);
|
||||||
|
validateTable(document, 1, 5, 6, 0, 0);
|
||||||
|
validateTable(document, 2, 5, 5, 0, 0);
|
||||||
|
validateTable(document, 3, 5, 5, 0, 0);
|
||||||
|
validateTable(document, 4, 5, 5, 0, 0);
|
||||||
|
validateTable(document, 5, 5, 5, 0, 0);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testT3() throws IOException {
|
||||||
|
|
||||||
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T3 S-Meto_Page29.pdf");
|
||||||
|
|
||||||
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
|
validateTableSize(document, 1);
|
||||||
|
|
||||||
|
validateTable(document, 0, 6, 5, 0, 0);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testT4() throws IOException {
|
||||||
|
|
||||||
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T4 138 IDD0000261736_Page16.pdf");
|
||||||
|
|
||||||
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
|
validateTableSize(document, 1);
|
||||||
|
|
||||||
|
validateTable(document, 0, 5, 8, 1, 0);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testT5() throws IOException {
|
||||||
|
|
||||||
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/T5 VV-640252-Page16.pdf");
|
||||||
|
|
||||||
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
|
validateTableSize(document, 5);
|
||||||
|
validateTable(document, 0, 1, 1, 0, 0);
|
||||||
|
validateTable(document, 1, 1, 1, 0, 0);
|
||||||
|
validateTable(document, 2, 1, 1, 0, 0);
|
||||||
|
validateTable(document, 3, 1, 1, 0, 0);
|
||||||
|
validateTable(document, 4, 1, 1, 0, 0);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMergedEntities_Page26() throws IOException {
|
||||||
|
|
||||||
|
ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/MergedEntities.pdf");
|
||||||
|
|
||||||
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
|
validateTableSize(document, 1);
|
||||||
|
|
||||||
|
validateTable(document, 0, 6, 6, 5, 0);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHeaderAndFooter() throws IOException {
|
||||||
|
|
||||||
|
String fileName = "files/SinglePages/Page1_54 Fludioxonil - EU AIR3 - Document E1 - Listing of Community and Member States MRLs.pdf";
|
||||||
|
String textToSearch = "Annex to Regulation 283/2013 Annex to Regulation 284/2013";
|
||||||
|
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
||||||
|
|
||||||
|
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
|
||||||
|
var textPositions = textPositionPerPage.stream()
|
||||||
|
.flatMap(t -> t.getSortedTextPositionSequences()
|
||||||
|
.stream()
|
||||||
|
.map(TextPositionSequence::toString))
|
||||||
|
.collect(Collectors.joining(" "));
|
||||||
|
assertThat(textPositions.contains(textToSearch)).isFalse();
|
||||||
|
|
||||||
|
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
|
||||||
|
assertThat(classificationDocument.getHeaders()
|
||||||
|
.get(0).getTextBlocks().size()).isEqualTo(3);
|
||||||
|
assertThat(classificationDocument.getHeaders()
|
||||||
|
.get(0).getTextBlocks()
|
||||||
|
.get(0).getSequences().size()).isEqualTo(8);
|
||||||
|
assertThat(classificationDocument.getHeaders()
|
||||||
|
.get(0).getTextBlocks()
|
||||||
|
.get(0).toString()).isEqualTo(textToSearch);
|
||||||
|
|
||||||
|
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||||
|
|
||||||
|
TextBlock leafTextBlock = document.getFirstPage().getHeader().getLeafTextBlock();
|
||||||
|
assertTrue(leafTextBlock.getSearchText().contains(textToSearch));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void toHtml(ClassificationDocument document, String filename) {
|
private void toHtml(ClassificationDocument document, String filename) {
|
||||||
|
|
||||||
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
var tables = document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.toList();
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
int currentPage = 1;
|
int currentPage = 1;
|
||||||
@ -603,9 +835,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
||||||
|
|
||||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
|
TablePageBlock table = document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.toList()
|
||||||
|
.get(tableIndex);
|
||||||
List<List<Cell>> rows = table.getRows();
|
List<List<Cell>> rows = table.getRows();
|
||||||
int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
|
int emptyCellsFoundFound = rows.stream()
|
||||||
|
.flatMap(List::stream)
|
||||||
|
.toList()
|
||||||
|
.stream()
|
||||||
|
.filter(f -> f.toString().isEmpty())
|
||||||
|
.toList().size();
|
||||||
|
|
||||||
for (List<Cell> row : table.getRows()) {
|
for (List<Cell> row : table.getRows()) {
|
||||||
row.forEach(r -> System.out.println(r.toString()));
|
row.forEach(r -> System.out.println(r.toString()));
|
||||||
@ -620,11 +862,20 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
|
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
|
||||||
|
|
||||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
|
TablePageBlock table = document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.toList()
|
||||||
|
.get(tableIndex);
|
||||||
List<List<Cell>> rows = table.getRows();
|
List<List<Cell>> rows = table.getRows();
|
||||||
|
|
||||||
List<Cell> rowsFlattened = rows.stream().flatMap(List::stream).toList();
|
List<Cell> rowsFlattened = rows.stream()
|
||||||
List<String> valuesFlattened = values.stream().flatMap(List::stream).toList();
|
.flatMap(List::stream)
|
||||||
|
.toList();
|
||||||
|
List<String> valuesFlattened = values.stream()
|
||||||
|
.flatMap(List::stream)
|
||||||
|
.toList();
|
||||||
|
|
||||||
for (int i = 0; i < valuesFlattened.size(); i++) {
|
for (int i = 0; i < valuesFlattened.size(); i++) {
|
||||||
Cell cell = rowsFlattened.get(i);
|
Cell cell = rowsFlattened.get(i);
|
||||||
@ -637,7 +888,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
private void validateTableSize(ClassificationDocument document, int tableSize) {
|
private void validateTableSize(ClassificationDocument document, int tableSize) {
|
||||||
|
|
||||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize);
|
assertThat(document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.toList().size()).isEqualTo(tableSize);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,13 +1,17 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||||
|
|
||||||
|
import java.awt.Color;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.pdfbox.Loader;
|
import org.apache.pdfbox.Loader;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.springframework.core.io.ClassPathResource;
|
import org.springframework.core.io.ClassPathResource;
|
||||||
|
|
||||||
@ -26,29 +30,50 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
|
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
public class RulingCleaningServiceTest extends BuildDocumentTest {
|
public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
// @Disabled
|
@Disabled
|
||||||
|
@SneakyThrows
|
||||||
|
public void textRectanglesFromRulingsExtraction() {
|
||||||
|
|
||||||
|
String fileName = "files/SinglePages/T5 VV-640252-Page16.pdf";
|
||||||
|
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_CELLS.pdf";
|
||||||
|
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||||
|
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||||
|
List<List<Rectangle2D>> rectanglesPerPage = new LinkedList<>();
|
||||||
|
for (PageContents pageContent : pageContents) {
|
||||||
|
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings());
|
||||||
|
List<Rectangle2D> rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
|
rectanglesPerPage.add(rects);
|
||||||
|
}
|
||||||
|
|
||||||
|
PdfDraw.drawRectanglesPerPage(fileName, rectanglesPerPage, lineFileName, DrawingOptions.builder().stroke(true).strokeColor(Color.RED).build());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Disabled
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void textRulingExtraction() {
|
public void textRulingExtraction() {
|
||||||
|
|
||||||
String fileName = "files/211.pdf";
|
String fileName = "files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page35.pdf";
|
||||||
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf";
|
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf";
|
||||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||||
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
|
|
||||||
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
|
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
|
||||||
for (PageContents pageContent : pageContents) {
|
for (PageContents pageContent : pageContents) {
|
||||||
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings()));
|
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings()));
|
||||||
}
|
}
|
||||||
|
var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVertical).collect(Collectors.toList());
|
||||||
|
PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -57,9 +82,6 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testTableExtraction() {
|
public void testTableExtraction() {
|
||||||
|
|
||||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
|
||||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
|
||||||
|
|
||||||
ClassPathResource resource = new ClassPathResource("files");
|
ClassPathResource resource = new ClassPathResource("files");
|
||||||
List<String> pdfFileNames = Files.walk(resource.getFile().toPath())
|
List<String> pdfFileNames = Files.walk(resource.getFile().toPath())
|
||||||
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
||||||
@ -67,8 +89,8 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
|||||||
.map(Path::toString)
|
.map(Path::toString)
|
||||||
.toList();
|
.toList();
|
||||||
|
|
||||||
for (int i = 0; i < pdfFileNames.size(); i++) {
|
for (String pdfFileName : pdfFileNames) {
|
||||||
writeJsons(Path.of(pdfFileNames.get(i)));
|
writeJsons(Path.of(pdfFileName));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -88,13 +110,13 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
|||||||
filename.toFile().toString()));
|
filename.toFile().toString()));
|
||||||
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
|
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
|
||||||
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
|
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
|
||||||
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
|
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure())) {
|
||||||
String tmpFileNameBefore = "C:/Users/YANNIK~1/AppData/Local/Temp/before." + filename.getFileName().toString();
|
String tmpFileNameBefore = "/tmp/before." + filename.getFileName().toString();
|
||||||
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
|
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
|
||||||
PdfDraw.drawDocumentGraph(pdDocument, documentGraphBefore);
|
PdfDraw.drawDocumentGraph(pdDocument, documentGraphBefore);
|
||||||
pdDocument.save(tmpFileNameBefore);
|
pdDocument.save(tmpFileNameBefore);
|
||||||
}
|
}
|
||||||
String tmpFileNameAfter = "C:/Users/YANNIK~1/AppData/Local/Temp/after." + filename.getFileName().toString();
|
String tmpFileNameAfter = "/tmp/after." + filename.getFileName().toString();
|
||||||
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
|
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
|
||||||
PdfDraw.drawDocumentGraph(pdDocument, documentGraphAfter);
|
PdfDraw.drawDocumentGraph(pdDocument, documentGraphAfter);
|
||||||
pdDocument.save(tmpFileNameAfter);
|
pdDocument.save(tmpFileNameAfter);
|
||||||
@ -105,9 +127,9 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2, String pdfName) {
|
private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2) {
|
||||||
|
|
||||||
List listStructure1 = structure1.streamAllEntries()
|
List<Table> listStructure1 = structure1.streamAllEntries()
|
||||||
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
||||||
.map(DocumentStructure.EntryData::getProperties)
|
.map(DocumentStructure.EntryData::getProperties)
|
||||||
.map(properties -> {
|
.map(properties -> {
|
||||||
@ -117,7 +139,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
|||||||
})
|
})
|
||||||
.toList();
|
.toList();
|
||||||
|
|
||||||
List listStructure2 = structure2.streamAllEntries()
|
List<Table> listStructure2 = structure2.streamAllEntries()
|
||||||
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
||||||
.map(DocumentStructure.EntryData::getProperties)
|
.map(DocumentStructure.EntryData::getProperties)
|
||||||
.map(properties -> {
|
.map(properties -> {
|
||||||
@ -128,8 +150,8 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
|||||||
.toList();
|
.toList();
|
||||||
|
|
||||||
for (int i = 0; i < listStructure1.size(); i++) {
|
for (int i = 0; i < listStructure1.size(); i++) {
|
||||||
Table tableNode1 = (Table) listStructure1.get(i);
|
Table tableNode1 = listStructure1.get(i);
|
||||||
Table tableNode2 = (Table) listStructure2.get(i);
|
Table tableNode2 = listStructure2.get(i);
|
||||||
if (tableNode1.getNumberOfRows() != tableNode2.getNumberOfRows() || tableNode1.getNumberOfCols() != tableNode2.getNumberOfCols()) {
|
if (tableNode1.getNumberOfRows() != tableNode2.getNumberOfRows() || tableNode1.getNumberOfCols() != tableNode2.getNumberOfCols()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -24,20 +24,31 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.experimental.FieldDefaults;
|
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
@UtilityClass
|
@UtilityClass
|
||||||
public class PdfDraw {
|
public class PdfDraw {
|
||||||
|
|
||||||
|
public static void drawRectanglesPerPage(String filename, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName, DrawingOptions options) throws IOException {
|
||||||
|
|
||||||
|
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||||
|
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||||
|
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||||
|
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
||||||
|
pageNumber,
|
||||||
|
rectanglesPerPage.get(pageNumber - 1),
|
||||||
|
options);
|
||||||
|
}
|
||||||
|
pdDocument.save(out);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public static void drawRectanglesPerPage(String filename, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) throws IOException {
|
public static void drawRectanglesPerPage(String filename, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) throws IOException {
|
||||||
|
|
||||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||||
@ -46,7 +57,7 @@ public class PdfDraw {
|
|||||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
||||||
pageNumber,
|
pageNumber,
|
||||||
rectanglesPerPage.get(pageNumber - 1),
|
rectanglesPerPage.get(pageNumber - 1),
|
||||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
DrawingOptions.builder().stroke(true).build());
|
||||||
}
|
}
|
||||||
pdDocument.save(out);
|
pdDocument.save(out);
|
||||||
}
|
}
|
||||||
@ -62,13 +73,13 @@ public class PdfDraw {
|
|||||||
var rectanglesOnPage = rectanglesPerPage.get(pageNumber - 1);
|
var rectanglesOnPage = rectanglesPerPage.get(pageNumber - 1);
|
||||||
for (int lineNumber = 0; lineNumber < rectanglesOnPage.size(); lineNumber++) {
|
for (int lineNumber = 0; lineNumber < rectanglesOnPage.size(); lineNumber++) {
|
||||||
var rectanglesInLine = rectanglesOnPage.get(lineNumber);
|
var rectanglesInLine = rectanglesOnPage.get(lineNumber);
|
||||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, PdfVisualisationUtility.Options.builder().stroke(true).build());
|
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, DrawingOptions.builder().stroke(true).build());
|
||||||
double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY());
|
double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY());
|
||||||
PdfVisualisationUtility.drawText(String.format("%d", lineNumber),
|
PdfVisualisationUtility.drawText(String.format("%d", lineNumber),
|
||||||
pdDocument,
|
pdDocument,
|
||||||
new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2),
|
new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2),
|
||||||
pageNumber,
|
pageNumber,
|
||||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
DrawingOptions.builder().stroke(true).build());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pdDocument.save(out);
|
pdDocument.save(out);
|
||||||
@ -99,20 +110,20 @@ public class PdfDraw {
|
|||||||
|
|
||||||
public static void drawNode(PDDocument document, DocumentTree.Entry entry) {
|
public static void drawNode(PDDocument document, DocumentTree.Entry entry) {
|
||||||
|
|
||||||
Options options = buildStandardOptionsForNodes(entry);
|
DrawingOptions options = buildStandardOptionsForNodes(entry);
|
||||||
|
|
||||||
drawBBoxAndLabelAndNumberOnPage(document, entry, options);
|
drawBBoxAndLabelAndNumberOnPage(document, entry, options);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static void drawTextBlock(PDDocument document, TextBlock textBlock, Options options) {
|
public static void drawTextBlock(PDDocument document, TextBlock textBlock, DrawingOptions options) {
|
||||||
|
|
||||||
textBlock.getAtomicTextBlocks().forEach(atb -> drawAtomicTextBlock(document, atb, options));
|
textBlock.getAtomicTextBlocks().forEach(atb -> drawAtomicTextBlock(document, atb, options));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, Options options) {
|
public static void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, DrawingOptions options) {
|
||||||
|
|
||||||
drawRectangle2DList(document, atomicTextBlock.getPage().getNumber(), atomicTextBlock.getPositions().stream().toList(), options);
|
drawRectangle2DList(document, atomicTextBlock.getPage().getNumber(), atomicTextBlock.getPositions().stream().toList(), options);
|
||||||
|
|
||||||
@ -120,7 +131,7 @@ public class PdfDraw {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private static void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, Options options, boolean rotate) {
|
private static void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, DrawingOptions options, boolean rotate) {
|
||||||
|
|
||||||
var pdPage = document.getPage(pageNumber - 1);
|
var pdPage = document.getPage(pageNumber - 1);
|
||||||
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||||
@ -142,14 +153,14 @@ public class PdfDraw {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public static void drawRectangle2DList(PDDocument document, int pageNumber, List<Rectangle2D> rectCollection, Options options) {
|
public static void drawRectangle2DList(PDDocument document, int pageNumber, List<Rectangle2D> rectCollection, DrawingOptions options) {
|
||||||
|
|
||||||
var pdPage = document.getPage(pageNumber - 1);
|
var pdPage = document.getPage(pageNumber - 1);
|
||||||
drawRectangle2DList(document, rectCollection, options, pdPage);
|
drawRectangle2DList(document, rectCollection, options, pdPage);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static void drawRectangle2DList(PDDocument document, List<Rectangle2D> rectCollection, Options options, PDPage pdPage) throws IOException {
|
private static void drawRectangle2DList(PDDocument document, List<Rectangle2D> rectCollection, DrawingOptions options, PDPage pdPage) throws IOException {
|
||||||
|
|
||||||
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||||
|
|
||||||
@ -181,12 +192,12 @@ public class PdfDraw {
|
|||||||
// PdfVisualisationUtility.drawLine2DList(pdDocument,
|
// PdfVisualisationUtility.drawLine2DList(pdDocument,
|
||||||
// pageNumber,
|
// pageNumber,
|
||||||
// list.get(pageNumber - 1),
|
// list.get(pageNumber - 1),
|
||||||
// PdfVisualisationUtility.Options.builder().stroke(true).build());
|
// PdfVisualisationUtility.DrawingOptions.builder().stroke(true).build());
|
||||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
||||||
pageNumber,
|
pageNumber,
|
||||||
rectanglesPerPage.get(pageNumber - 1),
|
rectanglesPerPage.get(pageNumber - 1),
|
||||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
DrawingOptions.builder().stroke(true).build());
|
||||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, list.get(pageNumber - 1), PdfVisualisationUtility.Options.builder().stroke(true).build());
|
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, list.get(pageNumber - 1), DrawingOptions.builder().stroke(true).build());
|
||||||
}
|
}
|
||||||
pdDocument.save(out);
|
pdDocument.save(out);
|
||||||
}
|
}
|
||||||
@ -202,35 +213,18 @@ public class PdfDraw {
|
|||||||
PdfVisualisationUtility.drawLine2DList(pdDocument,
|
PdfVisualisationUtility.drawLine2DList(pdDocument,
|
||||||
pageNumber,
|
pageNumber,
|
||||||
linesPerPage.get(pageNumber - 1),
|
linesPerPage.get(pageNumber - 1),
|
||||||
PdfVisualisationUtility.Options.builder().strokeColor(Color.RED).stroke(true).build());
|
DrawingOptions.builder().strokeColor(Color.RED).stroke(true).build());
|
||||||
}
|
}
|
||||||
pdDocument.save(out);
|
pdDocument.save(out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Builder
|
|
||||||
@AllArgsConstructor
|
|
||||||
@Getter
|
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
|
||||||
public static class Options {
|
|
||||||
|
|
||||||
boolean stroke;
|
|
||||||
@Builder.Default
|
|
||||||
Color strokeColor = Color.BLACK;
|
|
||||||
@Builder.Default
|
|
||||||
float strokeWidth = 1f;
|
|
||||||
|
|
||||||
boolean fill;
|
|
||||||
@Builder.Default
|
|
||||||
Color fillColor = Color.BLACK;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static Options buildStandardOptionsForNodes(DocumentTree.Entry entry) {
|
private static DrawingOptions buildStandardOptionsForNodes(DocumentTree.Entry entry) {
|
||||||
|
|
||||||
return Options.builder().stroke(true).strokeColor(switch (entry.getType()) {
|
return DrawingOptions.builder().stroke(true).strokeColor(switch (entry.getType()) {
|
||||||
case DOCUMENT -> Color.LIGHT_GRAY;
|
case DOCUMENT -> Color.LIGHT_GRAY;
|
||||||
case HEADER, FOOTER -> Color.GREEN;
|
case HEADER, FOOTER -> Color.GREEN;
|
||||||
case PARAGRAPH -> Color.BLUE;
|
case PARAGRAPH -> Color.BLUE;
|
||||||
@ -243,7 +237,7 @@ public class PdfDraw {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, Options options) {
|
private static void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, DrawingOptions options) {
|
||||||
|
|
||||||
Map<Page, Rectangle2D> rectanglesPerPage = entry.getNode().getBBox();
|
Map<Page, Rectangle2D> rectanglesPerPage = entry.getNode().getBBox();
|
||||||
for (Page page : rectanglesPerPage.keySet()) {
|
for (Page page : rectanglesPerPage.keySet()) {
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -17,6 +17,7 @@ import java.util.stream.Collectors;
|
|||||||
|
|
||||||
import org.apache.pdfbox.Loader;
|
import org.apache.pdfbox.Loader;
|
||||||
import org.apache.pdfbox.cos.COSName;
|
import org.apache.pdfbox.cos.COSName;
|
||||||
|
import org.apache.pdfbox.pdfwriter.compress.CompressParameters;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
|
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
|
||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
@ -126,8 +127,8 @@ public class ViewerDocumentService {
|
|||||||
pdDocument = openPDDocument(tmpFile.toFile());
|
pdDocument = openPDDocument(tmpFile.toFile());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
observedIncrementalSave(pdDocument, destinationFile);
|
|
||||||
|
|
||||||
|
observedIncrementalSave(pdDocument, destinationFile);
|
||||||
pdDocument.close();
|
pdDocument.close();
|
||||||
assert tmpFile.toFile().delete();
|
assert tmpFile.toFile().delete();
|
||||||
}
|
}
|
||||||
@ -282,10 +283,12 @@ public class ViewerDocumentService {
|
|||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void observedIncrementalSave(PDDocument pdDocument, File outputFile) {
|
private void observedIncrementalSave(PDDocument pdDocument, File outputFile) {
|
||||||
|
/*
|
||||||
|
Sometimes the viewer document is corrupted after saving and missing the content streams on a random page, for the files we viewed it did not seem to happen with incrementalSave. It might only be a timing issue though
|
||||||
|
*/
|
||||||
Observation.createNotStarted("ViewerDocumentService", registry).contextualName("incremental-save").observe(() -> {
|
Observation.createNotStarted("ViewerDocumentService", registry).contextualName("incremental-save").observe(() -> {
|
||||||
try (var out = new FileOutputStream(outputFile)) {
|
try (var out = new FileOutputStream(outputFile)) {
|
||||||
pdDocument.save(out);
|
pdDocument.save(out, CompressParameters.NO_COMPRESSION);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user