Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3b12242355 | ||
|
|
e8605f4956 | ||
|
|
f4a5b5fcbf | ||
|
|
8496b48cde | ||
|
|
de266dcfe5 | ||
|
|
10e525f0de | ||
|
|
e0e5e35b30 | ||
|
|
e1d8d1ea3b | ||
|
|
1546c05dd8 | ||
|
|
7c88c30ca7 | ||
|
|
50427d08dc | ||
|
|
338c6c5dd0 |
@ -29,5 +29,6 @@ dependencies {
|
|||||||
implementation("org.commonmark:commonmark:0.22.0")
|
implementation("org.commonmark:commonmark:0.22.0")
|
||||||
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
|
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
|
||||||
implementation("com.pdftron:PDFNet:10.11.0")
|
implementation("com.pdftron:PDFNet:10.11.0")
|
||||||
|
implementation("org.apache.commons:commons-text:1.12.0")
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -246,7 +246,7 @@ public class LayoutParsingPipeline {
|
|||||||
OutlineObject lastProcessedOutlineObject = null;
|
OutlineObject lastProcessedOutlineObject = null;
|
||||||
|
|
||||||
// parsing the structure elements could be useful as well
|
// parsing the structure elements could be useful as well
|
||||||
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
|
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD) {
|
||||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -324,18 +324,19 @@ public class LayoutParsingPipeline {
|
|||||||
classificationPage.setPageWidth(cropbox.getWidth());
|
classificationPage.setPageWidth(cropbox.getWidth());
|
||||||
classificationPage.setPageHeight(cropbox.getHeight());
|
classificationPage.setPageHeight(cropbox.getHeight());
|
||||||
|
|
||||||
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
|
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD) {
|
||||||
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
|
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber, new ArrayList<>());
|
||||||
|
|
||||||
OutlineObject notFoundOutlineObject = null;
|
OutlineObject notFoundOutlineObject = null;
|
||||||
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
|
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
|
||||||
lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight()));
|
lastProcessedOutlineObject.resetPoint();
|
||||||
notFoundOutlineObject = lastProcessedOutlineObject;
|
notFoundOutlineObject = lastProcessedOutlineObject;
|
||||||
}
|
}
|
||||||
if (!outlineObjects.isEmpty()) {
|
if (!outlineObjects.isEmpty()) {
|
||||||
classificationPage.setOutlineObjects(outlineObjects);
|
classificationPage.setOutlineObjects(outlineObjects);
|
||||||
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
|
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
|
||||||
}
|
}
|
||||||
|
classificationDocument.getLayoutDebugLayer().addOutlineObjects(outlineObjects, pageInformation);
|
||||||
}
|
}
|
||||||
|
|
||||||
classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
|
classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
|
||||||
@ -379,6 +380,12 @@ public class LayoutParsingPipeline {
|
|||||||
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
||||||
|
for (ClassificationPage page : classificationDocument.getPages()) {
|
||||||
|
docuMineBlockificationService.mergeblocks(page, page.getCleanRulings().withoutTextRulings(), 0, 10);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
List<TextPageBlock> headlines = classificationDocument.getPages()
|
List<TextPageBlock> headlines = classificationDocument.getPages()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||||
|
|||||||
@ -133,7 +133,7 @@ public abstract class BoundingBox {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean intersectsX(BoundingBox other, float threshold) {
|
public boolean intersectsX(BoundingBox other, float threshold) {
|
||||||
|
|
||||||
return this.getX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getX();
|
return this.getX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getX();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,12 +1,15 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||||
|
|
||||||
@ -29,9 +32,8 @@ public class Page {
|
|||||||
Integer height;
|
Integer height;
|
||||||
Integer width;
|
Integer width;
|
||||||
Integer rotation;
|
Integer rotation;
|
||||||
|
|
||||||
@EqualsAndHashCode.Exclude
|
@EqualsAndHashCode.Exclude
|
||||||
List<SemanticNode> mainBody;
|
List<AtomicTextBlock> textBlocksOnPage;
|
||||||
@EqualsAndHashCode.Exclude
|
@EqualsAndHashCode.Exclude
|
||||||
Header header;
|
Header header;
|
||||||
@EqualsAndHashCode.Exclude
|
@EqualsAndHashCode.Exclude
|
||||||
@ -53,20 +55,43 @@ public class Page {
|
|||||||
.width((int) classificationPage.getPageWidth())
|
.width((int) classificationPage.getPageWidth())
|
||||||
.number(classificationPage.getPageNumber())
|
.number(classificationPage.getPageNumber())
|
||||||
.rotation(classificationPage.getRotation())
|
.rotation(classificationPage.getRotation())
|
||||||
.mainBody(new LinkedList<>())
|
.textBlocksOnPage(new LinkedList<>())
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs and returns a {@link TextBlock} representing the concatenated text of all leaf semantic nodes in the main body.
|
||||||
|
*
|
||||||
|
* @return The main body text block.
|
||||||
|
*/
|
||||||
public TextBlock getMainBodyTextBlock() {
|
public TextBlock getMainBodyTextBlock() {
|
||||||
|
|
||||||
return mainBody.stream()
|
return textBlocksOnPage.stream()
|
||||||
.filter(SemanticNode::isLeaf)
|
|
||||||
.map(SemanticNode::getLeafTextBlock)
|
|
||||||
.collect(new TextBlockCollector());
|
.collect(new TextBlockCollector());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<SemanticNode> getMainBody() {
|
||||||
|
|
||||||
|
return textBlocksOnPage.stream()
|
||||||
|
.map(AtomicTextBlock::getParent)
|
||||||
|
.map(this::getHighestParentOnPage)
|
||||||
|
.distinct()
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private SemanticNode getHighestParentOnPage(SemanticNode node) {
|
||||||
|
|
||||||
|
SemanticNode currentNode = node;
|
||||||
|
while (currentNode.hasParent() && currentNode.getParent().onlyOnPage(this)) {
|
||||||
|
currentNode = currentNode.getParent();
|
||||||
|
}
|
||||||
|
return currentNode;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
||||||
|
|||||||
@ -74,7 +74,8 @@ public interface SemanticNode {
|
|||||||
|
|
||||||
return getTextBlock().getPages()
|
return getTextBlock().getPages()
|
||||||
.stream()
|
.stream()
|
||||||
.min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
|
.min(Comparator.comparingInt(Page::getNumber))
|
||||||
|
.orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -504,4 +505,17 @@ public interface SemanticNode {
|
|||||||
|
|
||||||
void accept(NodeVisitor visitor);
|
void accept(NodeVisitor visitor);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks wether this SemanticNode appears on a single page only, and if that page is the provided one.
|
||||||
|
*
|
||||||
|
* @param page the page to check
|
||||||
|
* @return true, when SemanticNode is on a single page only and the page is the provided page. Otherwise, false.
|
||||||
|
*/
|
||||||
|
default boolean onlyOnPage(Page page) {
|
||||||
|
|
||||||
|
Set<Page> pages = getPages();
|
||||||
|
return pages.size() == 1 && pages.contains(page);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -1,5 +1,6 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||||
|
|
||||||
|
import java.awt.geom.AffineTransform;
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -26,6 +27,9 @@ import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocume
|
|||||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@ -94,7 +98,8 @@ public class OutlineExtractorService {
|
|||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
int pageNumber = document.getPages().indexOf(page);
|
int pageNumber = document.getPages().indexOf(page) + 1;
|
||||||
|
AffineTransform userSpaceToPageCoords = CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(PageInformation.fromPDPage(pageNumber, page));
|
||||||
|
|
||||||
Optional<Point2D> outlinePosition = Optional.empty();
|
Optional<Point2D> outlinePosition = Optional.empty();
|
||||||
|
|
||||||
@ -123,8 +128,15 @@ public class OutlineExtractorService {
|
|||||||
log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title));
|
log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title));
|
||||||
}
|
}
|
||||||
|
|
||||||
return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, outlinePosition.orElse(new Point2D.Float(0, 0)), depth)));
|
return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title,
|
||||||
|
pageNumber,
|
||||||
|
transformPointToPageCoords(outlinePosition, userSpaceToPageCoords), depth)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static Point2D transformPointToPageCoords(Optional<Point2D> outlinePosition, AffineTransform userSpaceToPageCoords) {
|
||||||
|
|
||||||
|
return outlinePosition.map(point -> userSpaceToPageCoords.transform(point, null)).orElse(null);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,27 +1,34 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||||
import lombok.Data;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.Getter;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
@Data
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
@AllArgsConstructor
|
|
||||||
public class OutlineObject {
|
public class OutlineObject {
|
||||||
|
|
||||||
|
@Getter
|
||||||
private final String title;
|
private final String title;
|
||||||
|
@Getter
|
||||||
private final int pageNumber;
|
private final int pageNumber;
|
||||||
private Point2D point;
|
@Getter
|
||||||
private final int treeDepth;
|
private final int treeDepth;
|
||||||
|
|
||||||
|
private Point2D point; // java coordinates, (0, 0) is always top left
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
private boolean found;
|
private boolean found;
|
||||||
|
|
||||||
|
|
||||||
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {
|
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {
|
||||||
|
|
||||||
this(title, pageNumber, depth);
|
this.title = title;
|
||||||
|
this.pageNumber = pageNumber;
|
||||||
|
this.treeDepth = depth;
|
||||||
this.point = point2D;
|
this.point = point2D;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -32,4 +39,39 @@ public class OutlineObject {
|
|||||||
return "OutlineObject{" + "title='" + title + '\'' + '}';
|
return "OutlineObject{" + "title='" + title + '\'' + '}';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Optional<Point2D> getPoint() {
|
||||||
|
|
||||||
|
return Optional.ofNullable(point);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isAbove(BoundingBox boundingBox) {
|
||||||
|
|
||||||
|
if (point == null) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return point.getY() <= boundingBox.getMaxY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double distance(BoundingBox boundingBox) {
|
||||||
|
|
||||||
|
if (point == null) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (boundingBox.getBBox().contains(point)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
double deltaX = Math.min(Math.abs(boundingBox.getMinX() - point.getX()), Math.abs(boundingBox.getMaxX() - point.getX()));
|
||||||
|
double deltaY = Math.min(Math.abs(boundingBox.getMinY() - point.getY()), Math.abs(boundingBox.getMaxY() - point.getY()));
|
||||||
|
return Math.sqrt(deltaX * deltaX + deltaY * deltaY);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void resetPoint() {
|
||||||
|
|
||||||
|
this.point = null;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -39,4 +39,28 @@ public class OutlineObjectTree {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
sb.append("OutlineObjectTree(\n");
|
||||||
|
for (OutlineObjectTreeNode node : rootNodes) {
|
||||||
|
buildString(node, sb, 1);
|
||||||
|
}
|
||||||
|
sb.append(")");
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void buildString(OutlineObjectTreeNode node, StringBuilder sb, int depth) {
|
||||||
|
|
||||||
|
for (int i = 0; i < depth; i++) {
|
||||||
|
sb.append(" ");
|
||||||
|
}
|
||||||
|
sb.append(node.getOutlineObject().getTitle()).append("\n");
|
||||||
|
|
||||||
|
for (OutlineObjectTreeNode child : node.getChildren()) {
|
||||||
|
buildString(child, sb, depth + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -87,7 +87,7 @@ public class Cell extends BoundingBox {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" {2}", " ");
|
return TextNormalizationUtilities.cleanString(sb.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
|||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||||
|
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
@ -38,11 +39,7 @@ public class SearchableText {
|
|||||||
sb.append(word);
|
sb.append(word);
|
||||||
sb.append(' ');
|
sb.append(' ');
|
||||||
}
|
}
|
||||||
String text = sb.toString();
|
return TextNormalizationUtilities.cleanString(sb.toString());
|
||||||
text = TextNormalizationUtilities.removeHyphenLineBreaks(text);
|
|
||||||
text = TextNormalizationUtilities.removeLineBreaks(text);
|
|
||||||
text = TextNormalizationUtilities.removeRepeatingWhitespaces(text);
|
|
||||||
return text;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -213,7 +213,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
previous = word;
|
previous = word;
|
||||||
}
|
}
|
||||||
|
|
||||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
|
return TextNormalizationUtilities.removeHyphenLinebreaks(sb.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -8,6 +8,7 @@ import java.util.ListIterator;
|
|||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.commons.text.similarity.LevenshteinDistance;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||||
@ -23,7 +24,7 @@ import lombok.Data;
|
|||||||
@Service
|
@Service
|
||||||
public class BlockificationPostprocessingService {
|
public class BlockificationPostprocessingService {
|
||||||
|
|
||||||
private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f;
|
private static final float STRING_SIMILARITY_THRESHOLD = 0.1f;
|
||||||
|
|
||||||
|
|
||||||
public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) {
|
public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) {
|
||||||
@ -34,38 +35,36 @@ public class BlockificationPostprocessingService {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
float pageHeight = classificationPage.getPageHeight();
|
|
||||||
|
|
||||||
ListIterator<OutlineObject> outlineObjectListIterator = outlineObjects.listIterator();
|
ListIterator<OutlineObject> outlineObjectListIterator = outlineObjects.listIterator();
|
||||||
|
|
||||||
if (notFoundOutlineObject != null) {
|
if (notFoundOutlineObject != null) {
|
||||||
OutlineProcessionContext notFoundOutlineObjectProcessionContext = new OutlineProcessionContext(notFoundOutlineObject);
|
OutlineProcessionContext notFoundOutlineObjectProcessionContext = new OutlineProcessionContext(notFoundOutlineObject);
|
||||||
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, notFoundOutlineObjectProcessionContext);
|
processTextBlocks(getTextPageBlocks(classificationPage), notFoundOutlineObjectProcessionContext);
|
||||||
|
|
||||||
OutlineObject firstOutlineObject = null;
|
OutlineObject firstOutlineObject = null;
|
||||||
OutlineProcessionContext firstOutlineObjectProcessionContext = null;
|
OutlineProcessionContext firstOutlineObjectProcessionContext = null;
|
||||||
if (outlineObjectListIterator.hasNext()) {
|
if (outlineObjectListIterator.hasNext()) {
|
||||||
firstOutlineObject = outlineObjectListIterator.next();
|
firstOutlineObject = outlineObjectListIterator.next();
|
||||||
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
|
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
|
||||||
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
|
processTextBlocks(getTextPageBlocks(classificationPage), firstOutlineObjectProcessionContext);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) {
|
if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) {
|
||||||
notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext, pageHeight));
|
notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext));
|
||||||
}
|
}
|
||||||
if (firstOutlineObject != null) {
|
if (firstOutlineObject != null) {
|
||||||
// re-create the context for the updated blocks
|
// re-create the context for the updated blocks
|
||||||
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
|
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
|
||||||
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
|
processTextBlocks(getTextPageBlocks(classificationPage), firstOutlineObjectProcessionContext);
|
||||||
firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext, pageHeight));
|
firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
outlineObjectListIterator.forEachRemaining(outlineObject -> {
|
outlineObjectListIterator.forEachRemaining(outlineObject -> {
|
||||||
OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject);
|
OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject);
|
||||||
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext);
|
processTextBlocks(getTextPageBlocks(classificationPage), outlineObjectProcessionContext);
|
||||||
outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext, pageHeight));
|
outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext));
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!outlineObjects.isEmpty()) {
|
if (!outlineObjects.isEmpty()) {
|
||||||
@ -104,8 +103,7 @@ public class BlockificationPostprocessingService {
|
|||||||
|
|
||||||
double maxYFirst = blocksOfFirstOutline.stream()
|
double maxYFirst = blocksOfFirstOutline.stream()
|
||||||
.mapToDouble(TextPageBlock::getPdfMaxY)
|
.mapToDouble(TextPageBlock::getPdfMaxY)
|
||||||
.max()
|
.max().orElse(Double.NEGATIVE_INFINITY);
|
||||||
.orElse(Double.NEGATIVE_INFINITY);
|
|
||||||
|
|
||||||
return blocksOfNotFoundOutline.stream()
|
return blocksOfNotFoundOutline.stream()
|
||||||
.mapToDouble(TextPageBlock::getPdfMaxY)
|
.mapToDouble(TextPageBlock::getPdfMaxY)
|
||||||
@ -127,13 +125,13 @@ public class BlockificationPostprocessingService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void processTextBlocks(List<TextPageBlock> textBlocks, float pageHeight, OutlineProcessionContext context) {
|
private void processTextBlocks(List<TextPageBlock> textBlocks, OutlineProcessionContext context) {
|
||||||
|
|
||||||
OutlineObject outlineObject = context.getOutlineObject();
|
OutlineObject outlineObject = context.getOutlineObject();
|
||||||
ListIterator<TextPageBlock> iterator = textBlocks.listIterator();
|
ListIterator<TextPageBlock> iterator = textBlocks.listIterator();
|
||||||
while (iterator.hasNext()) {
|
while (iterator.hasNext()) {
|
||||||
TextPageBlock pageBlock = iterator.next();
|
TextPageBlock pageBlock = iterator.next();
|
||||||
if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) {
|
if (outlineObject.isAbove(pageBlock)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -148,7 +146,7 @@ public class BlockificationPostprocessingService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context, float pageHeight) {
|
private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
|
||||||
|
|
||||||
OutlineObject outlineObject = context.outlineObject;
|
OutlineObject outlineObject = context.outlineObject;
|
||||||
TextPageBlock directMatch = context.directMatch;
|
TextPageBlock directMatch = context.directMatch;
|
||||||
@ -156,8 +154,8 @@ public class BlockificationPostprocessingService {
|
|||||||
TextPageBlock splitCandidate = context.splitCandidate;
|
TextPageBlock splitCandidate = context.splitCandidate;
|
||||||
PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth());
|
PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth());
|
||||||
|
|
||||||
double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch, pageHeight) : Double.MAX_VALUE;
|
double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch) : Double.MAX_VALUE;
|
||||||
double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate, pageHeight) : Double.MAX_VALUE;
|
double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate) : Double.MAX_VALUE;
|
||||||
|
|
||||||
double distanceToBestMergeCandidates = Double.MAX_VALUE;
|
double distanceToBestMergeCandidates = Double.MAX_VALUE;
|
||||||
List<TextPageBlock> bestMergeCandidateCombination = new ArrayList<>();
|
List<TextPageBlock> bestMergeCandidateCombination = new ArrayList<>();
|
||||||
@ -177,9 +175,8 @@ public class BlockificationPostprocessingService {
|
|||||||
|
|
||||||
for (List<TextPageBlock> combination : combinations) {
|
for (List<TextPageBlock> combination : combinations) {
|
||||||
double averageDistance = combination.stream()
|
double averageDistance = combination.stream()
|
||||||
.map(block -> calculateDistance(outlineObject, block, pageHeight))
|
.map(block -> calculateDistance(outlineObject, block))
|
||||||
.mapToDouble(Double::doubleValue).average()
|
.mapToDouble(Double::doubleValue).average().orElse(Double.MAX_VALUE);
|
||||||
.orElse(Double.MAX_VALUE);
|
|
||||||
if (distanceToBestMergeCandidates > averageDistance) {
|
if (distanceToBestMergeCandidates > averageDistance) {
|
||||||
distanceToBestMergeCandidates = averageDistance;
|
distanceToBestMergeCandidates = averageDistance;
|
||||||
bestMergeCandidateCombination = combination;
|
bestMergeCandidateCombination = combination;
|
||||||
@ -406,11 +403,9 @@ public class BlockificationPostprocessingService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock, float pageHeight) {
|
private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock) {
|
||||||
|
|
||||||
double deltaX = outlineObject.getPoint().getX() - pageBlock.getMinX();
|
return outlineObject.distance(pageBlock);
|
||||||
double deltaY = pageHeight - outlineObject.getPoint().getY() - pageBlock.getMinY();
|
|
||||||
return Math.sqrt(deltaX * deltaX + deltaY * deltaY);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -427,6 +422,13 @@ public class BlockificationPostprocessingService {
|
|||||||
String blockText = sanitizeString(pageBlock.getText());
|
String blockText = sanitizeString(pageBlock.getText());
|
||||||
String outlineTitle = sanitizeString(outlineObject.getTitle());
|
String outlineTitle = sanitizeString(outlineObject.getTitle());
|
||||||
|
|
||||||
|
int threshold = (int) (Math.min(blockText.length(), outlineTitle.length()) * STRING_SIMILARITY_THRESHOLD) + 1;
|
||||||
|
int distance = new LevenshteinDistance(threshold).apply(blockText, outlineTitle);
|
||||||
|
if (distance >= 0 && distance < threshold) {
|
||||||
|
context.directMatch = pageBlock;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle);
|
boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle);
|
||||||
boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText);
|
boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText);
|
||||||
|
|
||||||
|
|||||||
@ -2,19 +2,23 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica
|
|||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.ListIterator;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
|
||||||
|
@SuppressWarnings("all")
|
||||||
@Service
|
@Service
|
||||||
public class DocuMineBlockificationService {
|
public class DocuMineBlockificationService {
|
||||||
|
|
||||||
@ -57,8 +61,11 @@ public class DocuMineBlockificationService {
|
|||||||
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
|
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
|
||||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||||
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 //
|
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 //
|
||||||
&& (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold") //
|
&& (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold")
|
||||||
|| prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
//
|
||||||
|
|| prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")
|
||||||
|
|| Math.abs(prev.getFontSize() - word.getFontSize()) >= 1
|
||||||
|
|| Math.abs(word.getTextHeight() - prev.getTextHeight()) > 0.8);
|
||||||
|
|
||||||
Matcher matcher = pattern.matcher(chunkWords.stream()
|
Matcher matcher = pattern.matcher(chunkWords.stream()
|
||||||
.collect(Collectors.joining(" ")).toString());
|
.collect(Collectors.joining(" ")).toString());
|
||||||
@ -120,5 +127,77 @@ public class DocuMineBlockificationService {
|
|||||||
return new ClassificationPage(textPageBlocks);
|
return new ClassificationPage(textPageBlocks);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void mergeblocks(ClassificationPage page, CleanRulings usedRulings, float xThreshold, float yThreshold) {
|
||||||
|
|
||||||
|
var blocks = page.getTextBlocks();
|
||||||
|
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
|
||||||
|
while (itty.hasNext()) {
|
||||||
|
AbstractPageBlock block = itty.next();
|
||||||
|
if (block == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (block instanceof TablePageBlock) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
TextPageBlock current = (TextPageBlock) block;
|
||||||
|
|
||||||
|
for (int i = 0; i < blocks.size(); i++) {
|
||||||
|
|
||||||
|
AbstractPageBlock abstractPageBlock = blocks.get(i);
|
||||||
|
if (abstractPageBlock == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (abstractPageBlock == current) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (abstractPageBlock instanceof TablePageBlock) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isHeadlineFromOutline(current) || isHeadlineFromOutline(abstractPageBlock)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
|
||||||
|
|
||||||
|
if (usedRulings.lineBetween(current, blocks.get(i))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold) && (current.getClassification() == null || current.getClassification()
|
||||||
|
.equals(inner.getClassification()))) {
|
||||||
|
|
||||||
|
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
||||||
|
current.getSequences().addAll(inner.getSequences());
|
||||||
|
current = buildTextBlock(current.getSequences(), 0);
|
||||||
|
current.setClassification(inner.getClassification());
|
||||||
|
current.setToDuplicate(toDuplicate);
|
||||||
|
blocks.set(i, null);
|
||||||
|
itty.set(current);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
var blocksIterator = blocks.iterator();
|
||||||
|
while (blocksIterator.hasNext()) {
|
||||||
|
if (blocksIterator.next() == null) {
|
||||||
|
blocksIterator.remove();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isHeadlineFromOutline(AbstractPageBlock abstractPageBlock) {
|
||||||
|
|
||||||
|
return abstractPageBlock.getEngines().contains(LayoutEngine.OUTLINE) && abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||||
|
|
||||||
|
return new TextPageBlock(wordBlockList);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -74,7 +74,7 @@ public class DocuMineClassificationService {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||||
textBlock.setClassification(PageBlockType.OTHER);
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) //
|
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) //
|
||||||
@ -108,7 +108,10 @@ public class DocuMineClassificationService {
|
|||||||
&& Character.isDigit(textBlock.toString().charAt(0))
|
&& Character.isDigit(textBlock.toString().charAt(0))
|
||||||
&& atLeast3Matcher.reset().find()
|
&& atLeast3Matcher.reset().find()
|
||||||
&& !textBlock.toString().contains(":") //
|
&& !textBlock.toString().contains(":") //
|
||||||
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && atLeast3Matcher.reset().find() && !textBlock.toString().contains(":") //
|
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT))
|
||||||
|
&& atLeast3Matcher.reset().find()
|
||||||
|
&& !textBlock.toString().contains(":")
|
||||||
|
&& !textBlock.toString().startsWith("(")//
|
||||||
|| textBlock.toString().startsWith("APPENDIX") //
|
|| textBlock.toString().startsWith("APPENDIX") //
|
||||||
|| textBlock.toString().startsWith("FIGURE") //
|
|| textBlock.toString().startsWith("FIGURE") //
|
||||||
|| textBlock.toString().startsWith("Continued TABLE") //
|
|| textBlock.toString().startsWith("Continued TABLE") //
|
||||||
@ -143,9 +146,9 @@ public class DocuMineClassificationService {
|
|||||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
} else {
|
} else {
|
||||||
textBlock.setClassification(PageBlockType.OTHER);
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -6,6 +6,7 @@ import static java.util.stream.Collectors.toList;
|
|||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -15,6 +16,7 @@ import java.util.Optional;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
@ -32,7 +34,9 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.He
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
@ -68,10 +72,25 @@ public class DocumentGraphFactory {
|
|||||||
documentGraph.setPages(context.pages.keySet());
|
documentGraph.setPages(context.pages.keySet());
|
||||||
documentGraph.setDocumentTree(context.documentTree);
|
documentGraph.setDocumentTree(context.documentTree);
|
||||||
documentGraph.setTextBlock(documentGraph.getTextBlock());
|
documentGraph.setTextBlock(documentGraph.getTextBlock());
|
||||||
|
addTextBlocksToPages(documentGraph);
|
||||||
|
|
||||||
return documentGraph;
|
return documentGraph;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void addTextBlocksToPages(Document documentGraph) {
|
||||||
|
|
||||||
|
documentGraph.streamAllSubNodes()
|
||||||
|
.filter(SemanticNode::isLeaf)
|
||||||
|
.filter(node -> !node.getType().equals(NodeType.HEADER))
|
||||||
|
.filter(node -> !node.getType().equals(NodeType.FOOTER))
|
||||||
|
.map(SemanticNode::getTextBlock)
|
||||||
|
.map(TextBlock::getAtomicTextBlocks)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
|
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
|
||||||
|
|
||||||
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
|
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
|
||||||
@ -105,8 +124,6 @@ public class DocumentGraphFactory {
|
|||||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
page.getMainBody().add(node);
|
|
||||||
|
|
||||||
List<TextPageBlock> textBlocks = new ArrayList<>();
|
List<TextPageBlock> textBlocks = new ArrayList<>();
|
||||||
textBlocks.add(originalTextBlock);
|
textBlocks.add(originalTextBlock);
|
||||||
textBlocks.addAll(textBlocksToMerge);
|
textBlocks.addAll(textBlocksToMerge);
|
||||||
@ -141,7 +158,7 @@ public class DocumentGraphFactory {
|
|||||||
|
|
||||||
Rectangle2D position = image.getPosition();
|
Rectangle2D position = image.getPosition();
|
||||||
Page page = context.getPage(image.getPage());
|
Page page = context.getPage(image.getPage());
|
||||||
Image imageNode = Image.builder()
|
return Image.builder()
|
||||||
.id(IdBuilder.buildId(Set.of(page), List.of(position)))
|
.id(IdBuilder.buildId(Set.of(page), List.of(position)))
|
||||||
.imageType(image.getImageType())
|
.imageType(image.getImageType())
|
||||||
.position(position)
|
.position(position)
|
||||||
@ -150,8 +167,6 @@ public class DocumentGraphFactory {
|
|||||||
.representationHash(image.getRepresentation())
|
.representationHash(image.getRepresentation())
|
||||||
.documentTree(context.getDocumentTree())
|
.documentTree(context.getDocumentTree())
|
||||||
.build();
|
.build();
|
||||||
page.getMainBody().add(imageNode);
|
|
||||||
return imageNode;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -51,9 +51,6 @@ public class SectionNodeFactory {
|
|||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
|
|
||||||
.collect(groupingBy(AbstractPageBlock::getPage));
|
|
||||||
|
|
||||||
AbstractSemanticNode section;
|
AbstractSemanticNode section;
|
||||||
boolean containsTablesAndTextBlocks = containsTablesAndTextBlocks(pageBlocks);
|
boolean containsTablesAndTextBlocks = containsTablesAndTextBlocks(pageBlocks);
|
||||||
if (isLeaf && !containsTablesAndTextBlocks) {
|
if (isLeaf && !containsTablesAndTextBlocks) {
|
||||||
@ -63,8 +60,6 @@ public class SectionNodeFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
context.getSections().add(section);
|
context.getSections().add(section);
|
||||||
blocksPerPage.keySet()
|
|
||||||
.forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
|
|
||||||
|
|
||||||
section.setTreeId(getTreeId(parentNode, context, section));
|
section.setTreeId(getTreeId(parentNode, context, section));
|
||||||
|
|
||||||
@ -242,10 +237,5 @@ public class SectionNodeFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, AbstractSemanticNode section, Integer pageNumber) {
|
|
||||||
|
|
||||||
Page page = context.getPage(pageNumber);
|
|
||||||
page.getMainBody().add(section);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -51,8 +51,6 @@ public class TableNodeFactory {
|
|||||||
.numberOfRows(mergedRows.size())
|
.numberOfRows(mergedRows.size())
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
pages.forEach(page -> addTableToPage(page, parentNode, table));
|
|
||||||
|
|
||||||
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
|
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
|
||||||
table.setTreeId(treeId);
|
table.setTreeId(treeId);
|
||||||
addTableCells(layoutParsingType, mergedRows, table, context, document);
|
addTableCells(layoutParsingType, mergedRows, table, context, document);
|
||||||
@ -82,17 +80,6 @@ public class TableNodeFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
|
||||||
private void addTableToPage(Page page, SemanticNode parentNode, Table table) {
|
|
||||||
|
|
||||||
if (!page.getMainBody().contains(parentNode)) {
|
|
||||||
parentNode.getPages().add(page);
|
|
||||||
}
|
|
||||||
|
|
||||||
page.getMainBody().add(table);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
|
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
|
||||||
|
|
||||||
if (table.streamHeaders()
|
if (table.streamHeaders()
|
||||||
@ -107,14 +94,7 @@ public class TableNodeFactory {
|
|||||||
|
|
||||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||||
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
|
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
|
||||||
addTableCell(layoutParsingType,
|
addTableCell(layoutParsingType, rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context, document);
|
||||||
rows.get(rowIndex)
|
|
||||||
.get(colIndex),
|
|
||||||
rowIndex,
|
|
||||||
colIndex,
|
|
||||||
table,
|
|
||||||
context,
|
|
||||||
document);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -131,14 +111,7 @@ public class TableNodeFactory {
|
|||||||
|
|
||||||
Page page = context.getPage(cell.getPageNumber());
|
Page page = context.getPage(cell.getPageNumber());
|
||||||
|
|
||||||
TableCell tableCell = TableCell.builder()
|
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBBoxPdf()).build();
|
||||||
.documentTree(context.getDocumentTree())
|
|
||||||
.row(rowIndex)
|
|
||||||
.col(colIndex)
|
|
||||||
.header(cell.isHeaderCell())
|
|
||||||
.bBox(cell.getBBoxPdf())
|
|
||||||
.build();
|
|
||||||
page.getMainBody().add(tableCell);
|
|
||||||
|
|
||||||
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
|
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
|
||||||
tableCell.setTreeId(treeId);
|
tableCell.setTreeId(treeId);
|
||||||
@ -147,9 +120,7 @@ public class TableNodeFactory {
|
|||||||
if (cell.getTextBlocks().isEmpty()) {
|
if (cell.getTextBlocks().isEmpty()) {
|
||||||
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
|
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
|
||||||
} else if (cell.getTextBlocks().size() == 1) {
|
} else if (cell.getTextBlocks().size() == 1) {
|
||||||
textBlock = context.getTextBlockFactory()
|
textBlock = context.getTextBlockFactory().buildAtomicTextBlock2(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
|
||||||
.buildAtomicTextBlock2(cell.getTextBlocks()
|
|
||||||
.get(0).getSequences(), tableCell, context, page);
|
|
||||||
tableCell.setLeafTextBlock(textBlock);
|
tableCell.setLeafTextBlock(textBlock);
|
||||||
} else if (firstTextBlockIsHeadline(cell)) {
|
} else if (firstTextBlockIsHeadline(cell)) {
|
||||||
SectionNodeFactory.addSection(layoutParsingType,
|
SectionNodeFactory.addSection(layoutParsingType,
|
||||||
@ -181,8 +152,7 @@ public class TableNodeFactory {
|
|||||||
|
|
||||||
private boolean firstTextBlockIsHeadline(Cell cell) {
|
private boolean firstTextBlockIsHeadline(Cell cell) {
|
||||||
|
|
||||||
return cell.getTextBlocks()
|
return cell.getTextBlocks().get(0).isHeadline();
|
||||||
.get(0).isHeadline();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -40,7 +40,7 @@ public class TextBlockFactory {
|
|||||||
orientation = sequences.get(0).getDir().toString();
|
orientation = sequences.get(0).getDir().toString();
|
||||||
textRotation = sequences.get(0).getDir().getRotation();
|
textRotation = sequences.get(0).getDir().getRotation();
|
||||||
}
|
}
|
||||||
return AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(),
|
var atb = AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(),
|
||||||
searchTextWithTextPositionDto.getLineBreaks(),
|
searchTextWithTextPositionDto.getLineBreaks(),
|
||||||
searchTextWithTextPositionDto.getBoldTextBoundaries(),
|
searchTextWithTextPositionDto.getBoldTextBoundaries(),
|
||||||
searchTextWithTextPositionDto.getItalicTextBoundaries(),
|
searchTextWithTextPositionDto.getItalicTextBoundaries(),
|
||||||
@ -53,14 +53,13 @@ public class TextBlockFactory {
|
|||||||
offset,
|
offset,
|
||||||
orientation,
|
orientation,
|
||||||
textRotation);
|
textRotation);
|
||||||
|
return atb;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public AtomicTextBlock emptyTextBlock(SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
|
public AtomicTextBlock emptyTextBlock(SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
|
||||||
|
|
||||||
long idx = textBlockIdx;
|
return emptyTextBlock(parent, context.getAndIncrementTextBlockNumberOnPage(page), page);
|
||||||
textBlockIdx++;
|
|
||||||
return AtomicTextBlock.empty(idx, stringOffset, page, context.getAndIncrementTextBlockNumberOnPage(page), parent);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -68,7 +67,8 @@ public class TextBlockFactory {
|
|||||||
|
|
||||||
long idx = textBlockIdx;
|
long idx = textBlockIdx;
|
||||||
textBlockIdx++;
|
textBlockIdx++;
|
||||||
return AtomicTextBlock.empty(idx, stringOffset, page, numberOnPage, parent);
|
var atb = AtomicTextBlock.empty(idx, stringOffset, page, numberOnPage, parent);
|
||||||
|
return atb;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -41,7 +41,9 @@ public class DocumentGraphMapper {
|
|||||||
DocumentTree documentTree = new DocumentTree(document);
|
DocumentTree documentTree = new DocumentTree(document);
|
||||||
Context context = new Context(documentData, documentTree);
|
Context context = new Context(documentData, documentTree);
|
||||||
|
|
||||||
context.pages.addAll(Arrays.stream(documentData.getDocumentPages()).map(DocumentGraphMapper::buildPage).toList());
|
context.pages.addAll(Arrays.stream(documentData.getDocumentPages())
|
||||||
|
.map(DocumentGraphMapper::buildPage)
|
||||||
|
.toList());
|
||||||
|
|
||||||
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildren(), context));
|
context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentStructure().getRoot().getChildren(), context));
|
||||||
|
|
||||||
@ -59,7 +61,9 @@ public class DocumentGraphMapper {
|
|||||||
List<DocumentTree.Entry> newEntries = new LinkedList<>();
|
List<DocumentTree.Entry> newEntries = new LinkedList<>();
|
||||||
for (DocumentStructure.EntryData entryData : entries) {
|
for (DocumentStructure.EntryData entryData : entries) {
|
||||||
|
|
||||||
List<Page> pages = Arrays.stream(entryData.getPageNumbers()).map(pageNumber -> getPage(pageNumber, context)).toList();
|
List<Page> pages = Arrays.stream(entryData.getPageNumbers())
|
||||||
|
.map(pageNumber -> getPage(pageNumber, context))
|
||||||
|
.toList();
|
||||||
|
|
||||||
SemanticNode node = switch (entryData.getType()) {
|
SemanticNode node = switch (entryData.getType()) {
|
||||||
case SECTION -> buildSection(context);
|
case SECTION -> buildSection(context);
|
||||||
@ -77,15 +81,16 @@ public class DocumentGraphMapper {
|
|||||||
if (entryData.getAtomicBlockIds().length > 0) {
|
if (entryData.getAtomicBlockIds().length > 0) {
|
||||||
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node);
|
TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node);
|
||||||
node.setLeafTextBlock(textBlock);
|
node.setLeafTextBlock(textBlock);
|
||||||
}
|
|
||||||
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed().toList();
|
|
||||||
node.setTreeId(treeId);
|
|
||||||
|
|
||||||
switch (entryData.getType()) {
|
switch (entryData.getType()) {
|
||||||
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
|
case HEADER -> pages.forEach(page -> page.setHeader((Header) node));
|
||||||
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
|
case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node));
|
||||||
default -> pages.forEach(page -> page.getMainBody().add(node));
|
default -> textBlock.getAtomicTextBlocks()
|
||||||
|
.forEach(atb -> atb.getPage().getTextBlocksOnPage().add(atb));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
List<Integer> treeId = Arrays.stream(entryData.getTreeId()).boxed()
|
||||||
|
.toList();
|
||||||
|
node.setTreeId(treeId);
|
||||||
|
|
||||||
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build());
|
newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build());
|
||||||
}
|
}
|
||||||
@ -142,6 +147,7 @@ public class DocumentGraphMapper {
|
|||||||
return Section.builder().documentTree(context.documentTree).build();
|
return Section.builder().documentTree(context.documentTree).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private SuperSection buildSuperSection(Context context) {
|
private SuperSection buildSuperSection(Context context) {
|
||||||
|
|
||||||
return SuperSection.builder().documentTree(context.documentTree).build();
|
return SuperSection.builder().documentTree(context.documentTree).build();
|
||||||
@ -166,7 +172,9 @@ public class DocumentGraphMapper {
|
|||||||
|
|
||||||
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
|
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
|
||||||
|
|
||||||
return Arrays.stream(atomicTextBlockIds).map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId)).collect(new TextBlockCollector());
|
return Arrays.stream(atomicTextBlockIds)
|
||||||
|
.map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId))
|
||||||
|
.collect(new TextBlockCollector());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -181,7 +189,7 @@ public class DocumentGraphMapper {
|
|||||||
|
|
||||||
private Page buildPage(DocumentPage p) {
|
private Page buildPage(DocumentPage p) {
|
||||||
|
|
||||||
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
|
return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).textBlocksOnPage(new LinkedList<>()).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -206,8 +214,10 @@ public class DocumentGraphMapper {
|
|||||||
|
|
||||||
this.documentTree = documentTree;
|
this.documentTree = documentTree;
|
||||||
this.pages = new LinkedList<>();
|
this.pages = new LinkedList<>();
|
||||||
this.documentTextDataBlockData = Arrays.stream(documentData.getDocumentTextData()).toList();
|
this.documentTextDataBlockData = Arrays.stream(documentData.getDocumentTextData())
|
||||||
this.atomicPositionBlockData = Arrays.stream(documentData.getDocumentPositions()).toList();
|
.toList();
|
||||||
|
this.atomicPositionBlockData = Arrays.stream(documentData.getDocumentPositions())
|
||||||
|
.toList();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -32,7 +32,6 @@ public class CoordinateTransforms {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public AffineTransform calculateInitialUserSpaceCoordsToImageCoords(PageInformation pageInformation, double scalingFactor) {
|
public AffineTransform calculateInitialUserSpaceCoordsToImageCoords(PageInformation pageInformation, double scalingFactor) {
|
||||||
|
|
||||||
@ -40,6 +39,19 @@ public class CoordinateTransforms {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public AffineTransform calculatePageCoordsToInitialUserSpaceCoords(PageInformation pageInformation) {
|
||||||
|
|
||||||
|
return calculateImageCoordsToInitialUserSpaceCoords(pageInformation, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public AffineTransform calculateInitialUserSpaceCoordsToPageCoords(PageInformation pageInformation) {
|
||||||
|
|
||||||
|
return calculatePageCoordsToInitialUserSpaceCoords(pageInformation).createInverse();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public double calculateScalingFactor(PageInformation pageInformation, double imageWidth) {
|
public double calculateScalingFactor(PageInformation pageInformation, double imageWidth) {
|
||||||
|
|
||||||
// PDFBox always returns page height and width based on rotation
|
// PDFBox always returns page height and width based on rotation
|
||||||
|
|||||||
@ -1,9 +1,10 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
@ -22,29 +23,83 @@ public class TableMergingUtility {
|
|||||||
List<TablePageBlock> consecutiveTables = pageBlocks.stream()
|
List<TablePageBlock> consecutiveTables = pageBlocks.stream()
|
||||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
.filter(tablePageBlock -> !tablePageBlock.equals(originalTablePageBlock))
|
.filter(tablePageBlock -> !tablePageBlock.equals(originalTablePageBlock))
|
||||||
|
.sorted(Comparator.comparingInt(TablePageBlock::getPage).thenComparing(TablePageBlock::getY).thenComparing(TablePageBlock::getX))
|
||||||
.toList();
|
.toList();
|
||||||
|
|
||||||
assert consecutiveTables.size() == pageBlocks.size() - 1;
|
assert consecutiveTables.size() == pageBlocks.size() - 1;
|
||||||
|
var currentTable = originalTablePageBlock;
|
||||||
|
int currentTableIndex = 0;
|
||||||
|
|
||||||
List<TablePageBlock> consecutiveTablesWithSameColCountAndHeaders = new LinkedList<>();
|
List<TablePageBlock> consecutiveTablesWithSameColCountAndHeaders = new LinkedList<>();
|
||||||
for (TablePageBlock consecutiveTable : consecutiveTables) {
|
consecutiveTablesWithSameColCountAndHeaders.add(originalTablePageBlock);
|
||||||
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock,
|
for (int i = 0; i < consecutiveTables.size(); i++) {
|
||||||
consecutiveTable)) {
|
TablePageBlock consecutiveTable = consecutiveTables.get(i);
|
||||||
|
|
||||||
|
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() //
|
||||||
|
&& headersMatch(originalTablePageBlock, consecutiveTable) //
|
||||||
|
&& outerBoundaryAlignsX(originalTablePageBlock, consecutiveTable) //
|
||||||
|
&& consecutiveOrSamePage(currentTable, consecutiveTable) //
|
||||||
|
&& !tableBetween(currentTable, consecutiveTable, findTablesBetween(consecutiveTables, currentTableIndex, i))) {
|
||||||
|
|
||||||
|
currentTable = consecutiveTable;
|
||||||
|
currentTableIndex = i;
|
||||||
consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable);
|
consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return Stream.concat(Stream.of(originalTablePageBlock), consecutiveTablesWithSameColCountAndHeaders.stream()).toList();
|
return consecutiveTablesWithSameColCountAndHeaders;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<TablePageBlock> findTablesBetween(List<TablePageBlock> consecutiveTables, int currentTableIndex, int i) {
|
||||||
|
|
||||||
|
if (currentTableIndex + 1 == consecutiveTables.size() || currentTableIndex + 1 >= i) {
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
return consecutiveTables.subList(currentTableIndex + 1, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean consecutiveOrSamePage(TablePageBlock currentTable, TablePageBlock consecutiveTable) {
|
||||||
|
|
||||||
|
return currentTable.getPage() == consecutiveTable.getPage() || currentTable.getPage() + 1 == consecutiveTable.getPage();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean tableBetween(TablePageBlock currentTable, TablePageBlock consecutiveTable, List<TablePageBlock> tablesBetween) {
|
||||||
|
|
||||||
|
if (tablesBetween.isEmpty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// assumes the tables are on the same page or on consecutive pages, all tables on pages in between are ignored.
|
||||||
|
return tablesBetween.stream()
|
||||||
|
.filter(tableBetween -> tableBetween.getPage() == currentTable.getPage())
|
||||||
|
.anyMatch(tableBetween -> tableBetween.isBelow(currentTable)) //
|
||||||
|
|| tablesBetween.stream()
|
||||||
|
.filter(tableBetween -> tableBetween.getPage() == consecutiveTable.getPage())
|
||||||
|
.anyMatch(tableBetween -> tableBetween.isAbove(consecutiveTable));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean headersMatch(TablePageBlock originalTable, TablePageBlock consecutiveTable) {
|
||||||
|
|
||||||
|
return getHeaders(consecutiveTable).isEmpty() || getHeaders(originalTable).equals(getHeaders(consecutiveTable));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static boolean outerBoundaryAlignsX(TablePageBlock originalTablePageBlock, TablePageBlock consecutiveTable) {
|
private static boolean outerBoundaryAlignsX(TablePageBlock originalTablePageBlock, TablePageBlock consecutiveTable) {
|
||||||
|
|
||||||
return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD && Math.abs(consecutiveTable.getMaxX() - originalTablePageBlock.getMaxX()) < TABLE_ALIGNMENT_THRESHOLD;
|
return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD
|
||||||
|
&& Math.abs(consecutiveTable.getMaxX() - originalTablePageBlock.getMaxX()) < TABLE_ALIGNMENT_THRESHOLD;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean hasTableHeader(TablePageBlock table) {
|
private List<Cell> getHeaders(TablePageBlock table) {
|
||||||
|
|
||||||
return table.getRows().stream().flatMap(Collection::stream).anyMatch(Cell::isHeaderCell);
|
return table.getRows()
|
||||||
|
.stream()
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.filter(Cell::isHeaderCell)
|
||||||
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,31 +1,40 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||||
|
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
@UtilityClass
|
@UtilityClass
|
||||||
public final class TextNormalizationUtilities {
|
public final class TextNormalizationUtilities {
|
||||||
|
|
||||||
/**
|
public static final Pattern hyphenLineBreaks = Pattern.compile("[-~‐‒⁻−﹣゠⁓‑\\u00AD][\\r\\n]+");
|
||||||
* Revert hyphenation due to line breaks.
|
public static final Pattern linebreaks = Pattern.compile("[\\r\\n]+");
|
||||||
*
|
public static final Pattern doubleWhitespaces = Pattern.compile("\\s{2,}");
|
||||||
* @param text Text to be processed.
|
|
||||||
* @return Text without line-break hyphenation.
|
|
||||||
*/
|
|
||||||
public static String removeHyphenLineBreaks(String text) {
|
|
||||||
|
|
||||||
return text.replaceAll("([^\\s\\d\\-]{2,500})[\\-\\u00AD]\\R", "$1");
|
|
||||||
|
public String cleanString(String value) {
|
||||||
|
|
||||||
|
String noHyphenLinebreaks = removeHyphenLinebreaks(value);
|
||||||
|
String noLinebreaks = removeLinebreaks(noHyphenLinebreaks);
|
||||||
|
return removeMultipleWhitespaces(noLinebreaks);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static String removeLineBreaks(String text) {
|
public String removeHyphenLinebreaks(String value) {
|
||||||
|
|
||||||
return text.replaceAll("\n", " ");
|
return hyphenLineBreaks.matcher(value).replaceAll("");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static String removeRepeatingWhitespaces(String text) {
|
private String removeMultipleWhitespaces(String value) {
|
||||||
|
|
||||||
return text.replaceAll(" {2}", " ");
|
return doubleWhitespaces.matcher(value).replaceAll(" ");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private String removeLinebreaks(String value) {
|
||||||
|
|
||||||
|
return linebreaks.matcher(value).replaceAll(" ");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,11 +1,14 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.visualization;
|
package com.knecon.fforesight.service.layoutparser.processor.visualization;
|
||||||
|
|
||||||
import java.awt.Color;
|
import java.awt.Color;
|
||||||
|
import java.awt.geom.AffineTransform;
|
||||||
import java.awt.geom.Line2D;
|
import java.awt.geom.Line2D;
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
@ -15,15 +18,19 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlo
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
import com.knecon.fforesight.service.viewerdoc.layers.LayoutDebugLayerConfig;
|
import com.knecon.fforesight.service.viewerdoc.layers.LayoutDebugLayerConfig;
|
||||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
|
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
|
||||||
|
import com.knecon.fforesight.service.viewerdoc.model.FilledRectangle;
|
||||||
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
|
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
|
||||||
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
|
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
|
||||||
|
|
||||||
@ -43,6 +50,8 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
|
|
||||||
boolean active;
|
boolean active;
|
||||||
|
|
||||||
|
Map<Integer, AtomicInteger> outlineObjectsWithoutPointsPerPage = new HashMap<>();
|
||||||
|
|
||||||
|
|
||||||
public void addTextVisualizations(List<TextPositionSequence> textPositionSequences, int pageNumber) {
|
public void addTextVisualizations(List<TextPositionSequence> textPositionSequences, int pageNumber) {
|
||||||
|
|
||||||
@ -151,7 +160,6 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public void addLineVisualizationsFromNestedTextPosition(Collection<Set<TextPositionSequence>> lines, int pageNumber) {
|
public void addLineVisualizationsFromNestedTextPosition(Collection<Set<TextPositionSequence>> lines, int pageNumber) {
|
||||||
|
|
||||||
if (!active) {
|
if (!active) {
|
||||||
@ -168,6 +176,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void addTextBlockVisualizations(List<TextPageBlock> textPageBlocks, int page) {
|
public void addTextBlockVisualizations(List<TextPageBlock> textPageBlocks, int page) {
|
||||||
|
|
||||||
if (!active) {
|
if (!active) {
|
||||||
@ -254,4 +263,40 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addOutlineObjects(List<OutlineObject> outlineObjects, PageInformation pageInformation) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (OutlineObject outlineObject : outlineObjects) {
|
||||||
|
addOutlineObject(outlineObject, pageInformation);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void addOutlineObject(OutlineObject outlineObject, PageInformation pageInformation) {
|
||||||
|
|
||||||
|
int rectSize = 5;
|
||||||
|
|
||||||
|
Point2D point2D;
|
||||||
|
if (outlineObject.getPoint().isPresent()) {
|
||||||
|
point2D = outlineObject.getPoint().get();
|
||||||
|
} else {
|
||||||
|
int numberOfOutlineObjectsWithoutPoints = outlineObjectsWithoutPointsPerPage.computeIfAbsent(outlineObject.getPageNumber(), a -> new AtomicInteger(0))
|
||||||
|
.getAndIncrement();
|
||||||
|
point2D = new Point2D.Double(10, 10 + numberOfOutlineObjectsWithoutPoints * (10 + rectSize * 2));
|
||||||
|
}
|
||||||
|
|
||||||
|
Point2D textPoint = new Point2D.Double(point2D.getX() + 2 * rectSize, point2D.getY() + rectSize);
|
||||||
|
AffineTransform pageToUserSpaceTransform = CoordinateTransforms.calculatePageCoordsToInitialUserSpaceCoords(pageInformation);
|
||||||
|
pageToUserSpaceTransform.transform(point2D, point2D);
|
||||||
|
pageToUserSpaceTransform.transform(textPoint, textPoint);
|
||||||
|
|
||||||
|
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(outlineObject.getPageNumber(), outlineObjects);
|
||||||
|
visualizationsOnPage.getFilledRectangles()
|
||||||
|
.add(new FilledRectangle(new Rectangle2D.Double(point2D.getX() - rectSize, point2D.getY() - rectSize, rectSize * 2, rectSize * 2), OUTLINE_OBJECT_COLOR, 1));
|
||||||
|
visualizationsOnPage.getPlacedTexts().add(PlacedText.textFacingUp(outlineObject.getTitle(), textPoint, 10, outlineObject.isFound() ? Color.BLACK : Color.RED, FONT));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -15,6 +15,7 @@ import java.util.Optional;
|
|||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
|
||||||
@ -72,6 +73,9 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
|||||||
public void addHeadline(Headline headline) {
|
public void addHeadline(Headline headline) {
|
||||||
|
|
||||||
addAsRectangle(headline, headlines, HEADLINE_COLOR);
|
addAsRectangle(headline, headlines, HEADLINE_COLOR);
|
||||||
|
if (headline.getEngines().contains(LayoutEngine.OUTLINE)) {
|
||||||
|
addAsRectangle(headline, outlineHeadlines, HEADLINE_COLOR);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -27,7 +27,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@Slf4j
|
@Slf4j
|
||||||
public class LayoutparserEnd2EndTest extends AbstractTest {
|
public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||||
|
|
||||||
public static final LayoutParsingType LAYOUT_PARSING_TYPE = LayoutParsingType.DOCUMINE;
|
public static final LayoutParsingType LAYOUT_PARSING_TYPE = LayoutParsingType.DOCUMINE_OLD;
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
private LayoutParsingPipeline layoutParsingPipeline;
|
private LayoutParsingPipeline layoutParsingPipeline;
|
||||||
@ -37,7 +37,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
|||||||
@Disabled
|
@Disabled
|
||||||
public void testLayoutParserEndToEnd() {
|
public void testLayoutParserEndToEnd() {
|
||||||
|
|
||||||
String filePath = "/home/kschuettler/Downloads/55974b3de7ed2915718a10458206bbd8.ORIGIN.pdf";
|
String filePath = "/home/kschuettler/Dokumente/Ticket Related/RED-9964/17a25133-e098-4610-b553-d1bf11a56d96/560e6ab1ab4754b9a62fd2e6d4d71327/560e6ab1ab4754b9a62fd2e6d4d71327.ORIGIN.pdf";
|
||||||
|
|
||||||
runForFile(filePath);
|
runForFile(filePath);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -84,17 +84,17 @@ public class OutlineDetectionTest extends AbstractTest {
|
|||||||
|
|
||||||
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
|
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
|
||||||
assertEquals(outlineObjectTree.getRootNodes().size(), 8);
|
assertEquals(outlineObjectTree.getRootNodes().size(), 8);
|
||||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(1).size(), 1);
|
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(2).size(), 1);
|
||||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(3).size(), 1);
|
|
||||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(4).size(), 1);
|
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(4).size(), 1);
|
||||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(5).size(), 1);
|
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(5).size(), 1);
|
||||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(6).size(), 2);
|
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(6).size(), 1);
|
||||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(7).size(), 3);
|
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(7).size(), 2);
|
||||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(8).size(), 2);
|
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(8).size(), 3);
|
||||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(10).size(), 1);
|
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(9).size(), 2);
|
||||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(11).size(), 4);
|
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(11).size(), 1);
|
||||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(12).size(), 1);
|
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(12).size(), 4);
|
||||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(13).size(), 2);
|
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(13).size(), 1);
|
||||||
|
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(14).size(), 2);
|
||||||
assertTrue(outlineObjectTree.getOutlineObjectsPerPage().values()
|
assertTrue(outlineObjectTree.getOutlineObjectsPerPage().values()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(Collection::stream)
|
.flatMap(Collection::stream)
|
||||||
|
|||||||
@ -40,6 +40,7 @@ public record LayerIdentifier(String name, String markedContentName) {
|
|||||||
public static final LayerIdentifier KNECON_LAYOUT_FIGURES = new LayerIdentifier("Figures", "LAYOUT_FIGURES");
|
public static final LayerIdentifier KNECON_LAYOUT_FIGURES = new LayerIdentifier("Figures", "LAYOUT_FIGURES");
|
||||||
public static final LayerIdentifier KNECON_LAYOUT_IMAGES = new LayerIdentifier("Images", "LAYOUT_IMAGES");
|
public static final LayerIdentifier KNECON_LAYOUT_IMAGES = new LayerIdentifier("Images", "LAYOUT_IMAGES");
|
||||||
public static final LayerIdentifier KNECON_LAYOUT_TREE_IDs = new LayerIdentifier("Tree IDs", "LAYOUT_TREE_IDs");
|
public static final LayerIdentifier KNECON_LAYOUT_TREE_IDs = new LayerIdentifier("Tree IDs", "LAYOUT_TREE_IDs");
|
||||||
|
public static final LayerIdentifier OUTLINE_HEADLINES = new LayerIdentifier("Outline Headlines", "OUTLINE_HEADLINES");
|
||||||
|
|
||||||
//layout grid debug
|
//layout grid debug
|
||||||
public static final LayerIdentifier KNECON_LAYOUT_DEBUG = new LayerIdentifier("Layout elements", "DEBUG_LAYOUT");
|
public static final LayerIdentifier KNECON_LAYOUT_DEBUG = new LayerIdentifier("Layout elements", "DEBUG_LAYOUT");
|
||||||
@ -53,6 +54,7 @@ public record LayerIdentifier(String name, String markedContentName) {
|
|||||||
public static final LayerIdentifier MARKED_CONTENT = new LayerIdentifier("Marked content", "MARKED_CONTENT");
|
public static final LayerIdentifier MARKED_CONTENT = new LayerIdentifier("Marked content", "MARKED_CONTENT");
|
||||||
public static final LayerIdentifier NEIGHBOURS = new LayerIdentifier("Neighbours", "NEIGHBOURS");
|
public static final LayerIdentifier NEIGHBOURS = new LayerIdentifier("Neighbours", "NEIGHBOURS");
|
||||||
public static final LayerIdentifier CHARACTERS = new LayerIdentifier("Characters", "CHARACTERS");
|
public static final LayerIdentifier CHARACTERS = new LayerIdentifier("Characters", "CHARACTERS");
|
||||||
|
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
|
||||||
|
|
||||||
public static final LayerIdentifier KNECON_VISUAL_PARSING = new LayerIdentifier("Visual Layout Parser", "VISUAL_PARSING");
|
public static final LayerIdentifier KNECON_VISUAL_PARSING = new LayerIdentifier("Visual Layout Parser", "VISUAL_PARSING");
|
||||||
|
|
||||||
|
|||||||
@ -30,6 +30,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
|||||||
protected static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6);
|
protected static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6);
|
||||||
|
|
||||||
protected static final Color CELLS_COLOR = new Color(31, 214, 27);
|
protected static final Color CELLS_COLOR = new Color(31, 214, 27);
|
||||||
|
protected static final Color OUTLINE_OBJECT_COLOR = new Color(214, 27, 183);
|
||||||
|
|
||||||
protected static final Color MAIN_BODY_COLOR = new Color(171, 131, 6);
|
protected static final Color MAIN_BODY_COLOR = new Color(171, 131, 6);
|
||||||
protected static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6);
|
protected static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6);
|
||||||
@ -53,6 +54,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
|||||||
protected final Visualizations markedContent = Visualizations.builder().layer(LayerIdentifier.MARKED_CONTENT).build();
|
protected final Visualizations markedContent = Visualizations.builder().layer(LayerIdentifier.MARKED_CONTENT).build();
|
||||||
protected final Visualizations neighbours = Visualizations.builder().layer(LayerIdentifier.NEIGHBOURS).build();
|
protected final Visualizations neighbours = Visualizations.builder().layer(LayerIdentifier.NEIGHBOURS).build();
|
||||||
protected final Visualizations characters = Visualizations.builder().layer(LayerIdentifier.CHARACTERS).build();
|
protected final Visualizations characters = Visualizations.builder().layer(LayerIdentifier.CHARACTERS).build();
|
||||||
|
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
|
||||||
|
|
||||||
|
|
||||||
public List<Visualizations> getVisualizations() {
|
public List<Visualizations> getVisualizations() {
|
||||||
@ -66,7 +68,8 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
|||||||
clean_rulings, //
|
clean_rulings, //
|
||||||
cells, //
|
cells, //
|
||||||
mainBody, //
|
mainBody, //
|
||||||
markedContent //
|
markedContent, //
|
||||||
|
outlineObjects //
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -44,12 +44,12 @@ public class LayoutGridLayerConfig extends AbstractLayerGroup {
|
|||||||
protected final Visualizations images = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_IMAGES).build();
|
protected final Visualizations images = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_IMAGES).build();
|
||||||
protected final Visualizations keyValue = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_KEY_VALUE).build();
|
protected final Visualizations keyValue = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_KEY_VALUE).build();
|
||||||
protected final Visualizations treeIds = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TREE_IDs).build();
|
protected final Visualizations treeIds = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TREE_IDs).build();
|
||||||
|
protected final Visualizations outlineHeadlines = Visualizations.builder().layer(LayerIdentifier.OUTLINE_HEADLINES).build();
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<Visualizations> getVisualizations() {
|
public List<Visualizations> getVisualizations() {
|
||||||
|
|
||||||
return List.of(headlines, paragraphs, tables, sections, headerFooter, keyValue, figures, images, treeIds);
|
return List.of(headlines, paragraphs, tables, sections, headerFooter, keyValue, figures, images, treeIds, outlineHeadlines);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -44,8 +44,8 @@ class PageContentCleanerTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testContentCleaning() {
|
public void testContentCleaning() {
|
||||||
|
|
||||||
Path file = Path.of("/tmp/OCR_TEST/402Study.pdf/viewerDocument.pdf");
|
Path file = Path.of("/home/kschuettler/Downloads/ITEM 23_Absorção cutanea.pdf");
|
||||||
File tmpFile = new File("/tmp/cleaned.pdf");
|
File tmpFile = new File("/tmp/ITEM 23_Absorção cutanea.pdf");
|
||||||
try (var in = new FileInputStream(file.toFile());//
|
try (var in = new FileInputStream(file.toFile());//
|
||||||
var doc = new PDFDoc(in);//
|
var doc = new PDFDoc(in);//
|
||||||
var out = new FileOutputStream(tmpFile);//
|
var out = new FileOutputStream(tmpFile);//
|
||||||
@ -58,7 +58,7 @@ class PageContentCleanerTest {
|
|||||||
.writer(pageWriter)
|
.writer(pageWriter)
|
||||||
.reader(reader)
|
.reader(reader)
|
||||||
.elementBuilder(builder)
|
.elementBuilder(builder)
|
||||||
.markedContentToRemove(Set.of(LayerIdentifier.KNECON_OCR_DEBUG.markedContentName()))
|
.markedContentToRemove(Set.of(LayerIdentifier.KNECON_LAYOUT.markedContentName()))
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
try (PageIterator iterator = doc.getPageIterator()) {
|
try (PageIterator iterator = doc.getPageIterator()) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user