Merge branch 'RED-9975' into 'main'

Red 9975: fix outline detection

See merge request fforesight/layout-parser!206
This commit is contained in:
Dominique Eifländer 2024-09-02 09:02:36 +02:00
commit 4395074b21
19 changed files with 235 additions and 92 deletions

View File

@ -29,5 +29,6 @@ dependencies {
implementation("org.commonmark:commonmark:0.22.0") implementation("org.commonmark:commonmark:0.22.0")
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0") implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
implementation("com.pdftron:PDFNet:10.11.0") implementation("com.pdftron:PDFNet:10.11.0")
implementation("org.apache.commons:commons-text:1.12.0")
} }

View File

@ -119,18 +119,14 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()) File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
.orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId() VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
.map(layoutParsingStorageService::getVisualLayoutParsingFile) .map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse());
.orElse(new VisualLayoutParsingResponse());
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId() ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
.map(layoutParsingStorageService::getImagesFile) .map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse());
.orElse(new ImageServiceResponse());
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId() TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
.map(layoutParsingStorageService::getTablesFile) .map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse());
.orElse(new TableServiceResponse());
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null // ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
@ -147,20 +143,13 @@ public class LayoutParsingPipeline {
log.info("Creating viewer document for {}", layoutParsingRequest.identifier()); log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
layoutGridService.addLayoutGrid(viewerDocumentFile, layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent());
documentGraph,
viewerDocumentFile,
false,
layoutParsingRequest.visualLayoutParsingFileId()
.isPresent());
log.info("Storing resulting files for {}", layoutParsingRequest.identifier()); log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph)); layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
if (layoutParsingRequest.documentMarkdownFileStorageId() if (layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) {
.isPresent()) { layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(), new MarkdownMapper().toMarkdownContent(documentGraph));
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
.get(), new MarkdownMapper().toMarkdownContent(documentGraph));
} }
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
@ -336,17 +325,18 @@ public class LayoutParsingPipeline {
classificationPage.setPageHeight(cropbox.getHeight()); classificationPage.setPageHeight(cropbox.getHeight());
if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD) { if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD) {
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>()); List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber, new ArrayList<>());
OutlineObject notFoundOutlineObject = null; OutlineObject notFoundOutlineObject = null;
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) { if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight())); lastProcessedOutlineObject.resetPoint();
notFoundOutlineObject = lastProcessedOutlineObject; notFoundOutlineObject = lastProcessedOutlineObject;
} }
if (!outlineObjects.isEmpty()) { if (!outlineObjects.isEmpty()) {
classificationPage.setOutlineObjects(outlineObjects); classificationPage.setOutlineObjects(outlineObjects);
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject); lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
} }
classificationDocument.getLayoutDebugLayer().addOutlineObjects(outlineObjects, pageInformation);
} }
classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber); classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);

View File

@ -85,7 +85,7 @@ public class Page {
private SemanticNode getHighestParentOnPage(SemanticNode node) { private SemanticNode getHighestParentOnPage(SemanticNode node) {
SemanticNode currentNode = node; SemanticNode currentNode = node;
while (currentNode.getParent().onlyOnPage(this)) { while (currentNode.hasParent() && currentNode.getParent().onlyOnPage(this)) {
currentNode = currentNode.getParent(); currentNode = currentNode.getParent();
} }
return currentNode; return currentNode;

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline; package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D; import java.awt.geom.Point2D;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
@ -26,6 +27,9 @@ import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocume
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@ -89,12 +93,13 @@ public class OutlineExtractorService {
if (page == null) { if (page == null) {
return Optional.empty(); return Optional.empty();
} }
}catch (IOException e){ } catch (IOException e) {
log.info(String.format("Error occurred during position resolution for outline item with title %s: " + e, title)); log.info(String.format("Error occurred during position resolution for outline item with title %s: " + e, title));
return Optional.empty(); return Optional.empty();
} }
int pageNumber = document.getPages().indexOf(page); int pageNumber = document.getPages().indexOf(page) + 1;
AffineTransform userSpaceToPageCoords = CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(PageInformation.fromPDPage(pageNumber, page));
Optional<Point2D> outlinePosition = Optional.empty(); Optional<Point2D> outlinePosition = Optional.empty();
@ -123,8 +128,15 @@ public class OutlineExtractorService {
log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title)); log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title));
} }
return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, outlinePosition.orElse(new Point2D.Float(0, 0)), depth))); return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title,
pageNumber,
transformPointToPageCoords(outlinePosition, userSpaceToPageCoords), depth)));
}
private static Point2D transformPointToPageCoords(Optional<Point2D> outlinePosition, AffineTransform userSpaceToPageCoords) {
return outlinePosition.map(point -> userSpaceToPageCoords.transform(point, null)).orElse(null);
} }

View File

@ -1,27 +1,34 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline; package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.Point2D; import java.awt.geom.Point2D;
import java.util.Optional;
import lombok.AllArgsConstructor; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import lombok.Data;
import lombok.RequiredArgsConstructor; import lombok.Getter;
import lombok.Setter;
@Data
@RequiredArgsConstructor
@AllArgsConstructor
public class OutlineObject { public class OutlineObject {
@Getter
private final String title; private final String title;
@Getter
private final int pageNumber; private final int pageNumber;
private Point2D point; @Getter
private final int treeDepth; private final int treeDepth;
private Point2D point; // java coordinates, (0, 0) is always top left
@Getter
@Setter
private boolean found; private boolean found;
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) { public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {
this(title, pageNumber, depth); this.title = title;
this.pageNumber = pageNumber;
this.treeDepth = depth;
this.point = point2D; this.point = point2D;
} }
@ -32,4 +39,39 @@ public class OutlineObject {
return "OutlineObject{" + "title='" + title + '\'' + '}'; return "OutlineObject{" + "title='" + title + '\'' + '}';
} }
public Optional<Point2D> getPoint() {
return Optional.ofNullable(point);
}
public boolean isAbove(BoundingBox boundingBox) {
if (point == null) {
return true;
}
return point.getY() <= boundingBox.getMaxY();
}
public double distance(BoundingBox boundingBox) {
if (point == null) {
return 0;
}
if (boundingBox.getBBox().contains(point)) {
return 0;
}
double deltaX = Math.min(Math.abs(boundingBox.getMinX() - point.getX()), Math.abs(boundingBox.getMaxX() - point.getX()));
double deltaY = Math.min(Math.abs(boundingBox.getMinY() - point.getY()), Math.abs(boundingBox.getMaxY() - point.getY()));
return Math.sqrt(deltaX * deltaX + deltaY * deltaY);
}
public void resetPoint() {
this.point = null;
}
} }

View File

@ -39,4 +39,28 @@ public class OutlineObjectTree {
} }
} }
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("OutlineObjectTree(\n");
for (OutlineObjectTreeNode node : rootNodes) {
buildString(node, sb, 1);
}
sb.append(")");
return sb.toString();
}
private void buildString(OutlineObjectTreeNode node, StringBuilder sb, int depth) {
for (int i = 0; i < depth; i++) {
sb.append(" ");
}
sb.append(node.getOutlineObject().getTitle()).append("\n");
for (OutlineObjectTreeNode child : node.getChildren()) {
buildString(child, sb, depth + 1);
}
}
} }

View File

@ -87,7 +87,7 @@ public class Cell extends BoundingBox {
} }
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" {2}", " "); return TextNormalizationUtilities.cleanString(sb.toString());
} }

View File

@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.Getter; import lombok.Getter;
@ -38,11 +39,7 @@ public class SearchableText {
sb.append(word); sb.append(word);
sb.append(' '); sb.append(' ');
} }
String text = sb.toString(); return TextNormalizationUtilities.cleanString(sb.toString());
text = TextNormalizationUtilities.removeHyphenLineBreaks(text);
text = TextNormalizationUtilities.removeLineBreaks(text);
text = TextNormalizationUtilities.removeRepeatingWhitespaces(text);
return text;
} }
} }

View File

@ -213,7 +213,7 @@ public class TextPageBlock extends AbstractPageBlock {
previous = word; previous = word;
} }
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()); return TextNormalizationUtilities.removeHyphenLinebreaks(sb.toString());
} }

View File

@ -8,6 +8,7 @@ import java.util.ListIterator;
import java.util.Locale; import java.util.Locale;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
@ -23,7 +24,7 @@ import lombok.Data;
@Service @Service
public class BlockificationPostprocessingService { public class BlockificationPostprocessingService {
private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f; private static final float STRING_SIMILARITY_THRESHOLD = 0.1f;
public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) { public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) {
@ -34,38 +35,36 @@ public class BlockificationPostprocessingService {
return null; return null;
} }
float pageHeight = classificationPage.getPageHeight();
ListIterator<OutlineObject> outlineObjectListIterator = outlineObjects.listIterator(); ListIterator<OutlineObject> outlineObjectListIterator = outlineObjects.listIterator();
if (notFoundOutlineObject != null) { if (notFoundOutlineObject != null) {
OutlineProcessionContext notFoundOutlineObjectProcessionContext = new OutlineProcessionContext(notFoundOutlineObject); OutlineProcessionContext notFoundOutlineObjectProcessionContext = new OutlineProcessionContext(notFoundOutlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, notFoundOutlineObjectProcessionContext); processTextBlocks(getTextPageBlocks(classificationPage), notFoundOutlineObjectProcessionContext);
OutlineObject firstOutlineObject = null; OutlineObject firstOutlineObject = null;
OutlineProcessionContext firstOutlineObjectProcessionContext = null; OutlineProcessionContext firstOutlineObjectProcessionContext = null;
if (outlineObjectListIterator.hasNext()) { if (outlineObjectListIterator.hasNext()) {
firstOutlineObject = outlineObjectListIterator.next(); firstOutlineObject = outlineObjectListIterator.next();
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject); firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext); processTextBlocks(getTextPageBlocks(classificationPage), firstOutlineObjectProcessionContext);
} }
if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) { if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) {
notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext, pageHeight)); notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext));
} }
if (firstOutlineObject != null) { if (firstOutlineObject != null) {
// re-create the context for the updated blocks // re-create the context for the updated blocks
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject); firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext); processTextBlocks(getTextPageBlocks(classificationPage), firstOutlineObjectProcessionContext);
firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext, pageHeight)); firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext));
} }
} }
outlineObjectListIterator.forEachRemaining(outlineObject -> { outlineObjectListIterator.forEachRemaining(outlineObject -> {
OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject); OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject);
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext); processTextBlocks(getTextPageBlocks(classificationPage), outlineObjectProcessionContext);
outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext, pageHeight)); outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext));
}); });
if (!outlineObjects.isEmpty()) { if (!outlineObjects.isEmpty()) {
@ -104,8 +103,7 @@ public class BlockificationPostprocessingService {
double maxYFirst = blocksOfFirstOutline.stream() double maxYFirst = blocksOfFirstOutline.stream()
.mapToDouble(TextPageBlock::getPdfMaxY) .mapToDouble(TextPageBlock::getPdfMaxY)
.max() .max().orElse(Double.NEGATIVE_INFINITY);
.orElse(Double.NEGATIVE_INFINITY);
return blocksOfNotFoundOutline.stream() return blocksOfNotFoundOutline.stream()
.mapToDouble(TextPageBlock::getPdfMaxY) .mapToDouble(TextPageBlock::getPdfMaxY)
@ -127,13 +125,13 @@ public class BlockificationPostprocessingService {
} }
private void processTextBlocks(List<TextPageBlock> textBlocks, float pageHeight, OutlineProcessionContext context) { private void processTextBlocks(List<TextPageBlock> textBlocks, OutlineProcessionContext context) {
OutlineObject outlineObject = context.getOutlineObject(); OutlineObject outlineObject = context.getOutlineObject();
ListIterator<TextPageBlock> iterator = textBlocks.listIterator(); ListIterator<TextPageBlock> iterator = textBlocks.listIterator();
while (iterator.hasNext()) { while (iterator.hasNext()) {
TextPageBlock pageBlock = iterator.next(); TextPageBlock pageBlock = iterator.next();
if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) { if (outlineObject.isAbove(pageBlock)) {
break; break;
} }
} }
@ -148,7 +146,7 @@ public class BlockificationPostprocessingService {
} }
private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context, float pageHeight) { private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
OutlineObject outlineObject = context.outlineObject; OutlineObject outlineObject = context.outlineObject;
TextPageBlock directMatch = context.directMatch; TextPageBlock directMatch = context.directMatch;
@ -156,8 +154,8 @@ public class BlockificationPostprocessingService {
TextPageBlock splitCandidate = context.splitCandidate; TextPageBlock splitCandidate = context.splitCandidate;
PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth()); PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth());
double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch, pageHeight) : Double.MAX_VALUE; double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch) : Double.MAX_VALUE;
double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate, pageHeight) : Double.MAX_VALUE; double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate) : Double.MAX_VALUE;
double distanceToBestMergeCandidates = Double.MAX_VALUE; double distanceToBestMergeCandidates = Double.MAX_VALUE;
List<TextPageBlock> bestMergeCandidateCombination = new ArrayList<>(); List<TextPageBlock> bestMergeCandidateCombination = new ArrayList<>();
@ -177,9 +175,8 @@ public class BlockificationPostprocessingService {
for (List<TextPageBlock> combination : combinations) { for (List<TextPageBlock> combination : combinations) {
double averageDistance = combination.stream() double averageDistance = combination.stream()
.map(block -> calculateDistance(outlineObject, block, pageHeight)) .map(block -> calculateDistance(outlineObject, block))
.mapToDouble(Double::doubleValue).average() .mapToDouble(Double::doubleValue).average().orElse(Double.MAX_VALUE);
.orElse(Double.MAX_VALUE);
if (distanceToBestMergeCandidates > averageDistance) { if (distanceToBestMergeCandidates > averageDistance) {
distanceToBestMergeCandidates = averageDistance; distanceToBestMergeCandidates = averageDistance;
bestMergeCandidateCombination = combination; bestMergeCandidateCombination = combination;
@ -406,11 +403,9 @@ public class BlockificationPostprocessingService {
} }
private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock, float pageHeight) { private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock) {
double deltaX = outlineObject.getPoint().getX() - pageBlock.getMinX(); return outlineObject.distance(pageBlock);
double deltaY = pageHeight - outlineObject.getPoint().getY() - pageBlock.getMinY();
return Math.sqrt(deltaX * deltaX + deltaY * deltaY);
} }
@ -427,6 +422,13 @@ public class BlockificationPostprocessingService {
String blockText = sanitizeString(pageBlock.getText()); String blockText = sanitizeString(pageBlock.getText());
String outlineTitle = sanitizeString(outlineObject.getTitle()); String outlineTitle = sanitizeString(outlineObject.getTitle());
int threshold = (int) (Math.min(blockText.length(), outlineTitle.length()) * STRING_SIMILARITY_THRESHOLD) + 1;
int distance = new LevenshteinDistance(threshold).apply(blockText, outlineTitle);
if (distance >= 0 && distance < threshold) {
context.directMatch = pageBlock;
return true;
}
boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle); boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle);
boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText); boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText);

View File

@ -32,7 +32,6 @@ public class CoordinateTransforms {
} }
@SneakyThrows @SneakyThrows
public AffineTransform calculateInitialUserSpaceCoordsToImageCoords(PageInformation pageInformation, double scalingFactor) { public AffineTransform calculateInitialUserSpaceCoordsToImageCoords(PageInformation pageInformation, double scalingFactor) {
@ -40,6 +39,19 @@ public class CoordinateTransforms {
} }
public AffineTransform calculatePageCoordsToInitialUserSpaceCoords(PageInformation pageInformation) {
return calculateImageCoordsToInitialUserSpaceCoords(pageInformation, 1);
}
@SneakyThrows
public AffineTransform calculateInitialUserSpaceCoordsToPageCoords(PageInformation pageInformation) {
return calculatePageCoordsToInitialUserSpaceCoords(pageInformation).createInverse();
}
public double calculateScalingFactor(PageInformation pageInformation, double imageWidth) { public double calculateScalingFactor(PageInformation pageInformation, double imageWidth) {
// PDFBox always returns page height and width based on rotation // PDFBox always returns page height and width based on rotation

View File

@ -1,31 +1,40 @@
package com.knecon.fforesight.service.layoutparser.processor.utils; package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@UtilityClass @UtilityClass
public final class TextNormalizationUtilities { public final class TextNormalizationUtilities {
/** public static final Pattern hyphenLineBreaks = Pattern.compile("[-~‐‒⁻−﹣゠⁓‑\\u00AD][\\r\\n]+");
* Revert hyphenation due to line breaks. public static final Pattern linebreaks = Pattern.compile("[\\r\\n]+");
* public static final Pattern doubleWhitespaces = Pattern.compile("\\s{2,}");
* @param text Text to be processed.
* @return Text without line-break hyphenation.
*/
public static String removeHyphenLineBreaks(String text) {
return text.replaceAll("([^\\s\\d\\-]{2,500})[\\-\\u00AD]\\R", "$1");
public String cleanString(String value) {
String noHyphenLinebreaks = removeHyphenLinebreaks(value);
String noLinebreaks = removeLinebreaks(noHyphenLinebreaks);
return removeMultipleWhitespaces(noLinebreaks);
} }
public static String removeLineBreaks(String text) { public String removeHyphenLinebreaks(String value) {
return text.replaceAll("\n", " "); return hyphenLineBreaks.matcher(value).replaceAll("");
} }
public static String removeRepeatingWhitespaces(String text) { private String removeMultipleWhitespaces(String value) {
return text.replaceAll(" {2}", " "); return doubleWhitespaces.matcher(value).replaceAll(" ");
} }
private String removeLinebreaks(String value) {
return linebreaks.matcher(value).replaceAll(" ");
}
} }

View File

@ -1,11 +1,14 @@
package com.knecon.fforesight.service.layoutparser.processor.visualization; package com.knecon.fforesight.service.layoutparser.processor.visualization;
import java.awt.Color; import java.awt.Color;
import java.awt.geom.AffineTransform;
import java.awt.geom.Line2D; import java.awt.geom.Line2D;
import java.awt.geom.Point2D; import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.Collection; import java.util.Collection;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
@ -15,15 +18,19 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlo
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.viewerdoc.layers.LayoutDebugLayerConfig; import com.knecon.fforesight.service.viewerdoc.layers.LayoutDebugLayerConfig;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle; import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
import com.knecon.fforesight.service.viewerdoc.model.FilledRectangle;
import com.knecon.fforesight.service.viewerdoc.model.PlacedText; import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage; import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
@ -43,6 +50,8 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
boolean active; boolean active;
Map<Integer, AtomicInteger> outlineObjectsWithoutPointsPerPage = new HashMap<>();
public void addTextVisualizations(List<TextPositionSequence> textPositionSequences, int pageNumber) { public void addTextVisualizations(List<TextPositionSequence> textPositionSequences, int pageNumber) {
@ -151,7 +160,6 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
} }
public void addLineVisualizationsFromNestedTextPosition(Collection<Set<TextPositionSequence>> lines, int pageNumber) { public void addLineVisualizationsFromNestedTextPosition(Collection<Set<TextPositionSequence>> lines, int pageNumber) {
if (!active) { if (!active) {
@ -168,6 +176,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
} }
public void addTextBlockVisualizations(List<TextPageBlock> textPageBlocks, int page) { public void addTextBlockVisualizations(List<TextPageBlock> textPageBlocks, int page) {
if (!active) { if (!active) {
@ -254,4 +263,40 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
} }
public void addOutlineObjects(List<OutlineObject> outlineObjects, PageInformation pageInformation) {
if (!active) {
return;
}
for (OutlineObject outlineObject : outlineObjects) {
addOutlineObject(outlineObject, pageInformation);
}
}
private void addOutlineObject(OutlineObject outlineObject, PageInformation pageInformation) {
int rectSize = 5;
Point2D point2D;
if (outlineObject.getPoint().isPresent()) {
point2D = outlineObject.getPoint().get();
} else {
int numberOfOutlineObjectsWithoutPoints = outlineObjectsWithoutPointsPerPage.computeIfAbsent(outlineObject.getPageNumber(), a -> new AtomicInteger(0))
.getAndIncrement();
point2D = new Point2D.Double(10, 10 + numberOfOutlineObjectsWithoutPoints * (10 + rectSize * 2));
}
Point2D textPoint = new Point2D.Double(point2D.getX() + 2 * rectSize, point2D.getY() + rectSize);
AffineTransform pageToUserSpaceTransform = CoordinateTransforms.calculatePageCoordsToInitialUserSpaceCoords(pageInformation);
pageToUserSpaceTransform.transform(point2D, point2D);
pageToUserSpaceTransform.transform(textPoint, textPoint);
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(outlineObject.getPageNumber(), outlineObjects);
visualizationsOnPage.getFilledRectangles()
.add(new FilledRectangle(new Rectangle2D.Double(point2D.getX() - rectSize, point2D.getY() - rectSize, rectSize * 2, rectSize * 2), OUTLINE_OBJECT_COLOR, 1));
visualizationsOnPage.getPlacedTexts().add(PlacedText.textFacingUp(outlineObject.getTitle(), textPoint, 10, outlineObject.isFound() ? Color.BLACK : Color.RED, FONT));
}
} }

View File

@ -15,6 +15,7 @@ import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
@ -72,6 +73,9 @@ public class LayoutGrid extends LayoutGridLayerConfig {
public void addHeadline(Headline headline) { public void addHeadline(Headline headline) {
addAsRectangle(headline, headlines, HEADLINE_COLOR); addAsRectangle(headline, headlines, HEADLINE_COLOR);
if (headline.getEngines().contains(LayoutEngine.OUTLINE)) {
addAsRectangle(headline, outlineHeadlines, HEADLINE_COLOR);
}
} }

View File

@ -84,17 +84,17 @@ public class OutlineDetectionTest extends AbstractTest {
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree(); OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
assertEquals(outlineObjectTree.getRootNodes().size(), 8); assertEquals(outlineObjectTree.getRootNodes().size(), 8);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(1).size(), 1); assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(2).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(3).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(4).size(), 1); assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(4).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(5).size(), 1); assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(5).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(6).size(), 2); assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(6).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(7).size(), 3); assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(7).size(), 2);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(8).size(), 2); assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(8).size(), 3);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(10).size(), 1); assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(9).size(), 2);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(11).size(), 4); assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(11).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(12).size(), 1); assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(12).size(), 4);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(13).size(), 2); assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(13).size(), 1);
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(14).size(), 2);
assertTrue(outlineObjectTree.getOutlineObjectsPerPage().values() assertTrue(outlineObjectTree.getOutlineObjectsPerPage().values()
.stream() .stream()
.flatMap(Collection::stream) .flatMap(Collection::stream)

View File

@ -40,6 +40,7 @@ public record LayerIdentifier(String name, String markedContentName) {
public static final LayerIdentifier KNECON_LAYOUT_FIGURES = new LayerIdentifier("Figures", "LAYOUT_FIGURES"); public static final LayerIdentifier KNECON_LAYOUT_FIGURES = new LayerIdentifier("Figures", "LAYOUT_FIGURES");
public static final LayerIdentifier KNECON_LAYOUT_IMAGES = new LayerIdentifier("Images", "LAYOUT_IMAGES"); public static final LayerIdentifier KNECON_LAYOUT_IMAGES = new LayerIdentifier("Images", "LAYOUT_IMAGES");
public static final LayerIdentifier KNECON_LAYOUT_TREE_IDs = new LayerIdentifier("Tree IDs", "LAYOUT_TREE_IDs"); public static final LayerIdentifier KNECON_LAYOUT_TREE_IDs = new LayerIdentifier("Tree IDs", "LAYOUT_TREE_IDs");
public static final LayerIdentifier OUTLINE_HEADLINES = new LayerIdentifier("Outline Headlines", "OUTLINE_HEADLINES");
//layout grid debug //layout grid debug
public static final LayerIdentifier KNECON_LAYOUT_DEBUG = new LayerIdentifier("Layout elements", "DEBUG_LAYOUT"); public static final LayerIdentifier KNECON_LAYOUT_DEBUG = new LayerIdentifier("Layout elements", "DEBUG_LAYOUT");
@ -53,6 +54,7 @@ public record LayerIdentifier(String name, String markedContentName) {
public static final LayerIdentifier MARKED_CONTENT = new LayerIdentifier("Marked content", "MARKED_CONTENT"); public static final LayerIdentifier MARKED_CONTENT = new LayerIdentifier("Marked content", "MARKED_CONTENT");
public static final LayerIdentifier NEIGHBOURS = new LayerIdentifier("Neighbours", "NEIGHBOURS"); public static final LayerIdentifier NEIGHBOURS = new LayerIdentifier("Neighbours", "NEIGHBOURS");
public static final LayerIdentifier CHARACTERS = new LayerIdentifier("Characters", "CHARACTERS"); public static final LayerIdentifier CHARACTERS = new LayerIdentifier("Characters", "CHARACTERS");
public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS");
public static final LayerIdentifier KNECON_VISUAL_PARSING = new LayerIdentifier("Visual Layout Parser", "VISUAL_PARSING"); public static final LayerIdentifier KNECON_VISUAL_PARSING = new LayerIdentifier("Visual Layout Parser", "VISUAL_PARSING");

View File

@ -30,6 +30,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
protected static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6); protected static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6);
protected static final Color CELLS_COLOR = new Color(31, 214, 27); protected static final Color CELLS_COLOR = new Color(31, 214, 27);
protected static final Color OUTLINE_OBJECT_COLOR = new Color(214, 27, 183);
protected static final Color MAIN_BODY_COLOR = new Color(171, 131, 6); protected static final Color MAIN_BODY_COLOR = new Color(171, 131, 6);
protected static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6); protected static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6);
@ -53,6 +54,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
protected final Visualizations markedContent = Visualizations.builder().layer(LayerIdentifier.MARKED_CONTENT).build(); protected final Visualizations markedContent = Visualizations.builder().layer(LayerIdentifier.MARKED_CONTENT).build();
protected final Visualizations neighbours = Visualizations.builder().layer(LayerIdentifier.NEIGHBOURS).build(); protected final Visualizations neighbours = Visualizations.builder().layer(LayerIdentifier.NEIGHBOURS).build();
protected final Visualizations characters = Visualizations.builder().layer(LayerIdentifier.CHARACTERS).build(); protected final Visualizations characters = Visualizations.builder().layer(LayerIdentifier.CHARACTERS).build();
protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build();
public List<Visualizations> getVisualizations() { public List<Visualizations> getVisualizations() {
@ -66,7 +68,8 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup {
clean_rulings, // clean_rulings, //
cells, // cells, //
mainBody, // mainBody, //
markedContent // markedContent, //
outlineObjects //
); );
} }

View File

@ -44,12 +44,12 @@ public class LayoutGridLayerConfig extends AbstractLayerGroup {
protected final Visualizations images = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_IMAGES).build(); protected final Visualizations images = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_IMAGES).build();
protected final Visualizations keyValue = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_KEY_VALUE).build(); protected final Visualizations keyValue = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_KEY_VALUE).build();
protected final Visualizations treeIds = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TREE_IDs).build(); protected final Visualizations treeIds = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TREE_IDs).build();
protected final Visualizations outlineHeadlines = Visualizations.builder().layer(LayerIdentifier.OUTLINE_HEADLINES).build();
@Override @Override
public List<Visualizations> getVisualizations() { public List<Visualizations> getVisualizations() {
return List.of(headlines, paragraphs, tables, sections, headerFooter, keyValue, figures, images, treeIds); return List.of(headlines, paragraphs, tables, sections, headerFooter, keyValue, figures, images, treeIds, outlineHeadlines);
} }
} }

View File

@ -44,8 +44,8 @@ class PageContentCleanerTest {
@SneakyThrows @SneakyThrows
public void testContentCleaning() { public void testContentCleaning() {
Path file = Path.of("/tmp/OCR_TEST/402Study.pdf/viewerDocument.pdf"); Path file = Path.of("/home/kschuettler/Downloads/ITEM 23_Absorção cutanea.pdf");
File tmpFile = new File("/tmp/cleaned.pdf"); File tmpFile = new File("/tmp/ITEM 23_Absorção cutanea.pdf");
try (var in = new FileInputStream(file.toFile());// try (var in = new FileInputStream(file.toFile());//
var doc = new PDFDoc(in);// var doc = new PDFDoc(in);//
var out = new FileOutputStream(tmpFile);// var out = new FileOutputStream(tmpFile);//
@ -58,7 +58,7 @@ class PageContentCleanerTest {
.writer(pageWriter) .writer(pageWriter)
.reader(reader) .reader(reader)
.elementBuilder(builder) .elementBuilder(builder)
.markedContentToRemove(Set.of(LayerIdentifier.KNECON_OCR_DEBUG.markedContentName())) .markedContentToRemove(Set.of(LayerIdentifier.KNECON_LAYOUT.markedContentName()))
.build(); .build();
try (PageIterator iterator = doc.getPageIterator()) { try (PageIterator iterator = doc.getPageIterator()) {