diff --git a/layoutparser-service/layoutparser-service-processor/build.gradle.kts b/layoutparser-service/layoutparser-service-processor/build.gradle.kts index 49de8b3..b981030 100644 --- a/layoutparser-service/layoutparser-service-processor/build.gradle.kts +++ b/layoutparser-service/layoutparser-service-processor/build.gradle.kts @@ -29,5 +29,6 @@ dependencies { implementation("org.commonmark:commonmark:0.22.0") implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0") implementation("com.pdftron:PDFNet:10.11.0") + implementation("org.apache.commons:commons-text:1.12.0") } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index bb3e95d..ebceb94 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -119,18 +119,14 @@ public class LayoutParsingPipeline { log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); - File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()) - .orElse(originFile); + File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile); VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId() - .map(layoutParsingStorageService::getVisualLayoutParsingFile) - .orElse(new VisualLayoutParsingResponse()); + .map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse()); ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId() - .map(layoutParsingStorageService::getImagesFile) - .orElse(new ImageServiceResponse()); + .map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse()); TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId() - .map(layoutParsingStorageService::getTablesFile) - .orElse(new TableServiceResponse()); + .map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse()); ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null // ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), @@ -147,20 +143,13 @@ public class LayoutParsingPipeline { log.info("Creating viewer document for {}", layoutParsingRequest.identifier()); - layoutGridService.addLayoutGrid(viewerDocumentFile, - documentGraph, - viewerDocumentFile, - false, - layoutParsingRequest.visualLayoutParsingFileId() - .isPresent()); + layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent()); log.info("Storing resulting files for {}", layoutParsingRequest.identifier()); layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph)); - if (layoutParsingRequest.documentMarkdownFileStorageId() - .isPresent()) { - layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId() - .get(), new MarkdownMapper().toMarkdownContent(documentGraph)); + if (layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) { + layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(), new MarkdownMapper().toMarkdownContent(documentGraph)); } layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile); @@ -336,17 +325,18 @@ public class LayoutParsingPipeline { classificationPage.setPageHeight(cropbox.getHeight()); if (layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD) { - List outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>()); + List outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber, new ArrayList<>()); OutlineObject notFoundOutlineObject = null; if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) { - lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight())); + lastProcessedOutlineObject.resetPoint(); notFoundOutlineObject = lastProcessedOutlineObject; } if (!outlineObjects.isEmpty()) { classificationPage.setOutlineObjects(outlineObjects); lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject); } + classificationDocument.getLayoutDebugLayer().addOutlineObjects(outlineObjects, pageInformation); } classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Page.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Page.java index 0a11ffe..ad3d248 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Page.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Page.java @@ -85,7 +85,7 @@ public class Page { private SemanticNode getHighestParentOnPage(SemanticNode node) { SemanticNode currentNode = node; - while (currentNode.getParent().onlyOnPage(this)) { + while (currentNode.hasParent() && currentNode.getParent().onlyOnPage(this)) { currentNode = currentNode.getParent(); } return currentNode; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java index 99a34c4..80f2370 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java @@ -1,5 +1,6 @@ package com.knecon.fforesight.service.layoutparser.processor.model.outline; +import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; import java.io.IOException; import java.util.ArrayList; @@ -26,6 +27,9 @@ import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocume import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; +import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; + import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @@ -89,12 +93,13 @@ public class OutlineExtractorService { if (page == null) { return Optional.empty(); } - }catch (IOException e){ + } catch (IOException e) { log.info(String.format("Error occurred during position resolution for outline item with title %s: " + e, title)); return Optional.empty(); } - int pageNumber = document.getPages().indexOf(page); + int pageNumber = document.getPages().indexOf(page) + 1; + AffineTransform userSpaceToPageCoords = CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(PageInformation.fromPDPage(pageNumber, page)); Optional outlinePosition = Optional.empty(); @@ -123,8 +128,15 @@ public class OutlineExtractorService { log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title)); } - return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, outlinePosition.orElse(new Point2D.Float(0, 0)), depth))); + return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title, + pageNumber, + transformPointToPageCoords(outlinePosition, userSpaceToPageCoords), depth))); + } + + private static Point2D transformPointToPageCoords(Optional outlinePosition, AffineTransform userSpaceToPageCoords) { + + return outlinePosition.map(point -> userSpaceToPageCoords.transform(point, null)).orElse(null); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java index 6f8af6b..72fdbef 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObject.java @@ -1,27 +1,34 @@ package com.knecon.fforesight.service.layoutparser.processor.model.outline; import java.awt.geom.Point2D; +import java.util.Optional; -import lombok.AllArgsConstructor; -import lombok.Data; -import lombok.RequiredArgsConstructor; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; + +import lombok.Getter; +import lombok.Setter; -@Data -@RequiredArgsConstructor -@AllArgsConstructor public class OutlineObject { + @Getter private final String title; + @Getter private final int pageNumber; - private Point2D point; + @Getter private final int treeDepth; + private Point2D point; // java coordinates, (0, 0) is always top left + + @Getter + @Setter private boolean found; public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) { - this(title, pageNumber, depth); + this.title = title; + this.pageNumber = pageNumber; + this.treeDepth = depth; this.point = point2D; } @@ -32,4 +39,39 @@ public class OutlineObject { return "OutlineObject{" + "title='" + title + '\'' + '}'; } + + public Optional getPoint() { + + return Optional.ofNullable(point); + } + + + public boolean isAbove(BoundingBox boundingBox) { + + if (point == null) { + return true; + } + return point.getY() <= boundingBox.getMaxY(); + } + + + public double distance(BoundingBox boundingBox) { + + if (point == null) { + return 0; + } + if (boundingBox.getBBox().contains(point)) { + return 0; + } + double deltaX = Math.min(Math.abs(boundingBox.getMinX() - point.getX()), Math.abs(boundingBox.getMaxX() - point.getX())); + double deltaY = Math.min(Math.abs(boundingBox.getMinY() - point.getY()), Math.abs(boundingBox.getMaxY() - point.getY())); + return Math.sqrt(deltaX * deltaX + deltaY * deltaY); + } + + + public void resetPoint() { + + this.point = null; + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTree.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTree.java index 61b0dd8..767b846 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTree.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineObjectTree.java @@ -39,4 +39,28 @@ public class OutlineObjectTree { } } + @Override + public String toString() { + + StringBuilder sb = new StringBuilder(); + sb.append("OutlineObjectTree(\n"); + for (OutlineObjectTreeNode node : rootNodes) { + buildString(node, sb, 1); + } + sb.append(")"); + return sb.toString(); + } + + private void buildString(OutlineObjectTreeNode node, StringBuilder sb, int depth) { + + for (int i = 0; i < depth; i++) { + sb.append(" "); + } + sb.append(node.getOutlineObject().getTitle()).append("\n"); + + for (OutlineObjectTreeNode child : node.getChildren()) { + buildString(child, sb, depth + 1); + } + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java index 80243e4..fd1e4de 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java @@ -87,7 +87,7 @@ public class Cell extends BoundingBox { } - return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" {2}", " "); + return TextNormalizationUtilities.cleanString(sb.toString()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SearchableText.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SearchableText.java index c0ef4e3..e8f6a39 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SearchableText.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SearchableText.java @@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text; import java.util.ArrayList; import java.util.List; + import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import lombok.Getter; @@ -38,11 +39,7 @@ public class SearchableText { sb.append(word); sb.append(' '); } - String text = sb.toString(); - text = TextNormalizationUtilities.removeHyphenLineBreaks(text); - text = TextNormalizationUtilities.removeLineBreaks(text); - text = TextNormalizationUtilities.removeRepeatingWhitespaces(text); - return text; + return TextNormalizationUtilities.cleanString(sb.toString()); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index a6367fe..cc0c7cd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -213,7 +213,7 @@ public class TextPageBlock extends AbstractPageBlock { previous = word; } - return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()); + return TextNormalizationUtilities.removeHyphenLinebreaks(sb.toString()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java index 0878584..08b5d83 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java @@ -8,6 +8,7 @@ import java.util.ListIterator; import java.util.Locale; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.text.similarity.LevenshteinDistance; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; @@ -23,7 +24,7 @@ import lombok.Data; @Service public class BlockificationPostprocessingService { - private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f; + private static final float STRING_SIMILARITY_THRESHOLD = 0.1f; public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) { @@ -34,38 +35,36 @@ public class BlockificationPostprocessingService { return null; } - float pageHeight = classificationPage.getPageHeight(); - ListIterator outlineObjectListIterator = outlineObjects.listIterator(); if (notFoundOutlineObject != null) { OutlineProcessionContext notFoundOutlineObjectProcessionContext = new OutlineProcessionContext(notFoundOutlineObject); - processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, notFoundOutlineObjectProcessionContext); + processTextBlocks(getTextPageBlocks(classificationPage), notFoundOutlineObjectProcessionContext); OutlineObject firstOutlineObject = null; OutlineProcessionContext firstOutlineObjectProcessionContext = null; if (outlineObjectListIterator.hasNext()) { firstOutlineObject = outlineObjectListIterator.next(); firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject); - processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext); + processTextBlocks(getTextPageBlocks(classificationPage), firstOutlineObjectProcessionContext); } if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) { - notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext, pageHeight)); + notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext)); } if (firstOutlineObject != null) { // re-create the context for the updated blocks firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject); - processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext); - firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext, pageHeight)); + processTextBlocks(getTextPageBlocks(classificationPage), firstOutlineObjectProcessionContext); + firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext)); } } outlineObjectListIterator.forEachRemaining(outlineObject -> { OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject); - processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext); - outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext, pageHeight)); + processTextBlocks(getTextPageBlocks(classificationPage), outlineObjectProcessionContext); + outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext)); }); if (!outlineObjects.isEmpty()) { @@ -104,8 +103,7 @@ public class BlockificationPostprocessingService { double maxYFirst = blocksOfFirstOutline.stream() .mapToDouble(TextPageBlock::getPdfMaxY) - .max() - .orElse(Double.NEGATIVE_INFINITY); + .max().orElse(Double.NEGATIVE_INFINITY); return blocksOfNotFoundOutline.stream() .mapToDouble(TextPageBlock::getPdfMaxY) @@ -127,13 +125,13 @@ public class BlockificationPostprocessingService { } - private void processTextBlocks(List textBlocks, float pageHeight, OutlineProcessionContext context) { + private void processTextBlocks(List textBlocks, OutlineProcessionContext context) { OutlineObject outlineObject = context.getOutlineObject(); ListIterator iterator = textBlocks.listIterator(); while (iterator.hasNext()) { TextPageBlock pageBlock = iterator.next(); - if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) { + if (outlineObject.isAbove(pageBlock)) { break; } } @@ -148,7 +146,7 @@ public class BlockificationPostprocessingService { } - private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context, float pageHeight) { + private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) { OutlineObject outlineObject = context.outlineObject; TextPageBlock directMatch = context.directMatch; @@ -156,8 +154,8 @@ public class BlockificationPostprocessingService { TextPageBlock splitCandidate = context.splitCandidate; PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth()); - double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch, pageHeight) : Double.MAX_VALUE; - double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate, pageHeight) : Double.MAX_VALUE; + double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch) : Double.MAX_VALUE; + double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate) : Double.MAX_VALUE; double distanceToBestMergeCandidates = Double.MAX_VALUE; List bestMergeCandidateCombination = new ArrayList<>(); @@ -177,9 +175,8 @@ public class BlockificationPostprocessingService { for (List combination : combinations) { double averageDistance = combination.stream() - .map(block -> calculateDistance(outlineObject, block, pageHeight)) - .mapToDouble(Double::doubleValue).average() - .orElse(Double.MAX_VALUE); + .map(block -> calculateDistance(outlineObject, block)) + .mapToDouble(Double::doubleValue).average().orElse(Double.MAX_VALUE); if (distanceToBestMergeCandidates > averageDistance) { distanceToBestMergeCandidates = averageDistance; bestMergeCandidateCombination = combination; @@ -406,11 +403,9 @@ public class BlockificationPostprocessingService { } - private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock, float pageHeight) { + private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock) { - double deltaX = outlineObject.getPoint().getX() - pageBlock.getMinX(); - double deltaY = pageHeight - outlineObject.getPoint().getY() - pageBlock.getMinY(); - return Math.sqrt(deltaX * deltaX + deltaY * deltaY); + return outlineObject.distance(pageBlock); } @@ -427,6 +422,13 @@ public class BlockificationPostprocessingService { String blockText = sanitizeString(pageBlock.getText()); String outlineTitle = sanitizeString(outlineObject.getTitle()); + int threshold = (int) (Math.min(blockText.length(), outlineTitle.length()) * STRING_SIMILARITY_THRESHOLD) + 1; + int distance = new LevenshteinDistance(threshold).apply(blockText, outlineTitle); + if (distance >= 0 && distance < threshold) { + context.directMatch = pageBlock; + return true; + } + boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle); boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/CoordinateTransforms.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/CoordinateTransforms.java index fbd540d..ce3f99f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/CoordinateTransforms.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/CoordinateTransforms.java @@ -32,7 +32,6 @@ public class CoordinateTransforms { } - @SneakyThrows public AffineTransform calculateInitialUserSpaceCoordsToImageCoords(PageInformation pageInformation, double scalingFactor) { @@ -40,6 +39,19 @@ public class CoordinateTransforms { } + public AffineTransform calculatePageCoordsToInitialUserSpaceCoords(PageInformation pageInformation) { + + return calculateImageCoordsToInitialUserSpaceCoords(pageInformation, 1); + } + + + @SneakyThrows + public AffineTransform calculateInitialUserSpaceCoordsToPageCoords(PageInformation pageInformation) { + + return calculatePageCoordsToInitialUserSpaceCoords(pageInformation).createInverse(); + } + + public double calculateScalingFactor(PageInformation pageInformation, double imageWidth) { // PDFBox always returns page height and width based on rotation diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextNormalizationUtilities.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextNormalizationUtilities.java index 9f90bee..8c26e93 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextNormalizationUtilities.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextNormalizationUtilities.java @@ -1,31 +1,40 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + import lombok.experimental.UtilityClass; @UtilityClass public final class TextNormalizationUtilities { - /** - * Revert hyphenation due to line breaks. - * - * @param text Text to be processed. - * @return Text without line-break hyphenation. - */ - public static String removeHyphenLineBreaks(String text) { + public static final Pattern hyphenLineBreaks = Pattern.compile("[-~‐‒⁻−﹣゠⁓‑\\u00AD][\\r\\n]+"); + public static final Pattern linebreaks = Pattern.compile("[\\r\\n]+"); + public static final Pattern doubleWhitespaces = Pattern.compile("\\s{2,}"); - return text.replaceAll("([^\\s\\d\\-]{2,500})[\\-\\u00AD]\\R", "$1"); + + public String cleanString(String value) { + + String noHyphenLinebreaks = removeHyphenLinebreaks(value); + String noLinebreaks = removeLinebreaks(noHyphenLinebreaks); + return removeMultipleWhitespaces(noLinebreaks); } - public static String removeLineBreaks(String text) { + public String removeHyphenLinebreaks(String value) { - return text.replaceAll("\n", " "); + return hyphenLineBreaks.matcher(value).replaceAll(""); } - public static String removeRepeatingWhitespaces(String text) { + private String removeMultipleWhitespaces(String value) { - return text.replaceAll(" {2}", " "); + return doubleWhitespaces.matcher(value).replaceAll(" "); } + + private String removeLinebreaks(String value) { + + return linebreaks.matcher(value).replaceAll(" "); + } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java index 619de30..0bac192 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java @@ -1,11 +1,14 @@ package com.knecon.fforesight.service.layoutparser.processor.visualization; import java.awt.Color; +import java.awt.geom.AffineTransform; import java.awt.geom.Line2D; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.util.Collection; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; @@ -15,15 +18,19 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlo import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; +import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.viewerdoc.layers.LayoutDebugLayerConfig; import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle; +import com.knecon.fforesight.service.viewerdoc.model.FilledRectangle; import com.knecon.fforesight.service.viewerdoc.model.PlacedText; import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage; @@ -43,6 +50,8 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { boolean active; + Map outlineObjectsWithoutPointsPerPage = new HashMap<>(); + public void addTextVisualizations(List textPositionSequences, int pageNumber) { @@ -151,7 +160,6 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { } - public void addLineVisualizationsFromNestedTextPosition(Collection> lines, int pageNumber) { if (!active) { @@ -168,6 +176,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { } + public void addTextBlockVisualizations(List textPageBlocks, int page) { if (!active) { @@ -254,4 +263,40 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { } + public void addOutlineObjects(List outlineObjects, PageInformation pageInformation) { + + if (!active) { + return; + } + + for (OutlineObject outlineObject : outlineObjects) { + addOutlineObject(outlineObject, pageInformation); + } + } + + + private void addOutlineObject(OutlineObject outlineObject, PageInformation pageInformation) { + + int rectSize = 5; + + Point2D point2D; + if (outlineObject.getPoint().isPresent()) { + point2D = outlineObject.getPoint().get(); + } else { + int numberOfOutlineObjectsWithoutPoints = outlineObjectsWithoutPointsPerPage.computeIfAbsent(outlineObject.getPageNumber(), a -> new AtomicInteger(0)) + .getAndIncrement(); + point2D = new Point2D.Double(10, 10 + numberOfOutlineObjectsWithoutPoints * (10 + rectSize * 2)); + } + + Point2D textPoint = new Point2D.Double(point2D.getX() + 2 * rectSize, point2D.getY() + rectSize); + AffineTransform pageToUserSpaceTransform = CoordinateTransforms.calculatePageCoordsToInitialUserSpaceCoords(pageInformation); + pageToUserSpaceTransform.transform(point2D, point2D); + pageToUserSpaceTransform.transform(textPoint, textPoint); + + VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(outlineObject.getPageNumber(), outlineObjects); + visualizationsOnPage.getFilledRectangles() + .add(new FilledRectangle(new Rectangle2D.Double(point2D.getX() - rectSize, point2D.getY() - rectSize, rectSize * 2, rectSize * 2), OUTLINE_OBJECT_COLOR, 1)); + visualizationsOnPage.getPlacedTexts().add(PlacedText.textFacingUp(outlineObject.getTitle(), textPoint, 10, outlineObject.isFound() ? Color.BLACK : Color.RED, FONT)); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutGrid.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutGrid.java index 439f33d..a8f39f6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutGrid.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutGrid.java @@ -15,6 +15,7 @@ import java.util.Optional; import java.util.stream.Collectors; import java.util.stream.Stream; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline; @@ -72,6 +73,9 @@ public class LayoutGrid extends LayoutGridLayerConfig { public void addHeadline(Headline headline) { addAsRectangle(headline, headlines, HEADLINE_COLOR); + if (headline.getEngines().contains(LayoutEngine.OUTLINE)) { + addAsRectangle(headline, outlineHeadlines, HEADLINE_COLOR); + } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java index 818a99a..ec25267 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java @@ -84,17 +84,17 @@ public class OutlineDetectionTest extends AbstractTest { OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree(); assertEquals(outlineObjectTree.getRootNodes().size(), 8); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(1).size(), 1); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(3).size(), 1); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(2).size(), 1); assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(4).size(), 1); assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(5).size(), 1); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(6).size(), 2); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(7).size(), 3); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(8).size(), 2); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(10).size(), 1); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(11).size(), 4); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(12).size(), 1); - assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(13).size(), 2); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(6).size(), 1); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(7).size(), 2); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(8).size(), 3); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(9).size(), 2); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(11).size(), 1); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(12).size(), 4); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(13).size(), 1); + assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(14).size(), 2); assertTrue(outlineObjectTree.getOutlineObjectsPerPage().values() .stream() .flatMap(Collection::stream) diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java index 0926878..2cca26c 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java @@ -40,6 +40,7 @@ public record LayerIdentifier(String name, String markedContentName) { public static final LayerIdentifier KNECON_LAYOUT_FIGURES = new LayerIdentifier("Figures", "LAYOUT_FIGURES"); public static final LayerIdentifier KNECON_LAYOUT_IMAGES = new LayerIdentifier("Images", "LAYOUT_IMAGES"); public static final LayerIdentifier KNECON_LAYOUT_TREE_IDs = new LayerIdentifier("Tree IDs", "LAYOUT_TREE_IDs"); + public static final LayerIdentifier OUTLINE_HEADLINES = new LayerIdentifier("Outline Headlines", "OUTLINE_HEADLINES"); //layout grid debug public static final LayerIdentifier KNECON_LAYOUT_DEBUG = new LayerIdentifier("Layout elements", "DEBUG_LAYOUT"); @@ -53,6 +54,7 @@ public record LayerIdentifier(String name, String markedContentName) { public static final LayerIdentifier MARKED_CONTENT = new LayerIdentifier("Marked content", "MARKED_CONTENT"); public static final LayerIdentifier NEIGHBOURS = new LayerIdentifier("Neighbours", "NEIGHBOURS"); public static final LayerIdentifier CHARACTERS = new LayerIdentifier("Characters", "CHARACTERS"); + public static final LayerIdentifier OUTLINE_OBJECTS = new LayerIdentifier("Outline Positions", "OUTLINE_OBJECTS"); public static final LayerIdentifier KNECON_VISUAL_PARSING = new LayerIdentifier("Visual Layout Parser", "VISUAL_PARSING"); diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java index ffbaf61..bb01cf0 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java @@ -30,6 +30,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup { protected static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6); protected static final Color CELLS_COLOR = new Color(31, 214, 27); + protected static final Color OUTLINE_OBJECT_COLOR = new Color(214, 27, 183); protected static final Color MAIN_BODY_COLOR = new Color(171, 131, 6); protected static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6); @@ -53,6 +54,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup { protected final Visualizations markedContent = Visualizations.builder().layer(LayerIdentifier.MARKED_CONTENT).build(); protected final Visualizations neighbours = Visualizations.builder().layer(LayerIdentifier.NEIGHBOURS).build(); protected final Visualizations characters = Visualizations.builder().layer(LayerIdentifier.CHARACTERS).build(); + protected final Visualizations outlineObjects = Visualizations.builder().layer(LayerIdentifier.OUTLINE_OBJECTS).build(); public List getVisualizations() { @@ -66,7 +68,8 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup { clean_rulings, // cells, // mainBody, // - markedContent // + markedContent, // + outlineObjects // ); } diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutGridLayerConfig.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutGridLayerConfig.java index 7b95a3d..a959fe1 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutGridLayerConfig.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutGridLayerConfig.java @@ -44,12 +44,12 @@ public class LayoutGridLayerConfig extends AbstractLayerGroup { protected final Visualizations images = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_IMAGES).build(); protected final Visualizations keyValue = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_KEY_VALUE).build(); protected final Visualizations treeIds = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TREE_IDs).build(); - + protected final Visualizations outlineHeadlines = Visualizations.builder().layer(LayerIdentifier.OUTLINE_HEADLINES).build(); @Override public List getVisualizations() { - return List.of(headlines, paragraphs, tables, sections, headerFooter, keyValue, figures, images, treeIds); + return List.of(headlines, paragraphs, tables, sections, headerFooter, keyValue, figures, images, treeIds, outlineHeadlines); } } diff --git a/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/PageContentCleanerTest.java b/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/PageContentCleanerTest.java index 618a8d6..83d6e1d 100644 --- a/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/PageContentCleanerTest.java +++ b/layoutparser-service/viewer-doc-processor/src/test/java/com/knecon/fforesight/service/viewerdoc/service/PageContentCleanerTest.java @@ -44,8 +44,8 @@ class PageContentCleanerTest { @SneakyThrows public void testContentCleaning() { - Path file = Path.of("/tmp/OCR_TEST/402Study.pdf/viewerDocument.pdf"); - File tmpFile = new File("/tmp/cleaned.pdf"); + Path file = Path.of("/home/kschuettler/Downloads/ITEM 23_Absorção cutanea.pdf"); + File tmpFile = new File("/tmp/ITEM 23_Absorção cutanea.pdf"); try (var in = new FileInputStream(file.toFile());// var doc = new PDFDoc(in);// var out = new FileOutputStream(tmpFile);// @@ -58,7 +58,7 @@ class PageContentCleanerTest { .writer(pageWriter) .reader(reader) .elementBuilder(builder) - .markedContentToRemove(Set.of(LayerIdentifier.KNECON_OCR_DEBUG.markedContentName())) + .markedContentToRemove(Set.of(LayerIdentifier.KNECON_LAYOUT.markedContentName())) .build(); try (PageIterator iterator = doc.getPageIterator()) {