From 2bbc3775c53b117b4995ad77d681431859f9ac22 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Wed, 7 Feb 2024 11:31:40 +0100 Subject: [PATCH] RED-8156: add ocr debug layers to viewer document --- .../ocr-service-processor/build.gradle.kts | 2 +- .../ocr/processor/model/OcrResultToWrite.java | 17 +++-- .../processor/service/OcrResultWriter.java | 63 ++++++++++++------- .../service/fonts/FontMetricsFactory.java | 3 +- .../fonts/Type0FontMetricsFactory.java | 61 ++++++++++++++---- .../scriptdetection/FontStyleDetector.java | 6 +- .../v1/server/queue/OcrMessageReceiver.java | 1 + publish-custom-image.sh | 2 +- 8 files changed, 110 insertions(+), 45 deletions(-) diff --git a/ocr-service-v1/ocr-service-processor/build.gradle.kts b/ocr-service-v1/ocr-service-processor/build.gradle.kts index 9b61c1d..d1f4f8f 100644 --- a/ocr-service-v1/ocr-service-processor/build.gradle.kts +++ b/ocr-service-v1/ocr-service-processor/build.gradle.kts @@ -25,6 +25,6 @@ dependencies { api("com.amazonaws:aws-java-sdk-kms:1.12.440") api("com.google.guava:guava:31.1-jre") api("com.iqser.red.commons:pdftron-logic-commons:2.23.0") - api("com.knecon.fforesight:viewer-doc-processor:0.3.0") + api("com.knecon.fforesight:viewer-doc-processor:0.89.0") testImplementation("org.junit.jupiter:junit-jupiter:5.8.1") } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrResultToWrite.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrResultToWrite.java index ccbd45a..3a4c10b 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrResultToWrite.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrResultToWrite.java @@ -25,11 +25,18 @@ public record OcrResultToWrite(List textPositionInImage, Qu .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue() .stream() - .map(ocrResult -> new OcrResultToWrite(ocrResult.getAllWords() - .stream() - .filter(word -> !word.isBlank()) - .map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR)) - .toList(), ocrResult.image().getImageCoordinatesInInitialUserSpace())) + .map(ocrResult -> new OcrResultToWrite(toTextPositionInImage(ocrResult, fontMetricsFactory), ocrResult.image().getImageCoordinatesInInitialUserSpace())) .toList())); } + + + private static List toTextPositionInImage(OcrResult ocrResult, FontMetricsFactory fontMetricsFactory) { + + return ocrResult.getAllWords() + .stream() + .filter(word -> !word.isBlank()) + .map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR)) + .toList(); + } + } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java index 1292859..4bae7b4 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java @@ -9,6 +9,8 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.function.Function; +import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode; @@ -18,10 +20,10 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite; import com.knecon.fforesight.service.ocr.processor.model.QuadPoint; import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage; import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle; -import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; import com.knecon.fforesight.service.viewerdoc.ContentStreams; import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; import com.knecon.fforesight.service.viewerdoc.model.PlacedText; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage; import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService; @@ -43,23 +45,36 @@ public class OcrResultWriter { @SneakyThrows public void drawOcrResultsToPdf(File document, File viewerDocument, Map> imagesWithResultsPerPage) { - List ocrVisualizationsOnPages = createVisualizations(imagesWithResultsPerPage); - List ocrTextDebugVisualizationsOnPages = createDebugTextVisualizations(imagesWithResultsPerPage); - List ocrBBoxDebugVisualizationsOnPages = createDebugBBoxVisualizations(imagesWithResultsPerPage); - viewerDocumentService.addVisualizationsOnPage(document, document, ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false); - viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false); - viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false); - viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false); + Map ocrVisualizationsOnPages = createVisualizations(imagesWithResultsPerPage); + Map ocrTextDebugVisualizationsOnPages = createDebugTextVisualizations(imagesWithResultsPerPage); + Map ocrBBoxDebugVisualizationsOnPages = createDebugBBoxVisualizations(imagesWithResultsPerPage); + + Visualizations visualizations = new Visualizations(ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false); + + List debugVisualizations = List.of(visualizations, + new Visualizations(ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false), + new Visualizations(ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false)); + + viewerDocumentService.addVisualizationsOnPage(document, document, visualizations); + viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, debugVisualizations); } - private List createVisualizations(Map> imagesWithResultsPerPage) { + private Map createVisualizations(Map> imagesWithResultsPerPage) { - return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList(); + return imagesWithResultsPerPage.keySet() + .stream() + .collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createVisualizations(imagesWithResultsPerPage.get(pageNumber)))); } - private VisualizationsOnPage createVisualizations(Integer pageNumber, List ocrResultsToWrite) { + private static Function pageNumber1IdxTo0IdxMapper() { + // PDFBox uses a 0-based index for page numbers internally, while we use a 1-based index + return p -> p - 1; + } + + + private VisualizationsOnPage createVisualizations(List ocrResultsToWrite) { List words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList(); List placedTexts = words.stream() @@ -67,21 +82,23 @@ public class OcrResultWriter { null, Color.BLACK, (float) word.getFontSize(), - word.getFont(), + word.getFontMetricsFactory(), Optional.of(word.getTextMatrix()), Optional.of(RenderingMode.NEITHER))) .toList(); - return VisualizationsOnPage.builder().pageNumber(pageNumber - 1).placedTexts(placedTexts).build(); + return VisualizationsOnPage.builder().placedTexts(placedTexts).build(); } - private List createDebugTextVisualizations(Map> imagesWithResultsPerPage) { + private Map createDebugTextVisualizations(Map> imagesWithResultsPerPage) { - return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createDebugTextVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList(); + return imagesWithResultsPerPage.keySet() + .stream() + .collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createDebugTextVisualizations(imagesWithResultsPerPage.get(pageNumber)))); } - private VisualizationsOnPage createDebugTextVisualizations(Integer pageNumber, List ocrResultsToWrite) { + private VisualizationsOnPage createDebugTextVisualizations(List ocrResultsToWrite) { List words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList(); List placedTexts = words.stream() @@ -89,28 +106,30 @@ public class OcrResultWriter { null, word.getFontStyle().equals(FontStyle.REGULAR) ? Color.BLUE : Color.RED, (float) word.getFontSize(), - word.getFont(), + word.getFontMetricsFactory(), Optional.of(word.getTextMatrix()), Optional.of(RenderingMode.FILL))) .toList(); - return VisualizationsOnPage.builder().pageNumber(pageNumber).placedTexts(placedTexts).build(); + return VisualizationsOnPage.builder().placedTexts(placedTexts).build(); } - private List createDebugBBoxVisualizations(Map> imagesWithResultsPerPage) { + private Map createDebugBBoxVisualizations(Map> imagesWithResultsPerPage) { - return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createDebugBBoxVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList(); + return imagesWithResultsPerPage.keySet() + .stream() + .collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createDebugBBoxVisualizations(imagesWithResultsPerPage.get(pageNumber)))); } - private VisualizationsOnPage createDebugBBoxVisualizations(Integer pageNumber, List ocrResultsToWrite) { + private VisualizationsOnPage createDebugBBoxVisualizations(List ocrResultsToWrite) { List words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList(); List coloredLines = Stream.concat(// words.stream().map(TextPositionInImage::getTransformedTextBBox).map(this::quadPointAsLines),// ocrResultsToWrite.stream().map(OcrResultToWrite::imageBoundingBox).map(this::createGrid)// ).flatMap(Collection::stream).toList(); - return VisualizationsOnPage.builder().pageNumber(pageNumber).coloredLines(coloredLines).build(); + return VisualizationsOnPage.builder().coloredLines(coloredLines).build(); } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/FontMetricsFactory.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/FontMetricsFactory.java index 039b217..a944e6d 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/FontMetricsFactory.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/FontMetricsFactory.java @@ -4,11 +4,12 @@ import org.apache.pdfbox.pdmodel.font.PDFont; import com.knecon.fforesight.service.ocr.processor.model.FontMetrics; import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent; +import com.knecon.fforesight.service.viewerdoc.model.EmbeddableFont; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; -public interface FontMetricsFactory { +public interface FontMetricsFactory extends EmbeddableFont { default FontMetrics calculateMetrics(String text, double textWidth, double textHeight) { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/Type0FontMetricsFactory.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/Type0FontMetricsFactory.java index 059daed..b71f646 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/Type0FontMetricsFactory.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/Type0FontMetricsFactory.java @@ -1,8 +1,6 @@ package com.knecon.fforesight.service.ocr.processor.service.fonts; import java.io.ByteArrayInputStream; -import java.util.Collections; -import java.util.List; import java.util.Set; import org.apache.fontbox.ttf.GlyphData; @@ -15,47 +13,62 @@ import org.apache.pdfbox.pdmodel.font.PDType0Font; import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent; +import lombok.AllArgsConstructor; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; -import software.amazon.awssdk.services.s3.endpoints.internal.Value; @Slf4j @RequiredArgsConstructor +@AllArgsConstructor public class Type0FontMetricsFactory implements FontMetricsFactory { - private final PDType0Font type0Font; - private final TrueTypeFont trueTypeFont; + private final String resourcePath; + private PDType0Font type0Font; + private TrueTypeFont trueTypeFont; + private PDDocument documentThisIsEmbeddedIn; // for this specific font back-/forward-slashes have a lot of descent screwing up the font size and therefore bold detection. So if we find such a character we ignore its descent. private static final Set slashGlyphIds = Set.of(18, 63); + @SneakyThrows public static Type0FontMetricsFactory regular(PDDocument document) { - return createFromResource("fonts/cmu-regular.ttf", document); + String resourcePath = "fonts/cmu-regular.ttf"; + return createFromResourcePath(resourcePath, document); } + @SneakyThrows public static Type0FontMetricsFactory bold(PDDocument document) { - return createFromResource("fonts/cmu-bold.ttf", document); + String resourcePath = "fonts/cmu-bold.ttf"; + return createFromResourcePath(resourcePath, document); } @SneakyThrows @SuppressWarnings("PMD.CloseResource") - // Todo i think this is not ok to never close the font... - private static Type0FontMetricsFactory createFromResource(String resourcePath, PDDocument document) { + private static TrueTypeFont readFromResourcePath(String resourcePath) { + // The ttf is closed with the document, see PDType0Font line 134 try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(resourcePath); var buffer = new RandomAccessReadBuffer(in)) { - TrueTypeFont trueTypeFont = new TTFParser().parse(buffer); // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information - PDType0Font type0Font = PDType0Font.load(document, trueTypeFont, true); // use Type0Font for unicode support - return new Type0FontMetricsFactory(type0Font, trueTypeFont); + return new TTFParser().parse(buffer); } } + @SneakyThrows + private static Type0FontMetricsFactory createFromResourcePath(String resourcePath, PDDocument document) { + + TrueTypeFont trueTypeFont = readFromResourcePath(resourcePath); + // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information + return new Type0FontMetricsFactory(resourcePath, PDType0Font.load(document, trueTypeFont, true), trueTypeFont, document); // use Type0Font for unicode support) + + } + + @SneakyThrows public HeightAndDescent calculateHeightAndDescent(String text) { @@ -99,4 +112,28 @@ public class Type0FontMetricsFactory implements FontMetricsFactory { return type0Font; } + + @Override + @SneakyThrows + public PDFont embed(PDDocument document) { + + if (documentThisIsEmbeddedIn.equals(document)) { + return getFont(); + } + + // no need to close, the font will be closed with the document it is embedded in + + this.trueTypeFont = readFromResourcePath(resourcePath); + this.type0Font = PDType0Font.load(document, trueTypeFont, true); + this.documentThisIsEmbeddedIn = document; + return getFont(); + } + + + @SneakyThrows + public void close() { + + trueTypeFont.close(); + } + } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/FontStyleDetector.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/FontStyleDetector.java index d12af43..4f85ac9 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/FontStyleDetector.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/FontStyleDetector.java @@ -49,11 +49,11 @@ public class FontStyleDetector { * (Opening (Morphology)). * We then threshold the ratio of remaining pixels to determine whether a word is bold or not. *

- * I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size. - * But this is based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height. + * I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size estimation. + * But that is calculated based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height. * The paper does not describe its clustering algorithm, so I've decided on DBSCAN due to its good runtime and readily available implementation by apache commons math. * Moreover, the paper states that stroke width scales linearly with text height. I've come to the conclusion this is not the case. - * It seems it scales with the square root of the text height. Or at least this seemed to give the best results. + * It seems it scales with the square root of the text height. Or at least this seemed to give the best results for me. */ public Map> detectBold(List ocrResults, PDDocument document) { diff --git a/ocr-service-v1/ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java b/ocr-service-v1/ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java index da3d0e2..0d3f46d 100644 --- a/ocr-service-v1/ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java +++ b/ocr-service-v1/ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java @@ -58,6 +58,7 @@ public class OcrMessageReceiver { setStatusOcrProcessing(dossierId, fileId); + tmpDir.toFile().mkdirs(); File documentFile = tmpDir.resolve("document.pdf").toFile(); File viewerDocumentFile = tmpDir.resolve("viewerDocument.pdf").toFile(); diff --git a/publish-custom-image.sh b/publish-custom-image.sh index 80ecc4e..9afa975 100755 --- a/publish-custom-image.sh +++ b/publish-custom-image.sh @@ -11,5 +11,5 @@ commit_hash=$(git rev-parse --short=5 HEAD) # Combine branch and commit hash buildName="${USER}-${branch}-${commit_hash}" -gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache +gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName echo "nexus.knecon.com:5001/ff/${dir}-server:$buildName"