RED-8156: add ocr debug layers to viewer document

2024-02-07 11:31:40 +01:00 · 2024-02-07 11:31:40 +01:00 · 2bbc3775c5
commit 2bbc3775c5
parent 2aaa53f441
8 changed files with 110 additions and 45 deletions
--- a/ocr-service-v1/ocr-service-processor/build.gradle.kts
+++ b/ocr-service-v1/ocr-service-processor/build.gradle.kts
@ -25,6 +25,6 @@ dependencies {
    api("com.amazonaws:aws-java-sdk-kms:1.12.440")
    api("com.google.guava:guava:31.1-jre")
    api("com.iqser.red.commons:pdftron-logic-commons:2.23.0")
-    api("com.knecon.fforesight:viewer-doc-processor:0.3.0")
+    api("com.knecon.fforesight:viewer-doc-processor:0.89.0")
    testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
 }
--- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrResultToWrite.java
+++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrResultToWrite.java
@ -25,11 +25,18 @@ public record OcrResultToWrite(List<TextPositionInImage> textPositionInImage, Qu
                .collect(Collectors.toMap(Map.Entry::getKey,
                        entry -> entry.getValue()
                                .stream()
-                                .map(ocrResult -> new OcrResultToWrite(ocrResult.getAllWords()
-                                        .stream()
-                                        .filter(word -> !word.isBlank())
-                                        .map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR))
-                                        .toList(), ocrResult.image().getImageCoordinatesInInitialUserSpace()))
+                                .map(ocrResult -> new OcrResultToWrite(toTextPositionInImage(ocrResult, fontMetricsFactory), ocrResult.image().getImageCoordinatesInInitialUserSpace()))
                                .toList()));
    }
+
+
+    private static List<TextPositionInImage> toTextPositionInImage(OcrResult ocrResult, FontMetricsFactory fontMetricsFactory) {
+
+        return ocrResult.getAllWords()
+                .stream()
+                .filter(word -> !word.isBlank())
+                .map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR))
+                .toList();
+    }
+
 }
--- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java
+++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java
@ -9,6 +9,8 @@ import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
+import java.util.function.Function;
+import java.util.stream.Collectors;
 import java.util.stream.Stream;

 import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
@ -18,10 +20,10 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
 import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
 import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
 import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
-import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
 import com.knecon.fforesight.service.viewerdoc.ContentStreams;
 import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
 import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
+import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
 import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
 import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;

@ -43,23 +45,36 @@ public class OcrResultWriter {
    @SneakyThrows
    public void drawOcrResultsToPdf(File document, File viewerDocument, Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {

-        List<VisualizationsOnPage> ocrVisualizationsOnPages = createVisualizations(imagesWithResultsPerPage);
-        List<VisualizationsOnPage> ocrTextDebugVisualizationsOnPages = createDebugTextVisualizations(imagesWithResultsPerPage);
-        List<VisualizationsOnPage> ocrBBoxDebugVisualizationsOnPages = createDebugBBoxVisualizations(imagesWithResultsPerPage);
-        viewerDocumentService.addVisualizationsOnPage(document, document, ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false);
-        viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false);
-        viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false);
-        viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false);
+        Map<Integer, VisualizationsOnPage> ocrVisualizationsOnPages = createVisualizations(imagesWithResultsPerPage);
+        Map<Integer, VisualizationsOnPage> ocrTextDebugVisualizationsOnPages = createDebugTextVisualizations(imagesWithResultsPerPage);
+        Map<Integer, VisualizationsOnPage> ocrBBoxDebugVisualizationsOnPages = createDebugBBoxVisualizations(imagesWithResultsPerPage);
+
+        Visualizations visualizations = new Visualizations(ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false);
+
+        List<Visualizations> debugVisualizations = List.of(visualizations,
+                new Visualizations(ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false),
+                new Visualizations(ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false));
+
+        viewerDocumentService.addVisualizationsOnPage(document, document, visualizations);
+        viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, debugVisualizations);
    }


-    private List<VisualizationsOnPage> createVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
+    private Map<Integer, VisualizationsOnPage> createVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {

-        return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList();
+        return imagesWithResultsPerPage.keySet()
+                .stream()
+                .collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createVisualizations(imagesWithResultsPerPage.get(pageNumber))));
    }


-    private VisualizationsOnPage createVisualizations(Integer pageNumber, List<OcrResultToWrite> ocrResultsToWrite) {
+    private static Function<Integer, Integer> pageNumber1IdxTo0IdxMapper() {
+        // PDFBox uses a 0-based index for page numbers internally, while we use a 1-based index
+        return p -> p - 1;
+    }
+
+
+    private VisualizationsOnPage createVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {

        List<TextPositionInImage> words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
        List<PlacedText> placedTexts = words.stream()
@ -67,21 +82,23 @@ public class OcrResultWriter {
                        null,
                        Color.BLACK,
                        (float) word.getFontSize(),
-                        word.getFont(),
+                        word.getFontMetricsFactory(),
                        Optional.of(word.getTextMatrix()),
                        Optional.of(RenderingMode.NEITHER)))
                .toList();
-        return VisualizationsOnPage.builder().pageNumber(pageNumber - 1).placedTexts(placedTexts).build();
+        return VisualizationsOnPage.builder().placedTexts(placedTexts).build();
    }


-    private List<VisualizationsOnPage> createDebugTextVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
+    private Map<Integer, VisualizationsOnPage> createDebugTextVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {

-        return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createDebugTextVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList();
+        return imagesWithResultsPerPage.keySet()
+                .stream()
+                .collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createDebugTextVisualizations(imagesWithResultsPerPage.get(pageNumber))));
    }


-    private VisualizationsOnPage createDebugTextVisualizations(Integer pageNumber, List<OcrResultToWrite> ocrResultsToWrite) {
+    private VisualizationsOnPage createDebugTextVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {

        List<TextPositionInImage> words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
        List<PlacedText> placedTexts = words.stream()
@ -89,28 +106,30 @@ public class OcrResultWriter {
                        null,
                        word.getFontStyle().equals(FontStyle.REGULAR) ? Color.BLUE : Color.RED,
                        (float) word.getFontSize(),
-                        word.getFont(),
+                        word.getFontMetricsFactory(),
                        Optional.of(word.getTextMatrix()),
                        Optional.of(RenderingMode.FILL)))
                .toList();
-        return VisualizationsOnPage.builder().pageNumber(pageNumber).placedTexts(placedTexts).build();
+        return VisualizationsOnPage.builder().placedTexts(placedTexts).build();
    }


-    private List<VisualizationsOnPage> createDebugBBoxVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
+    private Map<Integer, VisualizationsOnPage> createDebugBBoxVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {

-        return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createDebugBBoxVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList();
+        return imagesWithResultsPerPage.keySet()
+                .stream()
+                .collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createDebugBBoxVisualizations(imagesWithResultsPerPage.get(pageNumber))));
    }


-    private VisualizationsOnPage createDebugBBoxVisualizations(Integer pageNumber, List<OcrResultToWrite> ocrResultsToWrite) {
+    private VisualizationsOnPage createDebugBBoxVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {

        List<TextPositionInImage> words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
        List<ColoredLine> coloredLines = Stream.concat(//
                words.stream().map(TextPositionInImage::getTransformedTextBBox).map(this::quadPointAsLines),//
                ocrResultsToWrite.stream().map(OcrResultToWrite::imageBoundingBox).map(this::createGrid)//
        ).flatMap(Collection::stream).toList();
-        return VisualizationsOnPage.builder().pageNumber(pageNumber).coloredLines(coloredLines).build();
+        return VisualizationsOnPage.builder().coloredLines(coloredLines).build();
    }


--- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/FontMetricsFactory.java
+++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/FontMetricsFactory.java
@ -4,11 +4,12 @@ import org.apache.pdfbox.pdmodel.font.PDFont;

 import com.knecon.fforesight.service.ocr.processor.model.FontMetrics;
 import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
+import com.knecon.fforesight.service.viewerdoc.model.EmbeddableFont;

 import lombok.SneakyThrows;
 import lombok.extern.slf4j.Slf4j;

-public interface FontMetricsFactory {
+public interface FontMetricsFactory extends EmbeddableFont {

    default FontMetrics calculateMetrics(String text, double textWidth, double textHeight) {

--- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/Type0FontMetricsFactory.java
+++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/Type0FontMetricsFactory.java
@ -1,8 +1,6 @@
 package com.knecon.fforesight.service.ocr.processor.service.fonts;

 import java.io.ByteArrayInputStream;
-import java.util.Collections;
-import java.util.List;
 import java.util.Set;

 import org.apache.fontbox.ttf.GlyphData;
@ -15,47 +13,62 @@ import org.apache.pdfbox.pdmodel.font.PDType0Font;

 import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;

+import lombok.AllArgsConstructor;
 import lombok.RequiredArgsConstructor;
 import lombok.SneakyThrows;
 import lombok.extern.slf4j.Slf4j;
-import software.amazon.awssdk.services.s3.endpoints.internal.Value;

@Slf4j
@RequiredArgsConstructor
+@AllArgsConstructor
 public class Type0FontMetricsFactory implements FontMetricsFactory {

-    private final PDType0Font type0Font;
-    private final TrueTypeFont trueTypeFont;
+    private final String resourcePath;
+    private PDType0Font type0Font;
+    private TrueTypeFont trueTypeFont;
+    private PDDocument documentThisIsEmbeddedIn;

    // for this specific font back-/forward-slashes have a lot of descent screwing up the font size and therefore bold detection. So if we find such a character we ignore its descent.
    private static final Set<Integer> slashGlyphIds = Set.of(18, 63);


+    @SneakyThrows
    public static Type0FontMetricsFactory regular(PDDocument document) {

-        return createFromResource("fonts/cmu-regular.ttf", document);
+        String resourcePath = "fonts/cmu-regular.ttf";
+        return createFromResourcePath(resourcePath, document);
    }


+    @SneakyThrows
    public static Type0FontMetricsFactory bold(PDDocument document) {

-        return createFromResource("fonts/cmu-bold.ttf", document);
+        String resourcePath = "fonts/cmu-bold.ttf";
+        return createFromResourcePath(resourcePath, document);
    }


    @SneakyThrows
    @SuppressWarnings("PMD.CloseResource")
-    // Todo i think this is not ok to never close the font...
-    private static Type0FontMetricsFactory createFromResource(String resourcePath, PDDocument document) {
+    private static TrueTypeFont readFromResourcePath(String resourcePath) {

+        // The ttf is closed with the document, see PDType0Font line 134
        try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(resourcePath); var buffer = new RandomAccessReadBuffer(in)) {
-            TrueTypeFont trueTypeFont = new TTFParser().parse(buffer); // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
-            PDType0Font type0Font = PDType0Font.load(document, trueTypeFont, true); // use Type0Font for unicode support
-            return new Type0FontMetricsFactory(type0Font, trueTypeFont);
+            return new TTFParser().parse(buffer);
        }
    }


+    @SneakyThrows
+    private static Type0FontMetricsFactory createFromResourcePath(String resourcePath, PDDocument document) {
+
+        TrueTypeFont trueTypeFont = readFromResourcePath(resourcePath);
+        // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
+        return new Type0FontMetricsFactory(resourcePath, PDType0Font.load(document, trueTypeFont, true), trueTypeFont, document); // use Type0Font for unicode support)
+
+    }
+
+
    @SneakyThrows
    public HeightAndDescent calculateHeightAndDescent(String text) {

@ -99,4 +112,28 @@ public class Type0FontMetricsFactory implements FontMetricsFactory {
        return type0Font;
    }

+
+    @Override
+    @SneakyThrows
+    public PDFont embed(PDDocument document) {
+
+        if (documentThisIsEmbeddedIn.equals(document)) {
+            return getFont();
+        }
+
+        // no need to close, the font will be closed with the document it is embedded in
+
+        this.trueTypeFont = readFromResourcePath(resourcePath);
+        this.type0Font = PDType0Font.load(document, trueTypeFont, true);
+        this.documentThisIsEmbeddedIn = document;
+        return getFont();
+    }
+
+
+    @SneakyThrows
+    public void close() {
+
+        trueTypeFont.close();
+    }
+
 }
--- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/FontStyleDetector.java
+++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/FontStyleDetector.java
@ -49,11 +49,11 @@ public class FontStyleDetector {
     * (<a href="https://en.wikipedia.org/wiki/Opening_(morphology)">Opening (Morphology)</a>).
     * We then threshold the ratio of remaining pixels to determine whether a word is bold or not.
     * <p>
-     * I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size.
-     * But this is based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height.
+     * I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size estimation.
+     * But that is calculated based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height.
     * The paper does not describe its clustering algorithm, so I've decided on DBSCAN due to its good runtime and readily available implementation by apache commons math.
     * Moreover, the paper states that stroke width scales linearly with text height. I've come to the conclusion this is not the case.
-     * It seems it scales with the square root of the text height. Or at least this seemed to give the best results.
+     * It seems it scales with the square root of the text height. Or at least this seemed to give the best results for me.
     */
    public Map<Integer, List<OcrResultToWrite>> detectBold(List<OcrResult> ocrResults, PDDocument document) {

--- a/ocr-service-v1/ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java
+++ b/ocr-service-v1/ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java
@ -58,6 +58,7 @@ public class OcrMessageReceiver {

            setStatusOcrProcessing(dossierId, fileId);

+            tmpDir.toFile().mkdirs();
            File documentFile = tmpDir.resolve("document.pdf").toFile();
            File viewerDocumentFile = tmpDir.resolve("viewerDocument.pdf").toFile();

--- a/publish-custom-image.sh
+++ b/publish-custom-image.sh
@ -11,5 +11,5 @@ commit_hash=$(git rev-parse --short=5 HEAD)
 # Combine branch and commit hash
 buildName="${USER}-${branch}-${commit_hash}"

-gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache
+gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName
 echo "nexus.knecon.com:5001/ff/${dir}-server:$buildName"