RED-8156: add ocr debug layers to viewer document

This commit is contained in:
Kilian Schuettler 2024-02-07 11:31:40 +01:00
parent 2aaa53f441
commit 2bbc3775c5
8 changed files with 110 additions and 45 deletions

View File

@ -25,6 +25,6 @@ dependencies {
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
api("com.google.guava:guava:31.1-jre")
api("com.iqser.red.commons:pdftron-logic-commons:2.23.0")
api("com.knecon.fforesight:viewer-doc-processor:0.3.0")
api("com.knecon.fforesight:viewer-doc-processor:0.89.0")
testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
}

View File

@ -25,11 +25,18 @@ public record OcrResultToWrite(List<TextPositionInImage> textPositionInImage, Qu
.collect(Collectors.toMap(Map.Entry::getKey,
entry -> entry.getValue()
.stream()
.map(ocrResult -> new OcrResultToWrite(ocrResult.getAllWords()
.stream()
.filter(word -> !word.isBlank())
.map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR))
.toList(), ocrResult.image().getImageCoordinatesInInitialUserSpace()))
.map(ocrResult -> new OcrResultToWrite(toTextPositionInImage(ocrResult, fontMetricsFactory), ocrResult.image().getImageCoordinatesInInitialUserSpace()))
.toList()));
}
private static List<TextPositionInImage> toTextPositionInImage(OcrResult ocrResult, FontMetricsFactory fontMetricsFactory) {
return ocrResult.getAllWords()
.stream()
.filter(word -> !word.isBlank())
.map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR))
.toList();
}
}

View File

@ -9,6 +9,8 @@ import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
@ -18,10 +20,10 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
@ -43,23 +45,36 @@ public class OcrResultWriter {
@SneakyThrows
public void drawOcrResultsToPdf(File document, File viewerDocument, Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
List<VisualizationsOnPage> ocrVisualizationsOnPages = createVisualizations(imagesWithResultsPerPage);
List<VisualizationsOnPage> ocrTextDebugVisualizationsOnPages = createDebugTextVisualizations(imagesWithResultsPerPage);
List<VisualizationsOnPage> ocrBBoxDebugVisualizationsOnPages = createDebugBBoxVisualizations(imagesWithResultsPerPage);
viewerDocumentService.addVisualizationsOnPage(document, document, ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false);
viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false);
viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false);
viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false);
Map<Integer, VisualizationsOnPage> ocrVisualizationsOnPages = createVisualizations(imagesWithResultsPerPage);
Map<Integer, VisualizationsOnPage> ocrTextDebugVisualizationsOnPages = createDebugTextVisualizations(imagesWithResultsPerPage);
Map<Integer, VisualizationsOnPage> ocrBBoxDebugVisualizationsOnPages = createDebugBBoxVisualizations(imagesWithResultsPerPage);
Visualizations visualizations = new Visualizations(ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false);
List<Visualizations> debugVisualizations = List.of(visualizations,
new Visualizations(ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false),
new Visualizations(ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false));
viewerDocumentService.addVisualizationsOnPage(document, document, visualizations);
viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, debugVisualizations);
}
private List<VisualizationsOnPage> createVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
private Map<Integer, VisualizationsOnPage> createVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList();
return imagesWithResultsPerPage.keySet()
.stream()
.collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createVisualizations(imagesWithResultsPerPage.get(pageNumber))));
}
private VisualizationsOnPage createVisualizations(Integer pageNumber, List<OcrResultToWrite> ocrResultsToWrite) {
private static Function<Integer, Integer> pageNumber1IdxTo0IdxMapper() {
// PDFBox uses a 0-based index for page numbers internally, while we use a 1-based index
return p -> p - 1;
}
private VisualizationsOnPage createVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {
List<TextPositionInImage> words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
List<PlacedText> placedTexts = words.stream()
@ -67,21 +82,23 @@ public class OcrResultWriter {
null,
Color.BLACK,
(float) word.getFontSize(),
word.getFont(),
word.getFontMetricsFactory(),
Optional.of(word.getTextMatrix()),
Optional.of(RenderingMode.NEITHER)))
.toList();
return VisualizationsOnPage.builder().pageNumber(pageNumber - 1).placedTexts(placedTexts).build();
return VisualizationsOnPage.builder().placedTexts(placedTexts).build();
}
private List<VisualizationsOnPage> createDebugTextVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
private Map<Integer, VisualizationsOnPage> createDebugTextVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createDebugTextVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList();
return imagesWithResultsPerPage.keySet()
.stream()
.collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createDebugTextVisualizations(imagesWithResultsPerPage.get(pageNumber))));
}
private VisualizationsOnPage createDebugTextVisualizations(Integer pageNumber, List<OcrResultToWrite> ocrResultsToWrite) {
private VisualizationsOnPage createDebugTextVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {
List<TextPositionInImage> words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
List<PlacedText> placedTexts = words.stream()
@ -89,28 +106,30 @@ public class OcrResultWriter {
null,
word.getFontStyle().equals(FontStyle.REGULAR) ? Color.BLUE : Color.RED,
(float) word.getFontSize(),
word.getFont(),
word.getFontMetricsFactory(),
Optional.of(word.getTextMatrix()),
Optional.of(RenderingMode.FILL)))
.toList();
return VisualizationsOnPage.builder().pageNumber(pageNumber).placedTexts(placedTexts).build();
return VisualizationsOnPage.builder().placedTexts(placedTexts).build();
}
private List<VisualizationsOnPage> createDebugBBoxVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
private Map<Integer, VisualizationsOnPage> createDebugBBoxVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createDebugBBoxVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList();
return imagesWithResultsPerPage.keySet()
.stream()
.collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createDebugBBoxVisualizations(imagesWithResultsPerPage.get(pageNumber))));
}
private VisualizationsOnPage createDebugBBoxVisualizations(Integer pageNumber, List<OcrResultToWrite> ocrResultsToWrite) {
private VisualizationsOnPage createDebugBBoxVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {
List<TextPositionInImage> words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
List<ColoredLine> coloredLines = Stream.concat(//
words.stream().map(TextPositionInImage::getTransformedTextBBox).map(this::quadPointAsLines),//
ocrResultsToWrite.stream().map(OcrResultToWrite::imageBoundingBox).map(this::createGrid)//
).flatMap(Collection::stream).toList();
return VisualizationsOnPage.builder().pageNumber(pageNumber).coloredLines(coloredLines).build();
return VisualizationsOnPage.builder().coloredLines(coloredLines).build();
}

View File

@ -4,11 +4,12 @@ import org.apache.pdfbox.pdmodel.font.PDFont;
import com.knecon.fforesight.service.ocr.processor.model.FontMetrics;
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
import com.knecon.fforesight.service.viewerdoc.model.EmbeddableFont;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
public interface FontMetricsFactory {
public interface FontMetricsFactory extends EmbeddableFont {
default FontMetrics calculateMetrics(String text, double textWidth, double textHeight) {

View File

@ -1,8 +1,6 @@
package com.knecon.fforesight.service.ocr.processor.service.fonts;
import java.io.ByteArrayInputStream;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import org.apache.fontbox.ttf.GlyphData;
@ -15,47 +13,62 @@ import org.apache.pdfbox.pdmodel.font.PDType0Font;
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
import lombok.AllArgsConstructor;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import software.amazon.awssdk.services.s3.endpoints.internal.Value;
@Slf4j
@RequiredArgsConstructor
@AllArgsConstructor
public class Type0FontMetricsFactory implements FontMetricsFactory {
private final PDType0Font type0Font;
private final TrueTypeFont trueTypeFont;
private final String resourcePath;
private PDType0Font type0Font;
private TrueTypeFont trueTypeFont;
private PDDocument documentThisIsEmbeddedIn;
// for this specific font back-/forward-slashes have a lot of descent screwing up the font size and therefore bold detection. So if we find such a character we ignore its descent.
private static final Set<Integer> slashGlyphIds = Set.of(18, 63);
@SneakyThrows
public static Type0FontMetricsFactory regular(PDDocument document) {
return createFromResource("fonts/cmu-regular.ttf", document);
String resourcePath = "fonts/cmu-regular.ttf";
return createFromResourcePath(resourcePath, document);
}
@SneakyThrows
public static Type0FontMetricsFactory bold(PDDocument document) {
return createFromResource("fonts/cmu-bold.ttf", document);
String resourcePath = "fonts/cmu-bold.ttf";
return createFromResourcePath(resourcePath, document);
}
@SneakyThrows
@SuppressWarnings("PMD.CloseResource")
// Todo i think this is not ok to never close the font...
private static Type0FontMetricsFactory createFromResource(String resourcePath, PDDocument document) {
private static TrueTypeFont readFromResourcePath(String resourcePath) {
// The ttf is closed with the document, see PDType0Font line 134
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(resourcePath); var buffer = new RandomAccessReadBuffer(in)) {
TrueTypeFont trueTypeFont = new TTFParser().parse(buffer); // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
PDType0Font type0Font = PDType0Font.load(document, trueTypeFont, true); // use Type0Font for unicode support
return new Type0FontMetricsFactory(type0Font, trueTypeFont);
return new TTFParser().parse(buffer);
}
}
@SneakyThrows
private static Type0FontMetricsFactory createFromResourcePath(String resourcePath, PDDocument document) {
TrueTypeFont trueTypeFont = readFromResourcePath(resourcePath);
// since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
return new Type0FontMetricsFactory(resourcePath, PDType0Font.load(document, trueTypeFont, true), trueTypeFont, document); // use Type0Font for unicode support)
}
@SneakyThrows
public HeightAndDescent calculateHeightAndDescent(String text) {
@ -99,4 +112,28 @@ public class Type0FontMetricsFactory implements FontMetricsFactory {
return type0Font;
}
@Override
@SneakyThrows
public PDFont embed(PDDocument document) {
if (documentThisIsEmbeddedIn.equals(document)) {
return getFont();
}
// no need to close, the font will be closed with the document it is embedded in
this.trueTypeFont = readFromResourcePath(resourcePath);
this.type0Font = PDType0Font.load(document, trueTypeFont, true);
this.documentThisIsEmbeddedIn = document;
return getFont();
}
@SneakyThrows
public void close() {
trueTypeFont.close();
}
}

View File

@ -49,11 +49,11 @@ public class FontStyleDetector {
* (<a href="https://en.wikipedia.org/wiki/Opening_(morphology)">Opening (Morphology)</a>).
* We then threshold the ratio of remaining pixels to determine whether a word is bold or not.
* <p>
* I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size.
* But this is based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height.
* I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size estimation.
* But that is calculated based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height.
* The paper does not describe its clustering algorithm, so I've decided on DBSCAN due to its good runtime and readily available implementation by apache commons math.
* Moreover, the paper states that stroke width scales linearly with text height. I've come to the conclusion this is not the case.
* It seems it scales with the square root of the text height. Or at least this seemed to give the best results.
* It seems it scales with the square root of the text height. Or at least this seemed to give the best results for me.
*/
public Map<Integer, List<OcrResultToWrite>> detectBold(List<OcrResult> ocrResults, PDDocument document) {

View File

@ -58,6 +58,7 @@ public class OcrMessageReceiver {
setStatusOcrProcessing(dossierId, fileId);
tmpDir.toFile().mkdirs();
File documentFile = tmpDir.resolve("document.pdf").toFile();
File viewerDocumentFile = tmpDir.resolve("viewerDocument.pdf").toFile();

View File

@ -11,5 +11,5 @@ commit_hash=$(git rev-parse --short=5 HEAD)
# Combine branch and commit hash
buildName="${USER}-${branch}-${commit_hash}"
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName
echo "nexus.knecon.com:5001/ff/${dir}-server:$buildName"