RED-8156: add ocr debug layers to viewer document
This commit is contained in:
parent
2aaa53f441
commit
2bbc3775c5
@ -25,6 +25,6 @@ dependencies {
|
||||
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
|
||||
api("com.google.guava:guava:31.1-jre")
|
||||
api("com.iqser.red.commons:pdftron-logic-commons:2.23.0")
|
||||
api("com.knecon.fforesight:viewer-doc-processor:0.3.0")
|
||||
api("com.knecon.fforesight:viewer-doc-processor:0.89.0")
|
||||
testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
|
||||
}
|
||||
|
||||
@ -25,11 +25,18 @@ public record OcrResultToWrite(List<TextPositionInImage> textPositionInImage, Qu
|
||||
.collect(Collectors.toMap(Map.Entry::getKey,
|
||||
entry -> entry.getValue()
|
||||
.stream()
|
||||
.map(ocrResult -> new OcrResultToWrite(ocrResult.getAllWords()
|
||||
.stream()
|
||||
.filter(word -> !word.isBlank())
|
||||
.map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR))
|
||||
.toList(), ocrResult.image().getImageCoordinatesInInitialUserSpace()))
|
||||
.map(ocrResult -> new OcrResultToWrite(toTextPositionInImage(ocrResult, fontMetricsFactory), ocrResult.image().getImageCoordinatesInInitialUserSpace()))
|
||||
.toList()));
|
||||
}
|
||||
|
||||
|
||||
private static List<TextPositionInImage> toTextPositionInImage(OcrResult ocrResult, FontMetricsFactory fontMetricsFactory) {
|
||||
|
||||
return ocrResult.getAllWords()
|
||||
.stream()
|
||||
.filter(word -> !word.isBlank())
|
||||
.map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR))
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -9,6 +9,8 @@ import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
|
||||
@ -18,10 +20,10 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
|
||||
|
||||
@ -43,23 +45,36 @@ public class OcrResultWriter {
|
||||
@SneakyThrows
|
||||
public void drawOcrResultsToPdf(File document, File viewerDocument, Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||
|
||||
List<VisualizationsOnPage> ocrVisualizationsOnPages = createVisualizations(imagesWithResultsPerPage);
|
||||
List<VisualizationsOnPage> ocrTextDebugVisualizationsOnPages = createDebugTextVisualizations(imagesWithResultsPerPage);
|
||||
List<VisualizationsOnPage> ocrBBoxDebugVisualizationsOnPages = createDebugBBoxVisualizations(imagesWithResultsPerPage);
|
||||
viewerDocumentService.addVisualizationsOnPage(document, document, ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false);
|
||||
viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false);
|
||||
viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false);
|
||||
viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false);
|
||||
Map<Integer, VisualizationsOnPage> ocrVisualizationsOnPages = createVisualizations(imagesWithResultsPerPage);
|
||||
Map<Integer, VisualizationsOnPage> ocrTextDebugVisualizationsOnPages = createDebugTextVisualizations(imagesWithResultsPerPage);
|
||||
Map<Integer, VisualizationsOnPage> ocrBBoxDebugVisualizationsOnPages = createDebugBBoxVisualizations(imagesWithResultsPerPage);
|
||||
|
||||
Visualizations visualizations = new Visualizations(ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false);
|
||||
|
||||
List<Visualizations> debugVisualizations = List.of(visualizations,
|
||||
new Visualizations(ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false),
|
||||
new Visualizations(ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false));
|
||||
|
||||
viewerDocumentService.addVisualizationsOnPage(document, document, visualizations);
|
||||
viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, debugVisualizations);
|
||||
}
|
||||
|
||||
|
||||
private List<VisualizationsOnPage> createVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||
private Map<Integer, VisualizationsOnPage> createVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||
|
||||
return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList();
|
||||
return imagesWithResultsPerPage.keySet()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createVisualizations(imagesWithResultsPerPage.get(pageNumber))));
|
||||
}
|
||||
|
||||
|
||||
private VisualizationsOnPage createVisualizations(Integer pageNumber, List<OcrResultToWrite> ocrResultsToWrite) {
|
||||
private static Function<Integer, Integer> pageNumber1IdxTo0IdxMapper() {
|
||||
// PDFBox uses a 0-based index for page numbers internally, while we use a 1-based index
|
||||
return p -> p - 1;
|
||||
}
|
||||
|
||||
|
||||
private VisualizationsOnPage createVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {
|
||||
|
||||
List<TextPositionInImage> words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
|
||||
List<PlacedText> placedTexts = words.stream()
|
||||
@ -67,21 +82,23 @@ public class OcrResultWriter {
|
||||
null,
|
||||
Color.BLACK,
|
||||
(float) word.getFontSize(),
|
||||
word.getFont(),
|
||||
word.getFontMetricsFactory(),
|
||||
Optional.of(word.getTextMatrix()),
|
||||
Optional.of(RenderingMode.NEITHER)))
|
||||
.toList();
|
||||
return VisualizationsOnPage.builder().pageNumber(pageNumber - 1).placedTexts(placedTexts).build();
|
||||
return VisualizationsOnPage.builder().placedTexts(placedTexts).build();
|
||||
}
|
||||
|
||||
|
||||
private List<VisualizationsOnPage> createDebugTextVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||
private Map<Integer, VisualizationsOnPage> createDebugTextVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||
|
||||
return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createDebugTextVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList();
|
||||
return imagesWithResultsPerPage.keySet()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createDebugTextVisualizations(imagesWithResultsPerPage.get(pageNumber))));
|
||||
}
|
||||
|
||||
|
||||
private VisualizationsOnPage createDebugTextVisualizations(Integer pageNumber, List<OcrResultToWrite> ocrResultsToWrite) {
|
||||
private VisualizationsOnPage createDebugTextVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {
|
||||
|
||||
List<TextPositionInImage> words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
|
||||
List<PlacedText> placedTexts = words.stream()
|
||||
@ -89,28 +106,30 @@ public class OcrResultWriter {
|
||||
null,
|
||||
word.getFontStyle().equals(FontStyle.REGULAR) ? Color.BLUE : Color.RED,
|
||||
(float) word.getFontSize(),
|
||||
word.getFont(),
|
||||
word.getFontMetricsFactory(),
|
||||
Optional.of(word.getTextMatrix()),
|
||||
Optional.of(RenderingMode.FILL)))
|
||||
.toList();
|
||||
return VisualizationsOnPage.builder().pageNumber(pageNumber).placedTexts(placedTexts).build();
|
||||
return VisualizationsOnPage.builder().placedTexts(placedTexts).build();
|
||||
}
|
||||
|
||||
|
||||
private List<VisualizationsOnPage> createDebugBBoxVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||
private Map<Integer, VisualizationsOnPage> createDebugBBoxVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||
|
||||
return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createDebugBBoxVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList();
|
||||
return imagesWithResultsPerPage.keySet()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createDebugBBoxVisualizations(imagesWithResultsPerPage.get(pageNumber))));
|
||||
}
|
||||
|
||||
|
||||
private VisualizationsOnPage createDebugBBoxVisualizations(Integer pageNumber, List<OcrResultToWrite> ocrResultsToWrite) {
|
||||
private VisualizationsOnPage createDebugBBoxVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {
|
||||
|
||||
List<TextPositionInImage> words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
|
||||
List<ColoredLine> coloredLines = Stream.concat(//
|
||||
words.stream().map(TextPositionInImage::getTransformedTextBBox).map(this::quadPointAsLines),//
|
||||
ocrResultsToWrite.stream().map(OcrResultToWrite::imageBoundingBox).map(this::createGrid)//
|
||||
).flatMap(Collection::stream).toList();
|
||||
return VisualizationsOnPage.builder().pageNumber(pageNumber).coloredLines(coloredLines).build();
|
||||
return VisualizationsOnPage.builder().coloredLines(coloredLines).build();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -4,11 +4,12 @@ import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.FontMetrics;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.EmbeddableFont;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
public interface FontMetricsFactory {
|
||||
public interface FontMetricsFactory extends EmbeddableFont {
|
||||
|
||||
default FontMetrics calculateMetrics(String text, double textWidth, double textHeight) {
|
||||
|
||||
|
||||
@ -1,8 +1,6 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.fonts;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.fontbox.ttf.GlyphData;
|
||||
@ -15,47 +13,62 @@ import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.services.s3.endpoints.internal.Value;
|
||||
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class Type0FontMetricsFactory implements FontMetricsFactory {
|
||||
|
||||
private final PDType0Font type0Font;
|
||||
private final TrueTypeFont trueTypeFont;
|
||||
private final String resourcePath;
|
||||
private PDType0Font type0Font;
|
||||
private TrueTypeFont trueTypeFont;
|
||||
private PDDocument documentThisIsEmbeddedIn;
|
||||
|
||||
// for this specific font back-/forward-slashes have a lot of descent screwing up the font size and therefore bold detection. So if we find such a character we ignore its descent.
|
||||
private static final Set<Integer> slashGlyphIds = Set.of(18, 63);
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static Type0FontMetricsFactory regular(PDDocument document) {
|
||||
|
||||
return createFromResource("fonts/cmu-regular.ttf", document);
|
||||
String resourcePath = "fonts/cmu-regular.ttf";
|
||||
return createFromResourcePath(resourcePath, document);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static Type0FontMetricsFactory bold(PDDocument document) {
|
||||
|
||||
return createFromResource("fonts/cmu-bold.ttf", document);
|
||||
String resourcePath = "fonts/cmu-bold.ttf";
|
||||
return createFromResourcePath(resourcePath, document);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@SuppressWarnings("PMD.CloseResource")
|
||||
// Todo i think this is not ok to never close the font...
|
||||
private static Type0FontMetricsFactory createFromResource(String resourcePath, PDDocument document) {
|
||||
private static TrueTypeFont readFromResourcePath(String resourcePath) {
|
||||
|
||||
// The ttf is closed with the document, see PDType0Font line 134
|
||||
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(resourcePath); var buffer = new RandomAccessReadBuffer(in)) {
|
||||
TrueTypeFont trueTypeFont = new TTFParser().parse(buffer); // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
|
||||
PDType0Font type0Font = PDType0Font.load(document, trueTypeFont, true); // use Type0Font for unicode support
|
||||
return new Type0FontMetricsFactory(type0Font, trueTypeFont);
|
||||
return new TTFParser().parse(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static Type0FontMetricsFactory createFromResourcePath(String resourcePath, PDDocument document) {
|
||||
|
||||
TrueTypeFont trueTypeFont = readFromResourcePath(resourcePath);
|
||||
// since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
|
||||
return new Type0FontMetricsFactory(resourcePath, PDType0Font.load(document, trueTypeFont, true), trueTypeFont, document); // use Type0Font for unicode support)
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public HeightAndDescent calculateHeightAndDescent(String text) {
|
||||
|
||||
@ -99,4 +112,28 @@ public class Type0FontMetricsFactory implements FontMetricsFactory {
|
||||
return type0Font;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public PDFont embed(PDDocument document) {
|
||||
|
||||
if (documentThisIsEmbeddedIn.equals(document)) {
|
||||
return getFont();
|
||||
}
|
||||
|
||||
// no need to close, the font will be closed with the document it is embedded in
|
||||
|
||||
this.trueTypeFont = readFromResourcePath(resourcePath);
|
||||
this.type0Font = PDType0Font.load(document, trueTypeFont, true);
|
||||
this.documentThisIsEmbeddedIn = document;
|
||||
return getFont();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void close() {
|
||||
|
||||
trueTypeFont.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -49,11 +49,11 @@ public class FontStyleDetector {
|
||||
* (<a href="https://en.wikipedia.org/wiki/Opening_(morphology)">Opening (Morphology)</a>).
|
||||
* We then threshold the ratio of remaining pixels to determine whether a word is bold or not.
|
||||
* <p>
|
||||
* I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size.
|
||||
* But this is based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height.
|
||||
* I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size estimation.
|
||||
* But that is calculated based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height.
|
||||
* The paper does not describe its clustering algorithm, so I've decided on DBSCAN due to its good runtime and readily available implementation by apache commons math.
|
||||
* Moreover, the paper states that stroke width scales linearly with text height. I've come to the conclusion this is not the case.
|
||||
* It seems it scales with the square root of the text height. Or at least this seemed to give the best results.
|
||||
* It seems it scales with the square root of the text height. Or at least this seemed to give the best results for me.
|
||||
*/
|
||||
public Map<Integer, List<OcrResultToWrite>> detectBold(List<OcrResult> ocrResults, PDDocument document) {
|
||||
|
||||
|
||||
@ -58,6 +58,7 @@ public class OcrMessageReceiver {
|
||||
|
||||
setStatusOcrProcessing(dossierId, fileId);
|
||||
|
||||
tmpDir.toFile().mkdirs();
|
||||
File documentFile = tmpDir.resolve("document.pdf").toFile();
|
||||
File viewerDocumentFile = tmpDir.resolve("viewerDocument.pdf").toFile();
|
||||
|
||||
|
||||
@ -11,5 +11,5 @@ commit_hash=$(git rev-parse --short=5 HEAD)
|
||||
# Combine branch and commit hash
|
||||
buildName="${USER}-${branch}-${commit_hash}"
|
||||
|
||||
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache
|
||||
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName
|
||||
echo "nexus.knecon.com:5001/ff/${dir}-server:$buildName"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user