From 2aaa53f441da7e9cc609dcaa7073db792141249d Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Mon, 5 Feb 2024 18:28:19 +0100 Subject: [PATCH 1/3] RED-8156: add debug layers to viewer document * wip, fonts need to be created in the original document --- .../ocr-service-processor/build.gradle.kts | 1 + .../OcrServiceProcessorConfiguration.java | 12 + .../processor/service/FileStorageService.java | 57 ++-- .../ocr/processor/service/OCRService.java | 87 +++--- .../processor/service/OcrResultWriter.java | 282 ++++++------------ .../v1/server/queue/OcrMessageReceiver.java | 49 +-- .../v1/server/OcrServiceIntegrationTest.java | 43 ++- 7 files changed, 228 insertions(+), 303 deletions(-) diff --git a/ocr-service-v1/ocr-service-processor/build.gradle.kts b/ocr-service-v1/ocr-service-processor/build.gradle.kts index 8269636..9b61c1d 100644 --- a/ocr-service-v1/ocr-service-processor/build.gradle.kts +++ b/ocr-service-v1/ocr-service-processor/build.gradle.kts @@ -25,5 +25,6 @@ dependencies { api("com.amazonaws:aws-java-sdk-kms:1.12.440") api("com.google.guava:guava:31.1-jre") api("com.iqser.red.commons:pdftron-logic-commons:2.23.0") + api("com.knecon.fforesight:viewer-doc-processor:0.3.0") testImplementation("org.junit.jupiter:junit-jupiter:5.8.1") } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceProcessorConfiguration.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceProcessorConfiguration.java index 2a66bfd..345e86a 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceProcessorConfiguration.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceProcessorConfiguration.java @@ -1,14 +1,26 @@ package com.knecon.fforesight.service.ocr.processor; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.context.properties.EnableConfigurationProperties; +import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.ComponentScan; import org.springframework.context.annotation.Configuration; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; +import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService; + +import io.micrometer.observation.ObservationRegistry; @Configuration @ComponentScan @EnableConfigurationProperties(OcrServiceSettings.class) public class OcrServiceProcessorConfiguration { + @Bean + @Autowired + public ViewerDocumentService viewerDocumentService(ObservationRegistry registry) { + + return new ViewerDocumentService(registry); + } + } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/FileStorageService.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/FileStorageService.java index c8161d6..c21e6b4 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/FileStorageService.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/FileStorageService.java @@ -1,13 +1,11 @@ package com.knecon.fforesight.service.ocr.processor.service; -import java.io.ByteArrayInputStream; import java.io.File; +import java.io.FileInputStream; import java.io.InputStream; import java.nio.file.Files; -import java.nio.file.Paths; -import java.nio.file.StandardOpenOption; +import java.nio.file.StandardCopyOption; -import org.apache.commons.io.IOUtils; import org.springframework.stereotype.Service; import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType; @@ -31,47 +29,38 @@ public class FileStorageService { return dossierId + "/" + fileId + "." + fileType.name() + fileType.getExtension(); } - - @SneakyThrows - public byte[] getOriginalFile(String dossierId, String fileId) { - - try (InputStream inputStream = getInputStream(getStorageId(dossierId, fileId, FileType.ORIGIN))) { - return IOUtils.toByteArray(inputStream); - } - } - - - @SneakyThrows - public InputStream getOriginalFileAsStream(String dossierId, String fileId) { - - return getInputStream(getStorageId(dossierId, fileId, FileType.ORIGIN)); - } - - - public void storeOriginalFile(String dossierId, String fileId, InputStream stream) { - - storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), stream); - } - - public boolean untouchedFileExists(String dossierId, String fileId) { return storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED)); } + @SneakyThrows + public void storeFiles(String dossierId, String fileId, File documentFile, File viewerDocumentFile) { - public void storeUntouchedFile(String dossierId, String fileId, byte[] data) { - - storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED), new ByteArrayInputStream(data)); + try (var in = new FileInputStream(documentFile)) { + storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), in); + } + try (var in = new FileInputStream(viewerDocumentFile)) { + storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT), in); + } } @SneakyThrows - private InputStream getInputStream(String storageId) { + public void downloadFiles(String dossierId, String fileId, File documentFile, File viewerDocumentFile) { - File tempFile = File.createTempFile("temp", ".data"); - storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile); - return Files.newInputStream(Paths.get(tempFile.getPath()), StandardOpenOption.DELETE_ON_CLOSE); + storageService.downloadTo(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), documentFile); + if (storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT))) { + storageService.downloadTo(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT), viewerDocumentFile); + } else { + Files.copy(documentFile.toPath(), viewerDocumentFile.toPath(), StandardCopyOption.REPLACE_EXISTING); + } + + if (!untouchedFileExists(dossierId, fileId)) { + try (var in = new FileInputStream(documentFile)) { + storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED), in); + } + } } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java index b859051..9949d3d 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java @@ -1,12 +1,12 @@ package com.knecon.fforesight.service.ocr.processor.service; -import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -27,6 +27,7 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrResult; import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.FontStyleDetector; import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; +import com.pdftron.pdf.PDFDoc; import io.micrometer.observation.ObservationRegistry; import io.micrometer.observation.annotation.Observed; @@ -58,55 +59,66 @@ public class OCRService { * looking for stitchedImages (if so converting the current page to an image with ghostscript and work on this instead), * perform tesseract-ocr on these images (via threads) and write the generated ocr-text as invisible elements. * - * @param dossierId Id of dossier - * @param fileId Id of file - * @param out OutputStream where to write to + * @param dossierId Id of dossier + * @param fileId Id of file + * @param tmpDir working directory for all files + * @param documentFile the file to perform ocr on, results are written invisibly + * @param viewerDocumentFile debugging file, results are written visibly in an optional content group */ @Observed(name = "OCRService", contextualName = "run-ocr-on-document") @SneakyThrows - public void runOcrOnDocument(String dossierId, String fileId, OutputStream out) { - - try (InputStream fileStream = removeWatermarkIfEnabled(dossierId, fileId); ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) { - - invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false, false); - - try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) { - log.info("Starting OCR for file {}", fileId); - long ocrStart = System.currentTimeMillis(); - Statistics stats = runOcr(transferInputStream, out, fileId, dossierId); - long ocrEnd = System.currentTimeMillis(); - log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, String.format("%.1f", (ocrEnd - ocrStart) / 1000.0)); - log.info("Runtime breakdown: {}", stats); - } - } - } - - - private InputStream removeWatermarkIfEnabled(String dossierId, String fileId) throws IOException { + public void runOcrOnDocument(String dossierId, String fileId, Path tmpDir, File documentFile, File viewerDocumentFile) { if (settings.isRemoveWatermark()) { - try (var in = fileStorageService.getOriginalFileAsStream(dossierId, fileId); var transferOutputStream = new ByteArrayOutputStream()) { - watermarkRemovalService.removeWatermarks(in, transferOutputStream); - return new ByteArrayInputStream(transferOutputStream.toByteArray()); - } + removeWatermarkIfEnabled(documentFile); } - return fileStorageService.getOriginalFileAsStream(dossierId, fileId); + removeInvisibleElements(documentFile); + + log.info("Starting OCR for file {}", fileId); + long ocrStart = System.currentTimeMillis(); + Statistics stats = runOcr(tmpDir, documentFile, viewerDocumentFile, fileId, dossierId); + long ocrEnd = System.currentTimeMillis(); + log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, String.format("%.1f", (ocrEnd - ocrStart) / 1000.0)); + log.info("Runtime breakdown: {}", stats); + } @SneakyThrows - public Statistics runOcr(InputStream in, OutputStream out, String fileId, String dossierId) { + private void removeInvisibleElements(File originFile) { + + Path tmpFile = Files.createTempFile("invisibleElements", ".pdf"); + try (var in = new FileInputStream(originFile); var out = new FileOutputStream(tmpFile.toFile())) { + invisibleElementRemovalService.removeInvisibleElements(in, out, false, false); + } + Files.copy(tmpFile, originFile.toPath(), StandardCopyOption.REPLACE_EXISTING); + assert tmpFile.toFile().delete(); + } + + + @SneakyThrows + private void removeWatermarkIfEnabled(File originFile) { + + Path tmpFile = Files.createTempFile("removeWatermarks", ".pdf"); + try (var in = new FileInputStream(originFile); var out = new FileOutputStream(tmpFile.toFile())) { + watermarkRemovalService.removeWatermarks(in, out); + } + Files.copy(tmpFile, originFile.toPath(), StandardCopyOption.REPLACE_EXISTING); + assert tmpFile.toFile().delete(); + } + + + @SneakyThrows + public Statistics runOcr(Path tmpDir, File documentFile, File viewerDocumentFile, String fileId, String dossierId) { long timestamp; - Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve(dossierId + "-" + fileId); + Path tmpImageDir = tmpDir.resolve("images"); Path tesseractOutputDir = tmpDir.resolve("tesseract_output"); tesseractOutputDir.toFile().mkdirs(); tmpImageDir.toFile().mkdirs(); - File documentFile = OsUtils.writeFileToTmpFolder(in, tmpDir); - Statistics stats; try (PDDocument document = Loader.loadPDF(documentFile)) { OcrProgressLogger logger = new OcrProgressLogger(document.getNumberOfPages(), ocrMessageSender, fileId); @@ -150,12 +162,11 @@ public class OCRService { stats.increaseFontStyleDetectionDuration(System.currentTimeMillis() - timestamp); timestamp = System.currentTimeMillis(); - var dictionariesToUpdate = ocrResultWriter.drawOcrResultsToPdf(document, imageWithTextPositionsPerPage); + ocrResultWriter.drawOcrResultsToPdf(documentFile, viewerDocumentFile, imageWithTextPositionsPerPage); + log.info("Saving document"); - document.saveIncremental(out, dictionariesToUpdate); stats.increaseWritingTextDuration(System.currentTimeMillis() - timestamp); - FileSystemUtils.deleteRecursively(tmpDir); logger.sendFinished(); return stats; } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java index 30e2ee7..1292859 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java @@ -1,29 +1,29 @@ package com.knecon.fforesight.service.ocr.processor.service; import java.awt.Color; +import java.awt.geom.Line2D; import java.awt.geom.Point2D; +import java.io.File; import java.util.Collection; -import java.util.HashSet; +import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.Set; +import java.util.Optional; +import java.util.stream.Stream; -import org.apache.pdfbox.cos.COSDictionary; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDDocumentCatalog; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.PDPageContentStream; -import org.apache.pdfbox.pdmodel.PDResources; -import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentGroup; -import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentProperties; import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite; import com.knecon.fforesight.service.ocr.processor.model.QuadPoint; import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage; +import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; +import com.knecon.fforesight.service.viewerdoc.ContentStreams; +import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; +import com.knecon.fforesight.service.viewerdoc.model.PlacedText; +import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage; +import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService; import lombok.AccessLevel; import lombok.RequiredArgsConstructor; @@ -37,180 +37,97 @@ import lombok.extern.slf4j.Slf4j; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class OcrResultWriter { - static String ocrLayerName = "knecon OCR"; - OcrServiceSettings settings; + ViewerDocumentService viewerDocumentService; @SneakyThrows - public Set drawOcrResultsToPdf(PDDocument document, Map> imagesWithResultsPerPage) { + public void drawOcrResultsToPdf(File document, File viewerDocument, Map> imagesWithResultsPerPage) { - Set dictionariesToUpdate = new HashSet<>(); - imagesWithResultsPerPage.keySet().forEach(pageNumber -> drawResultsPerPage(document, pageNumber, imagesWithResultsPerPage.get(pageNumber), dictionariesToUpdate)); - dictionariesToUpdate.add(document.getDocumentInformation().getCOSObject()); - return dictionariesToUpdate; + List ocrVisualizationsOnPages = createVisualizations(imagesWithResultsPerPage); + List ocrTextDebugVisualizationsOnPages = createDebugTextVisualizations(imagesWithResultsPerPage); + List ocrBBoxDebugVisualizationsOnPages = createDebugBBoxVisualizations(imagesWithResultsPerPage); + viewerDocumentService.addVisualizationsOnPage(document, document, ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false); + viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false); + viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false); + viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false); + } + + + private List createVisualizations(Map> imagesWithResultsPerPage) { + + return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList(); + } + + + private VisualizationsOnPage createVisualizations(Integer pageNumber, List ocrResultsToWrite) { + + List words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList(); + List placedTexts = words.stream() + .map(word -> new PlacedText(word.getText(), + null, + Color.BLACK, + (float) word.getFontSize(), + word.getFont(), + Optional.of(word.getTextMatrix()), + Optional.of(RenderingMode.NEITHER))) + .toList(); + return VisualizationsOnPage.builder().pageNumber(pageNumber - 1).placedTexts(placedTexts).build(); + } + + + private List createDebugTextVisualizations(Map> imagesWithResultsPerPage) { + + return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createDebugTextVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList(); + } + + + private VisualizationsOnPage createDebugTextVisualizations(Integer pageNumber, List ocrResultsToWrite) { + + List words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList(); + List placedTexts = words.stream() + .map(word -> new PlacedText(word.getText(), + null, + word.getFontStyle().equals(FontStyle.REGULAR) ? Color.BLUE : Color.RED, + (float) word.getFontSize(), + word.getFont(), + Optional.of(word.getTextMatrix()), + Optional.of(RenderingMode.FILL))) + .toList(); + return VisualizationsOnPage.builder().pageNumber(pageNumber).placedTexts(placedTexts).build(); + } + + + private List createDebugBBoxVisualizations(Map> imagesWithResultsPerPage) { + + return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createDebugBBoxVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList(); + } + + + private VisualizationsOnPage createDebugBBoxVisualizations(Integer pageNumber, List ocrResultsToWrite) { + + List words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList(); + List coloredLines = Stream.concat(// + words.stream().map(TextPositionInImage::getTransformedTextBBox).map(this::quadPointAsLines),// + ocrResultsToWrite.stream().map(OcrResultToWrite::imageBoundingBox).map(this::createGrid)// + ).flatMap(Collection::stream).toList(); + return VisualizationsOnPage.builder().pageNumber(pageNumber).coloredLines(coloredLines).build(); + } + + + private List quadPointAsLines(QuadPoint rect) { + + return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.ORANGE, 1), + new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1), + new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1), + new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.MAGENTA, 1)); } @SneakyThrows - private void drawResultsPerPage(PDDocument document, Integer pageNumber, List ocrResultToWrite, Set dictionariesToUpdate) { + private List createGrid(QuadPoint rect) { - var pdPage = document.getPage(pageNumber - 1); + List lines = new LinkedList<>(quadPointAsLines(rect)); - PDOptionalContentGroup textDebugLayer = new PDOptionalContentGroup(ocrLayerName); - PDOptionalContentGroup bBoxDebugLayer = new PDOptionalContentGroup(ocrLayerName + "BBox"); - if (settings.isDebug()) { - textDebugLayer = addOptionalGroup(ocrLayerName, document, pdPage, dictionariesToUpdate); - bBoxDebugLayer = addOptionalGroup(ocrLayerName + " BBox", document, pdPage, dictionariesToUpdate); - } - - escapeContentStreams(document, pdPage); - - List words = ocrResultToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList(); - try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true)) { - - // write invisible ocr text inside tagged content - contentStream.beginMarkedContent(settings.getOcrMarkedContentTag()); - contentStream.saveGraphicsState(); - contentStream.setNonStrokingColor(Color.BLUE); - contentStream.setStrokingColor(Color.BLUE); - contentStream.setLineWidth(1); - words.forEach(word -> drawInvisibleWord(word, contentStream)); - contentStream.restoreGraphicsState(); - contentStream.endMarkedContent(); - - if (settings.isDebug()) { // must not be written, as it will interfere with layout parsing - // write visible ocr text inside optional group - contentStream.beginMarkedContent(COSName.OC, textDebugLayer); - contentStream.saveGraphicsState(); - words.forEach(word -> drawVisibleWord(word, contentStream)); - contentStream.restoreGraphicsState(); - contentStream.endMarkedContent(); - - // write word bounding boxes (tesseract output) inside optional group - contentStream.beginMarkedContent(COSName.OC, bBoxDebugLayer); - contentStream.saveGraphicsState(); - ocrResultToWrite.stream() - .map(OcrResultToWrite::imageBoundingBox) - .forEach(imagePosition -> drawGrid(contentStream, imagePosition)); - words.stream().map(TextPositionInImage::getTransformedTextBBox).forEach(word -> drawRectangle(contentStream, word)); - contentStream.restoreGraphicsState(); - contentStream.endMarkedContent(); - } - } - dictionariesToUpdate.add(pdPage.getCOSObject()); - dictionariesToUpdate.add(pdPage.getResources().getCOSObject()); - } - - - @SneakyThrows - private static void escapeContentStreams(PDDocument document, PDPage pdPage) { - // We need to append to the contentstream, otherwise the content could be overlapped by images - // But we also need to save the graphics state before, such that our appended content cannot be affected by previous contentstreams with side-effects, such as not escaped matrix transformations - try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.PREPEND, false)) { - contentStream.saveGraphicsState(); - } - try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, false)) { - contentStream.restoreGraphicsState(); - } - } - - - private PDOptionalContentGroup addOptionalGroup(String ocrLayerName, PDDocument document, PDPage pdPage, Set dictionariesToUpdate) { - - PDDocumentCatalog catalog = document.getDocumentCatalog(); - PDOptionalContentProperties ocprops = catalog.getOCProperties(); - if (ocprops == null) { - ocprops = new PDOptionalContentProperties(); - catalog.setOCProperties(ocprops); - } - PDOptionalContentGroup layer = null; - if (ocprops.hasGroup(ocrLayerName)) { - layer = ocprops.getGroup(ocrLayerName); - } else { - layer = new PDOptionalContentGroup(ocrLayerName); - ocprops.addGroup(layer); - } - - // enable debug layers by default only when DEBUG flag is set. - ocprops.setGroupEnabled(layer, settings.isDebug()); - PDResources resources = pdPage.getResources(); - if (resources == null) { - resources = new PDResources(); - pdPage.setResources(resources); - } - dictionariesToUpdate.add(catalog.getCOSObject()); - return layer; - } - - - @SneakyThrows - private void drawRectangle(PDPageContentStream contentStream, QuadPoint rect) { - - contentStream.saveGraphicsState(); - contentStream.setLineWidth(1); - contentStream.moveTo((float) rect.a().getX(), (float) rect.a().getY()); - contentStream.lineTo((float) rect.b().getX(), (float) rect.b().getY()); - contentStream.setStrokingColor(Color.ORANGE); - contentStream.stroke(); - contentStream.moveTo((float) rect.b().getX(), (float) rect.b().getY()); - contentStream.lineTo((float) rect.c().getX(), (float) rect.c().getY()); - contentStream.setStrokingColor(Color.BLUE); - contentStream.stroke(); - contentStream.moveTo((float) rect.c().getX(), (float) rect.c().getY()); - contentStream.lineTo((float) rect.d().getX(), (float) rect.d().getY()); - contentStream.setStrokingColor(Color.GREEN); - contentStream.stroke(); - contentStream.moveTo((float) rect.d().getX(), (float) rect.d().getY()); - contentStream.lineTo((float) rect.a().getX(), (float) rect.a().getY()); - contentStream.setStrokingColor(Color.MAGENTA); - contentStream.stroke(); - contentStream.restoreGraphicsState(); - } - - - private void drawInvisibleWord(TextPositionInImage word, PDPageContentStream contentStream) { - - drawWord(word, contentStream, RenderingMode.NEITHER); - } - - - private void drawVisibleWord(TextPositionInImage word, PDPageContentStream contentStream) { - - drawWord(word, contentStream, RenderingMode.FILL); - } - - - // @SneakyThrows - private void drawWord(TextPositionInImage position, PDPageContentStream contentStream, RenderingMode renderingMode) { - - try { - contentStream.setNonStrokingColor(switch (position.getFontStyle()) { - case BOLD -> Color.RED; - case ITALIC -> Color.GREEN; - default -> Color.BLUE; - }); - contentStream.beginText(); - contentStream.setRenderingMode(renderingMode); - contentStream.setFont(position.getFont(), (float) position.getFontSize()); - contentStream.setTextMatrix(position.getTextMatrix()); - contentStream.showText(position.getText()); - contentStream.endText(); - - } catch (Exception e) { - log.error("Failed to write text {}", position.getText()); - log.error(e.getMessage()); - } - } - - - @SneakyThrows - private void drawGrid(PDPageContentStream contentStream, QuadPoint rect) { - - drawRectangle(contentStream, rect); - - contentStream.saveGraphicsState(); - contentStream.setStrokingColor(Color.BLACK); - contentStream.setLineWidth(0.2F); int nRows = 8; int nCols = 8; @@ -218,7 +135,7 @@ public class OcrResultWriter { Point2D start = add(rect.a(), abStep); Point2D end = add(rect.d(), abStep); for (int row = 0; row < nRows; ++row) { - drawLine(start, end, contentStream); + lines.add(new ColoredLine(new Line2D.Double(start, end), Color.BLACK, 0.2f)); start = add(start, abStep); end = add(end, abStep); } @@ -226,21 +143,12 @@ public class OcrResultWriter { start = add(rect.a(), adStep); end = add(rect.b(), adStep); for (int col = 0; col < nCols; ++col) { - drawLine(start, end, contentStream); + lines.add(new ColoredLine(new Line2D.Double(start, end), Color.BLACK, 0.2f)); start = add(start, adStep); end = add(end, adStep); } - contentStream.restoreGraphicsState(); - } - - - @SneakyThrows - private void drawLine(Point2D a, Point2D b, PDPageContentStream contentStream) { - - contentStream.moveTo((float) a.getX(), (float) a.getY()); - contentStream.lineTo((float) b.getX(), (float) b.getY()); - contentStream.stroke(); + return lines; } diff --git a/ocr-service-v1/ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java b/ocr-service-v1/ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java index 56f74ef..da3d0e2 100644 --- a/ocr-service-v1/ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java +++ b/ocr-service-v1/ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java @@ -2,18 +2,23 @@ package com.knecon.fforesight.service.ocr.v1.server.queue; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.IOException; +import java.nio.file.Path; import java.time.OffsetDateTime; import java.time.temporal.ChronoUnit; +import org.apache.commons.io.FileUtils; import org.springframework.amqp.AmqpRejectAndDontRequeueException; import org.springframework.amqp.core.Message; import org.springframework.amqp.rabbit.annotation.RabbitHandler; import org.springframework.amqp.rabbit.annotation.RabbitListener; import org.springframework.http.HttpStatus; import org.springframework.stereotype.Service; +import org.springframework.util.FileSystemUtils; import com.fasterxml.jackson.databind.ObjectMapper; +import com.knecon.fforesight.service.ocr.processor.service.OsUtils; import com.knecon.fforesight.service.ocr.v1.server.client.FileStatusProcessingUpdateClient; import com.knecon.fforesight.service.ocr.processor.service.FileStorageService; import com.knecon.fforesight.service.ocr.processor.service.OCRService; @@ -21,7 +26,6 @@ import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest; import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileErrorInfo; import feign.FeignException; -import io.micrometer.observation.annotation.Observed; import lombok.AccessLevel; import lombok.RequiredArgsConstructor; import lombok.experimental.FieldDefaults; @@ -33,10 +37,10 @@ import lombok.extern.slf4j.Slf4j; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class OcrMessageReceiver { - FileStorageService fileStorageService; - ObjectMapper objectMapper; - FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient; - OCRService ocrService; + FileStorageService fileStorageService; + ObjectMapper objectMapper; + FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient; + OCRService ocrService; @RabbitHandler @@ -44,33 +48,33 @@ public class OcrMessageReceiver { public void receiveOcr(Message in) throws IOException { DocumentRequest ocrRequestMessage = objectMapper.readValue(in.getBody(), DocumentRequest.class); - log.info("--------------------------------------------------------------------------"); - log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); + String dossierId = ocrRequestMessage.getDossierId(); + String fileId = ocrRequestMessage.getFileId(); + Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve(dossierId + "-" + fileId); try { - setStatusOcrProcessing(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); + log.info("--------------------------------------------------------------------------"); + log.info("Start ocr for file with dossierId {} and fileId {}", dossierId, fileId); - if (!fileStorageService.untouchedFileExists(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId())) { - byte[] originalFile = fileStorageService.getOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); - fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile); - } + setStatusOcrProcessing(dossierId, fileId); - try (var transferStream = new ByteArrayOutputStream()) { - ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), transferStream); - try (var inputStream = new ByteArrayInputStream(transferStream.toByteArray())) { - fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), inputStream); - } - } catch (IOException e) { - log.error("Failed to store file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); - throw new RuntimeException(e); - } + File documentFile = tmpDir.resolve("document.pdf").toFile(); + File viewerDocumentFile = tmpDir.resolve("viewerDocument.pdf").toFile(); - fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); + fileStorageService.downloadFiles(dossierId, fileId, documentFile, viewerDocumentFile); + + ocrService.runOcrOnDocument(dossierId, fileId, tmpDir, documentFile, viewerDocumentFile); + + fileStorageService.storeFiles(dossierId, fileId, documentFile, viewerDocumentFile); + + fileStatusProcessingUpdateClient.ocrSuccessful(dossierId, fileId); } catch (Exception e) { log.warn("An exception occurred in ocr file stage: {}", e.getMessage()); in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_HEADER, e.getMessage()); in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_TIMESTAMP_HEADER, OffsetDateTime.now().truncatedTo(ChronoUnit.MILLIS)); throw new RuntimeException(e); + } finally { + FileSystemUtils.deleteRecursively(tmpDir); } } @@ -80,6 +84,7 @@ public class OcrMessageReceiver { public void receiveOcrDLQ(Message failedMessage) throws IOException { DocumentRequest ocrRequestMessage = objectMapper.readValue(failedMessage.getBody(), DocumentRequest.class); + log.info("OCR DQL received: {}", ocrRequestMessage); String errorMessage = failedMessage.getMessageProperties().getHeader(MessagingConfiguration.X_ERROR_INFO_HEADER); OffsetDateTime timestamp = failedMessage.getMessageProperties().getHeader(MessagingConfiguration.X_ERROR_INFO_TIMESTAMP_HEADER); diff --git a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index 2c3e40d..ec638b1 100644 --- a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -9,6 +9,7 @@ import java.io.FileInputStream; import java.io.FileOutputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import java.util.Comparator; import java.util.List; import java.util.concurrent.TimeUnit; @@ -25,13 +26,14 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.knecon.fforesight.service.ocr.processor.service.FileStorageService; import com.knecon.fforesight.service.ocr.processor.service.OCRService; import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType; +import com.knecon.fforesight.service.ocr.processor.service.OsUtils; import com.knecon.fforesight.tenantcommons.TenantContext; import io.micrometer.prometheus.PrometheusMeterRegistry; import io.micrometer.prometheus.PrometheusTimer; import lombok.SneakyThrows; -@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help. +//@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help. @SpringBootTest() public class OcrServiceIntegrationTest extends AbstractTest { @@ -64,7 +66,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcr() { - String text = testOCR("files/UNAPPROVED_VV-331155 (1).pdf"); + String text = testOCR("files/402Study.pdf"); } @@ -116,18 +118,17 @@ public class OcrServiceIntegrationTest extends AbstractTest { private String testOCR(String fileName) { ClassPathResource pdfFileResource = new ClassPathResource(fileName); - var originId = FileStorageService.getStorageId(TEST_DOSSIER_ID, "file", FileType.ORIGIN); - try (var fileStream = pdfFileResource.getInputStream()) { - storageService.storeObject(TenantContext.getTenantId(), originId, fileStream); - } + Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve("OCR_TEST").resolve(Path.of(fileName).getFileName()); + tmpDir.toFile().mkdirs(); + var documentFile = tmpDir.resolve(Path.of("document.pdf")); + var viewerDocumentFile = tmpDir.resolve(Path.of("viewerDocument.pdf")); + Files.copy(pdfFileResource.getFile().toPath(), documentFile, StandardCopyOption.REPLACE_EXISTING); + Files.copy(pdfFileResource.getFile().toPath(), viewerDocumentFile, StandardCopyOption.REPLACE_EXISTING); - Path tmpFileName = Path.of(getTemporaryDirectory()).resolve(Path.of(fileName).getFileName()); - try (var out = new FileOutputStream(tmpFileName.toFile())) { - ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", out); - System.out.println("File:" + tmpFileName); - } + ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", tmpDir, documentFile.toFile(), viewerDocumentFile.toFile()); + System.out.println("File:" + documentFile); - try (var fileStream = new FileInputStream(tmpFileName.toFile())) { + try (var fileStream = new FileInputStream(documentFile.toFile())) { return extractAllTextFromDocument(fileStream); } } @@ -166,20 +167,18 @@ public class OcrServiceIntegrationTest extends AbstractTest { } - @SneakyThrows private void testOCRForFile(File file) { - var originId = FileStorageService.getStorageId(TEST_DOSSIER_ID, "file", FileType.ORIGIN); - try (var fileStream = new FileInputStream(file)) { - storageService.storeObject(TenantContext.getTenantId(), originId, fileStream); - } + Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve("OCR_TEST").resolve(file.toPath().getFileName()); + tmpDir.toFile().mkdirs(); + var documentFile = tmpDir.resolve(Path.of("document.pdf")); + var viewerDocumentFile = tmpDir.resolve(Path.of("viewerDocument.pdf")); + Files.copy(file.toPath(), documentFile, StandardCopyOption.REPLACE_EXISTING); + Files.copy(file.toPath(), viewerDocumentFile, StandardCopyOption.REPLACE_EXISTING); - Path tmpFileName = Path.of(getTemporaryDirectory()).resolve(Path.of(file.getAbsolutePath()).getFileName()); - try (var out = new FileOutputStream(tmpFileName.toFile())) { - ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", out); - System.out.println("File:" + tmpFileName); - } + ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", tmpDir, documentFile.toFile(), viewerDocumentFile.toFile()); + System.out.println("File:" + documentFile); System.out.println("\n\n"); } From 2bbc3775c53b117b4995ad77d681431859f9ac22 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Wed, 7 Feb 2024 11:31:40 +0100 Subject: [PATCH 2/3] RED-8156: add ocr debug layers to viewer document --- .../ocr-service-processor/build.gradle.kts | 2 +- .../ocr/processor/model/OcrResultToWrite.java | 17 +++-- .../processor/service/OcrResultWriter.java | 63 ++++++++++++------- .../service/fonts/FontMetricsFactory.java | 3 +- .../fonts/Type0FontMetricsFactory.java | 61 ++++++++++++++---- .../scriptdetection/FontStyleDetector.java | 6 +- .../v1/server/queue/OcrMessageReceiver.java | 1 + publish-custom-image.sh | 2 +- 8 files changed, 110 insertions(+), 45 deletions(-) diff --git a/ocr-service-v1/ocr-service-processor/build.gradle.kts b/ocr-service-v1/ocr-service-processor/build.gradle.kts index 9b61c1d..d1f4f8f 100644 --- a/ocr-service-v1/ocr-service-processor/build.gradle.kts +++ b/ocr-service-v1/ocr-service-processor/build.gradle.kts @@ -25,6 +25,6 @@ dependencies { api("com.amazonaws:aws-java-sdk-kms:1.12.440") api("com.google.guava:guava:31.1-jre") api("com.iqser.red.commons:pdftron-logic-commons:2.23.0") - api("com.knecon.fforesight:viewer-doc-processor:0.3.0") + api("com.knecon.fforesight:viewer-doc-processor:0.89.0") testImplementation("org.junit.jupiter:junit-jupiter:5.8.1") } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrResultToWrite.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrResultToWrite.java index ccbd45a..3a4c10b 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrResultToWrite.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrResultToWrite.java @@ -25,11 +25,18 @@ public record OcrResultToWrite(List textPositionInImage, Qu .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue() .stream() - .map(ocrResult -> new OcrResultToWrite(ocrResult.getAllWords() - .stream() - .filter(word -> !word.isBlank()) - .map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR)) - .toList(), ocrResult.image().getImageCoordinatesInInitialUserSpace())) + .map(ocrResult -> new OcrResultToWrite(toTextPositionInImage(ocrResult, fontMetricsFactory), ocrResult.image().getImageCoordinatesInInitialUserSpace())) .toList())); } + + + private static List toTextPositionInImage(OcrResult ocrResult, FontMetricsFactory fontMetricsFactory) { + + return ocrResult.getAllWords() + .stream() + .filter(word -> !word.isBlank()) + .map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR)) + .toList(); + } + } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java index 1292859..4bae7b4 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java @@ -9,6 +9,8 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.function.Function; +import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode; @@ -18,10 +20,10 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite; import com.knecon.fforesight.service.ocr.processor.model.QuadPoint; import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage; import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle; -import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; import com.knecon.fforesight.service.viewerdoc.ContentStreams; import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; import com.knecon.fforesight.service.viewerdoc.model.PlacedText; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage; import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService; @@ -43,23 +45,36 @@ public class OcrResultWriter { @SneakyThrows public void drawOcrResultsToPdf(File document, File viewerDocument, Map> imagesWithResultsPerPage) { - List ocrVisualizationsOnPages = createVisualizations(imagesWithResultsPerPage); - List ocrTextDebugVisualizationsOnPages = createDebugTextVisualizations(imagesWithResultsPerPage); - List ocrBBoxDebugVisualizationsOnPages = createDebugBBoxVisualizations(imagesWithResultsPerPage); - viewerDocumentService.addVisualizationsOnPage(document, document, ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false); - viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false); - viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false); - viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false); + Map ocrVisualizationsOnPages = createVisualizations(imagesWithResultsPerPage); + Map ocrTextDebugVisualizationsOnPages = createDebugTextVisualizations(imagesWithResultsPerPage); + Map ocrBBoxDebugVisualizationsOnPages = createDebugBBoxVisualizations(imagesWithResultsPerPage); + + Visualizations visualizations = new Visualizations(ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false); + + List debugVisualizations = List.of(visualizations, + new Visualizations(ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false), + new Visualizations(ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false)); + + viewerDocumentService.addVisualizationsOnPage(document, document, visualizations); + viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, debugVisualizations); } - private List createVisualizations(Map> imagesWithResultsPerPage) { + private Map createVisualizations(Map> imagesWithResultsPerPage) { - return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList(); + return imagesWithResultsPerPage.keySet() + .stream() + .collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createVisualizations(imagesWithResultsPerPage.get(pageNumber)))); } - private VisualizationsOnPage createVisualizations(Integer pageNumber, List ocrResultsToWrite) { + private static Function pageNumber1IdxTo0IdxMapper() { + // PDFBox uses a 0-based index for page numbers internally, while we use a 1-based index + return p -> p - 1; + } + + + private VisualizationsOnPage createVisualizations(List ocrResultsToWrite) { List words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList(); List placedTexts = words.stream() @@ -67,21 +82,23 @@ public class OcrResultWriter { null, Color.BLACK, (float) word.getFontSize(), - word.getFont(), + word.getFontMetricsFactory(), Optional.of(word.getTextMatrix()), Optional.of(RenderingMode.NEITHER))) .toList(); - return VisualizationsOnPage.builder().pageNumber(pageNumber - 1).placedTexts(placedTexts).build(); + return VisualizationsOnPage.builder().placedTexts(placedTexts).build(); } - private List createDebugTextVisualizations(Map> imagesWithResultsPerPage) { + private Map createDebugTextVisualizations(Map> imagesWithResultsPerPage) { - return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createDebugTextVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList(); + return imagesWithResultsPerPage.keySet() + .stream() + .collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createDebugTextVisualizations(imagesWithResultsPerPage.get(pageNumber)))); } - private VisualizationsOnPage createDebugTextVisualizations(Integer pageNumber, List ocrResultsToWrite) { + private VisualizationsOnPage createDebugTextVisualizations(List ocrResultsToWrite) { List words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList(); List placedTexts = words.stream() @@ -89,28 +106,30 @@ public class OcrResultWriter { null, word.getFontStyle().equals(FontStyle.REGULAR) ? Color.BLUE : Color.RED, (float) word.getFontSize(), - word.getFont(), + word.getFontMetricsFactory(), Optional.of(word.getTextMatrix()), Optional.of(RenderingMode.FILL))) .toList(); - return VisualizationsOnPage.builder().pageNumber(pageNumber).placedTexts(placedTexts).build(); + return VisualizationsOnPage.builder().placedTexts(placedTexts).build(); } - private List createDebugBBoxVisualizations(Map> imagesWithResultsPerPage) { + private Map createDebugBBoxVisualizations(Map> imagesWithResultsPerPage) { - return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createDebugBBoxVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList(); + return imagesWithResultsPerPage.keySet() + .stream() + .collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createDebugBBoxVisualizations(imagesWithResultsPerPage.get(pageNumber)))); } - private VisualizationsOnPage createDebugBBoxVisualizations(Integer pageNumber, List ocrResultsToWrite) { + private VisualizationsOnPage createDebugBBoxVisualizations(List ocrResultsToWrite) { List words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList(); List coloredLines = Stream.concat(// words.stream().map(TextPositionInImage::getTransformedTextBBox).map(this::quadPointAsLines),// ocrResultsToWrite.stream().map(OcrResultToWrite::imageBoundingBox).map(this::createGrid)// ).flatMap(Collection::stream).toList(); - return VisualizationsOnPage.builder().pageNumber(pageNumber).coloredLines(coloredLines).build(); + return VisualizationsOnPage.builder().coloredLines(coloredLines).build(); } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/FontMetricsFactory.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/FontMetricsFactory.java index 039b217..a944e6d 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/FontMetricsFactory.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/FontMetricsFactory.java @@ -4,11 +4,12 @@ import org.apache.pdfbox.pdmodel.font.PDFont; import com.knecon.fforesight.service.ocr.processor.model.FontMetrics; import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent; +import com.knecon.fforesight.service.viewerdoc.model.EmbeddableFont; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; -public interface FontMetricsFactory { +public interface FontMetricsFactory extends EmbeddableFont { default FontMetrics calculateMetrics(String text, double textWidth, double textHeight) { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/Type0FontMetricsFactory.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/Type0FontMetricsFactory.java index 059daed..b71f646 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/Type0FontMetricsFactory.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/Type0FontMetricsFactory.java @@ -1,8 +1,6 @@ package com.knecon.fforesight.service.ocr.processor.service.fonts; import java.io.ByteArrayInputStream; -import java.util.Collections; -import java.util.List; import java.util.Set; import org.apache.fontbox.ttf.GlyphData; @@ -15,47 +13,62 @@ import org.apache.pdfbox.pdmodel.font.PDType0Font; import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent; +import lombok.AllArgsConstructor; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; -import software.amazon.awssdk.services.s3.endpoints.internal.Value; @Slf4j @RequiredArgsConstructor +@AllArgsConstructor public class Type0FontMetricsFactory implements FontMetricsFactory { - private final PDType0Font type0Font; - private final TrueTypeFont trueTypeFont; + private final String resourcePath; + private PDType0Font type0Font; + private TrueTypeFont trueTypeFont; + private PDDocument documentThisIsEmbeddedIn; // for this specific font back-/forward-slashes have a lot of descent screwing up the font size and therefore bold detection. So if we find such a character we ignore its descent. private static final Set slashGlyphIds = Set.of(18, 63); + @SneakyThrows public static Type0FontMetricsFactory regular(PDDocument document) { - return createFromResource("fonts/cmu-regular.ttf", document); + String resourcePath = "fonts/cmu-regular.ttf"; + return createFromResourcePath(resourcePath, document); } + @SneakyThrows public static Type0FontMetricsFactory bold(PDDocument document) { - return createFromResource("fonts/cmu-bold.ttf", document); + String resourcePath = "fonts/cmu-bold.ttf"; + return createFromResourcePath(resourcePath, document); } @SneakyThrows @SuppressWarnings("PMD.CloseResource") - // Todo i think this is not ok to never close the font... - private static Type0FontMetricsFactory createFromResource(String resourcePath, PDDocument document) { + private static TrueTypeFont readFromResourcePath(String resourcePath) { + // The ttf is closed with the document, see PDType0Font line 134 try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(resourcePath); var buffer = new RandomAccessReadBuffer(in)) { - TrueTypeFont trueTypeFont = new TTFParser().parse(buffer); // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information - PDType0Font type0Font = PDType0Font.load(document, trueTypeFont, true); // use Type0Font for unicode support - return new Type0FontMetricsFactory(type0Font, trueTypeFont); + return new TTFParser().parse(buffer); } } + @SneakyThrows + private static Type0FontMetricsFactory createFromResourcePath(String resourcePath, PDDocument document) { + + TrueTypeFont trueTypeFont = readFromResourcePath(resourcePath); + // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information + return new Type0FontMetricsFactory(resourcePath, PDType0Font.load(document, trueTypeFont, true), trueTypeFont, document); // use Type0Font for unicode support) + + } + + @SneakyThrows public HeightAndDescent calculateHeightAndDescent(String text) { @@ -99,4 +112,28 @@ public class Type0FontMetricsFactory implements FontMetricsFactory { return type0Font; } + + @Override + @SneakyThrows + public PDFont embed(PDDocument document) { + + if (documentThisIsEmbeddedIn.equals(document)) { + return getFont(); + } + + // no need to close, the font will be closed with the document it is embedded in + + this.trueTypeFont = readFromResourcePath(resourcePath); + this.type0Font = PDType0Font.load(document, trueTypeFont, true); + this.documentThisIsEmbeddedIn = document; + return getFont(); + } + + + @SneakyThrows + public void close() { + + trueTypeFont.close(); + } + } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/FontStyleDetector.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/FontStyleDetector.java index d12af43..4f85ac9 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/FontStyleDetector.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/FontStyleDetector.java @@ -49,11 +49,11 @@ public class FontStyleDetector { * (Opening (Morphology)). * We then threshold the ratio of remaining pixels to determine whether a word is bold or not. *

- * I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size. - * But this is based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height. + * I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size estimation. + * But that is calculated based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height. * The paper does not describe its clustering algorithm, so I've decided on DBSCAN due to its good runtime and readily available implementation by apache commons math. * Moreover, the paper states that stroke width scales linearly with text height. I've come to the conclusion this is not the case. - * It seems it scales with the square root of the text height. Or at least this seemed to give the best results. + * It seems it scales with the square root of the text height. Or at least this seemed to give the best results for me. */ public Map> detectBold(List ocrResults, PDDocument document) { diff --git a/ocr-service-v1/ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java b/ocr-service-v1/ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java index da3d0e2..0d3f46d 100644 --- a/ocr-service-v1/ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java +++ b/ocr-service-v1/ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java @@ -58,6 +58,7 @@ public class OcrMessageReceiver { setStatusOcrProcessing(dossierId, fileId); + tmpDir.toFile().mkdirs(); File documentFile = tmpDir.resolve("document.pdf").toFile(); File viewerDocumentFile = tmpDir.resolve("viewerDocument.pdf").toFile(); diff --git a/publish-custom-image.sh b/publish-custom-image.sh index 80ecc4e..9afa975 100755 --- a/publish-custom-image.sh +++ b/publish-custom-image.sh @@ -11,5 +11,5 @@ commit_hash=$(git rev-parse --short=5 HEAD) # Combine branch and commit hash buildName="${USER}-${branch}-${commit_hash}" -gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache +gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName echo "nexus.knecon.com:5001/ff/${dir}-server:$buildName" From d2f2def1c28230568ef9b30268db5e623c48b15a Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Wed, 7 Feb 2024 11:36:42 +0100 Subject: [PATCH 3/3] RED-8156: add ocr debug layers to viewer document * fix pmd * disable tests again --- .../ocr/processor/service/fonts/Type0FontMetricsFactory.java | 1 + .../service/ocr/v1/server/OcrServiceIntegrationTest.java | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/Type0FontMetricsFactory.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/Type0FontMetricsFactory.java index b71f646..af39e02 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/Type0FontMetricsFactory.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/Type0FontMetricsFactory.java @@ -60,6 +60,7 @@ public class Type0FontMetricsFactory implements FontMetricsFactory { @SneakyThrows + @SuppressWarnings("PMD.CloseResource") private static Type0FontMetricsFactory createFromResourcePath(String resourcePath, PDDocument document) { TrueTypeFont trueTypeFont = readFromResourcePath(resourcePath); diff --git a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index ec638b1..6e77461 100644 --- a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -33,7 +33,7 @@ import io.micrometer.prometheus.PrometheusMeterRegistry; import io.micrometer.prometheus.PrometheusTimer; import lombok.SneakyThrows; -//@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help. +@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help. @SpringBootTest() public class OcrServiceIntegrationTest extends AbstractTest {