diff --git a/ocr-service-v1/ocr-service-processor/build.gradle.kts b/ocr-service-v1/ocr-service-processor/build.gradle.kts index 8269636..9b61c1d 100644 --- a/ocr-service-v1/ocr-service-processor/build.gradle.kts +++ b/ocr-service-v1/ocr-service-processor/build.gradle.kts @@ -25,5 +25,6 @@ dependencies { api("com.amazonaws:aws-java-sdk-kms:1.12.440") api("com.google.guava:guava:31.1-jre") api("com.iqser.red.commons:pdftron-logic-commons:2.23.0") + api("com.knecon.fforesight:viewer-doc-processor:0.3.0") testImplementation("org.junit.jupiter:junit-jupiter:5.8.1") } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceProcessorConfiguration.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceProcessorConfiguration.java index 2a66bfd..345e86a 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceProcessorConfiguration.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceProcessorConfiguration.java @@ -1,14 +1,26 @@ package com.knecon.fforesight.service.ocr.processor; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.context.properties.EnableConfigurationProperties; +import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.ComponentScan; import org.springframework.context.annotation.Configuration; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; +import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService; + +import io.micrometer.observation.ObservationRegistry; @Configuration @ComponentScan @EnableConfigurationProperties(OcrServiceSettings.class) public class OcrServiceProcessorConfiguration { + @Bean + @Autowired + public ViewerDocumentService viewerDocumentService(ObservationRegistry registry) { + + return new ViewerDocumentService(registry); + } + } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/FileStorageService.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/FileStorageService.java index c8161d6..c21e6b4 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/FileStorageService.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/FileStorageService.java @@ -1,13 +1,11 @@ package com.knecon.fforesight.service.ocr.processor.service; -import java.io.ByteArrayInputStream; import java.io.File; +import java.io.FileInputStream; import java.io.InputStream; import java.nio.file.Files; -import java.nio.file.Paths; -import java.nio.file.StandardOpenOption; +import java.nio.file.StandardCopyOption; -import org.apache.commons.io.IOUtils; import org.springframework.stereotype.Service; import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType; @@ -31,47 +29,38 @@ public class FileStorageService { return dossierId + "/" + fileId + "." + fileType.name() + fileType.getExtension(); } - - @SneakyThrows - public byte[] getOriginalFile(String dossierId, String fileId) { - - try (InputStream inputStream = getInputStream(getStorageId(dossierId, fileId, FileType.ORIGIN))) { - return IOUtils.toByteArray(inputStream); - } - } - - - @SneakyThrows - public InputStream getOriginalFileAsStream(String dossierId, String fileId) { - - return getInputStream(getStorageId(dossierId, fileId, FileType.ORIGIN)); - } - - - public void storeOriginalFile(String dossierId, String fileId, InputStream stream) { - - storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), stream); - } - - public boolean untouchedFileExists(String dossierId, String fileId) { return storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED)); } + @SneakyThrows + public void storeFiles(String dossierId, String fileId, File documentFile, File viewerDocumentFile) { - public void storeUntouchedFile(String dossierId, String fileId, byte[] data) { - - storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED), new ByteArrayInputStream(data)); + try (var in = new FileInputStream(documentFile)) { + storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), in); + } + try (var in = new FileInputStream(viewerDocumentFile)) { + storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT), in); + } } @SneakyThrows - private InputStream getInputStream(String storageId) { + public void downloadFiles(String dossierId, String fileId, File documentFile, File viewerDocumentFile) { - File tempFile = File.createTempFile("temp", ".data"); - storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile); - return Files.newInputStream(Paths.get(tempFile.getPath()), StandardOpenOption.DELETE_ON_CLOSE); + storageService.downloadTo(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), documentFile); + if (storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT))) { + storageService.downloadTo(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT), viewerDocumentFile); + } else { + Files.copy(documentFile.toPath(), viewerDocumentFile.toPath(), StandardCopyOption.REPLACE_EXISTING); + } + + if (!untouchedFileExists(dossierId, fileId)) { + try (var in = new FileInputStream(documentFile)) { + storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED), in); + } + } } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java index b859051..9949d3d 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java @@ -1,12 +1,12 @@ package com.knecon.fforesight.service.ocr.processor.service; -import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -27,6 +27,7 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrResult; import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.FontStyleDetector; import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; +import com.pdftron.pdf.PDFDoc; import io.micrometer.observation.ObservationRegistry; import io.micrometer.observation.annotation.Observed; @@ -58,55 +59,66 @@ public class OCRService { * looking for stitchedImages (if so converting the current page to an image with ghostscript and work on this instead), * perform tesseract-ocr on these images (via threads) and write the generated ocr-text as invisible elements. * - * @param dossierId Id of dossier - * @param fileId Id of file - * @param out OutputStream where to write to + * @param dossierId Id of dossier + * @param fileId Id of file + * @param tmpDir working directory for all files + * @param documentFile the file to perform ocr on, results are written invisibly + * @param viewerDocumentFile debugging file, results are written visibly in an optional content group */ @Observed(name = "OCRService", contextualName = "run-ocr-on-document") @SneakyThrows - public void runOcrOnDocument(String dossierId, String fileId, OutputStream out) { - - try (InputStream fileStream = removeWatermarkIfEnabled(dossierId, fileId); ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) { - - invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false, false); - - try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) { - log.info("Starting OCR for file {}", fileId); - long ocrStart = System.currentTimeMillis(); - Statistics stats = runOcr(transferInputStream, out, fileId, dossierId); - long ocrEnd = System.currentTimeMillis(); - log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, String.format("%.1f", (ocrEnd - ocrStart) / 1000.0)); - log.info("Runtime breakdown: {}", stats); - } - } - } - - - private InputStream removeWatermarkIfEnabled(String dossierId, String fileId) throws IOException { + public void runOcrOnDocument(String dossierId, String fileId, Path tmpDir, File documentFile, File viewerDocumentFile) { if (settings.isRemoveWatermark()) { - try (var in = fileStorageService.getOriginalFileAsStream(dossierId, fileId); var transferOutputStream = new ByteArrayOutputStream()) { - watermarkRemovalService.removeWatermarks(in, transferOutputStream); - return new ByteArrayInputStream(transferOutputStream.toByteArray()); - } + removeWatermarkIfEnabled(documentFile); } - return fileStorageService.getOriginalFileAsStream(dossierId, fileId); + removeInvisibleElements(documentFile); + + log.info("Starting OCR for file {}", fileId); + long ocrStart = System.currentTimeMillis(); + Statistics stats = runOcr(tmpDir, documentFile, viewerDocumentFile, fileId, dossierId); + long ocrEnd = System.currentTimeMillis(); + log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, String.format("%.1f", (ocrEnd - ocrStart) / 1000.0)); + log.info("Runtime breakdown: {}", stats); + } @SneakyThrows - public Statistics runOcr(InputStream in, OutputStream out, String fileId, String dossierId) { + private void removeInvisibleElements(File originFile) { + + Path tmpFile = Files.createTempFile("invisibleElements", ".pdf"); + try (var in = new FileInputStream(originFile); var out = new FileOutputStream(tmpFile.toFile())) { + invisibleElementRemovalService.removeInvisibleElements(in, out, false, false); + } + Files.copy(tmpFile, originFile.toPath(), StandardCopyOption.REPLACE_EXISTING); + assert tmpFile.toFile().delete(); + } + + + @SneakyThrows + private void removeWatermarkIfEnabled(File originFile) { + + Path tmpFile = Files.createTempFile("removeWatermarks", ".pdf"); + try (var in = new FileInputStream(originFile); var out = new FileOutputStream(tmpFile.toFile())) { + watermarkRemovalService.removeWatermarks(in, out); + } + Files.copy(tmpFile, originFile.toPath(), StandardCopyOption.REPLACE_EXISTING); + assert tmpFile.toFile().delete(); + } + + + @SneakyThrows + public Statistics runOcr(Path tmpDir, File documentFile, File viewerDocumentFile, String fileId, String dossierId) { long timestamp; - Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve(dossierId + "-" + fileId); + Path tmpImageDir = tmpDir.resolve("images"); Path tesseractOutputDir = tmpDir.resolve("tesseract_output"); tesseractOutputDir.toFile().mkdirs(); tmpImageDir.toFile().mkdirs(); - File documentFile = OsUtils.writeFileToTmpFolder(in, tmpDir); - Statistics stats; try (PDDocument document = Loader.loadPDF(documentFile)) { OcrProgressLogger logger = new OcrProgressLogger(document.getNumberOfPages(), ocrMessageSender, fileId); @@ -150,12 +162,11 @@ public class OCRService { stats.increaseFontStyleDetectionDuration(System.currentTimeMillis() - timestamp); timestamp = System.currentTimeMillis(); - var dictionariesToUpdate = ocrResultWriter.drawOcrResultsToPdf(document, imageWithTextPositionsPerPage); + ocrResultWriter.drawOcrResultsToPdf(documentFile, viewerDocumentFile, imageWithTextPositionsPerPage); + log.info("Saving document"); - document.saveIncremental(out, dictionariesToUpdate); stats.increaseWritingTextDuration(System.currentTimeMillis() - timestamp); - FileSystemUtils.deleteRecursively(tmpDir); logger.sendFinished(); return stats; } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java index 30e2ee7..1292859 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java @@ -1,29 +1,29 @@ package com.knecon.fforesight.service.ocr.processor.service; import java.awt.Color; +import java.awt.geom.Line2D; import java.awt.geom.Point2D; +import java.io.File; import java.util.Collection; -import java.util.HashSet; +import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.Set; +import java.util.Optional; +import java.util.stream.Stream; -import org.apache.pdfbox.cos.COSDictionary; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDDocumentCatalog; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.PDPageContentStream; -import org.apache.pdfbox.pdmodel.PDResources; -import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentGroup; -import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentProperties; import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite; import com.knecon.fforesight.service.ocr.processor.model.QuadPoint; import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage; +import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; +import com.knecon.fforesight.service.viewerdoc.ContentStreams; +import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; +import com.knecon.fforesight.service.viewerdoc.model.PlacedText; +import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage; +import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService; import lombok.AccessLevel; import lombok.RequiredArgsConstructor; @@ -37,180 +37,97 @@ import lombok.extern.slf4j.Slf4j; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class OcrResultWriter { - static String ocrLayerName = "knecon OCR"; - OcrServiceSettings settings; + ViewerDocumentService viewerDocumentService; @SneakyThrows - public Set drawOcrResultsToPdf(PDDocument document, Map> imagesWithResultsPerPage) { + public void drawOcrResultsToPdf(File document, File viewerDocument, Map> imagesWithResultsPerPage) { - Set dictionariesToUpdate = new HashSet<>(); - imagesWithResultsPerPage.keySet().forEach(pageNumber -> drawResultsPerPage(document, pageNumber, imagesWithResultsPerPage.get(pageNumber), dictionariesToUpdate)); - dictionariesToUpdate.add(document.getDocumentInformation().getCOSObject()); - return dictionariesToUpdate; + List ocrVisualizationsOnPages = createVisualizations(imagesWithResultsPerPage); + List ocrTextDebugVisualizationsOnPages = createDebugTextVisualizations(imagesWithResultsPerPage); + List ocrBBoxDebugVisualizationsOnPages = createDebugBBoxVisualizations(imagesWithResultsPerPage); + viewerDocumentService.addVisualizationsOnPage(document, document, ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false); + viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false); + viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false); + viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false); + } + + + private List createVisualizations(Map> imagesWithResultsPerPage) { + + return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList(); + } + + + private VisualizationsOnPage createVisualizations(Integer pageNumber, List ocrResultsToWrite) { + + List words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList(); + List placedTexts = words.stream() + .map(word -> new PlacedText(word.getText(), + null, + Color.BLACK, + (float) word.getFontSize(), + word.getFont(), + Optional.of(word.getTextMatrix()), + Optional.of(RenderingMode.NEITHER))) + .toList(); + return VisualizationsOnPage.builder().pageNumber(pageNumber - 1).placedTexts(placedTexts).build(); + } + + + private List createDebugTextVisualizations(Map> imagesWithResultsPerPage) { + + return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createDebugTextVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList(); + } + + + private VisualizationsOnPage createDebugTextVisualizations(Integer pageNumber, List ocrResultsToWrite) { + + List words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList(); + List placedTexts = words.stream() + .map(word -> new PlacedText(word.getText(), + null, + word.getFontStyle().equals(FontStyle.REGULAR) ? Color.BLUE : Color.RED, + (float) word.getFontSize(), + word.getFont(), + Optional.of(word.getTextMatrix()), + Optional.of(RenderingMode.FILL))) + .toList(); + return VisualizationsOnPage.builder().pageNumber(pageNumber).placedTexts(placedTexts).build(); + } + + + private List createDebugBBoxVisualizations(Map> imagesWithResultsPerPage) { + + return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createDebugBBoxVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList(); + } + + + private VisualizationsOnPage createDebugBBoxVisualizations(Integer pageNumber, List ocrResultsToWrite) { + + List words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList(); + List coloredLines = Stream.concat(// + words.stream().map(TextPositionInImage::getTransformedTextBBox).map(this::quadPointAsLines),// + ocrResultsToWrite.stream().map(OcrResultToWrite::imageBoundingBox).map(this::createGrid)// + ).flatMap(Collection::stream).toList(); + return VisualizationsOnPage.builder().pageNumber(pageNumber).coloredLines(coloredLines).build(); + } + + + private List quadPointAsLines(QuadPoint rect) { + + return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.ORANGE, 1), + new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1), + new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1), + new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.MAGENTA, 1)); } @SneakyThrows - private void drawResultsPerPage(PDDocument document, Integer pageNumber, List ocrResultToWrite, Set dictionariesToUpdate) { + private List createGrid(QuadPoint rect) { - var pdPage = document.getPage(pageNumber - 1); + List lines = new LinkedList<>(quadPointAsLines(rect)); - PDOptionalContentGroup textDebugLayer = new PDOptionalContentGroup(ocrLayerName); - PDOptionalContentGroup bBoxDebugLayer = new PDOptionalContentGroup(ocrLayerName + "BBox"); - if (settings.isDebug()) { - textDebugLayer = addOptionalGroup(ocrLayerName, document, pdPage, dictionariesToUpdate); - bBoxDebugLayer = addOptionalGroup(ocrLayerName + " BBox", document, pdPage, dictionariesToUpdate); - } - - escapeContentStreams(document, pdPage); - - List words = ocrResultToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList(); - try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true)) { - - // write invisible ocr text inside tagged content - contentStream.beginMarkedContent(settings.getOcrMarkedContentTag()); - contentStream.saveGraphicsState(); - contentStream.setNonStrokingColor(Color.BLUE); - contentStream.setStrokingColor(Color.BLUE); - contentStream.setLineWidth(1); - words.forEach(word -> drawInvisibleWord(word, contentStream)); - contentStream.restoreGraphicsState(); - contentStream.endMarkedContent(); - - if (settings.isDebug()) { // must not be written, as it will interfere with layout parsing - // write visible ocr text inside optional group - contentStream.beginMarkedContent(COSName.OC, textDebugLayer); - contentStream.saveGraphicsState(); - words.forEach(word -> drawVisibleWord(word, contentStream)); - contentStream.restoreGraphicsState(); - contentStream.endMarkedContent(); - - // write word bounding boxes (tesseract output) inside optional group - contentStream.beginMarkedContent(COSName.OC, bBoxDebugLayer); - contentStream.saveGraphicsState(); - ocrResultToWrite.stream() - .map(OcrResultToWrite::imageBoundingBox) - .forEach(imagePosition -> drawGrid(contentStream, imagePosition)); - words.stream().map(TextPositionInImage::getTransformedTextBBox).forEach(word -> drawRectangle(contentStream, word)); - contentStream.restoreGraphicsState(); - contentStream.endMarkedContent(); - } - } - dictionariesToUpdate.add(pdPage.getCOSObject()); - dictionariesToUpdate.add(pdPage.getResources().getCOSObject()); - } - - - @SneakyThrows - private static void escapeContentStreams(PDDocument document, PDPage pdPage) { - // We need to append to the contentstream, otherwise the content could be overlapped by images - // But we also need to save the graphics state before, such that our appended content cannot be affected by previous contentstreams with side-effects, such as not escaped matrix transformations - try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.PREPEND, false)) { - contentStream.saveGraphicsState(); - } - try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, false)) { - contentStream.restoreGraphicsState(); - } - } - - - private PDOptionalContentGroup addOptionalGroup(String ocrLayerName, PDDocument document, PDPage pdPage, Set dictionariesToUpdate) { - - PDDocumentCatalog catalog = document.getDocumentCatalog(); - PDOptionalContentProperties ocprops = catalog.getOCProperties(); - if (ocprops == null) { - ocprops = new PDOptionalContentProperties(); - catalog.setOCProperties(ocprops); - } - PDOptionalContentGroup layer = null; - if (ocprops.hasGroup(ocrLayerName)) { - layer = ocprops.getGroup(ocrLayerName); - } else { - layer = new PDOptionalContentGroup(ocrLayerName); - ocprops.addGroup(layer); - } - - // enable debug layers by default only when DEBUG flag is set. - ocprops.setGroupEnabled(layer, settings.isDebug()); - PDResources resources = pdPage.getResources(); - if (resources == null) { - resources = new PDResources(); - pdPage.setResources(resources); - } - dictionariesToUpdate.add(catalog.getCOSObject()); - return layer; - } - - - @SneakyThrows - private void drawRectangle(PDPageContentStream contentStream, QuadPoint rect) { - - contentStream.saveGraphicsState(); - contentStream.setLineWidth(1); - contentStream.moveTo((float) rect.a().getX(), (float) rect.a().getY()); - contentStream.lineTo((float) rect.b().getX(), (float) rect.b().getY()); - contentStream.setStrokingColor(Color.ORANGE); - contentStream.stroke(); - contentStream.moveTo((float) rect.b().getX(), (float) rect.b().getY()); - contentStream.lineTo((float) rect.c().getX(), (float) rect.c().getY()); - contentStream.setStrokingColor(Color.BLUE); - contentStream.stroke(); - contentStream.moveTo((float) rect.c().getX(), (float) rect.c().getY()); - contentStream.lineTo((float) rect.d().getX(), (float) rect.d().getY()); - contentStream.setStrokingColor(Color.GREEN); - contentStream.stroke(); - contentStream.moveTo((float) rect.d().getX(), (float) rect.d().getY()); - contentStream.lineTo((float) rect.a().getX(), (float) rect.a().getY()); - contentStream.setStrokingColor(Color.MAGENTA); - contentStream.stroke(); - contentStream.restoreGraphicsState(); - } - - - private void drawInvisibleWord(TextPositionInImage word, PDPageContentStream contentStream) { - - drawWord(word, contentStream, RenderingMode.NEITHER); - } - - - private void drawVisibleWord(TextPositionInImage word, PDPageContentStream contentStream) { - - drawWord(word, contentStream, RenderingMode.FILL); - } - - - // @SneakyThrows - private void drawWord(TextPositionInImage position, PDPageContentStream contentStream, RenderingMode renderingMode) { - - try { - contentStream.setNonStrokingColor(switch (position.getFontStyle()) { - case BOLD -> Color.RED; - case ITALIC -> Color.GREEN; - default -> Color.BLUE; - }); - contentStream.beginText(); - contentStream.setRenderingMode(renderingMode); - contentStream.setFont(position.getFont(), (float) position.getFontSize()); - contentStream.setTextMatrix(position.getTextMatrix()); - contentStream.showText(position.getText()); - contentStream.endText(); - - } catch (Exception e) { - log.error("Failed to write text {}", position.getText()); - log.error(e.getMessage()); - } - } - - - @SneakyThrows - private void drawGrid(PDPageContentStream contentStream, QuadPoint rect) { - - drawRectangle(contentStream, rect); - - contentStream.saveGraphicsState(); - contentStream.setStrokingColor(Color.BLACK); - contentStream.setLineWidth(0.2F); int nRows = 8; int nCols = 8; @@ -218,7 +135,7 @@ public class OcrResultWriter { Point2D start = add(rect.a(), abStep); Point2D end = add(rect.d(), abStep); for (int row = 0; row < nRows; ++row) { - drawLine(start, end, contentStream); + lines.add(new ColoredLine(new Line2D.Double(start, end), Color.BLACK, 0.2f)); start = add(start, abStep); end = add(end, abStep); } @@ -226,21 +143,12 @@ public class OcrResultWriter { start = add(rect.a(), adStep); end = add(rect.b(), adStep); for (int col = 0; col < nCols; ++col) { - drawLine(start, end, contentStream); + lines.add(new ColoredLine(new Line2D.Double(start, end), Color.BLACK, 0.2f)); start = add(start, adStep); end = add(end, adStep); } - contentStream.restoreGraphicsState(); - } - - - @SneakyThrows - private void drawLine(Point2D a, Point2D b, PDPageContentStream contentStream) { - - contentStream.moveTo((float) a.getX(), (float) a.getY()); - contentStream.lineTo((float) b.getX(), (float) b.getY()); - contentStream.stroke(); + return lines; } diff --git a/ocr-service-v1/ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java b/ocr-service-v1/ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java index 56f74ef..da3d0e2 100644 --- a/ocr-service-v1/ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java +++ b/ocr-service-v1/ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java @@ -2,18 +2,23 @@ package com.knecon.fforesight.service.ocr.v1.server.queue; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.IOException; +import java.nio.file.Path; import java.time.OffsetDateTime; import java.time.temporal.ChronoUnit; +import org.apache.commons.io.FileUtils; import org.springframework.amqp.AmqpRejectAndDontRequeueException; import org.springframework.amqp.core.Message; import org.springframework.amqp.rabbit.annotation.RabbitHandler; import org.springframework.amqp.rabbit.annotation.RabbitListener; import org.springframework.http.HttpStatus; import org.springframework.stereotype.Service; +import org.springframework.util.FileSystemUtils; import com.fasterxml.jackson.databind.ObjectMapper; +import com.knecon.fforesight.service.ocr.processor.service.OsUtils; import com.knecon.fforesight.service.ocr.v1.server.client.FileStatusProcessingUpdateClient; import com.knecon.fforesight.service.ocr.processor.service.FileStorageService; import com.knecon.fforesight.service.ocr.processor.service.OCRService; @@ -21,7 +26,6 @@ import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest; import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileErrorInfo; import feign.FeignException; -import io.micrometer.observation.annotation.Observed; import lombok.AccessLevel; import lombok.RequiredArgsConstructor; import lombok.experimental.FieldDefaults; @@ -33,10 +37,10 @@ import lombok.extern.slf4j.Slf4j; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class OcrMessageReceiver { - FileStorageService fileStorageService; - ObjectMapper objectMapper; - FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient; - OCRService ocrService; + FileStorageService fileStorageService; + ObjectMapper objectMapper; + FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient; + OCRService ocrService; @RabbitHandler @@ -44,33 +48,33 @@ public class OcrMessageReceiver { public void receiveOcr(Message in) throws IOException { DocumentRequest ocrRequestMessage = objectMapper.readValue(in.getBody(), DocumentRequest.class); - log.info("--------------------------------------------------------------------------"); - log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); + String dossierId = ocrRequestMessage.getDossierId(); + String fileId = ocrRequestMessage.getFileId(); + Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve(dossierId + "-" + fileId); try { - setStatusOcrProcessing(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); + log.info("--------------------------------------------------------------------------"); + log.info("Start ocr for file with dossierId {} and fileId {}", dossierId, fileId); - if (!fileStorageService.untouchedFileExists(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId())) { - byte[] originalFile = fileStorageService.getOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); - fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile); - } + setStatusOcrProcessing(dossierId, fileId); - try (var transferStream = new ByteArrayOutputStream()) { - ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), transferStream); - try (var inputStream = new ByteArrayInputStream(transferStream.toByteArray())) { - fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), inputStream); - } - } catch (IOException e) { - log.error("Failed to store file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); - throw new RuntimeException(e); - } + File documentFile = tmpDir.resolve("document.pdf").toFile(); + File viewerDocumentFile = tmpDir.resolve("viewerDocument.pdf").toFile(); - fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); + fileStorageService.downloadFiles(dossierId, fileId, documentFile, viewerDocumentFile); + + ocrService.runOcrOnDocument(dossierId, fileId, tmpDir, documentFile, viewerDocumentFile); + + fileStorageService.storeFiles(dossierId, fileId, documentFile, viewerDocumentFile); + + fileStatusProcessingUpdateClient.ocrSuccessful(dossierId, fileId); } catch (Exception e) { log.warn("An exception occurred in ocr file stage: {}", e.getMessage()); in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_HEADER, e.getMessage()); in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_TIMESTAMP_HEADER, OffsetDateTime.now().truncatedTo(ChronoUnit.MILLIS)); throw new RuntimeException(e); + } finally { + FileSystemUtils.deleteRecursively(tmpDir); } } @@ -80,6 +84,7 @@ public class OcrMessageReceiver { public void receiveOcrDLQ(Message failedMessage) throws IOException { DocumentRequest ocrRequestMessage = objectMapper.readValue(failedMessage.getBody(), DocumentRequest.class); + log.info("OCR DQL received: {}", ocrRequestMessage); String errorMessage = failedMessage.getMessageProperties().getHeader(MessagingConfiguration.X_ERROR_INFO_HEADER); OffsetDateTime timestamp = failedMessage.getMessageProperties().getHeader(MessagingConfiguration.X_ERROR_INFO_TIMESTAMP_HEADER); diff --git a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index 2c3e40d..ec638b1 100644 --- a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -9,6 +9,7 @@ import java.io.FileInputStream; import java.io.FileOutputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import java.util.Comparator; import java.util.List; import java.util.concurrent.TimeUnit; @@ -25,13 +26,14 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.knecon.fforesight.service.ocr.processor.service.FileStorageService; import com.knecon.fforesight.service.ocr.processor.service.OCRService; import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType; +import com.knecon.fforesight.service.ocr.processor.service.OsUtils; import com.knecon.fforesight.tenantcommons.TenantContext; import io.micrometer.prometheus.PrometheusMeterRegistry; import io.micrometer.prometheus.PrometheusTimer; import lombok.SneakyThrows; -@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help. +//@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help. @SpringBootTest() public class OcrServiceIntegrationTest extends AbstractTest { @@ -64,7 +66,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcr() { - String text = testOCR("files/UNAPPROVED_VV-331155 (1).pdf"); + String text = testOCR("files/402Study.pdf"); } @@ -116,18 +118,17 @@ public class OcrServiceIntegrationTest extends AbstractTest { private String testOCR(String fileName) { ClassPathResource pdfFileResource = new ClassPathResource(fileName); - var originId = FileStorageService.getStorageId(TEST_DOSSIER_ID, "file", FileType.ORIGIN); - try (var fileStream = pdfFileResource.getInputStream()) { - storageService.storeObject(TenantContext.getTenantId(), originId, fileStream); - } + Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve("OCR_TEST").resolve(Path.of(fileName).getFileName()); + tmpDir.toFile().mkdirs(); + var documentFile = tmpDir.resolve(Path.of("document.pdf")); + var viewerDocumentFile = tmpDir.resolve(Path.of("viewerDocument.pdf")); + Files.copy(pdfFileResource.getFile().toPath(), documentFile, StandardCopyOption.REPLACE_EXISTING); + Files.copy(pdfFileResource.getFile().toPath(), viewerDocumentFile, StandardCopyOption.REPLACE_EXISTING); - Path tmpFileName = Path.of(getTemporaryDirectory()).resolve(Path.of(fileName).getFileName()); - try (var out = new FileOutputStream(tmpFileName.toFile())) { - ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", out); - System.out.println("File:" + tmpFileName); - } + ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", tmpDir, documentFile.toFile(), viewerDocumentFile.toFile()); + System.out.println("File:" + documentFile); - try (var fileStream = new FileInputStream(tmpFileName.toFile())) { + try (var fileStream = new FileInputStream(documentFile.toFile())) { return extractAllTextFromDocument(fileStream); } } @@ -166,20 +167,18 @@ public class OcrServiceIntegrationTest extends AbstractTest { } - @SneakyThrows private void testOCRForFile(File file) { - var originId = FileStorageService.getStorageId(TEST_DOSSIER_ID, "file", FileType.ORIGIN); - try (var fileStream = new FileInputStream(file)) { - storageService.storeObject(TenantContext.getTenantId(), originId, fileStream); - } + Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve("OCR_TEST").resolve(file.toPath().getFileName()); + tmpDir.toFile().mkdirs(); + var documentFile = tmpDir.resolve(Path.of("document.pdf")); + var viewerDocumentFile = tmpDir.resolve(Path.of("viewerDocument.pdf")); + Files.copy(file.toPath(), documentFile, StandardCopyOption.REPLACE_EXISTING); + Files.copy(file.toPath(), viewerDocumentFile, StandardCopyOption.REPLACE_EXISTING); - Path tmpFileName = Path.of(getTemporaryDirectory()).resolve(Path.of(file.getAbsolutePath()).getFileName()); - try (var out = new FileOutputStream(tmpFileName.toFile())) { - ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", out); - System.out.println("File:" + tmpFileName); - } + ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", tmpDir, documentFile.toFile(), viewerDocumentFile.toFile()); + System.out.println("File:" + documentFile); System.out.println("\n\n"); }