From 0b8de2882386203fabe1f61a31ea483ea3ca060b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kilian=20Sch=C3=BCttler?= Date: Wed, 20 Nov 2024 11:44:55 +0100 Subject: [PATCH] RED-10477: update api version --- .../build.gradle.kts | 27 ++--- .../OcrServiceProcessorConfiguration.java | 16 +++ .../ocr/processor/OcrServiceSettings.java | 10 +- .../ocr/processor/model/ImageFile.java | 8 ++ .../ocr/processor/model/PageBatch.java | 113 ++++++++++++++++-- .../ocr/processor/model/Statistics.java | 2 +- .../processor/service/AsyncOcrService.java | 37 +++++- .../processor/service/AzureOcrResource.java | 5 + .../ocr/processor/service/BatchFactory.java | 63 ++++++++-- .../ocr/processor/service/BatchStats.java | 28 ++++- .../service/ImageDetectionService.java | 16 +-- .../ocr/processor/service/OCRService.java | 12 +- .../service/OcrExecutionSupervisor.java | 31 ++++- .../OcrResultPostProcessingPipeline.java} | 48 +++++--- .../BBoxSnuggificationService.java | 4 +- .../GhostScriptOutputHandler.java | 1 + .../imageprocessing/GhostScriptService.java | 50 ++++++-- .../ImageProcessingPipeline.java | 15 +-- .../ImageProcessingService.java | 26 +++- .../service/ocr/processor/utils/OsUtils.java | 13 +- .../layers/IdpLayerFactory.java | 4 - .../visualizations/layers/LayerFactory.java | 10 +- .../service/ImageProcessingPipelineTest.java | 5 +- .../ocr/processor/service/SnugBoxesTest.java | 30 ++--- .../azure-ocr-service-server/build.gradle.kts | 5 + .../service/ocr/v1/server/Application.java | 17 +-- .../ocr/v1/server}/FileStorageService.java | 2 +- .../TenantMessagingConfigurationImpl.java | 11 -- .../TenantQueueProviderConfig.java | 32 +++++ .../v1/server/queue/OcrMessageReceiver.java | 16 +-- .../ocr/v1/server/queue/OcrMessageSender.java | 3 - .../TenantExchangeMessageReceiverImpl.java | 70 ----------- .../service/ocr/v1/server/AbstractTest.java | 18 ++- .../v1/server/OcrServiceIntegrationTest.java | 4 +- 34 files changed, 468 insertions(+), 284 deletions(-) rename azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/{visualizations/WritableOcrResultFactory.java => service/OcrResultPostProcessingPipeline.java} (92%) rename azure-ocr-service/{azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service => azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server}/FileStorageService.java (97%) delete mode 100644 azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/configuration/TenantMessagingConfigurationImpl.java create mode 100644 azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/configuration/TenantQueueProviderConfig.java delete mode 100644 azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/TenantExchangeMessageReceiverImpl.java diff --git a/azure-ocr-service/azure-ocr-service-processor/build.gradle.kts b/azure-ocr-service/azure-ocr-service-processor/build.gradle.kts index 23d4579..eacdea0 100644 --- a/azure-ocr-service/azure-ocr-service-processor/build.gradle.kts +++ b/azure-ocr-service/azure-ocr-service-processor/build.gradle.kts @@ -10,19 +10,18 @@ configurations { } dependencies { - api(project(":azure-ocr-service-api")) - api("com.iqser.red.service:persistence-service-internal-api-v1:2.224.0") - api("net.sourceforge.tess4j:tess4j:5.8.0") - api("com.iqser.red.commons:metric-commons:2.1.0") - api("com.iqser.red.commons:storage-commons:2.49.0") - api("com.knecon.fforesight:tenant-commons:0.30.0") - api("com.pdftron:PDFNet:10.7.0") - api("org.apache.pdfbox:pdfbox:3.0.0") - api("org.apache.commons:commons-math3:3.6.1") - api("com.amazonaws:aws-java-sdk-kms:1.12.440") - api("com.google.guava:guava:31.1-jre") - api("com.knecon.fforesight:viewer-doc-processor:0.177.0") - api("com.azure:azure-ai-documentintelligence:1.0.0-beta.3") - api("com.iqser.red.commons:pdftron-logic-commons:2.32.0") + implementation(project(":azure-ocr-service-api")) + implementation("net.sourceforge.tess4j:tess4j:5.8.0") + implementation("com.iqser.red.commons:metric-commons:2.1.0") + implementation("com.pdftron:PDFNet:11.0.0") + implementation("org.apache.pdfbox:pdfbox:3.0.0") + implementation("org.apache.commons:commons-math3:3.6.1") + implementation("com.amazonaws:aws-java-sdk-kms:1.12.440") + implementation("com.google.guava:guava:31.1-jre") + implementation("com.knecon.fforesight:viewer-doc-processor:0.193.0") + implementation("com.azure:azure-ai-documentintelligence:1.0.0-beta.4") + + implementation("com.iqser.red.commons:pdftron-logic-commons:2.32.0") + testImplementation("org.junit.jupiter:junit-jupiter:5.8.1") } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceProcessorConfiguration.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceProcessorConfiguration.java index d77b874..32f8c83 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceProcessorConfiguration.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceProcessorConfiguration.java @@ -6,6 +6,8 @@ import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.ComponentScan; import org.springframework.context.annotation.Configuration; +import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService; +import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService; import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService; import io.micrometer.observation.ObservationRegistry; @@ -22,4 +24,18 @@ public class OcrServiceProcessorConfiguration { return new PDFTronViewerDocumentService(registry); } + + @Bean + public InvisibleElementRemovalService invisibleElementRemovalService() { + + return new InvisibleElementRemovalService(); + } + + + @Bean + public WatermarkRemovalService watermarkRemovalService() { + + return new WatermarkRemovalService(); + } + } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceSettings.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceSettings.java index dacadd2..ca5fbd0 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceSettings.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceSettings.java @@ -11,14 +11,16 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(level = AccessLevel.PRIVATE) public class OcrServiceSettings { - // Limits the number of concurrent calls to the azure API. In my very rudimentary testing, azure starts throwing "too many requests" errors at around 80/s. Higher numbers greatly improve the speed. - int concurrency = 8; + // Limits the number of concurrent calls to azure + int concurrency = 2; // Limits the number of pages per call. - int batchSize = 128; + int batchSize = 32; boolean debug; // writes the ocr layer visibly to the viewer doc pdf boolean drawTablesAsLines; // writes the tables to the PDF as invisible lines. - boolean snuggify = true; + boolean snuggify = true; // attempts to shrink the word boxes returned by azure to fit the actual word pixels snug + boolean useCaches = true; // skips azure api, pdf rendering and image processing, when the files are already present + boolean azureFontStyleDetection; // omits all image processing and uses azures FONT_STYLE feature (costs 0.6ct per page) String contentFormat; // Either markdown or text. But, for whatever reason, with markdown enabled, key-values are not written by azure.... } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ImageFile.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ImageFile.java index c3da798..b39f7ae 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ImageFile.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ImageFile.java @@ -1,5 +1,7 @@ package com.knecon.fforesight.service.ocr.processor.model; +import java.io.File; + import net.sourceforge.lept4j.Leptonica1; import net.sourceforge.lept4j.Pix; @@ -10,4 +12,10 @@ public record ImageFile(int pageNumber, String absoluteFilePath) { return Leptonica1.pixRead(absoluteFilePath); } + + public boolean exists() { + + return new File(absoluteFilePath).exists(); + } + } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageBatch.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageBatch.java index 475195b..a3b12d8 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageBatch.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageBatch.java @@ -2,16 +2,25 @@ package com.knecon.fforesight.service.ocr.processor.model; import static com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils.formatIntervals; +import java.io.File; +import java.io.FileInputStream; import java.nio.file.Path; +import java.util.ArrayList; import java.util.List; import java.util.function.Consumer; +import com.azure.ai.documentintelligence.models.AnalyzeResult; import com.azure.core.util.BinaryData; +import com.azure.json.JsonOptions; +import com.azure.json.JsonReader; +import com.azure.json.implementation.DefaultJsonReader; +import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.GhostScriptService; import lombok.AccessLevel; import lombok.Getter; import lombok.NonNull; import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; @RequiredArgsConstructor @@ -21,23 +30,101 @@ public final class PageBatch implements Comparable { @Getter int index; @NonNull - List lookup; + List batchPageToOriginPageLookup; @NonNull @Getter Path batchDoc; @NonNull @Getter - Path imagePipelineDir; + Path batchDir; + + + @SneakyThrows + public AnalyzeResult getAzureResultCache() { + + try (var in = new FileInputStream(getAzureResultCacheFile()); JsonReader reader = DefaultJsonReader.fromStream(in, new JsonOptions());) { + return AnalyzeResult.fromJson(reader); + } + } + + + @SneakyThrows + public File getAzureResultCacheFile() { + + return batchDir.resolve("analyzeResult.json").toFile(); + } + + + public List getRenderedImageFiles() { + + List renderedImageFiles = new ArrayList<>(); + for (int i = 0; i < batchPageToOriginPageLookup.size(); i++) { + renderedImageFiles.add(getRenderedImageFile(batchPageToOriginPageLookup.get(i), i + 1)); + } + return renderedImageFiles; + } + + + public ImageFile getRenderedImageFile(int pageNumber, int numberInBatch) { + + return new ImageFile(pageNumber, getRenderedImageNameFormat().formatted(numberInBatch)); + } + + + public ImageFile getProcessedImageFile(int pageNumber, int numberInBatch) { + + return new ImageFile(pageNumber, getProcessedImageNameFormat().formatted(numberInBatch)); + } + + + public List getProcessedImageFiles() { + + List processedImageFiles = new ArrayList<>(); + for (int i = 0; i < batchPageToOriginPageLookup.size(); i++) { + processedImageFiles.add(getProcessedImageFile(batchPageToOriginPageLookup.get(i), i + 1)); + } + return processedImageFiles; + } + + + public String getRenderedImageNameFormat() { + + return getRenderedImageDir().resolve(getImageFormat()).toFile().toString(); + } + + + public String getProcessedImageNameFormat() { + + return getProcessedImageDir().resolve(getImageFormat()).toFile().toString(); + } + + + private String getImageFormat() { + + return "output_" + index + ".%04d" + GhostScriptService.FORMAT; + } + + + public Path getRenderedImageDir() { + + return batchDir.resolve("rendered"); + } + + + public Path getProcessedImageDir() { + + return batchDir.resolve("processed"); + } @Override public String toString() { if (size() == 1) { - return String.format("%d", lookup.get(0)); + return String.format("%d", batchPageToOriginPageLookup.get(0)); } - List intervals = formatIntervals(lookup); + List intervals = formatIntervals(batchPageToOriginPageLookup); if (intervals.size() > 4) { intervals = intervals.subList(0, 4); intervals.add("..."); @@ -49,46 +136,46 @@ public final class PageBatch implements Comparable { public void forEach(Consumer consumer) { - lookup.forEach(consumer); + batchPageToOriginPageLookup.forEach(consumer); } public List getAllPageNumbers() { - return lookup; + return batchPageToOriginPageLookup; } public int size() { - return lookup.size(); + return batchPageToOriginPageLookup.size(); } public boolean isEmpty() { - return lookup.isEmpty(); + return batchPageToOriginPageLookup.isEmpty(); } public int getPageNumber(int pageNumber) { - return lookup.get(pageNumber - 1); + return batchPageToOriginPageLookup.get(pageNumber - 1); } @Override public int compareTo(PageBatch o) { - if (lookup.isEmpty() && o.lookup.isEmpty()) { + if (batchPageToOriginPageLookup.isEmpty() && o.batchPageToOriginPageLookup.isEmpty()) { return 0; - } else if (lookup.isEmpty()) { + } else if (batchPageToOriginPageLookup.isEmpty()) { return 1; - } else if (o.lookup.isEmpty()) { + } else if (o.batchPageToOriginPageLookup.isEmpty()) { return -1; } - return Integer.compare(lookup.get(0), o.lookup.get(0)); + return Integer.compare(batchPageToOriginPageLookup.get(0), o.batchPageToOriginPageLookup.get(0)); } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/Statistics.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/Statistics.java index 0ec9aa9..b5b73e4 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/Statistics.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/Statistics.java @@ -162,7 +162,7 @@ public class Statistics { return batchStats.values() .stream() - .mapToLong(BatchStats::getWritingTextDuration) + .mapToLong(BatchStats::getMappingResultDuration) .toArray(); } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/AsyncOcrService.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/AsyncOcrService.java index f9aa206..73a6212 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/AsyncOcrService.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/AsyncOcrService.java @@ -1,8 +1,9 @@ package com.knecon.fforesight.service.ocr.processor.service; import java.util.List; -import java.util.Map; import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.function.Supplier; import org.slf4j.MDC; import org.springframework.stereotype.Service; @@ -22,6 +23,7 @@ import com.pdftron.pdf.PDFDoc; import lombok.AccessLevel; import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; import lombok.extern.slf4j.Slf4j; import reactor.core.publisher.Mono; @@ -67,6 +69,10 @@ public class AsyncOcrService { private void beginAnalysis(BinaryData data, BatchContext batchContext, Set features) throws InterruptedException { + if (settings.isUseCaches() && batchContext.batch().getAzureResultCacheFile().exists()) { + handleCached(batchContext); + } + batchContext.supervisor.enterConcurrency(batchContext.batch); batchContext.supervisor.logUploadStart(batchContext.batch, data.getLength()); @@ -85,13 +91,36 @@ public class AsyncOcrService { if (LongRunningOperationStatus.SUCCESSFULLY_COMPLETED == response.getStatus()) { return response.getFinalResult(); } - return Mono.error(new IllegalStateException("Polling completed unsuccessfully with status: " + response.getStatus())); + String message = "Polling completed unsuccessfully with status: " + response.getStatus(); + log.error(message); + return Mono.error(new IllegalStateException(message)); }).subscribe(finalResult -> handleSuccessful(finalResult, batchContext),// ex -> handleError(ex, batchContext),// () -> handleCompleted(batchContext)); } + @SneakyThrows + private static void handleCached(BatchContext batchContext) { + + var mdcContext = MDC.getCopyOfContextMap(); + Thread thread = new Thread(() -> { + MDC.setContextMap(mdcContext); + log.info("Batch {}: Using cached ocr result", batchContext.batch.getIndex()); + batchContext.batchStats().finishUpload(); + batchContext.batchStats().finishApiWait(); + batchContext.supervisor.logPageSuccess(batchContext.batch()); + try { + batchContext.layerFactory.processAnalyzeResult(batchContext.batch(), batchContext.batch().getAzureResultCache()); + } catch (InterruptedException e) { + batchContext.supervisor.logPageError(batchContext.batch, e); + } + + }); + thread.start(); + } + + private static void handleCompleted(BatchContext batchContext) { log.info("Completed batch {} with pages {}", batchContext.batch.getIndex(), batchContext.batch); @@ -109,10 +138,8 @@ public class AsyncOcrService { batchContext.supervisor.leaveConcurrency(batchContext.batch); try { + mapper.writeValue(batchContext.batch().getAzureResultCacheFile(), finalResult); batchContext.supervisor.logPageSuccess(batchContext.batch()); - if (settings.isDebug()) { - mapper.writeValue(batchContext.batch().getImagePipelineDir().resolve("azure_result_%d.json" .formatted(batchContext.batch().getIndex())).toFile(), finalResult); - } batchContext.layerFactory.processAnalyzeResult(batchContext.batch(), finalResult); } catch (Exception e) { handleError(e, batchContext); diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/AzureOcrResource.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/AzureOcrResource.java index 5170194..f78810c 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/AzureOcrResource.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/AzureOcrResource.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.ocr.processor.service; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Set; @@ -56,6 +57,7 @@ public class AzureOcrResource { buildFeatures(features), null, buildContentFormat(), + Collections.emptyList(), analyzeRequest); } @@ -86,6 +88,9 @@ public class AzureOcrResource { if (features.contains(AzureOcrFeature.IDP)) { azureFeatures.add(DocumentAnalysisFeature.KEY_VALUE_PAIRS); } + if (settings.isAzureFontStyleDetection() && features.contains(AzureOcrFeature.FONT_STYLE_DETECTION)) { + azureFeatures.add(DocumentAnalysisFeature.STYLE_FONT); + } azureFeatures.add(DocumentAnalysisFeature.BARCODES); return azureFeatures; diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/BatchFactory.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/BatchFactory.java index 193154c..d7609cf 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/BatchFactory.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/BatchFactory.java @@ -1,7 +1,12 @@ package com.knecon.fforesight.service.ocr.processor.service; +import static com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils.formatIntervals; + +import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Set; @@ -18,9 +23,10 @@ import com.pdftron.sdf.SDFDoc; import lombok.AccessLevel; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; - import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; +@Slf4j @Service @RequiredArgsConstructor @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) @@ -29,26 +35,39 @@ public class BatchFactory { OcrServiceSettings settings; - public static String formatBatchFilename(int number) { + @SneakyThrows + public List splitIntoBatches(PDFDoc pdfDoc, OcrExecutionSupervisor supervisor, Set features, Path runDir) { - return "batch_%d.pdf".formatted(number); + Set pagesToProcess = findPagesToProcess(pdfDoc, features); + supervisor.logImageExtractionFinished(pdfDoc.getPageCount(), pagesToProcess.size()); + + List batches = buildBatches(pdfDoc, supervisor, features, runDir, pagesToProcess); + if (batches.size() > 1) { + log.info("Split {} pages to process into {} batches", pagesToProcess.size(), batches.size()); + } + return batches; } @SneakyThrows - public List splitIntoBatches(PDFDoc pdfDoc, OcrExecutionSupervisor supervisor, Set features, Path fileDir) { + public Set findPagesToProcess(PDFDoc pdfDoc, Set features) { - Set pagesWithImages = ImageDetectionService.findPagesToProcess(pdfDoc, features); - supervisor.logImageExtractionFinished(pdfDoc.getPageCount(), pagesWithImages.size()); + if (features.contains(AzureOcrFeature.ALL_PAGES)) { + Set pages = new HashSet<>(); + for (int i = 1; i <= pdfDoc.getPageCount(); i++) { + pages.add(i); + } + return Collections.unmodifiableSet(pages); + } - return buildBatches(pdfDoc, supervisor, features, fileDir, pagesWithImages); + return ImageDetectionService.findPagesWithImages(pdfDoc); } public List buildBatches(PDFDoc pdfDoc, OcrExecutionSupervisor supervisor, Set features, - Path fileDir, + Path runDir, Set pagesWithImages) throws PDFNetException { List batches = new ArrayList<>(); @@ -60,30 +79,48 @@ public class BatchFactory { } numbersForCurrentBatch.add(pageNumber); if (numbersForCurrentBatch.size() == settings.getBatchSize()) { - batches.add(create(batches.size(), pdfDoc, numbersForCurrentBatch, fileDir)); + batches.add(create(batches.size(), pdfDoc, numbersForCurrentBatch, runDir)); numbersForCurrentBatch = new ArrayList<>(); } } if (!numbersForCurrentBatch.isEmpty()) { - batches.add(create(batches.size(), pdfDoc, numbersForCurrentBatch, fileDir)); + batches.add(create(batches.size(), pdfDoc, numbersForCurrentBatch, runDir)); } return batches; } @SneakyThrows - public static PageBatch create(int number, PDFDoc pdfDoc, List pageNumbers, Path fileDir) { + public static PageBatch create(int number, PDFDoc pdfDoc, List pageNumbers, Path runDir) { if (pageNumbers.isEmpty()) { throw new IllegalArgumentException("pageNumbers must not be empty"); } + Path batchDir = formatBatchDir(number, pageNumbers, runDir); + Files.createDirectories(batchDir); - Path batchDocPath = fileDir.resolve(formatBatchFilename(number)); + Path batchDocPath = batchDir.resolve("batch.pdf"); try (var batchDoc = extractBatchDocument(pdfDoc, pageNumbers)) { Optimizer.optimize(batchDoc); batchDoc.save(batchDocPath.toFile().toString(), SDFDoc.SaveMode.LINEARIZED, null); } - return new PageBatch(number, pageNumbers, batchDocPath, fileDir); + PageBatch batch = new PageBatch(number, pageNumbers, batchDocPath, batchDir); + Files.createDirectories(batch.getRenderedImageDir()); + Files.createDirectories(batch.getProcessedImageDir()); + return batch; + } + + + private static Path formatBatchDir(int number, List pageNumbers, Path runDir) { + + List intervals = formatIntervals(pageNumbers); + if (intervals.size() > 4) { + intervals = intervals.subList(0, 4); + intervals.add("..."); + } + + String batchName = String.join(", ", intervals); + return runDir.resolve("batch_%04d_%s".formatted(number, batchName)); } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/BatchStats.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/BatchStats.java index 169e0e2..6d85224 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/BatchStats.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/BatchStats.java @@ -10,7 +10,7 @@ public class BatchStats { private long apiWaitTimestamp = -1; private long imageUploadTimestamp = -1; - private long writingTextTimestamp = -1; + private long mappingResultTimestamp = -1; private long batchRenderTimestamp = -1; @@ -38,9 +38,9 @@ public class BatchStats { } - public void finishWritingText() { + public void finishMappingResult() { - writingTextTimestamp = System.currentTimeMillis(); + mappingResultTimestamp = System.currentTimeMillis(); } @@ -50,15 +50,33 @@ public class BatchStats { } + public boolean isApiWaitFinished() { + + return apiWaitTimestamp > 0; + } + + + public boolean isMappingResultFinished() { + + return mappingResultTimestamp > 0; + } + + + public boolean isBatchRenderFinished() { + + return batchRenderTimestamp > 0; + } + + public long getApiWaitDuration() {return this.apiWaitTimestamp - imageUploadTimestamp;} public long getImageUploadDuration() {return this.imageUploadTimestamp - batchRenderTimestamp;} - public long getWritingTextDuration() {return this.writingTextTimestamp - apiWaitTimestamp;} + public long getMappingResultDuration() {return this.mappingResultTimestamp - apiWaitTimestamp;} - public long getBatchRenderDuration() {return this.batchRenderTimestamp - startTimestamp;} + public long getBatchRenderDuration() {return startTimestamp - this.batchRenderTimestamp;} } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageDetectionService.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageDetectionService.java index cc7584b..599ae0d 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageDetectionService.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageDetectionService.java @@ -24,21 +24,7 @@ public class ImageDetectionService { @SneakyThrows - public Set findPagesToProcess(PDFDoc pdfDoc, Set features) { - - if (features.contains(AzureOcrFeature.ALL_PAGES)) { - Set pages = new HashSet<>(); - for (int i = 1; i <= pdfDoc.getPageCount(); i++) { - pages.add(i); - } - return Collections.unmodifiableSet(pages); - } - - return findPagesWithImages(pdfDoc); - } - - - private Set findPagesWithImages(PDFDoc pdfDoc) throws PDFNetException { + public Set findPagesWithImages(PDFDoc pdfDoc) { Set pagesWithImages = new HashSet<>(); try (ElementReader reader = new ElementReader()) { diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java index 4c9b653..c87a5e0 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java @@ -38,8 +38,6 @@ import lombok.extern.slf4j.Slf4j; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class OCRService { - public static final String IMAGE_PIPELINE_DIR = "image_pipeline"; - public static final String AZURE_OUTPUT_DIR = "azure_output"; IOcrMessageSender ocrMessageSender; WatermarkRemovalService watermarkRemovalService; InvisibleElementRemovalService invisibleElementRemovalService; @@ -120,7 +118,7 @@ public class OCRService { @SneakyThrows - public OcrExecutionSupervisor runOcr(Path tmpDir, + public OcrExecutionSupervisor runOcr(Path runDir, File documentFile, File viewerDocumentFile, String fileId, @@ -128,12 +126,6 @@ public class OCRService { File analyzeResultFile, Set features) { - Path tmpImageDir = tmpDir.resolve(IMAGE_PIPELINE_DIR); - Path azureOutputDir = tmpDir.resolve(AZURE_OUTPUT_DIR); - - Files.createDirectories(azureOutputDir); - Files.createDirectories(tmpImageDir); - try (var in = new FileInputStream(documentFile); PDFDoc pdfDoc = new PDFDoc(in)) { OCGWatermarkRemovalService.removeWatermarks(pdfDoc); @@ -141,7 +133,7 @@ public class OCRService { OcrExecutionSupervisor supervisor = new OcrExecutionSupervisor(pdfDoc.getPageCount(), ocrMessageSender, fileId, settings); supervisor.getStatistics().setStart(); - List batches = batchFactory.splitIntoBatches(pdfDoc, supervisor, features, tmpImageDir); + List batches = batchFactory.splitIntoBatches(pdfDoc, supervisor, features, runDir); OcrResult ocrResult = asyncOcrService.awaitOcr(pdfDoc, supervisor, features, batches); diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrExecutionSupervisor.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrExecutionSupervisor.java index f72e1a0..8ca0b4e 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrExecutionSupervisor.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrExecutionSupervisor.java @@ -5,6 +5,7 @@ import static com.knecon.fforesight.service.ocr.processor.model.Statistics.human import java.util.Collections; import java.util.HashSet; +import java.util.Map; import java.util.Set; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; @@ -65,10 +66,10 @@ public class OcrExecutionSupervisor { } - public void logImageExtractionFinished(int numberOfPages, int numberOfImages) { + public void logImageExtractionFinished(int numberOfPages, int numberOfPagesToProcess) { statistics.imageExtractionFinished(); - log.info("Images found on {}/{} pages in {}", numberOfImages, numberOfPages, humanizeDuration(statistics.getImageExtractionDuration())); + log.info("Images found on {}/{} pages in {}", numberOfPagesToProcess, numberOfPages, humanizeDuration(statistics.getImageExtractionDuration())); } @@ -95,7 +96,7 @@ public class OcrExecutionSupervisor { public void finishMappingResult(PageBatch batch) { batch.forEach(pageIndex -> countDownPagesToProcess.countDown()); - statistics.getBatchStats(batch).finishWritingText(); + statistics.getBatchStats(batch).finishMappingResult(); ocrMessageSender.sendUpdate(fileId, this.processedPages(), getTotalPageCount()); } @@ -127,7 +128,29 @@ public class OcrExecutionSupervisor { private int processedPages() { - return (int) (totalPageCount - countDownPagesToProcess.getCount()); + if (countDownPagesToProcess.getCount() == 0) { + return totalPageCount; + } + int processedPages = 0; + for (Map.Entry entry : statistics.getBatchStats().entrySet()) { + PageBatch pageBatch = entry.getKey(); + BatchStats batchStats = entry.getValue(); + float percentage = 0; + if (batchStats.isBatchRenderFinished()) { + percentage += 0.1f; + } + if (batchStats.isUploadFinished()) { + percentage += 0.3f; + } + if (batchStats.isApiWaitFinished()) { + percentage += 0.3f; + } + if (batchStats.isMappingResultFinished()) { + percentage += 0.3f; + } + processedPages += (int) (pageBatch.size() * percentage); + } + return processedPages; } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/WritableOcrResultFactory.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultPostProcessingPipeline.java similarity index 92% rename from azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/WritableOcrResultFactory.java rename to azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultPostProcessingPipeline.java index 66cbb16..5133928 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/WritableOcrResultFactory.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultPostProcessingPipeline.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.ocr.processor.visualizations; +package com.knecon.fforesight.service.ocr.processor.service; import java.awt.geom.AffineTransform; import java.awt.geom.Line2D; @@ -37,6 +37,7 @@ import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.Image import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingSupervisor; import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.BBoxSnuggificationService; import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.StrokeWidthCalculator; +import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResult; import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.FontMetricsProvider; import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.FontStyle; import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.Type0FontMetricsProvider; @@ -56,7 +57,7 @@ import net.sourceforge.lept4j.util.LeptUtils; @Slf4j @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -public class WritableOcrResultFactory { +public class OcrResultPostProcessingPipeline { @Getter Map resultToPageTransforms; @@ -67,10 +68,10 @@ public class WritableOcrResultFactory { @SneakyThrows - public WritableOcrResultFactory(Map pageInformation, - ImageProcessingPipeline imageProcessingPipeline, - OcrServiceSettings settings, - Set features) { + public OcrResultPostProcessingPipeline(Map pageInformation, + ImageProcessingPipeline imageProcessingPipeline, + OcrServiceSettings settings, + Set features) { this.imageProcessingPipeline = imageProcessingPipeline; this.pageInformation = pageInformation; @@ -80,14 +81,9 @@ public class WritableOcrResultFactory { } - public List buildOcrResultToWrite(AnalyzeResult analyzeResult, PageBatch batch) throws InterruptedException { + public List processAnalyzeResult(AnalyzeResult analyzeResult, PageBatch batch) throws InterruptedException { - Map anglesPerPage = analyzeResult.getPages() - .stream() - .collect(Collectors.toMap(DocumentPage::getPageNumber, documentPage -> -documentPage.getAngle())); - RotationCorrectionUtility.rotatePages(batch.getBatchDoc(), batch.getBatchDoc(), anglesPerPage); - - ImageProcessingSupervisor imageSupervisor = imageProcessingPipeline.addToPipeline(batch); + ImageProcessingSupervisor imageSupervisor = renderImagesIfNecessary(analyzeResult, batch); List writableOcrResultList = new ArrayList<>(); @@ -114,6 +110,30 @@ public class WritableOcrResultFactory { } + private ImageProcessingSupervisor renderImagesIfNecessary(AnalyzeResult analyzeResult, PageBatch batch) { + + ImageProcessingSupervisor imageSupervisor = null; + if (useRenderedImages()) { + + Map anglesPerPage = analyzeResult.getPages() + .stream() + .collect(Collectors.toMap(DocumentPage::getPageNumber, documentPage -> -documentPage.getAngle())); + RotationCorrectionUtility.rotatePages(batch.getBatchDoc(), batch.getBatchDoc(), anglesPerPage); + imageSupervisor = imageProcessingPipeline.addToPipeline(batch); + } + return imageSupervisor; + } + + + private boolean useRenderedImages() { + + if (settings.isAzureFontStyleDetection() && features.contains(AzureOcrFeature.FONT_STYLE_DETECTION)) { + return false; + } + return settings.isSnuggify() || features.contains(AzureOcrFeature.FONT_STYLE_DETECTION); + } + + private List buildTextPositionsInImage(PageBatch pageOffset, DocumentPage resultPage, AffineTransform resultToPageTransform, @@ -121,7 +141,7 @@ public class WritableOcrResultFactory { PageInformation pageInformation, ImageProcessingSupervisor imageSupervisor) throws InterruptedException { - if (!settings.isSnuggify() && !features.contains(AzureOcrFeature.FONT_STYLE_DETECTION)) { + if (!useRenderedImages()) { return buildText(resultPage, resultToPageTransform, lookups, pageInformation); } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/BBoxSnuggificationService.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/BBoxSnuggificationService.java index e9827b5..d38421f 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/BBoxSnuggificationService.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/BBoxSnuggificationService.java @@ -7,7 +7,7 @@ import java.util.Optional; import com.azure.ai.documentintelligence.models.DocumentPage; import com.azure.ai.documentintelligence.models.DocumentWord; -import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResultFactory; +import com.knecon.fforesight.service.ocr.processor.service.OcrResultPostProcessingPipeline; import com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility; import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint; import com.sun.jna.Pointer; @@ -58,7 +58,7 @@ public class BBoxSnuggificationService { return Optional.empty(); } - Pix wordImage = WritableOcrResultFactory.extractWordImage(originTransformed, pageImage); + Pix wordImage = OcrResultPostProcessingPipeline.extractWordImage(originTransformed, pageImage); if (wordImage == null) { log.debug("Unable to extract word image! wordImage: {}, pageImage {}", originTransformed.getBounds2D(), new Rectangle2D.Float(0, 0, pageImage.w, pageImage.h)); diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/GhostScriptOutputHandler.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/GhostScriptOutputHandler.java index bb14307..d0996de 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/GhostScriptOutputHandler.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/GhostScriptOutputHandler.java @@ -15,6 +15,7 @@ import java.util.regex.Pattern; import org.slf4j.MDC; import com.knecon.fforesight.service.ocr.processor.model.ImageFile; +import com.knecon.fforesight.service.ocr.processor.model.PageBatch; import lombok.AccessLevel; import lombok.RequiredArgsConstructor; diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/GhostScriptService.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/GhostScriptService.java index e79e2ef..25ee225 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/GhostScriptService.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/GhostScriptService.java @@ -5,10 +5,12 @@ import java.nio.file.Path; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.concurrent.Semaphore; import java.util.function.Consumer; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings; import com.knecon.fforesight.service.ocr.processor.model.ImageFile; import com.knecon.fforesight.service.ocr.processor.model.PageBatch; @@ -25,35 +27,43 @@ import lombok.extern.slf4j.Slf4j; @SuppressWarnings("PMD") // can't figure out how to safely close the stdOut and stdError streams in line 72/74 public class GhostScriptService { - static String FORMAT = ".tiff"; + private OcrServiceSettings ocrServiceSettings; + public static String FORMAT = ".tiff"; static String DEVICE = "tiffgray"; static int DPI = 300; + private Semaphore concurrencySemaphore = new Semaphore(3); @SneakyThrows - public void startBatchRender(PageBatch batch, ImageProcessingSupervisor supervisor, Path renderedImageDir, Consumer successHandler, Consumer errorHandler) { + public void startBatchRender(PageBatch batch, ImageProcessingSupervisor supervisor, Consumer successHandler, Consumer errorHandler) { supervisor.requireNoErrors(); + List renderedImageFiles = batch.getRenderedImageFiles(); + if (ocrServiceSettings.isUseCaches() && renderedImageFiles.stream() + .allMatch(ImageFile::exists)) { + log.info("Batch {}: Using cached GhostScript rendering with page(s) {}", batch.getIndex(), batch); + renderedImageFiles.forEach(successHandler); + return; + } + + concurrencySemaphore.acquire(); log.info("Batch {}: starting GhostScript rendering with page(s) {}", batch.getIndex(), batch); - executeProcess(batch.getIndex(), buildCmdArgs(batch, renderedImageDir, batch.getBatchDoc()), successHandler, errorHandler); + executeProcess(batch.getIndex(), buildCmdArgs(batch, batch.getBatchDoc()), successHandler, errorHandler); } @SneakyThrows - private ProcessCmdsAndRenderedImageFiles buildCmdArgs(PageBatch batch, Path outputDir, Path document) { - - String imagePathFormat = outputDir.resolve("output_" + batch.getIndex() + ".%04d" + FORMAT).toFile().toString(); + private ProcessCmdsAndRenderedImageFiles buildCmdArgs(PageBatch batch, Path document) { Map fullPageImages = new HashMap<>(); - List allPageNumbers = batch.getAllPageNumbers(); - - for (int i = 0; i < allPageNumbers.size(); i++) { - Integer pageNumber = allPageNumbers.get(i); - fullPageImages.put(i + 1, new ImageFile(pageNumber, String.format(imagePathFormat, i + 1))); + List renderedImageFiles = batch.getRenderedImageFiles(); + for (int i = 1; i <= renderedImageFiles.size(); i++) { + ImageFile renderedImageFile = renderedImageFiles.get(i - 1); + fullPageImages.put(i, renderedImageFile); } - String[] cmdArgs = buildCmdArgs(document, imagePathFormat); + String[] cmdArgs = buildCmdArgs(document, batch.getRenderedImageNameFormat()); return new ProcessCmdsAndRenderedImageFiles(cmdArgs, fullPageImages); } @@ -76,6 +86,22 @@ public class GhostScriptService { stdOutLogger.start(); stdErrorLogger.start(); + handleFinished(p); + } + + + private void handleFinished(Process p) { + + Thread finishedThread = new Thread(() -> { + try { + p.waitFor(); + } catch (InterruptedException e) { + log.error("GhostScript process was interrupted", e); + } finally { + concurrencySemaphore.release(); + } + }); + finishedThread.start(); } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/ImageProcessingPipeline.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/ImageProcessingPipeline.java index 06977a1..efc3464 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/ImageProcessingPipeline.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/ImageProcessingPipeline.java @@ -1,7 +1,5 @@ package com.knecon.fforesight.service.ocr.processor.service.imageprocessing; -import java.nio.file.Files; -import java.nio.file.Path; import java.util.List; import java.util.function.Consumer; @@ -20,9 +18,6 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class ImageProcessingPipeline { - public static final String PROCESSED_DIR = "processed"; - public static final String RENDERED_DIR = "rendered"; - GhostScriptService ghostScriptService; ImageProcessingService imageProcessingService; @@ -30,20 +25,14 @@ public class ImageProcessingPipeline { @SneakyThrows public ImageProcessingSupervisor addToPipeline(PageBatch batch) { - Path processedImageDir = batch.getImagePipelineDir().resolve(PROCESSED_DIR); - Path renderedImageDir = batch.getImagePipelineDir().resolve(RENDERED_DIR); - - Files.createDirectories(renderedImageDir); - Files.createDirectories(processedImageDir); - List pageNumbers = batch.getAllPageNumbers(); ImageProcessingSupervisor supervisor = new ImageProcessingSupervisor(pageNumbers); - Consumer renderingSuccessConsumer = imageFile -> imageProcessingService.addToProcessingQueue(imageFile, processedImageDir, supervisor); + Consumer renderingSuccessConsumer = imageFile -> imageProcessingService.addToProcessingQueue(imageFile, batch.getProcessedImageDir(), supervisor); Consumer renderingErrorConsumer = supervisor::markError; - ghostScriptService.startBatchRender(batch, supervisor, renderedImageDir, renderingSuccessConsumer, renderingErrorConsumer); + ghostScriptService.startBatchRender(batch, supervisor, renderingSuccessConsumer, renderingErrorConsumer); return supervisor; } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/ImageProcessingService.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/ImageProcessingService.java index da32bc7..87dc937 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/ImageProcessingService.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/ImageProcessingService.java @@ -1,11 +1,13 @@ package com.knecon.fforesight.service.ocr.processor.service.imageprocessing; +import java.io.File; import java.nio.file.Path; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings; import com.knecon.fforesight.service.ocr.processor.model.ImageFile; import lombok.AccessLevel; @@ -23,9 +25,10 @@ import net.sourceforge.lept4j.util.LeptUtils; public class ImageProcessingService { BlockingQueue queue = new LinkedBlockingQueue<>(); + private final OcrServiceSettings ocrServiceSettings; - public ImageProcessingService() { + public ImageProcessingService(OcrServiceSettings ocrServiceSettings) { Thread queueConsumerThread = new Thread(() -> { while (true) { @@ -44,7 +47,7 @@ public class ImageProcessingService { } }); queueConsumerThread.start(); - + this.ocrServiceSettings = ocrServiceSettings; } @@ -58,24 +61,35 @@ public class ImageProcessingService { private void process(ImageFile unprocessedImage, Path outputDir, ImageProcessingSupervisor supervisor) { String absoluteFilePath = outputDir.resolve(Path.of(unprocessedImage.absoluteFilePath()).getFileName()).toFile().toString(); - ImageFile imageFile = new ImageFile(unprocessedImage.pageNumber(), absoluteFilePath); + ImageFile processedImage = new ImageFile(unprocessedImage.pageNumber(), absoluteFilePath); + + if (ocrServiceSettings.isUseCaches() && processedImage.exists()) { + supervisor.markPageFinished(processedImage); + return; + } + try { + if (!unprocessedImage.exists()) { + log.error("ERROR, rendered image {} does not exist", unprocessedImage.absoluteFilePath()); + throw new AssertionError(); + } synchronized (ImageProcessingSupervisor.class) { // Leptonica is not thread safe, but is being called in WritableOcrResultFactory as well Pix processedPix; Pix pix = unprocessedImage.readPix(); + assert pix != null; + processedPix = processPix(pix); - Leptonica1.pixWrite(absoluteFilePath, processedPix, ILeptonica.IFF_TIFF_PACKBITS); + Leptonica1.pixWrite(processedImage.absoluteFilePath(), processedPix, ILeptonica.IFF_TIFF_PACKBITS); LeptUtils.disposePix(pix); LeptUtils.disposePix(processedPix); - } } catch (Exception e) { supervisor.markError(e.getMessage()); } finally { - supervisor.markPageFinished(imageFile); + supervisor.markPageFinished(processedImage); } } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/OsUtils.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/OsUtils.java index efa87dd..a63cbbf 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/OsUtils.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/OsUtils.java @@ -1,25 +1,26 @@ package com.knecon.fforesight.service.ocr.processor.utils; -import org.apache.commons.lang3.StringUtils; +import java.util.Locale; import lombok.experimental.UtilityClass; @UtilityClass public final class OsUtils { - private static final String SERVICE_NAME = "azure-ocr-service"; - - private static boolean isWindows() { - return StringUtils.containsIgnoreCase(System.getProperty("os.name"), "Windows"); + String osName = System.getProperty("os.name"); + if (osName == null) { + return false; + } + return osName.toLowerCase(Locale.ENGLISH).contains("windows"); } public static String getTemporaryDirectory() { String tmpdir = System.getProperty("java.io.tmpdir"); - if (isWindows() && StringUtils.isNotBlank(tmpdir)) { + if (isWindows() && !tmpdir.isBlank()) { return tmpdir; } return "/tmp"; diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/IdpLayerFactory.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/IdpLayerFactory.java index 2e0f680..3bc2a42 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/IdpLayerFactory.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/IdpLayerFactory.java @@ -65,10 +65,6 @@ public class IdpLayerFactory { analyzeResult.getTables() .forEach(documentTable -> idpLayer.addTable(documentTable, pageOffset)); } - if (analyzeResult.getLists() != null) { - analyzeResult.getLists() - .forEach(list -> idpLayer.addList(list, pageOffset)); - } if (analyzeResult.getKeyValuePairs() != null) { analyzeResult.getKeyValuePairs() .forEach(keyValue -> idpLayer.addKeyValue(keyValue, pageOffset)); diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/LayerFactory.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/LayerFactory.java index 258fcb0..8ca934f 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/LayerFactory.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/layers/LayerFactory.java @@ -14,7 +14,7 @@ import com.knecon.fforesight.service.ocr.processor.service.OcrExecutionSuperviso import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings; import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingPipeline; import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResult; -import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResultFactory; +import com.knecon.fforesight.service.ocr.processor.service.OcrResultPostProcessingPipeline; import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature; import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup; @@ -25,7 +25,7 @@ import lombok.experimental.FieldDefaults; public class LayerFactory { OcrExecutionSupervisor supervisor; - WritableOcrResultFactory writableOcrResultFactory; + OcrResultPostProcessingPipeline ocrResultPostProcessingPipeline; IdpLayerFactory idpLayerFactory; OcrDebugLayerFactory ocrDebugLayerFactory; OcrTextLayerFactory ocrTextLayerFactory; @@ -40,8 +40,8 @@ public class LayerFactory { Map pageInformation, ImageProcessingPipeline imageProcessingPipeline) { - this.writableOcrResultFactory = new WritableOcrResultFactory(pageInformation, imageProcessingPipeline, settings, features); - this.idpLayerFactory = new IdpLayerFactory(writableOcrResultFactory.getResultToPageTransforms()); + this.ocrResultPostProcessingPipeline = new OcrResultPostProcessingPipeline(pageInformation, imageProcessingPipeline, settings, features); + this.idpLayerFactory = new IdpLayerFactory(ocrResultPostProcessingPipeline.getResultToPageTransforms()); this.ocrDebugLayerFactory = new OcrDebugLayerFactory(); this.ocrTextLayerFactory = new OcrTextLayerFactory(); this.settings = settings; @@ -53,7 +53,7 @@ public class LayerFactory { public void processAnalyzeResult(PageBatch batch, AnalyzeResult analyzeResult) throws InterruptedException { - List results = writableOcrResultFactory.buildOcrResultToWrite(analyzeResult, batch); + List results = ocrResultPostProcessingPipeline.processAnalyzeResult(analyzeResult, batch); results.forEach(result -> angles.put(result.getPageNumber(), result.getAngle())); diff --git a/azure-ocr-service/azure-ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/service/ImageProcessingPipelineTest.java b/azure-ocr-service/azure-ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/service/ImageProcessingPipelineTest.java index 5c79d77..0c6f832 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/service/ImageProcessingPipelineTest.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/service/ImageProcessingPipelineTest.java @@ -38,8 +38,9 @@ class ImageProcessingPipelineTest { new NativeLibrariesInitializer("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a").init(); - ImageProcessingService imageProcessingService = new ImageProcessingService(); - GhostScriptService ghostScriptService = new GhostScriptService(); + OcrServiceSettings settings = new OcrServiceSettings(); + ImageProcessingService imageProcessingService = new ImageProcessingService(settings); + GhostScriptService ghostScriptService = new GhostScriptService(settings); imageProcessingPipeline = new ImageProcessingPipeline(ghostScriptService, imageProcessingService); } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/service/SnugBoxesTest.java b/azure-ocr-service/azure-ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/service/SnugBoxesTest.java index 1a9f079..a2adafe 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/service/SnugBoxesTest.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/service/SnugBoxesTest.java @@ -1,17 +1,12 @@ package com.knecon.fforesight.service.ocr.processor.service; -import static com.knecon.fforesight.service.ocr.processor.service.OCRService.IMAGE_PIPELINE_DIR; -import static com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingPipeline.PROCESSED_DIR; - import java.awt.Color; import java.awt.geom.AffineTransform; import java.awt.geom.Line2D; -import java.awt.geom.Rectangle2D; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.nio.file.Path; -import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; @@ -32,7 +27,6 @@ import com.knecon.fforesight.service.ocr.processor.model.ImageFile; import com.knecon.fforesight.service.ocr.processor.model.PageInformation; import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage; import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResult; -import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResultFactory; import com.knecon.fforesight.service.ocr.processor.visualizations.layers.OcrDebugLayerFactory; import com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility; import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint; @@ -44,15 +38,8 @@ import com.pdftron.pdf.ElementBuilder; import com.pdftron.pdf.ElementWriter; import com.pdftron.pdf.PDFDoc; import com.pdftron.pdf.Page; -import com.sun.jna.Memory; -import com.sun.jna.Native; -import com.sun.jna.Pointer; -import com.sun.jna.ptr.PointerByReference; import lombok.SneakyThrows; -import net.sourceforge.lept4j.Box; -import net.sourceforge.lept4j.Boxa; -import net.sourceforge.lept4j.util.LeptUtils; @Disabled // leptonica is not available in build server public class SnugBoxesTest { @@ -60,9 +47,8 @@ public class SnugBoxesTest { public static final int PAGE_NUMBER = 41; public static final Path ORIGIN_FILE = Path.of("/home/kschuettler/Dokumente/TestFiles/OCR/VV-331340-first100.pdf"); public static final Path TEST_FOLDER = Path.of("/tmp/OCR_TEST/").resolve(ORIGIN_FILE.getFileName()); - public static final Path PROCESSED_FOLDER = TEST_FOLDER.resolve(IMAGE_PIPELINE_DIR).resolve(PROCESSED_DIR); - public static final Path DESTINATION_FILE = TEST_FOLDER.resolve("SnugBoxesTest.pdf"); - public static final Path RESULT_FILE = TEST_FOLDER.resolve(IMAGE_PIPELINE_DIR).resolve("azure_result_0.json"); + public static final Path BATCH_FOLDER = TEST_FOLDER.resolve("batch_0"); + public static final Path DESTINATION_FILE = BATCH_FOLDER.resolve("SnugBoxesTest.pdf"); PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null); @@ -79,24 +65,24 @@ public class SnugBoxesTest { public void snugBoxes() { String filePath = ORIGIN_FILE.toFile().toString(); - File file = PROCESSED_FOLDER.resolve("output_0.%04d.tiff".formatted(PAGE_NUMBER)).toFile(); + File file = new File(filePath); assert file.exists(); ImageFile imageFile = new ImageFile(PAGE_NUMBER, file.toString()); AnalyzeResult result = null; - try (var in = new FileInputStream(RESULT_FILE.toFile()); JsonReader reader = DefaultJsonReader.fromStream(in, new JsonOptions());) { + try (var in = new FileInputStream(BATCH_FOLDER.resolve("analyzeResult.json").toFile()); JsonReader reader = DefaultJsonReader.fromStream(in, new JsonOptions());) { result = AnalyzeResult.fromJson(reader); } var resultPage = result.getPages() .get(PAGE_NUMBER - 1); - WritableOcrResultFactory writableOcrResultFactory = new WritableOcrResultFactory(null, null, new OcrServiceSettings(), Set.of()); + OcrResultPostProcessingPipeline ocrResultPostProcessingPipeline = new OcrResultPostProcessingPipeline(null, null, new OcrServiceSettings(), Set.of()); OcrDebugLayerFactory debugLayerFactory = new OcrDebugLayerFactory(); InvisibleElementRemovalService invisibleElementRemovalService = new InvisibleElementRemovalService(); try (var in = new FileInputStream(ORIGIN_FILE.toFile()); var out = new FileOutputStream(DESTINATION_FILE.toFile())) { invisibleElementRemovalService.removeInvisibleElements(in, out, false); } PageInformation pageInformation = getPageInformation(PAGE_NUMBER, DESTINATION_FILE.toFile().toString()); - WritableOcrResultFactory.Lookups empty = WritableOcrResultFactory.Lookups.empty(); + OcrResultPostProcessingPipeline.Lookups empty = OcrResultPostProcessingPipeline.Lookups.empty(); AffineTransform pageCtm = getPageCtm(PAGE_NUMBER, filePath, resultPage.getWidth()); // pageCtm.preConcatenate(rotationCorrection); @@ -117,7 +103,7 @@ public class SnugBoxesTest { // - List words = writableOcrResultFactory.buildTextWithSnugBBoxes(resultPage, imageFile, pageCtm, empty, pageInformation); + List words = ocrResultPostProcessingPipeline.buildTextWithSnugBBoxes(resultPage, imageFile, pageCtm, empty, pageInformation); var results = new WritableOcrResult(PAGE_NUMBER, -resultPage.getAngle(), words, Collections.emptyList()); debugLayerFactory.addAnalysisResult(List.of(results)); @@ -231,7 +217,7 @@ public class SnugBoxesTest { @SneakyThrows private static AffineTransform getPageCtm(int pageNumber, String file, double imageWidh) { - return WritableOcrResultFactory.buildResultToPageTransform(getPageInformation(pageNumber, file), imageWidh); + return OcrResultPostProcessingPipeline.buildResultToPageTransform(getPageInformation(pageNumber, file), imageWidh); } diff --git a/azure-ocr-service/azure-ocr-service-server/build.gradle.kts b/azure-ocr-service/azure-ocr-service-server/build.gradle.kts index 9138cf7..d273ad8 100644 --- a/azure-ocr-service/azure-ocr-service-server/build.gradle.kts +++ b/azure-ocr-service/azure-ocr-service-server/build.gradle.kts @@ -31,12 +31,17 @@ dependencies { implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.1.1") implementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}") + implementation("com.iqser.red.service:persistence-service-internal-api-v1:2.224.0") + implementation("com.knecon.fforesight:tenant-commons:0.31.0") + implementation("com.iqser.red.commons:storage-commons:2.50.0") + implementation("net.logstash.logback:logstash-logback-encoder:7.4") implementation("ch.qos.logback:logback-classic") testImplementation("org.springframework.boot:spring-boot-starter-test:${springBootStarterVersion}") testImplementation("com.iqser.red.commons:test-commons:2.1.0") testImplementation("org.springframework.amqp:spring-rabbit-test:3.0.2") + testImplementation("com.iqser.red.commons:pdftron-logic-commons:2.32.0") } tasks.named("bootBuildImage") { diff --git a/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/Application.java b/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/Application.java index 79dd03f..89464cb 100644 --- a/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/Application.java +++ b/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/Application.java @@ -9,11 +9,9 @@ import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Import; import org.springframework.scheduling.annotation.EnableAsync; -import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService; -import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService; +import com.iqser.red.storage.commons.StorageAutoConfiguration; import com.knecon.fforesight.service.ocr.processor.OcrServiceProcessorConfiguration; import com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration; -import com.iqser.red.storage.commons.StorageAutoConfiguration; import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration; import io.micrometer.core.aop.TimedAspect; @@ -43,17 +41,4 @@ public class Application { } - @Bean - public InvisibleElementRemovalService invisibleElementRemovalService() { - - return new InvisibleElementRemovalService(); - } - - - @Bean - public WatermarkRemovalService watermarkRemovalService() { - - return new WatermarkRemovalService(); - } - } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/FileStorageService.java b/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/FileStorageService.java similarity index 97% rename from azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/FileStorageService.java rename to azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/FileStorageService.java index 5128421..3a53cf9 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/FileStorageService.java +++ b/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/FileStorageService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.ocr.processor.service; +package com.knecon.fforesight.service.ocr.v1.server; import java.io.File; import java.io.FileInputStream; diff --git a/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/configuration/TenantMessagingConfigurationImpl.java b/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/configuration/TenantMessagingConfigurationImpl.java deleted file mode 100644 index 25804fb..0000000 --- a/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/configuration/TenantMessagingConfigurationImpl.java +++ /dev/null @@ -1,11 +0,0 @@ -package com.knecon.fforesight.service.ocr.v1.server.configuration; - -import org.springframework.context.annotation.Configuration; - -import com.knecon.fforesight.tenantcommons.queue.TenantMessagingConfiguration; - -@Configuration -public class TenantMessagingConfigurationImpl extends TenantMessagingConfiguration { - - -} diff --git a/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/configuration/TenantQueueProviderConfig.java b/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/configuration/TenantQueueProviderConfig.java new file mode 100644 index 0000000..e774183 --- /dev/null +++ b/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/configuration/TenantQueueProviderConfig.java @@ -0,0 +1,32 @@ +package com.knecon.fforesight.service.ocr.v1.server.configuration; + +import static com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration.OCR_DLQ; +import static com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration.OCR_REQUEST_EXCHANGE; +import static com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration.OCR_REQUEST_QUEUE_PREFIX; + +import java.util.Map; +import java.util.Set; + +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +import com.knecon.fforesight.service.ocr.v1.server.queue.OcrMessageReceiver; +import com.knecon.fforesight.tenantcommons.model.TenantQueueConfiguration; +import com.knecon.fforesight.tenantcommons.model.TenantQueueProvider; + +@Configuration +public class TenantQueueProviderConfig { + + @Bean + protected TenantQueueProvider getTenantQueueConfigs() { + + return new TenantQueueProvider(Set.of(TenantQueueConfiguration.builder() + .listenerId(OcrMessageReceiver.OCR_REQUEST_LISTENER_ID) + .exchangeName(OCR_REQUEST_EXCHANGE) + .queuePrefix(OCR_REQUEST_QUEUE_PREFIX) + .dlqName(OCR_DLQ) + .arguments(Map.of("x-max-priority", 2)) + .build())); + } + +} diff --git a/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java b/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java index a70c724..03ae27d 100644 --- a/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java +++ b/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageReceiver.java @@ -2,7 +2,6 @@ package com.knecon.fforesight.service.ocr.v1.server.queue; import java.io.File; import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Path; import java.time.OffsetDateTime; import java.time.temporal.ChronoUnit; @@ -16,7 +15,8 @@ import org.springframework.stereotype.Service; import org.springframework.util.FileSystemUtils; import com.fasterxml.jackson.databind.ObjectMapper; -import com.knecon.fforesight.service.ocr.processor.service.FileStorageService; +import com.knecon.fforesight.service.ocr.processor.utils.OsUtils; +import com.knecon.fforesight.service.ocr.v1.server.FileStorageService; import com.knecon.fforesight.service.ocr.processor.service.IOcrMessageSender; import com.knecon.fforesight.service.ocr.processor.service.OCRService; import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest; @@ -52,7 +52,7 @@ public class OcrMessageReceiver { DocumentRequest request = objectMapper.readValue(in.getBody(), DocumentRequest.class); String dossierId = request.getDossierId(); String fileId = request.getFileId(); - Path tmpDir = Files.createTempDirectory(null); + Path runDir = Path.of(OsUtils.getTemporaryDirectory()).resolve(request.getDossierId()).resolve(request.getFileId()); try { MDC.put("fileId", fileId); @@ -60,13 +60,13 @@ public class OcrMessageReceiver { ocrMessageSender.sendOCRStarted(fileId); - File documentFile = tmpDir.resolve("document.pdf").toFile(); - File viewerDocumentFile = tmpDir.resolve("viewerDocument.pdf").toFile(); - File analyzeResultFile = tmpDir.resolve("azureAnalysisResult.json").toFile(); + File documentFile = runDir.resolve("document.pdf").toFile(); + File viewerDocumentFile = runDir.resolve("viewerDocument.pdf").toFile(); + File analyzeResultFile = runDir.resolve("azureAnalysisResult.json").toFile(); fileStorageService.downloadFiles(request, documentFile); - ocrService.runOcrOnDocument(dossierId, fileId, request.getFeatures(), tmpDir, documentFile, viewerDocumentFile, analyzeResultFile); + ocrService.runOcrOnDocument(dossierId, fileId, request.getFeatures(), runDir, documentFile, viewerDocumentFile, analyzeResultFile); fileStorageService.storeFiles(request, documentFile, viewerDocumentFile, analyzeResultFile); @@ -79,7 +79,7 @@ public class OcrMessageReceiver { } finally { log.info("Done"); MDC.remove("fileId"); - FileSystemUtils.deleteRecursively(tmpDir); + FileSystemUtils.deleteRecursively(runDir); } } diff --git a/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageSender.java b/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageSender.java index 2f86d1d..0b694e7 100644 --- a/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageSender.java +++ b/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/OcrMessageSender.java @@ -29,7 +29,6 @@ public class OcrMessageSender implements IOcrMessageSender { rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_EXCHANGE, TenantContext.getTenantId(), - OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(totalImages).numberOfOCRedPages(totalImages).ocrFinished(true).build()); } @@ -38,7 +37,6 @@ public class OcrMessageSender implements IOcrMessageSender { rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_EXCHANGE, TenantContext.getTenantId(), - OCRStatusUpdateResponse.builder().fileId(fileId).ocrStarted(true).build()); } @@ -48,7 +46,6 @@ public class OcrMessageSender implements IOcrMessageSender { rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_EXCHANGE, TenantContext.getTenantId(), - OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(totalImages).numberOfOCRedPages(finishedImages).build()); } diff --git a/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/TenantExchangeMessageReceiverImpl.java b/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/TenantExchangeMessageReceiverImpl.java deleted file mode 100644 index 7e9bc76..0000000 --- a/azure-ocr-service/azure-ocr-service-server/src/main/java/com/knecon/fforesight/service/ocr/v1/server/queue/TenantExchangeMessageReceiverImpl.java +++ /dev/null @@ -1,70 +0,0 @@ -package com.knecon.fforesight.service.ocr.v1.server.queue; - -import static com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration.OCR_DLQ; -import static com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration.OCR_REQUEST_EXCHANGE; -import static com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration.OCR_REQUEST_QUEUE_PREFIX; - -import java.util.Map; -import java.util.Set; - -import org.springframework.amqp.rabbit.annotation.RabbitHandler; -import org.springframework.amqp.rabbit.annotation.RabbitListener; -import org.springframework.boot.context.event.ApplicationReadyEvent; -import org.springframework.context.event.EventListener; -import org.springframework.stereotype.Service; - -import com.knecon.fforesight.tenantcommons.TenantProvider; -import com.knecon.fforesight.tenantcommons.model.TenantCreatedEvent; -import com.knecon.fforesight.tenantcommons.model.TenantQueueConfiguration; -import com.knecon.fforesight.tenantcommons.model.TenantResponse; -import com.knecon.fforesight.tenantcommons.queue.RabbitQueueFromExchangeService; -import com.knecon.fforesight.tenantcommons.queue.TenantExchangeMessageReceiver; - -@Service -public class TenantExchangeMessageReceiverImpl extends TenantExchangeMessageReceiver { - - public TenantExchangeMessageReceiverImpl(RabbitQueueFromExchangeService rabbitQueueService, TenantProvider tenantProvider) { - - super(rabbitQueueService, tenantProvider); - } - - - @Override - protected Set getTenantQueueConfigs() { - - return Set.of(TenantQueueConfiguration.builder() - .listenerId(OcrMessageReceiver.OCR_REQUEST_LISTENER_ID) - .exchangeName(OCR_REQUEST_EXCHANGE) - .queuePrefix(OCR_REQUEST_QUEUE_PREFIX) - .dlqName(OCR_DLQ) - .arguments(Map.of("x-max-priority", 2)) - .build()); - } - - - @EventListener(ApplicationReadyEvent.class) - public void onApplicationReady() { - - System.out.println("application ready invoked"); - super.initializeQueues(); - } - - - @RabbitHandler - @RabbitListener(queues = "#{tenantMessagingConfigurationImpl.getTenantCreatedQueueName()}") - public void reactToTenantCreation(TenantCreatedEvent tenantCreatedEvent) { - - super.reactToTenantCreation(tenantCreatedEvent); - } - - - @RabbitHandler - @RabbitListener(queues = "#{tenantMessagingConfigurationImpl.getTenantDeletedQueueName()}") - public void reactToTenantDeletion(TenantResponse tenantResponse) { - - super.reactToTenantDeletion(tenantResponse); - - } - -} - diff --git a/azure-ocr-service/azure-ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/AbstractTest.java b/azure-ocr-service/azure-ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/AbstractTest.java index 9b323fa..af1b7a1 100644 --- a/azure-ocr-service/azure-ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/AbstractTest.java +++ b/azure-ocr-service/azure-ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/AbstractTest.java @@ -1,6 +1,9 @@ package com.knecon.fforesight.service.ocr.v1.server; import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; @@ -10,6 +13,7 @@ import org.mockito.MockitoAnnotations; import org.mockito.junit.jupiter.MockitoExtension; import org.springframework.amqp.rabbit.core.RabbitAdmin; import org.springframework.amqp.rabbit.core.RabbitTemplate; +import org.springframework.amqp.rabbit.listener.MessageListenerContainer; import org.springframework.amqp.rabbit.listener.RabbitListenerEndpointRegistry; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.autoconfigure.EnableAutoConfiguration; @@ -57,11 +61,9 @@ public class AbstractTest { @MockBean private RabbitAdmin rabbitAdmin; - @MockBean - private RabbitListenerEndpointRegistry rabbitListenerEndpointRegistry; - private static String pdftronLicense; + @BeforeEach public void openMocks() { @@ -107,6 +109,16 @@ public class AbstractTest { @ComponentScan(excludeFilters = {@ComponentScan.Filter(type = FilterType.ASSIGNABLE_TYPE, value = StorageAutoConfiguration.class)}) public static class TestConfiguration { + @Bean + public RabbitListenerEndpointRegistry rabbitListenerEndpointRegistry() { + + var mock = mock(RabbitListenerEndpointRegistry.class); + when(mock.getListenerContainer(any())).thenReturn(mock(MessageListenerContainer.class)); + + return mock; + } + + @Bean @Primary public StorageService inMemoryStorage() { diff --git a/azure-ocr-service/azure-ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/azure-ocr-service/azure-ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index 21ef3c9..684de3f 100644 --- a/azure-ocr-service/azure-ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/azure-ocr-service/azure-ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -25,8 +25,8 @@ import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature; import lombok.SneakyThrows; -@Disabled // in order to run, the azure.key must be set first in the application.yml and you must set the env variable VCPKG_DYNAMIC_LIB to your tesseract and leptonica installation folder +@Disabled @SpringBootTest() public class OcrServiceIntegrationTest extends AbstractTest { @@ -55,7 +55,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcrWithFile() { - testOCR("/home/kschuettler/Dokumente/LayoutparsingEvaluation/RAW_FILES/Difficult Headlines/VV-284053.pdf/VV-284053.pdf.ORIGIN.pdf"); + testOCR("/home/kschuettler/Dokumente/TestFiles/OCR/VV-331340/VV-331340.pdf"); }