Merge branch 'RED-10477' into 'main'
RED-10477: update api version See merge request fforesight/azure-ocr-service!19
This commit is contained in:
commit
eac6a49100
@ -10,19 +10,18 @@ configurations {
|
||||
}
|
||||
|
||||
dependencies {
|
||||
api(project(":azure-ocr-service-api"))
|
||||
api("com.iqser.red.service:persistence-service-internal-api-v1:2.224.0")
|
||||
api("net.sourceforge.tess4j:tess4j:5.8.0")
|
||||
api("com.iqser.red.commons:metric-commons:2.1.0")
|
||||
api("com.iqser.red.commons:storage-commons:2.49.0")
|
||||
api("com.knecon.fforesight:tenant-commons:0.30.0")
|
||||
api("com.pdftron:PDFNet:10.7.0")
|
||||
api("org.apache.pdfbox:pdfbox:3.0.0")
|
||||
api("org.apache.commons:commons-math3:3.6.1")
|
||||
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
|
||||
api("com.google.guava:guava:31.1-jre")
|
||||
api("com.knecon.fforesight:viewer-doc-processor:0.177.0")
|
||||
api("com.azure:azure-ai-documentintelligence:1.0.0-beta.3")
|
||||
api("com.iqser.red.commons:pdftron-logic-commons:2.32.0")
|
||||
implementation(project(":azure-ocr-service-api"))
|
||||
implementation("net.sourceforge.tess4j:tess4j:5.8.0")
|
||||
implementation("com.iqser.red.commons:metric-commons:2.1.0")
|
||||
implementation("com.pdftron:PDFNet:11.0.0")
|
||||
implementation("org.apache.pdfbox:pdfbox:3.0.0")
|
||||
implementation("org.apache.commons:commons-math3:3.6.1")
|
||||
implementation("com.amazonaws:aws-java-sdk-kms:1.12.440")
|
||||
implementation("com.google.guava:guava:31.1-jre")
|
||||
implementation("com.knecon.fforesight:viewer-doc-processor:0.193.0")
|
||||
implementation("com.azure:azure-ai-documentintelligence:1.0.0-beta.4")
|
||||
|
||||
implementation("com.iqser.red.commons:pdftron-logic-commons:2.32.0")
|
||||
|
||||
testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
|
||||
}
|
||||
|
||||
@ -6,6 +6,8 @@ import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.ComponentScan;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
@ -22,4 +24,18 @@ public class OcrServiceProcessorConfiguration {
|
||||
return new PDFTronViewerDocumentService(registry);
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
public InvisibleElementRemovalService invisibleElementRemovalService() {
|
||||
|
||||
return new InvisibleElementRemovalService();
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
public WatermarkRemovalService watermarkRemovalService() {
|
||||
|
||||
return new WatermarkRemovalService();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -11,14 +11,16 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class OcrServiceSettings {
|
||||
|
||||
// Limits the number of concurrent calls to the azure API. In my very rudimentary testing, azure starts throwing "too many requests" errors at around 80/s. Higher numbers greatly improve the speed.
|
||||
int concurrency = 8;
|
||||
// Limits the number of concurrent calls to azure
|
||||
int concurrency = 2;
|
||||
// Limits the number of pages per call.
|
||||
int batchSize = 128;
|
||||
int batchSize = 32;
|
||||
|
||||
boolean debug; // writes the ocr layer visibly to the viewer doc pdf
|
||||
boolean drawTablesAsLines; // writes the tables to the PDF as invisible lines.
|
||||
boolean snuggify = true;
|
||||
boolean snuggify = true; // attempts to shrink the word boxes returned by azure to fit the actual word pixels snug
|
||||
boolean useCaches = true; // skips azure api, pdf rendering and image processing, when the files are already present
|
||||
boolean azureFontStyleDetection; // omits all image processing and uses azures FONT_STYLE feature (costs 0.6ct per page)
|
||||
String contentFormat; // Either markdown or text. But, for whatever reason, with markdown enabled, key-values are not written by azure....
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
|
||||
@ -10,4 +12,10 @@ public record ImageFile(int pageNumber, String absoluteFilePath) {
|
||||
return Leptonica1.pixRead(absoluteFilePath);
|
||||
}
|
||||
|
||||
|
||||
public boolean exists() {
|
||||
|
||||
return new File(absoluteFilePath).exists();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -2,16 +2,25 @@ package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import static com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils.formatIntervals;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import com.azure.ai.documentintelligence.models.AnalyzeResult;
|
||||
import com.azure.core.util.BinaryData;
|
||||
import com.azure.json.JsonOptions;
|
||||
import com.azure.json.JsonReader;
|
||||
import com.azure.json.implementation.DefaultJsonReader;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.GhostScriptService;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@RequiredArgsConstructor
|
||||
@ -21,23 +30,101 @@ public final class PageBatch implements Comparable<PageBatch> {
|
||||
@Getter
|
||||
int index;
|
||||
@NonNull
|
||||
List<Integer> lookup;
|
||||
List<Integer> batchPageToOriginPageLookup;
|
||||
@NonNull
|
||||
@Getter
|
||||
Path batchDoc;
|
||||
@NonNull
|
||||
@Getter
|
||||
Path imagePipelineDir;
|
||||
Path batchDir;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public AnalyzeResult getAzureResultCache() {
|
||||
|
||||
try (var in = new FileInputStream(getAzureResultCacheFile()); JsonReader reader = DefaultJsonReader.fromStream(in, new JsonOptions());) {
|
||||
return AnalyzeResult.fromJson(reader);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public File getAzureResultCacheFile() {
|
||||
|
||||
return batchDir.resolve("analyzeResult.json").toFile();
|
||||
}
|
||||
|
||||
|
||||
public List<ImageFile> getRenderedImageFiles() {
|
||||
|
||||
List<ImageFile> renderedImageFiles = new ArrayList<>();
|
||||
for (int i = 0; i < batchPageToOriginPageLookup.size(); i++) {
|
||||
renderedImageFiles.add(getRenderedImageFile(batchPageToOriginPageLookup.get(i), i + 1));
|
||||
}
|
||||
return renderedImageFiles;
|
||||
}
|
||||
|
||||
|
||||
public ImageFile getRenderedImageFile(int pageNumber, int numberInBatch) {
|
||||
|
||||
return new ImageFile(pageNumber, getRenderedImageNameFormat().formatted(numberInBatch));
|
||||
}
|
||||
|
||||
|
||||
public ImageFile getProcessedImageFile(int pageNumber, int numberInBatch) {
|
||||
|
||||
return new ImageFile(pageNumber, getProcessedImageNameFormat().formatted(numberInBatch));
|
||||
}
|
||||
|
||||
|
||||
public List<ImageFile> getProcessedImageFiles() {
|
||||
|
||||
List<ImageFile> processedImageFiles = new ArrayList<>();
|
||||
for (int i = 0; i < batchPageToOriginPageLookup.size(); i++) {
|
||||
processedImageFiles.add(getProcessedImageFile(batchPageToOriginPageLookup.get(i), i + 1));
|
||||
}
|
||||
return processedImageFiles;
|
||||
}
|
||||
|
||||
|
||||
public String getRenderedImageNameFormat() {
|
||||
|
||||
return getRenderedImageDir().resolve(getImageFormat()).toFile().toString();
|
||||
}
|
||||
|
||||
|
||||
public String getProcessedImageNameFormat() {
|
||||
|
||||
return getProcessedImageDir().resolve(getImageFormat()).toFile().toString();
|
||||
}
|
||||
|
||||
|
||||
private String getImageFormat() {
|
||||
|
||||
return "output_" + index + ".%04d" + GhostScriptService.FORMAT;
|
||||
}
|
||||
|
||||
|
||||
public Path getRenderedImageDir() {
|
||||
|
||||
return batchDir.resolve("rendered");
|
||||
}
|
||||
|
||||
|
||||
public Path getProcessedImageDir() {
|
||||
|
||||
return batchDir.resolve("processed");
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
if (size() == 1) {
|
||||
return String.format("%d", lookup.get(0));
|
||||
return String.format("%d", batchPageToOriginPageLookup.get(0));
|
||||
}
|
||||
|
||||
List<String> intervals = formatIntervals(lookup);
|
||||
List<String> intervals = formatIntervals(batchPageToOriginPageLookup);
|
||||
if (intervals.size() > 4) {
|
||||
intervals = intervals.subList(0, 4);
|
||||
intervals.add("...");
|
||||
@ -49,46 +136,46 @@ public final class PageBatch implements Comparable<PageBatch> {
|
||||
|
||||
public void forEach(Consumer<? super Integer> consumer) {
|
||||
|
||||
lookup.forEach(consumer);
|
||||
batchPageToOriginPageLookup.forEach(consumer);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> getAllPageNumbers() {
|
||||
|
||||
return lookup;
|
||||
return batchPageToOriginPageLookup;
|
||||
}
|
||||
|
||||
|
||||
public int size() {
|
||||
|
||||
return lookup.size();
|
||||
return batchPageToOriginPageLookup.size();
|
||||
}
|
||||
|
||||
|
||||
public boolean isEmpty() {
|
||||
|
||||
return lookup.isEmpty();
|
||||
return batchPageToOriginPageLookup.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
public int getPageNumber(int pageNumber) {
|
||||
|
||||
return lookup.get(pageNumber - 1);
|
||||
return batchPageToOriginPageLookup.get(pageNumber - 1);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(PageBatch o) {
|
||||
|
||||
if (lookup.isEmpty() && o.lookup.isEmpty()) {
|
||||
if (batchPageToOriginPageLookup.isEmpty() && o.batchPageToOriginPageLookup.isEmpty()) {
|
||||
return 0;
|
||||
} else if (lookup.isEmpty()) {
|
||||
} else if (batchPageToOriginPageLookup.isEmpty()) {
|
||||
return 1;
|
||||
} else if (o.lookup.isEmpty()) {
|
||||
} else if (o.batchPageToOriginPageLookup.isEmpty()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return Integer.compare(lookup.get(0), o.lookup.get(0));
|
||||
return Integer.compare(batchPageToOriginPageLookup.get(0), o.batchPageToOriginPageLookup.get(0));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -162,7 +162,7 @@ public class Statistics {
|
||||
|
||||
return batchStats.values()
|
||||
.stream()
|
||||
.mapToLong(BatchStats::getWritingTextDuration)
|
||||
.mapToLong(BatchStats::getMappingResultDuration)
|
||||
.toArray();
|
||||
}
|
||||
|
||||
|
||||
@ -1,8 +1,9 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
import org.slf4j.MDC;
|
||||
import org.springframework.stereotype.Service;
|
||||
@ -22,6 +23,7 @@ import com.pdftron.pdf.PDFDoc;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import reactor.core.publisher.Mono;
|
||||
@ -67,6 +69,10 @@ public class AsyncOcrService {
|
||||
|
||||
private void beginAnalysis(BinaryData data, BatchContext batchContext, Set<AzureOcrFeature> features) throws InterruptedException {
|
||||
|
||||
if (settings.isUseCaches() && batchContext.batch().getAzureResultCacheFile().exists()) {
|
||||
handleCached(batchContext);
|
||||
}
|
||||
|
||||
batchContext.supervisor.enterConcurrency(batchContext.batch);
|
||||
|
||||
batchContext.supervisor.logUploadStart(batchContext.batch, data.getLength());
|
||||
@ -85,13 +91,36 @@ public class AsyncOcrService {
|
||||
if (LongRunningOperationStatus.SUCCESSFULLY_COMPLETED == response.getStatus()) {
|
||||
return response.getFinalResult();
|
||||
}
|
||||
return Mono.error(new IllegalStateException("Polling completed unsuccessfully with status: " + response.getStatus()));
|
||||
String message = "Polling completed unsuccessfully with status: " + response.getStatus();
|
||||
log.error(message);
|
||||
return Mono.error(new IllegalStateException(message));
|
||||
}).subscribe(finalResult -> handleSuccessful(finalResult, batchContext),//
|
||||
ex -> handleError(ex, batchContext),//
|
||||
() -> handleCompleted(batchContext));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void handleCached(BatchContext batchContext) {
|
||||
|
||||
var mdcContext = MDC.getCopyOfContextMap();
|
||||
Thread thread = new Thread(() -> {
|
||||
MDC.setContextMap(mdcContext);
|
||||
log.info("Batch {}: Using cached ocr result", batchContext.batch.getIndex());
|
||||
batchContext.batchStats().finishUpload();
|
||||
batchContext.batchStats().finishApiWait();
|
||||
batchContext.supervisor.logPageSuccess(batchContext.batch());
|
||||
try {
|
||||
batchContext.layerFactory.processAnalyzeResult(batchContext.batch(), batchContext.batch().getAzureResultCache());
|
||||
} catch (InterruptedException e) {
|
||||
batchContext.supervisor.logPageError(batchContext.batch, e);
|
||||
}
|
||||
|
||||
});
|
||||
thread.start();
|
||||
}
|
||||
|
||||
|
||||
private static void handleCompleted(BatchContext batchContext) {
|
||||
|
||||
log.info("Completed batch {} with pages {}", batchContext.batch.getIndex(), batchContext.batch);
|
||||
@ -109,10 +138,8 @@ public class AsyncOcrService {
|
||||
|
||||
batchContext.supervisor.leaveConcurrency(batchContext.batch);
|
||||
try {
|
||||
mapper.writeValue(batchContext.batch().getAzureResultCacheFile(), finalResult);
|
||||
batchContext.supervisor.logPageSuccess(batchContext.batch());
|
||||
if (settings.isDebug()) {
|
||||
mapper.writeValue(batchContext.batch().getImagePipelineDir().resolve("azure_result_%d.json" .formatted(batchContext.batch().getIndex())).toFile(), finalResult);
|
||||
}
|
||||
batchContext.layerFactory.processAnalyzeResult(batchContext.batch(), finalResult);
|
||||
} catch (Exception e) {
|
||||
handleError(e, batchContext);
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
@ -56,6 +57,7 @@ public class AzureOcrResource {
|
||||
buildFeatures(features),
|
||||
null,
|
||||
buildContentFormat(),
|
||||
Collections.emptyList(),
|
||||
analyzeRequest);
|
||||
|
||||
}
|
||||
@ -86,6 +88,9 @@ public class AzureOcrResource {
|
||||
if (features.contains(AzureOcrFeature.IDP)) {
|
||||
azureFeatures.add(DocumentAnalysisFeature.KEY_VALUE_PAIRS);
|
||||
}
|
||||
if (settings.isAzureFontStyleDetection() && features.contains(AzureOcrFeature.FONT_STYLE_DETECTION)) {
|
||||
azureFeatures.add(DocumentAnalysisFeature.STYLE_FONT);
|
||||
}
|
||||
azureFeatures.add(DocumentAnalysisFeature.BARCODES);
|
||||
|
||||
return azureFeatures;
|
||||
|
||||
@ -1,7 +1,12 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import static com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils.formatIntervals;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
@ -18,9 +23,10 @@ import com.pdftron.sdf.SDFDoc;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
@ -29,26 +35,39 @@ public class BatchFactory {
|
||||
OcrServiceSettings settings;
|
||||
|
||||
|
||||
public static String formatBatchFilename(int number) {
|
||||
@SneakyThrows
|
||||
public List<PageBatch> splitIntoBatches(PDFDoc pdfDoc, OcrExecutionSupervisor supervisor, Set<AzureOcrFeature> features, Path runDir) {
|
||||
|
||||
return "batch_%d.pdf".formatted(number);
|
||||
Set<Integer> pagesToProcess = findPagesToProcess(pdfDoc, features);
|
||||
supervisor.logImageExtractionFinished(pdfDoc.getPageCount(), pagesToProcess.size());
|
||||
|
||||
List<PageBatch> batches = buildBatches(pdfDoc, supervisor, features, runDir, pagesToProcess);
|
||||
if (batches.size() > 1) {
|
||||
log.info("Split {} pages to process into {} batches", pagesToProcess.size(), batches.size());
|
||||
}
|
||||
return batches;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public List<PageBatch> splitIntoBatches(PDFDoc pdfDoc, OcrExecutionSupervisor supervisor, Set<AzureOcrFeature> features, Path fileDir) {
|
||||
public Set<Integer> findPagesToProcess(PDFDoc pdfDoc, Set<AzureOcrFeature> features) {
|
||||
|
||||
Set<Integer> pagesWithImages = ImageDetectionService.findPagesToProcess(pdfDoc, features);
|
||||
supervisor.logImageExtractionFinished(pdfDoc.getPageCount(), pagesWithImages.size());
|
||||
if (features.contains(AzureOcrFeature.ALL_PAGES)) {
|
||||
Set<Integer> pages = new HashSet<>();
|
||||
for (int i = 1; i <= pdfDoc.getPageCount(); i++) {
|
||||
pages.add(i);
|
||||
}
|
||||
return Collections.unmodifiableSet(pages);
|
||||
}
|
||||
|
||||
return buildBatches(pdfDoc, supervisor, features, fileDir, pagesWithImages);
|
||||
return ImageDetectionService.findPagesWithImages(pdfDoc);
|
||||
}
|
||||
|
||||
|
||||
public List<PageBatch> buildBatches(PDFDoc pdfDoc,
|
||||
OcrExecutionSupervisor supervisor,
|
||||
Set<AzureOcrFeature> features,
|
||||
Path fileDir,
|
||||
Path runDir,
|
||||
Set<Integer> pagesWithImages) throws PDFNetException {
|
||||
|
||||
List<PageBatch> batches = new ArrayList<>();
|
||||
@ -60,30 +79,48 @@ public class BatchFactory {
|
||||
}
|
||||
numbersForCurrentBatch.add(pageNumber);
|
||||
if (numbersForCurrentBatch.size() == settings.getBatchSize()) {
|
||||
batches.add(create(batches.size(), pdfDoc, numbersForCurrentBatch, fileDir));
|
||||
batches.add(create(batches.size(), pdfDoc, numbersForCurrentBatch, runDir));
|
||||
numbersForCurrentBatch = new ArrayList<>();
|
||||
}
|
||||
}
|
||||
if (!numbersForCurrentBatch.isEmpty()) {
|
||||
batches.add(create(batches.size(), pdfDoc, numbersForCurrentBatch, fileDir));
|
||||
batches.add(create(batches.size(), pdfDoc, numbersForCurrentBatch, runDir));
|
||||
}
|
||||
return batches;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static PageBatch create(int number, PDFDoc pdfDoc, List<Integer> pageNumbers, Path fileDir) {
|
||||
public static PageBatch create(int number, PDFDoc pdfDoc, List<Integer> pageNumbers, Path runDir) {
|
||||
|
||||
if (pageNumbers.isEmpty()) {
|
||||
throw new IllegalArgumentException("pageNumbers must not be empty");
|
||||
}
|
||||
Path batchDir = formatBatchDir(number, pageNumbers, runDir);
|
||||
Files.createDirectories(batchDir);
|
||||
|
||||
Path batchDocPath = fileDir.resolve(formatBatchFilename(number));
|
||||
Path batchDocPath = batchDir.resolve("batch.pdf");
|
||||
try (var batchDoc = extractBatchDocument(pdfDoc, pageNumbers)) {
|
||||
Optimizer.optimize(batchDoc);
|
||||
batchDoc.save(batchDocPath.toFile().toString(), SDFDoc.SaveMode.LINEARIZED, null);
|
||||
}
|
||||
return new PageBatch(number, pageNumbers, batchDocPath, fileDir);
|
||||
PageBatch batch = new PageBatch(number, pageNumbers, batchDocPath, batchDir);
|
||||
Files.createDirectories(batch.getRenderedImageDir());
|
||||
Files.createDirectories(batch.getProcessedImageDir());
|
||||
return batch;
|
||||
}
|
||||
|
||||
|
||||
private static Path formatBatchDir(int number, List<Integer> pageNumbers, Path runDir) {
|
||||
|
||||
List<String> intervals = formatIntervals(pageNumbers);
|
||||
if (intervals.size() > 4) {
|
||||
intervals = intervals.subList(0, 4);
|
||||
intervals.add("...");
|
||||
}
|
||||
|
||||
String batchName = String.join(", ", intervals);
|
||||
return runDir.resolve("batch_%04d_%s".formatted(number, batchName));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ public class BatchStats {
|
||||
|
||||
private long apiWaitTimestamp = -1;
|
||||
private long imageUploadTimestamp = -1;
|
||||
private long writingTextTimestamp = -1;
|
||||
private long mappingResultTimestamp = -1;
|
||||
private long batchRenderTimestamp = -1;
|
||||
|
||||
|
||||
@ -38,9 +38,9 @@ public class BatchStats {
|
||||
}
|
||||
|
||||
|
||||
public void finishWritingText() {
|
||||
public void finishMappingResult() {
|
||||
|
||||
writingTextTimestamp = System.currentTimeMillis();
|
||||
mappingResultTimestamp = System.currentTimeMillis();
|
||||
}
|
||||
|
||||
|
||||
@ -50,15 +50,33 @@ public class BatchStats {
|
||||
}
|
||||
|
||||
|
||||
public boolean isApiWaitFinished() {
|
||||
|
||||
return apiWaitTimestamp > 0;
|
||||
}
|
||||
|
||||
|
||||
public boolean isMappingResultFinished() {
|
||||
|
||||
return mappingResultTimestamp > 0;
|
||||
}
|
||||
|
||||
|
||||
public boolean isBatchRenderFinished() {
|
||||
|
||||
return batchRenderTimestamp > 0;
|
||||
}
|
||||
|
||||
|
||||
public long getApiWaitDuration() {return this.apiWaitTimestamp - imageUploadTimestamp;}
|
||||
|
||||
|
||||
public long getImageUploadDuration() {return this.imageUploadTimestamp - batchRenderTimestamp;}
|
||||
|
||||
|
||||
public long getWritingTextDuration() {return this.writingTextTimestamp - apiWaitTimestamp;}
|
||||
public long getMappingResultDuration() {return this.mappingResultTimestamp - apiWaitTimestamp;}
|
||||
|
||||
|
||||
public long getBatchRenderDuration() {return this.batchRenderTimestamp - startTimestamp;}
|
||||
public long getBatchRenderDuration() {return startTimestamp - this.batchRenderTimestamp;}
|
||||
|
||||
}
|
||||
|
||||
@ -24,21 +24,7 @@ public class ImageDetectionService {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Set<Integer> findPagesToProcess(PDFDoc pdfDoc, Set<AzureOcrFeature> features) {
|
||||
|
||||
if (features.contains(AzureOcrFeature.ALL_PAGES)) {
|
||||
Set<Integer> pages = new HashSet<>();
|
||||
for (int i = 1; i <= pdfDoc.getPageCount(); i++) {
|
||||
pages.add(i);
|
||||
}
|
||||
return Collections.unmodifiableSet(pages);
|
||||
}
|
||||
|
||||
return findPagesWithImages(pdfDoc);
|
||||
}
|
||||
|
||||
|
||||
private Set<Integer> findPagesWithImages(PDFDoc pdfDoc) throws PDFNetException {
|
||||
public Set<Integer> findPagesWithImages(PDFDoc pdfDoc) {
|
||||
|
||||
Set<Integer> pagesWithImages = new HashSet<>();
|
||||
try (ElementReader reader = new ElementReader()) {
|
||||
|
||||
@ -38,8 +38,6 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OCRService {
|
||||
|
||||
public static final String IMAGE_PIPELINE_DIR = "image_pipeline";
|
||||
public static final String AZURE_OUTPUT_DIR = "azure_output";
|
||||
IOcrMessageSender ocrMessageSender;
|
||||
WatermarkRemovalService watermarkRemovalService;
|
||||
InvisibleElementRemovalService invisibleElementRemovalService;
|
||||
@ -120,7 +118,7 @@ public class OCRService {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public OcrExecutionSupervisor runOcr(Path tmpDir,
|
||||
public OcrExecutionSupervisor runOcr(Path runDir,
|
||||
File documentFile,
|
||||
File viewerDocumentFile,
|
||||
String fileId,
|
||||
@ -128,12 +126,6 @@ public class OCRService {
|
||||
File analyzeResultFile,
|
||||
Set<AzureOcrFeature> features) {
|
||||
|
||||
Path tmpImageDir = tmpDir.resolve(IMAGE_PIPELINE_DIR);
|
||||
Path azureOutputDir = tmpDir.resolve(AZURE_OUTPUT_DIR);
|
||||
|
||||
Files.createDirectories(azureOutputDir);
|
||||
Files.createDirectories(tmpImageDir);
|
||||
|
||||
try (var in = new FileInputStream(documentFile); PDFDoc pdfDoc = new PDFDoc(in)) {
|
||||
|
||||
OCGWatermarkRemovalService.removeWatermarks(pdfDoc);
|
||||
@ -141,7 +133,7 @@ public class OCRService {
|
||||
OcrExecutionSupervisor supervisor = new OcrExecutionSupervisor(pdfDoc.getPageCount(), ocrMessageSender, fileId, settings);
|
||||
supervisor.getStatistics().setStart();
|
||||
|
||||
List<PageBatch> batches = batchFactory.splitIntoBatches(pdfDoc, supervisor, features, tmpImageDir);
|
||||
List<PageBatch> batches = batchFactory.splitIntoBatches(pdfDoc, supervisor, features, runDir);
|
||||
|
||||
OcrResult ocrResult = asyncOcrService.awaitOcr(pdfDoc, supervisor, features, batches);
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@ import static com.knecon.fforesight.service.ocr.processor.model.Statistics.human
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
@ -65,10 +66,10 @@ public class OcrExecutionSupervisor {
|
||||
}
|
||||
|
||||
|
||||
public void logImageExtractionFinished(int numberOfPages, int numberOfImages) {
|
||||
public void logImageExtractionFinished(int numberOfPages, int numberOfPagesToProcess) {
|
||||
|
||||
statistics.imageExtractionFinished();
|
||||
log.info("Images found on {}/{} pages in {}", numberOfImages, numberOfPages, humanizeDuration(statistics.getImageExtractionDuration()));
|
||||
log.info("Images found on {}/{} pages in {}", numberOfPagesToProcess, numberOfPages, humanizeDuration(statistics.getImageExtractionDuration()));
|
||||
}
|
||||
|
||||
|
||||
@ -95,7 +96,7 @@ public class OcrExecutionSupervisor {
|
||||
public void finishMappingResult(PageBatch batch) {
|
||||
|
||||
batch.forEach(pageIndex -> countDownPagesToProcess.countDown());
|
||||
statistics.getBatchStats(batch).finishWritingText();
|
||||
statistics.getBatchStats(batch).finishMappingResult();
|
||||
ocrMessageSender.sendUpdate(fileId, this.processedPages(), getTotalPageCount());
|
||||
}
|
||||
|
||||
@ -127,7 +128,29 @@ public class OcrExecutionSupervisor {
|
||||
|
||||
private int processedPages() {
|
||||
|
||||
return (int) (totalPageCount - countDownPagesToProcess.getCount());
|
||||
if (countDownPagesToProcess.getCount() == 0) {
|
||||
return totalPageCount;
|
||||
}
|
||||
int processedPages = 0;
|
||||
for (Map.Entry<PageBatch, BatchStats> entry : statistics.getBatchStats().entrySet()) {
|
||||
PageBatch pageBatch = entry.getKey();
|
||||
BatchStats batchStats = entry.getValue();
|
||||
float percentage = 0;
|
||||
if (batchStats.isBatchRenderFinished()) {
|
||||
percentage += 0.1f;
|
||||
}
|
||||
if (batchStats.isUploadFinished()) {
|
||||
percentage += 0.3f;
|
||||
}
|
||||
if (batchStats.isApiWaitFinished()) {
|
||||
percentage += 0.3f;
|
||||
}
|
||||
if (batchStats.isMappingResultFinished()) {
|
||||
percentage += 0.3f;
|
||||
}
|
||||
processedPages += (int) (pageBatch.size() * percentage);
|
||||
}
|
||||
return processedPages;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.visualizations;
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Line2D;
|
||||
@ -37,6 +37,7 @@ import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.Image
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingSupervisor;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.BBoxSnuggificationService;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.StrokeWidthCalculator;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.FontMetricsProvider;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.FontStyle;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.Type0FontMetricsProvider;
|
||||
@ -56,7 +57,7 @@ import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
@Slf4j
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class WritableOcrResultFactory {
|
||||
public class OcrResultPostProcessingPipeline {
|
||||
|
||||
@Getter
|
||||
Map<Integer, AffineTransform> resultToPageTransforms;
|
||||
@ -67,10 +68,10 @@ public class WritableOcrResultFactory {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public WritableOcrResultFactory(Map<Integer, PageInformation> pageInformation,
|
||||
ImageProcessingPipeline imageProcessingPipeline,
|
||||
OcrServiceSettings settings,
|
||||
Set<AzureOcrFeature> features) {
|
||||
public OcrResultPostProcessingPipeline(Map<Integer, PageInformation> pageInformation,
|
||||
ImageProcessingPipeline imageProcessingPipeline,
|
||||
OcrServiceSettings settings,
|
||||
Set<AzureOcrFeature> features) {
|
||||
|
||||
this.imageProcessingPipeline = imageProcessingPipeline;
|
||||
this.pageInformation = pageInformation;
|
||||
@ -80,14 +81,9 @@ public class WritableOcrResultFactory {
|
||||
}
|
||||
|
||||
|
||||
public List<WritableOcrResult> buildOcrResultToWrite(AnalyzeResult analyzeResult, PageBatch batch) throws InterruptedException {
|
||||
public List<WritableOcrResult> processAnalyzeResult(AnalyzeResult analyzeResult, PageBatch batch) throws InterruptedException {
|
||||
|
||||
Map<Integer, Double> anglesPerPage = analyzeResult.getPages()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(DocumentPage::getPageNumber, documentPage -> -documentPage.getAngle()));
|
||||
RotationCorrectionUtility.rotatePages(batch.getBatchDoc(), batch.getBatchDoc(), anglesPerPage);
|
||||
|
||||
ImageProcessingSupervisor imageSupervisor = imageProcessingPipeline.addToPipeline(batch);
|
||||
ImageProcessingSupervisor imageSupervisor = renderImagesIfNecessary(analyzeResult, batch);
|
||||
|
||||
List<WritableOcrResult> writableOcrResultList = new ArrayList<>();
|
||||
|
||||
@ -114,6 +110,30 @@ public class WritableOcrResultFactory {
|
||||
}
|
||||
|
||||
|
||||
private ImageProcessingSupervisor renderImagesIfNecessary(AnalyzeResult analyzeResult, PageBatch batch) {
|
||||
|
||||
ImageProcessingSupervisor imageSupervisor = null;
|
||||
if (useRenderedImages()) {
|
||||
|
||||
Map<Integer, Double> anglesPerPage = analyzeResult.getPages()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(DocumentPage::getPageNumber, documentPage -> -documentPage.getAngle()));
|
||||
RotationCorrectionUtility.rotatePages(batch.getBatchDoc(), batch.getBatchDoc(), anglesPerPage);
|
||||
imageSupervisor = imageProcessingPipeline.addToPipeline(batch);
|
||||
}
|
||||
return imageSupervisor;
|
||||
}
|
||||
|
||||
|
||||
private boolean useRenderedImages() {
|
||||
|
||||
if (settings.isAzureFontStyleDetection() && features.contains(AzureOcrFeature.FONT_STYLE_DETECTION)) {
|
||||
return false;
|
||||
}
|
||||
return settings.isSnuggify() || features.contains(AzureOcrFeature.FONT_STYLE_DETECTION);
|
||||
}
|
||||
|
||||
|
||||
private List<TextPositionInImage> buildTextPositionsInImage(PageBatch pageOffset,
|
||||
DocumentPage resultPage,
|
||||
AffineTransform resultToPageTransform,
|
||||
@ -121,7 +141,7 @@ public class WritableOcrResultFactory {
|
||||
PageInformation pageInformation,
|
||||
ImageProcessingSupervisor imageSupervisor) throws InterruptedException {
|
||||
|
||||
if (!settings.isSnuggify() && !features.contains(AzureOcrFeature.FONT_STYLE_DETECTION)) {
|
||||
if (!useRenderedImages()) {
|
||||
return buildText(resultPage, resultToPageTransform, lookups, pageInformation);
|
||||
}
|
||||
|
||||
@ -7,7 +7,7 @@ import java.util.Optional;
|
||||
|
||||
import com.azure.ai.documentintelligence.models.DocumentPage;
|
||||
import com.azure.ai.documentintelligence.models.DocumentWord;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResultFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OcrResultPostProcessingPipeline;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
|
||||
import com.sun.jna.Pointer;
|
||||
@ -58,7 +58,7 @@ public class BBoxSnuggificationService {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
Pix wordImage = WritableOcrResultFactory.extractWordImage(originTransformed, pageImage);
|
||||
Pix wordImage = OcrResultPostProcessingPipeline.extractWordImage(originTransformed, pageImage);
|
||||
|
||||
if (wordImage == null) {
|
||||
log.debug("Unable to extract word image! wordImage: {}, pageImage {}", originTransformed.getBounds2D(), new Rectangle2D.Float(0, 0, pageImage.w, pageImage.h));
|
||||
|
||||
@ -15,6 +15,7 @@ import java.util.regex.Pattern;
|
||||
import org.slf4j.MDC;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@ -5,10 +5,12 @@ import java.nio.file.Path;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
|
||||
|
||||
@ -25,35 +27,43 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@SuppressWarnings("PMD") // can't figure out how to safely close the stdOut and stdError streams in line 72/74
|
||||
public class GhostScriptService {
|
||||
|
||||
static String FORMAT = ".tiff";
|
||||
private OcrServiceSettings ocrServiceSettings;
|
||||
public static String FORMAT = ".tiff";
|
||||
static String DEVICE = "tiffgray";
|
||||
static int DPI = 300;
|
||||
private Semaphore concurrencySemaphore = new Semaphore(3);
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void startBatchRender(PageBatch batch, ImageProcessingSupervisor supervisor, Path renderedImageDir, Consumer<ImageFile> successHandler, Consumer<String> errorHandler) {
|
||||
public void startBatchRender(PageBatch batch, ImageProcessingSupervisor supervisor, Consumer<ImageFile> successHandler, Consumer<String> errorHandler) {
|
||||
|
||||
supervisor.requireNoErrors();
|
||||
|
||||
List<ImageFile> renderedImageFiles = batch.getRenderedImageFiles();
|
||||
if (ocrServiceSettings.isUseCaches() && renderedImageFiles.stream()
|
||||
.allMatch(ImageFile::exists)) {
|
||||
log.info("Batch {}: Using cached GhostScript rendering with page(s) {}", batch.getIndex(), batch);
|
||||
renderedImageFiles.forEach(successHandler);
|
||||
return;
|
||||
}
|
||||
|
||||
concurrencySemaphore.acquire();
|
||||
log.info("Batch {}: starting GhostScript rendering with page(s) {}", batch.getIndex(), batch);
|
||||
executeProcess(batch.getIndex(), buildCmdArgs(batch, renderedImageDir, batch.getBatchDoc()), successHandler, errorHandler);
|
||||
executeProcess(batch.getIndex(), buildCmdArgs(batch, batch.getBatchDoc()), successHandler, errorHandler);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private ProcessCmdsAndRenderedImageFiles buildCmdArgs(PageBatch batch, Path outputDir, Path document) {
|
||||
|
||||
String imagePathFormat = outputDir.resolve("output_" + batch.getIndex() + ".%04d" + FORMAT).toFile().toString();
|
||||
private ProcessCmdsAndRenderedImageFiles buildCmdArgs(PageBatch batch, Path document) {
|
||||
|
||||
Map<Integer, ImageFile> fullPageImages = new HashMap<>();
|
||||
List<Integer> allPageNumbers = batch.getAllPageNumbers();
|
||||
|
||||
for (int i = 0; i < allPageNumbers.size(); i++) {
|
||||
Integer pageNumber = allPageNumbers.get(i);
|
||||
fullPageImages.put(i + 1, new ImageFile(pageNumber, String.format(imagePathFormat, i + 1)));
|
||||
List<ImageFile> renderedImageFiles = batch.getRenderedImageFiles();
|
||||
for (int i = 1; i <= renderedImageFiles.size(); i++) {
|
||||
ImageFile renderedImageFile = renderedImageFiles.get(i - 1);
|
||||
fullPageImages.put(i, renderedImageFile);
|
||||
}
|
||||
|
||||
String[] cmdArgs = buildCmdArgs(document, imagePathFormat);
|
||||
String[] cmdArgs = buildCmdArgs(document, batch.getRenderedImageNameFormat());
|
||||
|
||||
return new ProcessCmdsAndRenderedImageFiles(cmdArgs, fullPageImages);
|
||||
}
|
||||
@ -76,6 +86,22 @@ public class GhostScriptService {
|
||||
|
||||
stdOutLogger.start();
|
||||
stdErrorLogger.start();
|
||||
handleFinished(p);
|
||||
}
|
||||
|
||||
|
||||
private void handleFinished(Process p) {
|
||||
|
||||
Thread finishedThread = new Thread(() -> {
|
||||
try {
|
||||
p.waitFor();
|
||||
} catch (InterruptedException e) {
|
||||
log.error("GhostScript process was interrupted", e);
|
||||
} finally {
|
||||
concurrencySemaphore.release();
|
||||
}
|
||||
});
|
||||
finishedThread.start();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,7 +1,5 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.imageprocessing;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
@ -20,9 +18,6 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ImageProcessingPipeline {
|
||||
|
||||
public static final String PROCESSED_DIR = "processed";
|
||||
public static final String RENDERED_DIR = "rendered";
|
||||
|
||||
GhostScriptService ghostScriptService;
|
||||
ImageProcessingService imageProcessingService;
|
||||
|
||||
@ -30,20 +25,14 @@ public class ImageProcessingPipeline {
|
||||
@SneakyThrows
|
||||
public ImageProcessingSupervisor addToPipeline(PageBatch batch) {
|
||||
|
||||
Path processedImageDir = batch.getImagePipelineDir().resolve(PROCESSED_DIR);
|
||||
Path renderedImageDir = batch.getImagePipelineDir().resolve(RENDERED_DIR);
|
||||
|
||||
Files.createDirectories(renderedImageDir);
|
||||
Files.createDirectories(processedImageDir);
|
||||
|
||||
List<Integer> pageNumbers = batch.getAllPageNumbers();
|
||||
|
||||
ImageProcessingSupervisor supervisor = new ImageProcessingSupervisor(pageNumbers);
|
||||
|
||||
Consumer<ImageFile> renderingSuccessConsumer = imageFile -> imageProcessingService.addToProcessingQueue(imageFile, processedImageDir, supervisor);
|
||||
Consumer<ImageFile> renderingSuccessConsumer = imageFile -> imageProcessingService.addToProcessingQueue(imageFile, batch.getProcessedImageDir(), supervisor);
|
||||
Consumer<String> renderingErrorConsumer = supervisor::markError;
|
||||
|
||||
ghostScriptService.startBatchRender(batch, supervisor, renderedImageDir, renderingSuccessConsumer, renderingErrorConsumer);
|
||||
ghostScriptService.startBatchRender(batch, supervisor, renderingSuccessConsumer, renderingErrorConsumer);
|
||||
|
||||
return supervisor;
|
||||
}
|
||||
|
||||
@ -1,11 +1,13 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.imageprocessing;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Path;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ImageFile;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
@ -23,9 +25,10 @@ import net.sourceforge.lept4j.util.LeptUtils;
|
||||
public class ImageProcessingService {
|
||||
|
||||
BlockingQueue<ProcessParams> queue = new LinkedBlockingQueue<>();
|
||||
private final OcrServiceSettings ocrServiceSettings;
|
||||
|
||||
|
||||
public ImageProcessingService() {
|
||||
public ImageProcessingService(OcrServiceSettings ocrServiceSettings) {
|
||||
|
||||
Thread queueConsumerThread = new Thread(() -> {
|
||||
while (true) {
|
||||
@ -44,7 +47,7 @@ public class ImageProcessingService {
|
||||
}
|
||||
});
|
||||
queueConsumerThread.start();
|
||||
|
||||
this.ocrServiceSettings = ocrServiceSettings;
|
||||
}
|
||||
|
||||
|
||||
@ -58,24 +61,35 @@ public class ImageProcessingService {
|
||||
private void process(ImageFile unprocessedImage, Path outputDir, ImageProcessingSupervisor supervisor) {
|
||||
|
||||
String absoluteFilePath = outputDir.resolve(Path.of(unprocessedImage.absoluteFilePath()).getFileName()).toFile().toString();
|
||||
ImageFile imageFile = new ImageFile(unprocessedImage.pageNumber(), absoluteFilePath);
|
||||
ImageFile processedImage = new ImageFile(unprocessedImage.pageNumber(), absoluteFilePath);
|
||||
|
||||
if (ocrServiceSettings.isUseCaches() && processedImage.exists()) {
|
||||
supervisor.markPageFinished(processedImage);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
if (!unprocessedImage.exists()) {
|
||||
log.error("ERROR, rendered image {} does not exist", unprocessedImage.absoluteFilePath());
|
||||
throw new AssertionError();
|
||||
}
|
||||
synchronized (ImageProcessingSupervisor.class) {
|
||||
// Leptonica is not thread safe, but is being called in WritableOcrResultFactory as well
|
||||
Pix processedPix;
|
||||
Pix pix = unprocessedImage.readPix();
|
||||
|
||||
assert pix != null;
|
||||
|
||||
processedPix = processPix(pix);
|
||||
Leptonica1.pixWrite(absoluteFilePath, processedPix, ILeptonica.IFF_TIFF_PACKBITS);
|
||||
Leptonica1.pixWrite(processedImage.absoluteFilePath(), processedPix, ILeptonica.IFF_TIFF_PACKBITS);
|
||||
|
||||
LeptUtils.disposePix(pix);
|
||||
LeptUtils.disposePix(processedPix);
|
||||
|
||||
}
|
||||
} catch (Exception e) {
|
||||
supervisor.markError(e.getMessage());
|
||||
} finally {
|
||||
supervisor.markPageFinished(imageFile);
|
||||
supervisor.markPageFinished(processedImage);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1,25 +1,26 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import java.util.Locale;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public final class OsUtils {
|
||||
|
||||
private static final String SERVICE_NAME = "azure-ocr-service";
|
||||
|
||||
|
||||
private static boolean isWindows() {
|
||||
|
||||
return StringUtils.containsIgnoreCase(System.getProperty("os.name"), "Windows");
|
||||
String osName = System.getProperty("os.name");
|
||||
if (osName == null) {
|
||||
return false;
|
||||
}
|
||||
return osName.toLowerCase(Locale.ENGLISH).contains("windows");
|
||||
}
|
||||
|
||||
|
||||
public static String getTemporaryDirectory() {
|
||||
|
||||
String tmpdir = System.getProperty("java.io.tmpdir");
|
||||
if (isWindows() && StringUtils.isNotBlank(tmpdir)) {
|
||||
if (isWindows() && !tmpdir.isBlank()) {
|
||||
return tmpdir;
|
||||
}
|
||||
return "/tmp";
|
||||
|
||||
@ -65,10 +65,6 @@ public class IdpLayerFactory {
|
||||
analyzeResult.getTables()
|
||||
.forEach(documentTable -> idpLayer.addTable(documentTable, pageOffset));
|
||||
}
|
||||
if (analyzeResult.getLists() != null) {
|
||||
analyzeResult.getLists()
|
||||
.forEach(list -> idpLayer.addList(list, pageOffset));
|
||||
}
|
||||
if (analyzeResult.getKeyValuePairs() != null) {
|
||||
analyzeResult.getKeyValuePairs()
|
||||
.forEach(keyValue -> idpLayer.addKeyValue(keyValue, pageOffset));
|
||||
|
||||
@ -14,7 +14,7 @@ import com.knecon.fforesight.service.ocr.processor.service.OcrExecutionSuperviso
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingPipeline;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResultFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OcrResultPostProcessingPipeline;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup;
|
||||
|
||||
@ -25,7 +25,7 @@ import lombok.experimental.FieldDefaults;
|
||||
public class LayerFactory {
|
||||
|
||||
OcrExecutionSupervisor supervisor;
|
||||
WritableOcrResultFactory writableOcrResultFactory;
|
||||
OcrResultPostProcessingPipeline ocrResultPostProcessingPipeline;
|
||||
IdpLayerFactory idpLayerFactory;
|
||||
OcrDebugLayerFactory ocrDebugLayerFactory;
|
||||
OcrTextLayerFactory ocrTextLayerFactory;
|
||||
@ -40,8 +40,8 @@ public class LayerFactory {
|
||||
Map<Integer, PageInformation> pageInformation,
|
||||
ImageProcessingPipeline imageProcessingPipeline) {
|
||||
|
||||
this.writableOcrResultFactory = new WritableOcrResultFactory(pageInformation, imageProcessingPipeline, settings, features);
|
||||
this.idpLayerFactory = new IdpLayerFactory(writableOcrResultFactory.getResultToPageTransforms());
|
||||
this.ocrResultPostProcessingPipeline = new OcrResultPostProcessingPipeline(pageInformation, imageProcessingPipeline, settings, features);
|
||||
this.idpLayerFactory = new IdpLayerFactory(ocrResultPostProcessingPipeline.getResultToPageTransforms());
|
||||
this.ocrDebugLayerFactory = new OcrDebugLayerFactory();
|
||||
this.ocrTextLayerFactory = new OcrTextLayerFactory();
|
||||
this.settings = settings;
|
||||
@ -53,7 +53,7 @@ public class LayerFactory {
|
||||
|
||||
public void processAnalyzeResult(PageBatch batch, AnalyzeResult analyzeResult) throws InterruptedException {
|
||||
|
||||
List<WritableOcrResult> results = writableOcrResultFactory.buildOcrResultToWrite(analyzeResult, batch);
|
||||
List<WritableOcrResult> results = ocrResultPostProcessingPipeline.processAnalyzeResult(analyzeResult, batch);
|
||||
|
||||
results.forEach(result -> angles.put(result.getPageNumber(), result.getAngle()));
|
||||
|
||||
|
||||
@ -38,8 +38,9 @@ class ImageProcessingPipelineTest {
|
||||
|
||||
new NativeLibrariesInitializer("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a").init();
|
||||
|
||||
ImageProcessingService imageProcessingService = new ImageProcessingService();
|
||||
GhostScriptService ghostScriptService = new GhostScriptService();
|
||||
OcrServiceSettings settings = new OcrServiceSettings();
|
||||
ImageProcessingService imageProcessingService = new ImageProcessingService(settings);
|
||||
GhostScriptService ghostScriptService = new GhostScriptService(settings);
|
||||
imageProcessingPipeline = new ImageProcessingPipeline(ghostScriptService, imageProcessingService);
|
||||
}
|
||||
|
||||
|
||||
@ -1,17 +1,12 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import static com.knecon.fforesight.service.ocr.processor.service.OCRService.IMAGE_PIPELINE_DIR;
|
||||
import static com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingPipeline.PROCESSED_DIR;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@ -32,7 +27,6 @@ import com.knecon.fforesight.service.ocr.processor.model.ImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResultFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.layers.OcrDebugLayerFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
|
||||
@ -44,15 +38,8 @@ import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.sun.jna.Memory;
|
||||
import com.sun.jna.Native;
|
||||
import com.sun.jna.Pointer;
|
||||
import com.sun.jna.ptr.PointerByReference;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import net.sourceforge.lept4j.Box;
|
||||
import net.sourceforge.lept4j.Boxa;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
@Disabled // leptonica is not available in build server
|
||||
public class SnugBoxesTest {
|
||||
@ -60,9 +47,8 @@ public class SnugBoxesTest {
|
||||
public static final int PAGE_NUMBER = 41;
|
||||
public static final Path ORIGIN_FILE = Path.of("/home/kschuettler/Dokumente/TestFiles/OCR/VV-331340-first100.pdf");
|
||||
public static final Path TEST_FOLDER = Path.of("/tmp/OCR_TEST/").resolve(ORIGIN_FILE.getFileName());
|
||||
public static final Path PROCESSED_FOLDER = TEST_FOLDER.resolve(IMAGE_PIPELINE_DIR).resolve(PROCESSED_DIR);
|
||||
public static final Path DESTINATION_FILE = TEST_FOLDER.resolve("SnugBoxesTest.pdf");
|
||||
public static final Path RESULT_FILE = TEST_FOLDER.resolve(IMAGE_PIPELINE_DIR).resolve("azure_result_0.json");
|
||||
public static final Path BATCH_FOLDER = TEST_FOLDER.resolve("batch_0");
|
||||
public static final Path DESTINATION_FILE = BATCH_FOLDER.resolve("SnugBoxesTest.pdf");
|
||||
|
||||
PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null);
|
||||
|
||||
@ -79,24 +65,24 @@ public class SnugBoxesTest {
|
||||
public void snugBoxes() {
|
||||
|
||||
String filePath = ORIGIN_FILE.toFile().toString();
|
||||
File file = PROCESSED_FOLDER.resolve("output_0.%04d.tiff".formatted(PAGE_NUMBER)).toFile();
|
||||
File file = new File(filePath);
|
||||
assert file.exists();
|
||||
ImageFile imageFile = new ImageFile(PAGE_NUMBER, file.toString());
|
||||
AnalyzeResult result = null;
|
||||
try (var in = new FileInputStream(RESULT_FILE.toFile()); JsonReader reader = DefaultJsonReader.fromStream(in, new JsonOptions());) {
|
||||
try (var in = new FileInputStream(BATCH_FOLDER.resolve("analyzeResult.json").toFile()); JsonReader reader = DefaultJsonReader.fromStream(in, new JsonOptions());) {
|
||||
result = AnalyzeResult.fromJson(reader);
|
||||
}
|
||||
|
||||
var resultPage = result.getPages()
|
||||
.get(PAGE_NUMBER - 1);
|
||||
WritableOcrResultFactory writableOcrResultFactory = new WritableOcrResultFactory(null, null, new OcrServiceSettings(), Set.of());
|
||||
OcrResultPostProcessingPipeline ocrResultPostProcessingPipeline = new OcrResultPostProcessingPipeline(null, null, new OcrServiceSettings(), Set.of());
|
||||
OcrDebugLayerFactory debugLayerFactory = new OcrDebugLayerFactory();
|
||||
InvisibleElementRemovalService invisibleElementRemovalService = new InvisibleElementRemovalService();
|
||||
try (var in = new FileInputStream(ORIGIN_FILE.toFile()); var out = new FileOutputStream(DESTINATION_FILE.toFile())) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
|
||||
}
|
||||
PageInformation pageInformation = getPageInformation(PAGE_NUMBER, DESTINATION_FILE.toFile().toString());
|
||||
WritableOcrResultFactory.Lookups empty = WritableOcrResultFactory.Lookups.empty();
|
||||
OcrResultPostProcessingPipeline.Lookups empty = OcrResultPostProcessingPipeline.Lookups.empty();
|
||||
|
||||
AffineTransform pageCtm = getPageCtm(PAGE_NUMBER, filePath, resultPage.getWidth());
|
||||
// pageCtm.preConcatenate(rotationCorrection);
|
||||
@ -117,7 +103,7 @@ public class SnugBoxesTest {
|
||||
|
||||
//
|
||||
|
||||
List<TextPositionInImage> words = writableOcrResultFactory.buildTextWithSnugBBoxes(resultPage, imageFile, pageCtm, empty, pageInformation);
|
||||
List<TextPositionInImage> words = ocrResultPostProcessingPipeline.buildTextWithSnugBBoxes(resultPage, imageFile, pageCtm, empty, pageInformation);
|
||||
var results = new WritableOcrResult(PAGE_NUMBER, -resultPage.getAngle(), words, Collections.emptyList());
|
||||
debugLayerFactory.addAnalysisResult(List.of(results));
|
||||
|
||||
@ -231,7 +217,7 @@ public class SnugBoxesTest {
|
||||
@SneakyThrows
|
||||
private static AffineTransform getPageCtm(int pageNumber, String file, double imageWidh) {
|
||||
|
||||
return WritableOcrResultFactory.buildResultToPageTransform(getPageInformation(pageNumber, file), imageWidh);
|
||||
return OcrResultPostProcessingPipeline.buildResultToPageTransform(getPageInformation(pageNumber, file), imageWidh);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -31,12 +31,17 @@ dependencies {
|
||||
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.1.1")
|
||||
implementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}")
|
||||
|
||||
implementation("com.iqser.red.service:persistence-service-internal-api-v1:2.224.0")
|
||||
implementation("com.knecon.fforesight:tenant-commons:0.31.0")
|
||||
implementation("com.iqser.red.commons:storage-commons:2.50.0")
|
||||
|
||||
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
|
||||
implementation("ch.qos.logback:logback-classic")
|
||||
|
||||
testImplementation("org.springframework.boot:spring-boot-starter-test:${springBootStarterVersion}")
|
||||
testImplementation("com.iqser.red.commons:test-commons:2.1.0")
|
||||
testImplementation("org.springframework.amqp:spring-rabbit-test:3.0.2")
|
||||
testImplementation("com.iqser.red.commons:pdftron-logic-commons:2.32.0")
|
||||
}
|
||||
|
||||
tasks.named<BootBuildImage>("bootBuildImage") {
|
||||
|
||||
@ -9,11 +9,9 @@ import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Import;
|
||||
import org.springframework.scheduling.annotation.EnableAsync;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceProcessorConfiguration;
|
||||
import com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
|
||||
|
||||
import io.micrometer.core.aop.TimedAspect;
|
||||
@ -43,17 +41,4 @@ public class Application {
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
public InvisibleElementRemovalService invisibleElementRemovalService() {
|
||||
|
||||
return new InvisibleElementRemovalService();
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
public WatermarkRemovalService watermarkRemovalService() {
|
||||
|
||||
return new WatermarkRemovalService();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
package com.knecon.fforesight.service.ocr.v1.server;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
@ -1,11 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.server.configuration;
|
||||
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import com.knecon.fforesight.tenantcommons.queue.TenantMessagingConfiguration;
|
||||
|
||||
@Configuration
|
||||
public class TenantMessagingConfigurationImpl extends TenantMessagingConfiguration {
|
||||
|
||||
|
||||
}
|
||||
@ -0,0 +1,32 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.server.configuration;
|
||||
|
||||
import static com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration.OCR_DLQ;
|
||||
import static com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration.OCR_REQUEST_EXCHANGE;
|
||||
import static com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration.OCR_REQUEST_QUEUE_PREFIX;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.v1.server.queue.OcrMessageReceiver;
|
||||
import com.knecon.fforesight.tenantcommons.model.TenantQueueConfiguration;
|
||||
import com.knecon.fforesight.tenantcommons.model.TenantQueueProvider;
|
||||
|
||||
@Configuration
|
||||
public class TenantQueueProviderConfig {
|
||||
|
||||
@Bean
|
||||
protected TenantQueueProvider getTenantQueueConfigs() {
|
||||
|
||||
return new TenantQueueProvider(Set.of(TenantQueueConfiguration.builder()
|
||||
.listenerId(OcrMessageReceiver.OCR_REQUEST_LISTENER_ID)
|
||||
.exchangeName(OCR_REQUEST_EXCHANGE)
|
||||
.queuePrefix(OCR_REQUEST_QUEUE_PREFIX)
|
||||
.dlqName(OCR_DLQ)
|
||||
.arguments(Map.of("x-max-priority", 2))
|
||||
.build()));
|
||||
}
|
||||
|
||||
}
|
||||
@ -2,7 +2,6 @@ package com.knecon.fforesight.service.ocr.v1.server.queue;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.time.temporal.ChronoUnit;
|
||||
@ -16,7 +15,8 @@ import org.springframework.stereotype.Service;
|
||||
import org.springframework.util.FileSystemUtils;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.FileStorageService;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.OsUtils;
|
||||
import com.knecon.fforesight.service.ocr.v1.server.FileStorageService;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.IOcrMessageSender;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OCRService;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest;
|
||||
@ -52,7 +52,7 @@ public class OcrMessageReceiver {
|
||||
DocumentRequest request = objectMapper.readValue(in.getBody(), DocumentRequest.class);
|
||||
String dossierId = request.getDossierId();
|
||||
String fileId = request.getFileId();
|
||||
Path tmpDir = Files.createTempDirectory(null);
|
||||
Path runDir = Path.of(OsUtils.getTemporaryDirectory()).resolve(request.getDossierId()).resolve(request.getFileId());
|
||||
|
||||
try {
|
||||
MDC.put("fileId", fileId);
|
||||
@ -60,13 +60,13 @@ public class OcrMessageReceiver {
|
||||
|
||||
ocrMessageSender.sendOCRStarted(fileId);
|
||||
|
||||
File documentFile = tmpDir.resolve("document.pdf").toFile();
|
||||
File viewerDocumentFile = tmpDir.resolve("viewerDocument.pdf").toFile();
|
||||
File analyzeResultFile = tmpDir.resolve("azureAnalysisResult.json").toFile();
|
||||
File documentFile = runDir.resolve("document.pdf").toFile();
|
||||
File viewerDocumentFile = runDir.resolve("viewerDocument.pdf").toFile();
|
||||
File analyzeResultFile = runDir.resolve("azureAnalysisResult.json").toFile();
|
||||
|
||||
fileStorageService.downloadFiles(request, documentFile);
|
||||
|
||||
ocrService.runOcrOnDocument(dossierId, fileId, request.getFeatures(), tmpDir, documentFile, viewerDocumentFile, analyzeResultFile);
|
||||
ocrService.runOcrOnDocument(dossierId, fileId, request.getFeatures(), runDir, documentFile, viewerDocumentFile, analyzeResultFile);
|
||||
|
||||
fileStorageService.storeFiles(request, documentFile, viewerDocumentFile, analyzeResultFile);
|
||||
|
||||
@ -79,7 +79,7 @@ public class OcrMessageReceiver {
|
||||
} finally {
|
||||
log.info("Done");
|
||||
MDC.remove("fileId");
|
||||
FileSystemUtils.deleteRecursively(tmpDir);
|
||||
FileSystemUtils.deleteRecursively(runDir);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -29,7 +29,6 @@ public class OcrMessageSender implements IOcrMessageSender {
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_EXCHANGE,
|
||||
TenantContext.getTenantId(),
|
||||
|
||||
OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(totalImages).numberOfOCRedPages(totalImages).ocrFinished(true).build());
|
||||
}
|
||||
|
||||
@ -38,7 +37,6 @@ public class OcrMessageSender implements IOcrMessageSender {
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_EXCHANGE,
|
||||
TenantContext.getTenantId(),
|
||||
|
||||
OCRStatusUpdateResponse.builder().fileId(fileId).ocrStarted(true).build());
|
||||
|
||||
}
|
||||
@ -48,7 +46,6 @@ public class OcrMessageSender implements IOcrMessageSender {
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_EXCHANGE,
|
||||
TenantContext.getTenantId(),
|
||||
|
||||
OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(totalImages).numberOfOCRedPages(finishedImages).build());
|
||||
|
||||
}
|
||||
|
||||
@ -1,70 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.server.queue;
|
||||
|
||||
import static com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration.OCR_DLQ;
|
||||
import static com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration.OCR_REQUEST_EXCHANGE;
|
||||
import static com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration.OCR_REQUEST_QUEUE_PREFIX;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
||||
import org.springframework.amqp.rabbit.annotation.RabbitListener;
|
||||
import org.springframework.boot.context.event.ApplicationReadyEvent;
|
||||
import org.springframework.context.event.EventListener;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.tenantcommons.TenantProvider;
|
||||
import com.knecon.fforesight.tenantcommons.model.TenantCreatedEvent;
|
||||
import com.knecon.fforesight.tenantcommons.model.TenantQueueConfiguration;
|
||||
import com.knecon.fforesight.tenantcommons.model.TenantResponse;
|
||||
import com.knecon.fforesight.tenantcommons.queue.RabbitQueueFromExchangeService;
|
||||
import com.knecon.fforesight.tenantcommons.queue.TenantExchangeMessageReceiver;
|
||||
|
||||
@Service
|
||||
public class TenantExchangeMessageReceiverImpl extends TenantExchangeMessageReceiver {
|
||||
|
||||
public TenantExchangeMessageReceiverImpl(RabbitQueueFromExchangeService rabbitQueueService, TenantProvider tenantProvider) {
|
||||
|
||||
super(rabbitQueueService, tenantProvider);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected Set<TenantQueueConfiguration> getTenantQueueConfigs() {
|
||||
|
||||
return Set.of(TenantQueueConfiguration.builder()
|
||||
.listenerId(OcrMessageReceiver.OCR_REQUEST_LISTENER_ID)
|
||||
.exchangeName(OCR_REQUEST_EXCHANGE)
|
||||
.queuePrefix(OCR_REQUEST_QUEUE_PREFIX)
|
||||
.dlqName(OCR_DLQ)
|
||||
.arguments(Map.of("x-max-priority", 2))
|
||||
.build());
|
||||
}
|
||||
|
||||
|
||||
@EventListener(ApplicationReadyEvent.class)
|
||||
public void onApplicationReady() {
|
||||
|
||||
System.out.println("application ready invoked");
|
||||
super.initializeQueues();
|
||||
}
|
||||
|
||||
|
||||
@RabbitHandler
|
||||
@RabbitListener(queues = "#{tenantMessagingConfigurationImpl.getTenantCreatedQueueName()}")
|
||||
public void reactToTenantCreation(TenantCreatedEvent tenantCreatedEvent) {
|
||||
|
||||
super.reactToTenantCreation(tenantCreatedEvent);
|
||||
}
|
||||
|
||||
|
||||
@RabbitHandler
|
||||
@RabbitListener(queues = "#{tenantMessagingConfigurationImpl.getTenantDeletedQueueName()}")
|
||||
public void reactToTenantDeletion(TenantResponse tenantResponse) {
|
||||
|
||||
super.reactToTenantDeletion(tenantResponse);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,6 +1,9 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.server;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
@ -10,6 +13,7 @@ import org.mockito.MockitoAnnotations;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
import org.springframework.amqp.rabbit.core.RabbitAdmin;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.amqp.rabbit.listener.MessageListenerContainer;
|
||||
import org.springframework.amqp.rabbit.listener.RabbitListenerEndpointRegistry;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||
@ -57,11 +61,9 @@ public class AbstractTest {
|
||||
@MockBean
|
||||
private RabbitAdmin rabbitAdmin;
|
||||
|
||||
@MockBean
|
||||
private RabbitListenerEndpointRegistry rabbitListenerEndpointRegistry;
|
||||
|
||||
private static String pdftronLicense;
|
||||
|
||||
|
||||
@BeforeEach
|
||||
public void openMocks() {
|
||||
|
||||
@ -107,6 +109,16 @@ public class AbstractTest {
|
||||
@ComponentScan(excludeFilters = {@ComponentScan.Filter(type = FilterType.ASSIGNABLE_TYPE, value = StorageAutoConfiguration.class)})
|
||||
public static class TestConfiguration {
|
||||
|
||||
@Bean
|
||||
public RabbitListenerEndpointRegistry rabbitListenerEndpointRegistry() {
|
||||
|
||||
var mock = mock(RabbitListenerEndpointRegistry.class);
|
||||
when(mock.getListenerContainer(any())).thenReturn(mock(MessageListenerContainer.class));
|
||||
|
||||
return mock;
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
@Primary
|
||||
public StorageService inMemoryStorage() {
|
||||
|
||||
@ -25,8 +25,8 @@ import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Disabled
|
||||
// in order to run, the azure.key must be set first in the application.yml and you must set the env variable VCPKG_DYNAMIC_LIB to your tesseract and leptonica installation folder
|
||||
@Disabled
|
||||
@SpringBootTest()
|
||||
public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
|
||||
@ -55,7 +55,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testOcrWithFile() {
|
||||
|
||||
testOCR("/home/kschuettler/Dokumente/LayoutparsingEvaluation/RAW_FILES/Difficult Headlines/VV-284053.pdf/VV-284053.pdf.ORIGIN.pdf");
|
||||
testOCR("/home/kschuettler/Dokumente/TestFiles/OCR/VV-331340/VV-331340.pdf");
|
||||
}
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user