From 6fe95c6940a3877c11afd900f4b7d8915d5ee911 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Tue, 28 Nov 2023 10:04:56 +0100 Subject: [PATCH] RED-7669: optimize OCR-module performance * dont interrupt threads, use boolean flag instead --- .../processor/service/GhostScriptService.java | 4 +- .../processor/service/OcrImageFactory.java | 4 +- .../service/threads/BlockingQueueFiller.java | 44 ++++++------- .../threads/ImageProcessingThread.java | 64 ++++++++----------- .../settings/OcrServiceSettings.java | 1 + 5 files changed, 53 insertions(+), 64 deletions(-) diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java index a767f91..cc2c84e 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java @@ -55,7 +55,7 @@ public class GhostScriptService { Statistics stats) { BlockingQueue imageFileCollectorQueue = new LinkedBlockingDeque<>(); - Thread asyncTransferThread = new BlockingQueueFiller(imageFileCollectorQueue, imageProcessingQueue); + BlockingQueueFiller asyncTransferThread = new BlockingQueueFiller(imageFileCollectorQueue, imageProcessingQueue); asyncTransferThread.start(); int numOfProcesses = Math.min(settings.getGsProcessCount(), stitchedPageNumbers.size()); @@ -86,7 +86,7 @@ public class GhostScriptService { stats.increasePDF2ImgDuration(System.currentTimeMillis() - timestamp); log.info("Batch {}: Ghostscript processes finished with exit codes " + processExitCodes, batchIdx); } - asyncTransferThread.interrupt(); + asyncTransferThread.setAllImagesQueued(true); } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java index 2e913e2..0bf3f7a 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java @@ -94,8 +94,8 @@ public class OcrImageFactory { if (!stitchedPageNumbers.isEmpty()) { ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageProcessingQueue, stats); } - imageProcessingThread.interrupt(); - log.info("All images extracted, interrupting processing thread."); + + imageProcessingThread.setAllImagesExtracted(true); imageProcessingThread.join(); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/BlockingQueueFiller.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/BlockingQueueFiller.java index 554e190..b46a1fe 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/BlockingQueueFiller.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/BlockingQueueFiller.java @@ -2,6 +2,7 @@ package com.knecon.fforesight.service.ocr.processor.service.threads; import java.util.ArrayList; import java.util.List; +import java.util.NoSuchElementException; import java.util.concurrent.BlockingQueue; import com.knecon.fforesight.service.ocr.processor.model.OcrImage; @@ -10,6 +11,7 @@ import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage; import lombok.AccessLevel; import lombok.RequiredArgsConstructor; +import lombok.Setter; import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; import lombok.extern.slf4j.Slf4j; @@ -21,41 +23,35 @@ This just moves the Elements from the GhostScriptOutputListener into the ImagePr */ @Slf4j @RequiredArgsConstructor -@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +@FieldDefaults(level = AccessLevel.PRIVATE) public class BlockingQueueFiller extends Thread { - BlockingQueue imageInputQueue; - BlockingQueue imageOutputQueue; + final BlockingQueue imageInputQueue; + final BlockingQueue imageOutputQueue; + + @Setter + boolean allImagesQueued; + @SneakyThrows @Override public void run() { // Interrupting signals that the image extraction has finished - while (true) { - try { + while (!allImagesQueued) { final UnprocessedImage image = imageInputQueue.take(); - try { - imageOutputQueue.put(image); - } catch (InterruptedException e) { - imageOutputQueue.put(image); - break; - } - - } catch (InterruptedException e) { - break; - } - } - // empty the queue - List remainingImages = new ArrayList<>(imageInputQueue.size()); - imageInputQueue.drainTo(remainingImages); - remainingImages.forEach(image -> { - try { imageOutputQueue.put(image); - } catch (InterruptedException e) { - log.error(e.getMessage()); + } + + // empty the queue + try { + while (true) { + final UnprocessedImage image = imageInputQueue.remove(); + imageOutputQueue.put(image); } - }); + } catch (NoSuchElementException e) { + log.debug("No images left in queue, stopping."); + } } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java index 5dfe1eb..6fec116 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java @@ -6,6 +6,7 @@ import java.nio.FloatBuffer; import java.nio.IntBuffer; import java.util.ArrayList; import java.util.List; +import java.util.NoSuchElementException; import java.util.concurrent.BlockingQueue; import org.apache.pdfbox.pdmodel.PDDocument; @@ -24,6 +25,7 @@ import com.sun.jna.ptr.PointerByReference; import lombok.AccessLevel; import lombok.RequiredArgsConstructor; +import lombok.Setter; import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; import lombok.extern.slf4j.Slf4j; @@ -37,48 +39,40 @@ import net.sourceforge.tess4j.TessAPI1; */ @Slf4j @RequiredArgsConstructor -@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +@FieldDefaults(level = AccessLevel.PRIVATE) public class ImageProcessingThread extends Thread { - BlockingQueue imageInputQueue; - BlockingQueue imageOutputQueue; - ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle(); - Statistics stats; - OcrServiceSettings settings; - PDDocument document; + final BlockingQueue imageInputQueue; + final BlockingQueue imageOutputQueue; + final ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle(); + final Statistics stats; + final OcrServiceSettings settings; + final PDDocument document; + + @Setter + boolean allImagesExtracted; @SneakyThrows @Override public void run() { - // Interrupting signals that the image extraction has finished - while (true) { - try { - final UnprocessedImage image = imageInputQueue.take(); - OcrImage extractedOcrImage = this.process(image); - try { - imageOutputQueue.put(extractedOcrImage); - } catch (InterruptedException e) { - imageOutputQueue.put(extractedOcrImage); - break; - } - - } catch (InterruptedException e) { - break; - } + while (!allImagesExtracted) { + final UnprocessedImage image = imageInputQueue.take(); + var ocrImage = this.process(image); + imageOutputQueue.put(ocrImage); } - // empty the queue - List remainingImages = new ArrayList<>(imageInputQueue.size()); - imageInputQueue.drainTo(remainingImages); - remainingImages.forEach(image -> { - OcrImage ocrImage = this.process(image); - try { + + + try { + while (true) { + final UnprocessedImage image = imageInputQueue.remove(); + OcrImage ocrImage = this.process(image); imageOutputQueue.put(ocrImage); - } catch (InterruptedException e) { - log.error(e.getMessage()); } - }); + } catch (NoSuchElementException e) { + log.debug("No images left in processing queue, stopping."); + } TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle); } @@ -150,9 +144,7 @@ public class ImageProcessingThread extends Thread { } - - - static public int detectOrientation(Pix pix, int dpi, ITessAPI.TessBaseAPI detectionScriptHandle) { + public int detectOrientation(Pix pix, int dpi, ITessAPI.TessBaseAPI detectionScriptHandle) { TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, pix); TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, dpi); @@ -173,7 +165,7 @@ public class ImageProcessingThread extends Thread { orientationDegreeConfidenceBuffer, scriptureNameBuffer, scriptureConfidenceBuffer); - if (result == TRUE && orientationDegreeConfidenceBuffer.get() > 10) { + if (result == TRUE && orientationDegreeConfidenceBuffer.get() > settings.getMinRotationConfidence()) { orientationDegree = orientationDegreeResultBuffer.get(); } @@ -198,7 +190,7 @@ public class ImageProcessingThread extends Thread { ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate(); String datapath = System.getenv("TESSDATA_PREFIX"); TessAPI1.TessBaseAPIInit3(handle, datapath, "osd"); - + TessAPI1.TessBaseAPISetVariable(handle, "debug_file", "/dev/null"); return handle; } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java index d8e3665..227c9c3 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java @@ -19,6 +19,7 @@ public class OcrServiceSettings { int psmOverride = -1; // Overrides the page segmentation mode if > 0 int minImageHeight = 20; // Minimum height for images to be processed int minImageWidth = 20; // Minimum width for images to be processed + float minRotationConfidence = 2; // boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes boolean removeWatermark; // If true, watermarks will be removed String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment