Merge branch 'RED-7669' into 'master'

RED-7669: optimize OCR-module performance

Closes RED-7669

See merge request redactmanager/ocr-service!24
This commit is contained in:
Kilian Schüttler 2023-11-28 12:35:22 +01:00
commit 65d818200f
5 changed files with 53 additions and 64 deletions

View File

@ -55,7 +55,7 @@ public class GhostScriptService {
Statistics stats) {
BlockingQueue<RenderedPageImageFile> imageFileCollectorQueue = new LinkedBlockingDeque<>();
Thread asyncTransferThread = new BlockingQueueFiller(imageFileCollectorQueue, imageProcessingQueue);
BlockingQueueFiller asyncTransferThread = new BlockingQueueFiller(imageFileCollectorQueue, imageProcessingQueue);
asyncTransferThread.start();
int numOfProcesses = Math.min(settings.getGsProcessCount(), stitchedPageNumbers.size());
@ -86,7 +86,7 @@ public class GhostScriptService {
stats.increasePDF2ImgDuration(System.currentTimeMillis() - timestamp);
log.info("Batch {}: Ghostscript processes finished with exit codes " + processExitCodes, batchIdx);
}
asyncTransferThread.interrupt();
asyncTransferThread.setAllImagesQueued(true);
}

View File

@ -94,8 +94,8 @@ public class OcrImageFactory {
if (!stitchedPageNumbers.isEmpty()) {
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageProcessingQueue, stats);
}
imageProcessingThread.interrupt();
log.info("All images extracted, interrupting processing thread.");
imageProcessingThread.setAllImagesExtracted(true);
imageProcessingThread.join();

View File

@ -2,6 +2,7 @@ package com.knecon.fforesight.service.ocr.processor.service.threads;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.concurrent.BlockingQueue;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
@ -10,6 +11,7 @@ import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.Setter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@ -21,41 +23,35 @@ This just moves the Elements from the GhostScriptOutputListener into the ImagePr
*/
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class BlockingQueueFiller extends Thread {
BlockingQueue<RenderedPageImageFile> imageInputQueue;
BlockingQueue<UnprocessedImage> imageOutputQueue;
final BlockingQueue<RenderedPageImageFile> imageInputQueue;
final BlockingQueue<UnprocessedImage> imageOutputQueue;
@Setter
boolean allImagesQueued;
@SneakyThrows
@Override
public void run() {
// Interrupting signals that the image extraction has finished
while (true) {
try {
while (!allImagesQueued) {
final UnprocessedImage image = imageInputQueue.take();
try {
imageOutputQueue.put(image);
} catch (InterruptedException e) {
imageOutputQueue.put(image);
break;
}
} catch (InterruptedException e) {
break;
}
}
// empty the queue
List<UnprocessedImage> remainingImages = new ArrayList<>(imageInputQueue.size());
imageInputQueue.drainTo(remainingImages);
remainingImages.forEach(image -> {
try {
imageOutputQueue.put(image);
} catch (InterruptedException e) {
log.error(e.getMessage());
}
// empty the queue
try {
while (true) {
final UnprocessedImage image = imageInputQueue.remove();
imageOutputQueue.put(image);
}
});
} catch (NoSuchElementException e) {
log.debug("No images left in queue, stopping.");
}
}
}

View File

@ -6,6 +6,7 @@ import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.concurrent.BlockingQueue;
import org.apache.pdfbox.pdmodel.PDDocument;
@ -24,6 +25,7 @@ import com.sun.jna.ptr.PointerByReference;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.Setter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@ -37,48 +39,40 @@ import net.sourceforge.tess4j.TessAPI1;
*/
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ImageProcessingThread extends Thread {
BlockingQueue<UnprocessedImage> imageInputQueue;
BlockingQueue<OcrImage> imageOutputQueue;
ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
Statistics stats;
OcrServiceSettings settings;
PDDocument document;
final BlockingQueue<UnprocessedImage> imageInputQueue;
final BlockingQueue<OcrImage> imageOutputQueue;
final ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
final Statistics stats;
final OcrServiceSettings settings;
final PDDocument document;
@Setter
boolean allImagesExtracted;
@SneakyThrows
@Override
public void run() {
// Interrupting signals that the image extraction has finished
while (true) {
try {
final UnprocessedImage image = imageInputQueue.take();
OcrImage extractedOcrImage = this.process(image);
try {
imageOutputQueue.put(extractedOcrImage);
} catch (InterruptedException e) {
imageOutputQueue.put(extractedOcrImage);
break;
}
} catch (InterruptedException e) {
break;
}
while (!allImagesExtracted) {
final UnprocessedImage image = imageInputQueue.take();
var ocrImage = this.process(image);
imageOutputQueue.put(ocrImage);
}
// empty the queue
List<UnprocessedImage> remainingImages = new ArrayList<>(imageInputQueue.size());
imageInputQueue.drainTo(remainingImages);
remainingImages.forEach(image -> {
OcrImage ocrImage = this.process(image);
try {
try {
while (true) {
final UnprocessedImage image = imageInputQueue.remove();
OcrImage ocrImage = this.process(image);
imageOutputQueue.put(ocrImage);
} catch (InterruptedException e) {
log.error(e.getMessage());
}
});
} catch (NoSuchElementException e) {
log.debug("No images left in processing queue, stopping.");
}
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
}
@ -150,9 +144,7 @@ public class ImageProcessingThread extends Thread {
}
static public int detectOrientation(Pix pix, int dpi, ITessAPI.TessBaseAPI detectionScriptHandle) {
public int detectOrientation(Pix pix, int dpi, ITessAPI.TessBaseAPI detectionScriptHandle) {
TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, pix);
TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, dpi);
@ -173,7 +165,7 @@ public class ImageProcessingThread extends Thread {
orientationDegreeConfidenceBuffer,
scriptureNameBuffer,
scriptureConfidenceBuffer);
if (result == TRUE && orientationDegreeConfidenceBuffer.get() > 10) {
if (result == TRUE && orientationDegreeConfidenceBuffer.get() > settings.getMinRotationConfidence()) {
orientationDegree = orientationDegreeResultBuffer.get();
}
@ -198,7 +190,7 @@ public class ImageProcessingThread extends Thread {
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
String datapath = System.getenv("TESSDATA_PREFIX");
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
TessAPI1.TessBaseAPISetVariable(handle, "debug_file", "/dev/null");
return handle;
}

View File

@ -19,6 +19,7 @@ public class OcrServiceSettings {
int psmOverride = -1; // Overrides the page segmentation mode if > 0
int minImageHeight = 20; // Minimum height for images to be processed
int minImageWidth = 20; // Minimum width for images to be processed
float minRotationConfidence = 2; //
boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes
boolean removeWatermark; // If true, watermarks will be removed
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment