RED-7669: optimize OCR-module performance

* attempt at thread safety
This commit is contained in:
Kilian Schuettler 2023-11-15 15:07:39 +01:00
parent c556687499
commit 57e194fcd0
4 changed files with 32 additions and 47 deletions

View File

@ -1,20 +1,11 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.AlphaComposite;
import java.awt.Color;
import java.awt.Graphics2D;
import java.awt.Transparency;
import java.awt.geom.AffineTransform;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.nio.IntBuffer;
import java.util.concurrent.Semaphore;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import com.pdftron.sdf.Obj;
import lombok.AccessLevel;
import lombok.Getter;
@ -23,9 +14,7 @@ import lombok.Setter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
import net.sourceforge.tess4j.ITessAPI;
@Slf4j
@ -56,25 +45,12 @@ public class ExtractedOcrImage implements OcrImage {
this.originalHeight = bufferedImage.getHeight();
this.originalWidth = bufferedImage.getWidth();
float imageDPI = Math.abs(bufferedImage.getWidth() / (ctm.getScalingFactorX() / 72));
this.pix = binarize(bufferedImage, imageDPI, targetDpi);
this.pix = ImageProcessingUtils.process(bufferedImage, imageDPI, targetDpi);
this.height = pix.h;
this.width = pix.w;
}
@SneakyThrows
private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) {
ImageProcessingUtils.setAlphaChannelToWhite(image);
synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs tesseract detection script.
Pix grayScale = ImageProcessingUtils.convertToGrayScale(image);
Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale);
return ImageProcessingUtils.despecklePix(scaledUp);
}
}
@Override
public AffineTransform getImageCTM() {

View File

@ -12,9 +12,9 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(level = AccessLevel.PRIVATE)
public class OcrServiceSettings {
int ocrThreadCount = 4; // Number of OCR threads
int ocrThreadCount = 16; // Number of OCR threads
int imageExtractThreadCount = 2; // Number of image extraction threads
int gsProcessCount = 2; // Number of Ghostscript processes
int gsProcessCount = 5; // Number of Ghostscript processes
int dpi = 300; // Target DPI for binarized images
int psmOverride = -1; // Overrides the page segmentation mode if > 0
int minImageHeight = 20; // Minimum height for images to be processed

View File

@ -5,7 +5,6 @@ import java.awt.Color;
import java.awt.Graphics2D;
import java.awt.Transparency;
import java.awt.image.BufferedImage;
import java.io.IOException;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@ -16,27 +15,38 @@ import net.sourceforge.lept4j.util.LeptUtils;
@UtilityClass
public class ImageProcessingUtils {
public static Pix despecklePix(Pix pix) {
public Pix process(BufferedImage image, float imageDpi, int targetDpi) {
assert pix.d == 8;
Pix despeckled;
if (pix.w < 100 || pix.h < 100) {
// too small to properly despeckle, just binarize instead.
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
} else {
despeckled = LeptUtils.despeckle(pix, LeptUtils.SEL_STR3, 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though...
if (despeckled == null) {
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
}
setAlphaChannelToWhite(image);
return processWithLeptonica(image, imageDpi, targetDpi);
}
// LeptUtils and Leptonica1 does not seem to be thread safe, so we must ensure synchronization for the image processing.
// There might be a way to get this working multi-threaded, but it does not seem to be a significant runtime factor, so i didn't bother investing the time to dive deeper.
synchronized private static Pix processWithLeptonica(BufferedImage image, float imageDpi, int targetDpi) {
Pix grayScale = convertToGrayScale(image);
Pix scaledUp = scaleToTargetDpi(imageDpi, targetDpi, grayScale);
return despecklePix(scaledUp);
}
private static Pix despecklePix(Pix scaledUp) {
assert scaledUp.d == 8;
Pix despeckled = LeptUtils.despeckle(scaledUp, LeptUtils.SEL_STR3, 3);
if (despeckled == null) { // sometimes despeckle fails, and I wasn't able to figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with too small images, not sure though...
despeckled = Leptonica1.pixThresholdToBinary(scaledUp, 180);
}
if (pix != despeckled) {
LeptUtils.disposePix(pix);
if (scaledUp != despeckled) {
LeptUtils.disposePix(scaledUp);
}
return despeckled;
}
public static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) {
private static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) {
float targetFactor = targetDpi / imageDpi;
@ -57,7 +67,7 @@ public class ImageProcessingUtils {
@SneakyThrows
public static Pix convertToGrayScale(BufferedImage image) {
private static Pix convertToGrayScale(BufferedImage image) {
Pix pix = LeptUtils.convertImageToPix(image);
if (pix.d == 8) {
@ -74,7 +84,7 @@ public class ImageProcessingUtils {
}
public static void setAlphaChannelToWhite(BufferedImage image) {
private static void setAlphaChannelToWhite(BufferedImage image) {
if (image.getTransparency() == Transparency.TRANSLUCENT) {
// NOTE: For BITMASK images, the color model is likely IndexColorModel,

View File

@ -31,7 +31,7 @@ import io.micrometer.prometheus.PrometheusMeterRegistry;
import io.micrometer.prometheus.PrometheusTimer;
import lombok.SneakyThrows;
@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help.
//@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help.
@SpringBootTest()
public class OcrServiceIntegrationTest extends AbstractTest {
@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@SneakyThrows
public void testOcr() {
String text = testOCR("files/2009-1048395_50pages_tables.pdf");
String text = testOCR("files/10.SYN524464 FS (A16148C) - Absorção cutânea.pdf");
}
@ -172,7 +172,6 @@ public class OcrServiceIntegrationTest extends AbstractTest {
}
@SneakyThrows
private void testOCRForFile(File file) {