RED-7669: optimize OCR-module performance
* attempt at thread safety
This commit is contained in:
parent
c556687499
commit
57e194fcd0
@ -1,20 +1,11 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.AlphaComposite;
|
||||
import java.awt.Color;
|
||||
import java.awt.Graphics2D;
|
||||
import java.awt.Transparency;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.concurrent.Semaphore;
|
||||
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||
import com.pdftron.sdf.Obj;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
@ -23,9 +14,7 @@ import lombok.Setter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
import net.sourceforge.tess4j.ITessAPI;
|
||||
|
||||
@Slf4j
|
||||
@ -56,25 +45,12 @@ public class ExtractedOcrImage implements OcrImage {
|
||||
this.originalHeight = bufferedImage.getHeight();
|
||||
this.originalWidth = bufferedImage.getWidth();
|
||||
float imageDPI = Math.abs(bufferedImage.getWidth() / (ctm.getScalingFactorX() / 72));
|
||||
this.pix = binarize(bufferedImage, imageDPI, targetDpi);
|
||||
this.pix = ImageProcessingUtils.process(bufferedImage, imageDPI, targetDpi);
|
||||
this.height = pix.h;
|
||||
this.width = pix.w;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) {
|
||||
|
||||
ImageProcessingUtils.setAlphaChannelToWhite(image);
|
||||
|
||||
synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs tesseract detection script.
|
||||
Pix grayScale = ImageProcessingUtils.convertToGrayScale(image);
|
||||
Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale);
|
||||
return ImageProcessingUtils.despecklePix(scaledUp);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public AffineTransform getImageCTM() {
|
||||
|
||||
|
||||
@ -12,9 +12,9 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class OcrServiceSettings {
|
||||
|
||||
int ocrThreadCount = 4; // Number of OCR threads
|
||||
int ocrThreadCount = 16; // Number of OCR threads
|
||||
int imageExtractThreadCount = 2; // Number of image extraction threads
|
||||
int gsProcessCount = 2; // Number of Ghostscript processes
|
||||
int gsProcessCount = 5; // Number of Ghostscript processes
|
||||
int dpi = 300; // Target DPI for binarized images
|
||||
int psmOverride = -1; // Overrides the page segmentation mode if > 0
|
||||
int minImageHeight = 20; // Minimum height for images to be processed
|
||||
|
||||
@ -5,7 +5,6 @@ import java.awt.Color;
|
||||
import java.awt.Graphics2D;
|
||||
import java.awt.Transparency;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
@ -16,27 +15,38 @@ import net.sourceforge.lept4j.util.LeptUtils;
|
||||
@UtilityClass
|
||||
public class ImageProcessingUtils {
|
||||
|
||||
public static Pix despecklePix(Pix pix) {
|
||||
public Pix process(BufferedImage image, float imageDpi, int targetDpi) {
|
||||
|
||||
assert pix.d == 8;
|
||||
Pix despeckled;
|
||||
if (pix.w < 100 || pix.h < 100) {
|
||||
// too small to properly despeckle, just binarize instead.
|
||||
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
|
||||
} else {
|
||||
despeckled = LeptUtils.despeckle(pix, LeptUtils.SEL_STR3, 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though...
|
||||
if (despeckled == null) {
|
||||
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
|
||||
}
|
||||
setAlphaChannelToWhite(image);
|
||||
return processWithLeptonica(image, imageDpi, targetDpi);
|
||||
}
|
||||
|
||||
|
||||
// LeptUtils and Leptonica1 does not seem to be thread safe, so we must ensure synchronization for the image processing.
|
||||
// There might be a way to get this working multi-threaded, but it does not seem to be a significant runtime factor, so i didn't bother investing the time to dive deeper.
|
||||
synchronized private static Pix processWithLeptonica(BufferedImage image, float imageDpi, int targetDpi) {
|
||||
|
||||
Pix grayScale = convertToGrayScale(image);
|
||||
Pix scaledUp = scaleToTargetDpi(imageDpi, targetDpi, grayScale);
|
||||
return despecklePix(scaledUp);
|
||||
}
|
||||
|
||||
|
||||
private static Pix despecklePix(Pix scaledUp) {
|
||||
|
||||
assert scaledUp.d == 8;
|
||||
Pix despeckled = LeptUtils.despeckle(scaledUp, LeptUtils.SEL_STR3, 3);
|
||||
if (despeckled == null) { // sometimes despeckle fails, and I wasn't able to figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with too small images, not sure though...
|
||||
despeckled = Leptonica1.pixThresholdToBinary(scaledUp, 180);
|
||||
}
|
||||
if (pix != despeckled) {
|
||||
LeptUtils.disposePix(pix);
|
||||
if (scaledUp != despeckled) {
|
||||
LeptUtils.disposePix(scaledUp);
|
||||
}
|
||||
return despeckled;
|
||||
}
|
||||
|
||||
|
||||
public static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) {
|
||||
private static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) {
|
||||
|
||||
float targetFactor = targetDpi / imageDpi;
|
||||
|
||||
@ -57,7 +67,7 @@ public class ImageProcessingUtils {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static Pix convertToGrayScale(BufferedImage image) {
|
||||
private static Pix convertToGrayScale(BufferedImage image) {
|
||||
|
||||
Pix pix = LeptUtils.convertImageToPix(image);
|
||||
if (pix.d == 8) {
|
||||
@ -74,7 +84,7 @@ public class ImageProcessingUtils {
|
||||
}
|
||||
|
||||
|
||||
public static void setAlphaChannelToWhite(BufferedImage image) {
|
||||
private static void setAlphaChannelToWhite(BufferedImage image) {
|
||||
|
||||
if (image.getTransparency() == Transparency.TRANSLUCENT) {
|
||||
// NOTE: For BITMASK images, the color model is likely IndexColorModel,
|
||||
|
||||
@ -31,7 +31,7 @@ import io.micrometer.prometheus.PrometheusMeterRegistry;
|
||||
import io.micrometer.prometheus.PrometheusTimer;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help.
|
||||
//@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help.
|
||||
@SpringBootTest()
|
||||
public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
|
||||
@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testOcr() {
|
||||
|
||||
String text = testOCR("files/2009-1048395_50pages_tables.pdf");
|
||||
String text = testOCR("files/10.SYN524464 FS (A16148C) - Absorção cutânea.pdf");
|
||||
}
|
||||
|
||||
|
||||
@ -172,7 +172,6 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void testOCRForFile(File file) {
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user