diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java index 74f792d..884df82 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java @@ -1,20 +1,11 @@ package com.knecon.fforesight.service.ocr.processor.model; -import java.awt.AlphaComposite; -import java.awt.Color; -import java.awt.Graphics2D; -import java.awt.Transparency; import java.awt.geom.AffineTransform; import java.awt.image.BufferedImage; -import java.io.IOException; -import java.nio.IntBuffer; -import java.util.concurrent.Semaphore; import org.apache.pdfbox.util.Matrix; -import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; -import com.pdftron.sdf.Obj; import lombok.AccessLevel; import lombok.Getter; @@ -23,9 +14,7 @@ import lombok.Setter; import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; import lombok.extern.slf4j.Slf4j; -import net.sourceforge.lept4j.Leptonica1; import net.sourceforge.lept4j.Pix; -import net.sourceforge.lept4j.util.LeptUtils; import net.sourceforge.tess4j.ITessAPI; @Slf4j @@ -56,25 +45,12 @@ public class ExtractedOcrImage implements OcrImage { this.originalHeight = bufferedImage.getHeight(); this.originalWidth = bufferedImage.getWidth(); float imageDPI = Math.abs(bufferedImage.getWidth() / (ctm.getScalingFactorX() / 72)); - this.pix = binarize(bufferedImage, imageDPI, targetDpi); + this.pix = ImageProcessingUtils.process(bufferedImage, imageDPI, targetDpi); this.height = pix.h; this.width = pix.w; } - @SneakyThrows - private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) { - - ImageProcessingUtils.setAlphaChannelToWhite(image); - - synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs tesseract detection script. - Pix grayScale = ImageProcessingUtils.convertToGrayScale(image); - Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale); - return ImageProcessingUtils.despecklePix(scaledUp); - } - } - - @Override public AffineTransform getImageCTM() { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java index 0592808..16a122a 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java @@ -12,9 +12,9 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(level = AccessLevel.PRIVATE) public class OcrServiceSettings { - int ocrThreadCount = 4; // Number of OCR threads + int ocrThreadCount = 16; // Number of OCR threads int imageExtractThreadCount = 2; // Number of image extraction threads - int gsProcessCount = 2; // Number of Ghostscript processes + int gsProcessCount = 5; // Number of Ghostscript processes int dpi = 300; // Target DPI for binarized images int psmOverride = -1; // Overrides the page segmentation mode if > 0 int minImageHeight = 20; // Minimum height for images to be processed diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java index ffa9f74..c2dab97 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java @@ -5,7 +5,6 @@ import java.awt.Color; import java.awt.Graphics2D; import java.awt.Transparency; import java.awt.image.BufferedImage; -import java.io.IOException; import lombok.SneakyThrows; import lombok.experimental.UtilityClass; @@ -16,27 +15,38 @@ import net.sourceforge.lept4j.util.LeptUtils; @UtilityClass public class ImageProcessingUtils { - public static Pix despecklePix(Pix pix) { + public Pix process(BufferedImage image, float imageDpi, int targetDpi) { - assert pix.d == 8; - Pix despeckled; - if (pix.w < 100 || pix.h < 100) { - // too small to properly despeckle, just binarize instead. - despeckled = Leptonica1.pixThresholdToBinary(pix, 180); - } else { - despeckled = LeptUtils.despeckle(pix, LeptUtils.SEL_STR3, 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though... - if (despeckled == null) { - despeckled = Leptonica1.pixThresholdToBinary(pix, 180); - } + setAlphaChannelToWhite(image); + return processWithLeptonica(image, imageDpi, targetDpi); + } + + + // LeptUtils and Leptonica1 does not seem to be thread safe, so we must ensure synchronization for the image processing. + // There might be a way to get this working multi-threaded, but it does not seem to be a significant runtime factor, so i didn't bother investing the time to dive deeper. + synchronized private static Pix processWithLeptonica(BufferedImage image, float imageDpi, int targetDpi) { + + Pix grayScale = convertToGrayScale(image); + Pix scaledUp = scaleToTargetDpi(imageDpi, targetDpi, grayScale); + return despecklePix(scaledUp); + } + + + private static Pix despecklePix(Pix scaledUp) { + + assert scaledUp.d == 8; + Pix despeckled = LeptUtils.despeckle(scaledUp, LeptUtils.SEL_STR3, 3); + if (despeckled == null) { // sometimes despeckle fails, and I wasn't able to figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with too small images, not sure though... + despeckled = Leptonica1.pixThresholdToBinary(scaledUp, 180); } - if (pix != despeckled) { - LeptUtils.disposePix(pix); + if (scaledUp != despeckled) { + LeptUtils.disposePix(scaledUp); } return despeckled; } - public static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) { + private static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) { float targetFactor = targetDpi / imageDpi; @@ -57,7 +67,7 @@ public class ImageProcessingUtils { @SneakyThrows - public static Pix convertToGrayScale(BufferedImage image) { + private static Pix convertToGrayScale(BufferedImage image) { Pix pix = LeptUtils.convertImageToPix(image); if (pix.d == 8) { @@ -74,7 +84,7 @@ public class ImageProcessingUtils { } - public static void setAlphaChannelToWhite(BufferedImage image) { + private static void setAlphaChannelToWhite(BufferedImage image) { if (image.getTransparency() == Transparency.TRANSLUCENT) { // NOTE: For BITMASK images, the color model is likely IndexColorModel, diff --git a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index 06465ad..f76b416 100644 --- a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -31,7 +31,7 @@ import io.micrometer.prometheus.PrometheusMeterRegistry; import io.micrometer.prometheus.PrometheusTimer; import lombok.SneakyThrows; -@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help. +//@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help. @SpringBootTest() public class OcrServiceIntegrationTest extends AbstractTest { @@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcr() { - String text = testOCR("files/2009-1048395_50pages_tables.pdf"); + String text = testOCR("files/10.SYN524464 FS (A16148C) - Absorção cutânea.pdf"); } @@ -172,7 +172,6 @@ public class OcrServiceIntegrationTest extends AbstractTest { } - @SneakyThrows private void testOCRForFile(File file) {