From 57e194fcd080cab2caceb93e22cf686aeb3c221a Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Wed, 15 Nov 2023 15:07:39 +0100 Subject: [PATCH 01/16] RED-7669: optimize OCR-module performance * attempt at thread safety --- .../processor/model/ExtractedOcrImage.java | 26 +---------- .../settings/OcrServiceSettings.java | 4 +- .../processor/utils/ImageProcessingUtils.java | 44 ++++++++++++------- .../v1/server/OcrServiceIntegrationTest.java | 5 +-- 4 files changed, 32 insertions(+), 47 deletions(-) diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java index 74f792d..884df82 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java @@ -1,20 +1,11 @@ package com.knecon.fforesight.service.ocr.processor.model; -import java.awt.AlphaComposite; -import java.awt.Color; -import java.awt.Graphics2D; -import java.awt.Transparency; import java.awt.geom.AffineTransform; import java.awt.image.BufferedImage; -import java.io.IOException; -import java.nio.IntBuffer; -import java.util.concurrent.Semaphore; import org.apache.pdfbox.util.Matrix; -import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; -import com.pdftron.sdf.Obj; import lombok.AccessLevel; import lombok.Getter; @@ -23,9 +14,7 @@ import lombok.Setter; import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; import lombok.extern.slf4j.Slf4j; -import net.sourceforge.lept4j.Leptonica1; import net.sourceforge.lept4j.Pix; -import net.sourceforge.lept4j.util.LeptUtils; import net.sourceforge.tess4j.ITessAPI; @Slf4j @@ -56,25 +45,12 @@ public class ExtractedOcrImage implements OcrImage { this.originalHeight = bufferedImage.getHeight(); this.originalWidth = bufferedImage.getWidth(); float imageDPI = Math.abs(bufferedImage.getWidth() / (ctm.getScalingFactorX() / 72)); - this.pix = binarize(bufferedImage, imageDPI, targetDpi); + this.pix = ImageProcessingUtils.process(bufferedImage, imageDPI, targetDpi); this.height = pix.h; this.width = pix.w; } - @SneakyThrows - private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) { - - ImageProcessingUtils.setAlphaChannelToWhite(image); - - synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs tesseract detection script. - Pix grayScale = ImageProcessingUtils.convertToGrayScale(image); - Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale); - return ImageProcessingUtils.despecklePix(scaledUp); - } - } - - @Override public AffineTransform getImageCTM() { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java index 0592808..16a122a 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java @@ -12,9 +12,9 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(level = AccessLevel.PRIVATE) public class OcrServiceSettings { - int ocrThreadCount = 4; // Number of OCR threads + int ocrThreadCount = 16; // Number of OCR threads int imageExtractThreadCount = 2; // Number of image extraction threads - int gsProcessCount = 2; // Number of Ghostscript processes + int gsProcessCount = 5; // Number of Ghostscript processes int dpi = 300; // Target DPI for binarized images int psmOverride = -1; // Overrides the page segmentation mode if > 0 int minImageHeight = 20; // Minimum height for images to be processed diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java index ffa9f74..c2dab97 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java @@ -5,7 +5,6 @@ import java.awt.Color; import java.awt.Graphics2D; import java.awt.Transparency; import java.awt.image.BufferedImage; -import java.io.IOException; import lombok.SneakyThrows; import lombok.experimental.UtilityClass; @@ -16,27 +15,38 @@ import net.sourceforge.lept4j.util.LeptUtils; @UtilityClass public class ImageProcessingUtils { - public static Pix despecklePix(Pix pix) { + public Pix process(BufferedImage image, float imageDpi, int targetDpi) { - assert pix.d == 8; - Pix despeckled; - if (pix.w < 100 || pix.h < 100) { - // too small to properly despeckle, just binarize instead. - despeckled = Leptonica1.pixThresholdToBinary(pix, 180); - } else { - despeckled = LeptUtils.despeckle(pix, LeptUtils.SEL_STR3, 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though... - if (despeckled == null) { - despeckled = Leptonica1.pixThresholdToBinary(pix, 180); - } + setAlphaChannelToWhite(image); + return processWithLeptonica(image, imageDpi, targetDpi); + } + + + // LeptUtils and Leptonica1 does not seem to be thread safe, so we must ensure synchronization for the image processing. + // There might be a way to get this working multi-threaded, but it does not seem to be a significant runtime factor, so i didn't bother investing the time to dive deeper. + synchronized private static Pix processWithLeptonica(BufferedImage image, float imageDpi, int targetDpi) { + + Pix grayScale = convertToGrayScale(image); + Pix scaledUp = scaleToTargetDpi(imageDpi, targetDpi, grayScale); + return despecklePix(scaledUp); + } + + + private static Pix despecklePix(Pix scaledUp) { + + assert scaledUp.d == 8; + Pix despeckled = LeptUtils.despeckle(scaledUp, LeptUtils.SEL_STR3, 3); + if (despeckled == null) { // sometimes despeckle fails, and I wasn't able to figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with too small images, not sure though... + despeckled = Leptonica1.pixThresholdToBinary(scaledUp, 180); } - if (pix != despeckled) { - LeptUtils.disposePix(pix); + if (scaledUp != despeckled) { + LeptUtils.disposePix(scaledUp); } return despeckled; } - public static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) { + private static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) { float targetFactor = targetDpi / imageDpi; @@ -57,7 +67,7 @@ public class ImageProcessingUtils { @SneakyThrows - public static Pix convertToGrayScale(BufferedImage image) { + private static Pix convertToGrayScale(BufferedImage image) { Pix pix = LeptUtils.convertImageToPix(image); if (pix.d == 8) { @@ -74,7 +84,7 @@ public class ImageProcessingUtils { } - public static void setAlphaChannelToWhite(BufferedImage image) { + private static void setAlphaChannelToWhite(BufferedImage image) { if (image.getTransparency() == Transparency.TRANSLUCENT) { // NOTE: For BITMASK images, the color model is likely IndexColorModel, diff --git a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index 06465ad..f76b416 100644 --- a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -31,7 +31,7 @@ import io.micrometer.prometheus.PrometheusMeterRegistry; import io.micrometer.prometheus.PrometheusTimer; import lombok.SneakyThrows; -@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help. +//@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help. @SpringBootTest() public class OcrServiceIntegrationTest extends AbstractTest { @@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcr() { - String text = testOCR("files/2009-1048395_50pages_tables.pdf"); + String text = testOCR("files/10.SYN524464 FS (A16148C) - Absorção cutânea.pdf"); } @@ -172,7 +172,6 @@ public class OcrServiceIntegrationTest extends AbstractTest { } - @SneakyThrows private void testOCRForFile(File file) { From 77355b53671c63940faf26aa11064e48d019242b Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Wed, 15 Nov 2023 15:29:18 +0100 Subject: [PATCH 02/16] RED-7669: optimize OCR-module performance * second attempt at thread safety --- .../processor/model/ExtractedOcrImage.java | 12 ++++++++- .../processor/utils/ImageProcessingUtils.java | 25 +++---------------- .../v1/server/OcrServiceIntegrationTest.java | 1 + 3 files changed, 16 insertions(+), 22 deletions(-) diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java index 884df82..6455f97 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java @@ -45,12 +45,22 @@ public class ExtractedOcrImage implements OcrImage { this.originalHeight = bufferedImage.getHeight(); this.originalWidth = bufferedImage.getWidth(); float imageDPI = Math.abs(bufferedImage.getWidth() / (ctm.getScalingFactorX() / 72)); - this.pix = ImageProcessingUtils.process(bufferedImage, imageDPI, targetDpi); + this.pix = binarize(bufferedImage, imageDPI, targetDpi); this.height = pix.h; this.width = pix.w; } + @SneakyThrows + synchronized private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) { + + ImageProcessingUtils.setAlphaChannelToWhite(image); + Pix grayScale = ImageProcessingUtils.convertToGrayScale(image); + Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale); + return ImageProcessingUtils.despecklePix(scaledUp); + } + + @Override public AffineTransform getImageCTM() { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java index c2dab97..27621b3 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java @@ -15,24 +15,7 @@ import net.sourceforge.lept4j.util.LeptUtils; @UtilityClass public class ImageProcessingUtils { - public Pix process(BufferedImage image, float imageDpi, int targetDpi) { - - setAlphaChannelToWhite(image); - return processWithLeptonica(image, imageDpi, targetDpi); - } - - - // LeptUtils and Leptonica1 does not seem to be thread safe, so we must ensure synchronization for the image processing. - // There might be a way to get this working multi-threaded, but it does not seem to be a significant runtime factor, so i didn't bother investing the time to dive deeper. - synchronized private static Pix processWithLeptonica(BufferedImage image, float imageDpi, int targetDpi) { - - Pix grayScale = convertToGrayScale(image); - Pix scaledUp = scaleToTargetDpi(imageDpi, targetDpi, grayScale); - return despecklePix(scaledUp); - } - - - private static Pix despecklePix(Pix scaledUp) { + public static Pix despecklePix(Pix scaledUp) { assert scaledUp.d == 8; Pix despeckled = LeptUtils.despeckle(scaledUp, LeptUtils.SEL_STR3, 3); @@ -46,7 +29,7 @@ public class ImageProcessingUtils { } - private static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) { + public static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) { float targetFactor = targetDpi / imageDpi; @@ -67,7 +50,7 @@ public class ImageProcessingUtils { @SneakyThrows - private static Pix convertToGrayScale(BufferedImage image) { + public static Pix convertToGrayScale(BufferedImage image) { Pix pix = LeptUtils.convertImageToPix(image); if (pix.d == 8) { @@ -84,7 +67,7 @@ public class ImageProcessingUtils { } - private static void setAlphaChannelToWhite(BufferedImage image) { + public static void setAlphaChannelToWhite(BufferedImage image) { if (image.getTransparency() == Transparency.TRANSLUCENT) { // NOTE: For BITMASK images, the color model is likely IndexColorModel, diff --git a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index f76b416..098c7de 100644 --- a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -172,6 +172,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { } + @SneakyThrows private void testOCRForFile(File file) { From 3d09f46844757310f856691ece7e533766676f20 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Wed, 15 Nov 2023 15:52:55 +0100 Subject: [PATCH 03/16] RED-7669: optimize OCR-module performance * don't despeckle small images --- .../processor/model/ExtractedOcrImage.java | 85 ++++++++++++++++++- 1 file changed, 81 insertions(+), 4 deletions(-) diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java index 6455f97..472287e 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java @@ -54,10 +54,87 @@ public class ExtractedOcrImage implements OcrImage { @SneakyThrows synchronized private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) { - ImageProcessingUtils.setAlphaChannelToWhite(image); - Pix grayScale = ImageProcessingUtils.convertToGrayScale(image); - Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale); - return ImageProcessingUtils.despecklePix(scaledUp); + setAlphaChannelToWhite(image); + Pix grayScale = convertToGrayScale(image); + Pix scaledUp = scaleToTargetDpi(imageDpi, targetDpi, grayScale); + return despecklePix(scaledUp); + } + + + private static Pix despecklePix(Pix pix) { + + assert pix.d == 8; + Pix despeckled; + if (pix.w < 100 || pix.h < 100) { + // too small to properly despeckle, just binarize instead. + despeckled = Leptonica1.pixThresholdToBinary(pix, 180); + } else { + despeckled = LeptUtils.despeckle(pix, LeptUtils.SEL_STR3, 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though... + if (despeckled == null) { + despeckled = Leptonica1.pixThresholdToBinary(pix, 180); + } + } + if (pix != despeckled) { + LeptUtils.disposePix(pix); + } + return despeckled; + } + + + private static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) { + + float targetFactor = targetDpi / imageDpi; + + if (targetFactor > 3) { + Pix scaledUp; + scaledUp = Leptonica1.pixScaleGray4xLI(grayScale); + LeptUtils.disposePix(grayScale); + return scaledUp; + } else if (targetFactor > 1.9) { + Pix scaledUp; + scaledUp = Leptonica1.pixScaleGray2xLI(grayScale); + LeptUtils.disposePix(grayScale); + return scaledUp; + } else { + return grayScale; + } + } + + + private static Pix convertToGrayScale(BufferedImage image) throws IOException { + + Pix pix = LeptUtils.convertImageToPix(image); + if (pix.d == 8) { + return pix; + } else if (pix.d == 32) { + Pix grayScale = Leptonica1.pixConvertRGBToGrayFast(pix); + LeptUtils.disposePix(pix); + return grayScale; + } else { + Pix grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255); + LeptUtils.disposePix(pix); + return grayScale; + } + } + + + private static void setAlphaChannelToWhite(BufferedImage image) { + + if (image.getTransparency() == Transparency.TRANSLUCENT) { + // NOTE: For BITMASK images, the color model is likely IndexColorModel, + // and this model will contain the "real" color of the transparent parts + // which is likely a better fit than unconditionally setting it to white. + + // Fill background with white + Graphics2D graphics = image.createGraphics(); + try { + graphics.setComposite(AlphaComposite.DstOver); // Set composite rules to paint "behind" + graphics.setPaint(Color.WHITE); + graphics.fillRect(0, 0, image.getWidth(), image.getHeight()); + } finally { + graphics.dispose(); + } + } } From 4c225c22191a6f993b7a46d6730e5d85f3d3fe68 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Wed, 15 Nov 2023 15:55:26 +0100 Subject: [PATCH 04/16] RED-7669: optimize OCR-module performance * cleanup Code --- .../processor/model/ExtractedOcrImage.java | 85 +------------------ .../processor/utils/ImageProcessingUtils.java | 20 +++-- 2 files changed, 17 insertions(+), 88 deletions(-) diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java index 472287e..6455f97 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java @@ -54,87 +54,10 @@ public class ExtractedOcrImage implements OcrImage { @SneakyThrows synchronized private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) { - setAlphaChannelToWhite(image); - Pix grayScale = convertToGrayScale(image); - Pix scaledUp = scaleToTargetDpi(imageDpi, targetDpi, grayScale); - return despecklePix(scaledUp); - } - - - private static Pix despecklePix(Pix pix) { - - assert pix.d == 8; - Pix despeckled; - if (pix.w < 100 || pix.h < 100) { - // too small to properly despeckle, just binarize instead. - despeckled = Leptonica1.pixThresholdToBinary(pix, 180); - } else { - despeckled = LeptUtils.despeckle(pix, LeptUtils.SEL_STR3, 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though... - if (despeckled == null) { - despeckled = Leptonica1.pixThresholdToBinary(pix, 180); - } - } - if (pix != despeckled) { - LeptUtils.disposePix(pix); - } - return despeckled; - } - - - private static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) { - - float targetFactor = targetDpi / imageDpi; - - if (targetFactor > 3) { - Pix scaledUp; - scaledUp = Leptonica1.pixScaleGray4xLI(grayScale); - LeptUtils.disposePix(grayScale); - return scaledUp; - } else if (targetFactor > 1.9) { - Pix scaledUp; - scaledUp = Leptonica1.pixScaleGray2xLI(grayScale); - LeptUtils.disposePix(grayScale); - return scaledUp; - } else { - return grayScale; - } - } - - - private static Pix convertToGrayScale(BufferedImage image) throws IOException { - - Pix pix = LeptUtils.convertImageToPix(image); - if (pix.d == 8) { - return pix; - } else if (pix.d == 32) { - Pix grayScale = Leptonica1.pixConvertRGBToGrayFast(pix); - LeptUtils.disposePix(pix); - return grayScale; - } else { - Pix grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255); - LeptUtils.disposePix(pix); - return grayScale; - } - } - - - private static void setAlphaChannelToWhite(BufferedImage image) { - - if (image.getTransparency() == Transparency.TRANSLUCENT) { - // NOTE: For BITMASK images, the color model is likely IndexColorModel, - // and this model will contain the "real" color of the transparent parts - // which is likely a better fit than unconditionally setting it to white. - - // Fill background with white - Graphics2D graphics = image.createGraphics(); - try { - graphics.setComposite(AlphaComposite.DstOver); // Set composite rules to paint "behind" - graphics.setPaint(Color.WHITE); - graphics.fillRect(0, 0, image.getWidth(), image.getHeight()); - } finally { - graphics.dispose(); - } - } + ImageProcessingUtils.setAlphaChannelToWhite(image); + Pix grayScale = ImageProcessingUtils.convertToGrayScale(image); + Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale); + return ImageProcessingUtils.despecklePix(scaledUp); } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java index 27621b3..1727113 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java @@ -15,15 +15,21 @@ import net.sourceforge.lept4j.util.LeptUtils; @UtilityClass public class ImageProcessingUtils { - public static Pix despecklePix(Pix scaledUp) { + public static Pix despecklePix(Pix pix) { - assert scaledUp.d == 8; - Pix despeckled = LeptUtils.despeckle(scaledUp, LeptUtils.SEL_STR3, 3); - if (despeckled == null) { // sometimes despeckle fails, and I wasn't able to figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with too small images, not sure though... - despeckled = Leptonica1.pixThresholdToBinary(scaledUp, 180); + assert pix.d == 8; + Pix despeckled; + if (pix.w < 100 || pix.h < 100) { + // too small to properly despeckle, just binarize instead. + despeckled = Leptonica1.pixThresholdToBinary(pix, 180); + } else { + despeckled = LeptUtils.despeckle(pix, LeptUtils.SEL_STR3, 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though... + if (despeckled == null) { + despeckled = Leptonica1.pixThresholdToBinary(pix, 180); + } } - if (scaledUp != despeckled) { - LeptUtils.disposePix(scaledUp); + if (pix != despeckled) { + LeptUtils.disposePix(pix); } return despeckled; } From 2632d2023da917e5a2a3686daf5ef196153ab774 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Wed, 15 Nov 2023 16:00:00 +0100 Subject: [PATCH 05/16] RED-7669: optimize OCR-module performance * reset test and settings --- .../service/ocr/processor/settings/OcrServiceSettings.java | 4 ++-- .../service/ocr/v1/server/OcrServiceIntegrationTest.java | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java index 16a122a..0592808 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java @@ -12,9 +12,9 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(level = AccessLevel.PRIVATE) public class OcrServiceSettings { - int ocrThreadCount = 16; // Number of OCR threads + int ocrThreadCount = 4; // Number of OCR threads int imageExtractThreadCount = 2; // Number of image extraction threads - int gsProcessCount = 5; // Number of Ghostscript processes + int gsProcessCount = 2; // Number of Ghostscript processes int dpi = 300; // Target DPI for binarized images int psmOverride = -1; // Overrides the page segmentation mode if > 0 int minImageHeight = 20; // Minimum height for images to be processed diff --git a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index 098c7de..06465ad 100644 --- a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -31,7 +31,7 @@ import io.micrometer.prometheus.PrometheusMeterRegistry; import io.micrometer.prometheus.PrometheusTimer; import lombok.SneakyThrows; -//@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help. +@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help. @SpringBootTest() public class OcrServiceIntegrationTest extends AbstractTest { @@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcr() { - String text = testOCR("files/10.SYN524464 FS (A16148C) - Absorção cutânea.pdf"); + String text = testOCR("files/2009-1048395_50pages_tables.pdf"); } From 19747cbca5b7122a70257eee7f1c3bb593a732ab Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Wed, 15 Nov 2023 17:17:21 +0100 Subject: [PATCH 06/16] RED-7669: optimize OCR-module performance * moar sigsegv --- .../processor/service/threads/OCRThread.java | 18 +++++------- .../utils/NativeMemoryAllocationUtils.java | 28 +++++++++++++++++++ 2 files changed, 35 insertions(+), 11 deletions(-) create mode 100644 ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/NativeMemoryAllocationUtils.java diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java index ad567ef..fdb9d27 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java @@ -1,5 +1,6 @@ package com.knecon.fforesight.service.ocr.processor.service.threads; +import static com.knecon.fforesight.service.ocr.processor.utils.NativeMemoryAllocationUtils.getDetectionScriptBuffers; import static net.sourceforge.tess4j.ITessAPI.TRUE; import java.io.File; @@ -15,6 +16,7 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrImage; import com.knecon.fforesight.service.ocr.processor.model.OcrResult; import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger; import com.knecon.fforesight.service.ocr.processor.service.Statistics; +import com.knecon.fforesight.service.ocr.processor.utils.NativeMemoryAllocationUtils; import com.knecon.fforesight.service.ocr.processor.utils.Tesseract2; import com.sun.jna.ptr.PointerByReference; @@ -128,21 +130,15 @@ public class OCRThread extends Thread { TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, image.getPix()); TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, image.getDpi()); - synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs in leptonica binarization. - orientationDegreeResultBuffer = IntBuffer.allocate(1); - orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1); - scriptureNameBuffer = new PointerByReference(); - scriptureConfidenceBuffer = FloatBuffer.allocate(1); - } int orient_deg = 0; int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle, - orientationDegreeResultBuffer, - orientationDegreeConfidenceBuffer, - scriptureNameBuffer, - scriptureConfidenceBuffer); + buffers.orient_degB(), + buffers.orient_confB(), + buffers.script_nameB(), + buffers.script_confB()); if (result == TRUE) { - orient_deg = orientationDegreeResultBuffer.get(); + orient_deg = buffers.orient_degB().get(); } synchronized (OCRThread.class) { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/NativeMemoryAllocationUtils.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/NativeMemoryAllocationUtils.java new file mode 100644 index 0000000..8da78d7 --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/NativeMemoryAllocationUtils.java @@ -0,0 +1,28 @@ +package com.knecon.fforesight.service.ocr.processor.utils; + +import java.nio.FloatBuffer; +import java.nio.IntBuffer; + +import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; +import com.sun.jna.ptr.PointerByReference; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class NativeMemoryAllocationUtils { + + synchronized public static DetectionScriptBuffers getDetectionScriptBuffers() { + + IntBuffer orient_degB = IntBuffer.allocate(1); + FloatBuffer orient_confB = FloatBuffer.allocate(1); + PointerByReference script_nameB = new PointerByReference(); + FloatBuffer script_confB = FloatBuffer.allocate(1); + DetectionScriptBuffers buffers = new DetectionScriptBuffers(orient_degB, orient_confB, script_nameB, script_confB); + return buffers; + } + + + public record DetectionScriptBuffers(IntBuffer orient_degB, FloatBuffer orient_confB, PointerByReference script_nameB, FloatBuffer script_confB) { + + } +} From 12217f2459a74b6f4d30c35bc240b3cca1767b0a Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Wed, 15 Nov 2023 17:17:33 +0100 Subject: [PATCH 07/16] RED-7669: optimize OCR-module performance * moar sigsegv --- .../ocr/processor/utils/NativeMemoryAllocationUtils.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/NativeMemoryAllocationUtils.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/NativeMemoryAllocationUtils.java index 8da78d7..4ec153d 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/NativeMemoryAllocationUtils.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/NativeMemoryAllocationUtils.java @@ -17,8 +17,7 @@ public class NativeMemoryAllocationUtils { FloatBuffer orient_confB = FloatBuffer.allocate(1); PointerByReference script_nameB = new PointerByReference(); FloatBuffer script_confB = FloatBuffer.allocate(1); - DetectionScriptBuffers buffers = new DetectionScriptBuffers(orient_degB, orient_confB, script_nameB, script_confB); - return buffers; + return new DetectionScriptBuffers(orient_degB, orient_confB, script_nameB, script_confB); } From 574f7ac25e13736bb8a88f50f82c1d53f671061f Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Wed, 15 Nov 2023 17:19:11 +0100 Subject: [PATCH 08/16] RED-7669: optimize OCR-module performance * moar sigsegv --- .../ocr/processor/service/threads/OCRThread.java | 14 +++++--------- .../utils/NativeMemoryAllocationUtils.java | 3 +-- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java index fdb9d27..1fd3753 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java @@ -1,11 +1,8 @@ package com.knecon.fforesight.service.ocr.processor.service.threads; -import static com.knecon.fforesight.service.ocr.processor.utils.NativeMemoryAllocationUtils.getDetectionScriptBuffers; import static net.sourceforge.tess4j.ITessAPI.TRUE; import java.io.File; -import java.nio.FloatBuffer; -import java.nio.IntBuffer; import java.nio.file.Path; import java.util.List; import java.util.NoSuchElementException; @@ -18,7 +15,6 @@ import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger; import com.knecon.fforesight.service.ocr.processor.service.Statistics; import com.knecon.fforesight.service.ocr.processor.utils.NativeMemoryAllocationUtils; import com.knecon.fforesight.service.ocr.processor.utils.Tesseract2; -import com.sun.jna.ptr.PointerByReference; import lombok.AccessLevel; import lombok.RequiredArgsConstructor; @@ -133,12 +129,12 @@ public class OCRThread extends Thread { int orient_deg = 0; int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle, - buffers.orient_degB(), - buffers.orient_confB(), - buffers.script_nameB(), - buffers.script_confB()); + buffers.orientationDegreeResultBuffer(), + buffers.orientationDegreeConfidenceBuffer(), + buffers.scriptureNameBuffer(), + buffers.scriptureConfidenceBuffer()); if (result == TRUE) { - orient_deg = buffers.orient_degB().get(); + orient_deg = buffers.orientationDegreeResultBuffer().get(); } synchronized (OCRThread.class) { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/NativeMemoryAllocationUtils.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/NativeMemoryAllocationUtils.java index 4ec153d..a75d004 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/NativeMemoryAllocationUtils.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/NativeMemoryAllocationUtils.java @@ -3,7 +3,6 @@ package com.knecon.fforesight.service.ocr.processor.utils; import java.nio.FloatBuffer; import java.nio.IntBuffer; -import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.sun.jna.ptr.PointerByReference; import lombok.experimental.UtilityClass; @@ -21,7 +20,7 @@ public class NativeMemoryAllocationUtils { } - public record DetectionScriptBuffers(IntBuffer orient_degB, FloatBuffer orient_confB, PointerByReference script_nameB, FloatBuffer script_confB) { + public record DetectionScriptBuffers(IntBuffer orientationDegreeResultBuffer, FloatBuffer orientationDegreeConfidenceBuffer, PointerByReference scriptureNameBuffer, FloatBuffer scriptureConfidenceBuffer) { } } From 6f996649060e15c8d90d324d3f9f2e7483a0a2ed Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Fri, 17 Nov 2023 14:04:43 +0100 Subject: [PATCH 09/16] RED-7669: optimize OCR-module performance * try and synchronize all malloc calls --- .../processor/model/ExtractedOcrImage.java | 16 +++++++++--- .../processor/service/threads/OCRThread.java | 20 +++++++++----- .../utils/NativeMemoryAllocationUtils.java | 26 ------------------- 3 files changed, 26 insertions(+), 36 deletions(-) delete mode 100644 ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/NativeMemoryAllocationUtils.java diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java index 6455f97..819ccbe 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java @@ -2,10 +2,15 @@ package com.knecon.fforesight.service.ocr.processor.model; import java.awt.geom.AffineTransform; import java.awt.image.BufferedImage; +import java.io.IOException; +import java.nio.IntBuffer; +import java.util.concurrent.Semaphore; import org.apache.pdfbox.util.Matrix; +import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; +import com.pdftron.sdf.Obj; import lombok.AccessLevel; import lombok.Getter; @@ -52,12 +57,15 @@ public class ExtractedOcrImage implements OcrImage { @SneakyThrows - synchronized private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) { + private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) { ImageProcessingUtils.setAlphaChannelToWhite(image); - Pix grayScale = ImageProcessingUtils.convertToGrayScale(image); - Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale); - return ImageProcessingUtils.despecklePix(scaledUp); + + synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs tesseract detection script. + Pix grayScale = ImageProcessingUtils.convertToGrayScale(image); + Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale); + return ImageProcessingUtils.despecklePix(scaledUp); + } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java index 1fd3753..ad567ef 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java @@ -3,6 +3,8 @@ package com.knecon.fforesight.service.ocr.processor.service.threads; import static net.sourceforge.tess4j.ITessAPI.TRUE; import java.io.File; +import java.nio.FloatBuffer; +import java.nio.IntBuffer; import java.nio.file.Path; import java.util.List; import java.util.NoSuchElementException; @@ -13,8 +15,8 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrImage; import com.knecon.fforesight.service.ocr.processor.model.OcrResult; import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger; import com.knecon.fforesight.service.ocr.processor.service.Statistics; -import com.knecon.fforesight.service.ocr.processor.utils.NativeMemoryAllocationUtils; import com.knecon.fforesight.service.ocr.processor.utils.Tesseract2; +import com.sun.jna.ptr.PointerByReference; import lombok.AccessLevel; import lombok.RequiredArgsConstructor; @@ -126,15 +128,21 @@ public class OCRThread extends Thread { TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, image.getPix()); TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, image.getDpi()); + synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs in leptonica binarization. + orientationDegreeResultBuffer = IntBuffer.allocate(1); + orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1); + scriptureNameBuffer = new PointerByReference(); + scriptureConfidenceBuffer = FloatBuffer.allocate(1); + } int orient_deg = 0; int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle, - buffers.orientationDegreeResultBuffer(), - buffers.orientationDegreeConfidenceBuffer(), - buffers.scriptureNameBuffer(), - buffers.scriptureConfidenceBuffer()); + orientationDegreeResultBuffer, + orientationDegreeConfidenceBuffer, + scriptureNameBuffer, + scriptureConfidenceBuffer); if (result == TRUE) { - orient_deg = buffers.orientationDegreeResultBuffer().get(); + orient_deg = orientationDegreeResultBuffer.get(); } synchronized (OCRThread.class) { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/NativeMemoryAllocationUtils.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/NativeMemoryAllocationUtils.java deleted file mode 100644 index a75d004..0000000 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/NativeMemoryAllocationUtils.java +++ /dev/null @@ -1,26 +0,0 @@ -package com.knecon.fforesight.service.ocr.processor.utils; - -import java.nio.FloatBuffer; -import java.nio.IntBuffer; - -import com.sun.jna.ptr.PointerByReference; - -import lombok.experimental.UtilityClass; - -@UtilityClass -public class NativeMemoryAllocationUtils { - - synchronized public static DetectionScriptBuffers getDetectionScriptBuffers() { - - IntBuffer orient_degB = IntBuffer.allocate(1); - FloatBuffer orient_confB = FloatBuffer.allocate(1); - PointerByReference script_nameB = new PointerByReference(); - FloatBuffer script_confB = FloatBuffer.allocate(1); - return new DetectionScriptBuffers(orient_degB, orient_confB, script_nameB, script_confB); - } - - - public record DetectionScriptBuffers(IntBuffer orientationDegreeResultBuffer, FloatBuffer orientationDegreeConfidenceBuffer, PointerByReference scriptureNameBuffer, FloatBuffer scriptureConfidenceBuffer) { - - } -} From bb5b4a2fd8ec96c6986518d9dcf58ae1f53bb024 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Tue, 21 Nov 2023 14:37:18 +0100 Subject: [PATCH 10/16] RED-7669: optimize OCR-module performance * binarize images after reading --- .../ocr/processor/model/ExtractedImage.java | 27 +++++++ .../processor/model/ExtractedOcrImage.java | 15 ++++ .../service/ocr/processor/model/OcrImage.java | 16 +++++ .../ocr/processor/model/QuadPoint.java | 6 ++ .../processor/service/ImageStreamEngine.java | 26 +++---- .../ocr/processor/service/OCRService.java | 2 +- .../threads/ImageExtractionThread.java | 39 ++++++----- .../processor/service/threads/OCRThread.java | 70 +++++++++++-------- .../settings/OcrServiceSettings.java | 6 +- .../ocr/processor/utils/Tesseract2.java | 61 +++++++++------- .../v1/server/OcrServiceIntegrationTest.java | 8 +-- 11 files changed, 182 insertions(+), 94 deletions(-) create mode 100644 ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java new file mode 100644 index 0000000..57ce77f --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java @@ -0,0 +1,27 @@ +package com.knecon.fforesight.service.ocr.processor.model; + +import java.awt.image.BufferedImage; + +import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace; +import org.apache.pdfbox.util.Matrix; + +import lombok.AccessLevel; +import lombok.Getter; +import lombok.RequiredArgsConstructor; +import lombok.experimental.FieldDefaults; + +@Getter +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class ExtractedImage { + + int pageNumber; + QuadPoint position; + int height; + int width; + BufferedImage image; + Matrix ctm; + int numberOnPage; + PDColorSpace colorSpace; + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java index 819ccbe..cedcba9 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java @@ -11,6 +11,8 @@ import org.apache.pdfbox.util.Matrix; import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; import com.pdftron.sdf.Obj; +import com.sun.jna.StringArray; +import com.sun.jna.ptr.PointerByReference; import lombok.AccessLevel; import lombok.Getter; @@ -56,6 +58,19 @@ public class ExtractedOcrImage implements OcrImage { } + public ExtractedOcrImage(ExtractedImage image, int targetDpi) { + this.pageNumber = image.getPageNumber(); + this.numberOnPage = image.getNumberOnPage(); + this.ctm = image.getCtm(); + this.originalHeight = image.getImage().getHeight(); + this.originalWidth = image.getImage().getWidth(); + float imageDPI = Math.abs(image.getImage().getWidth() / (ctm.getScalingFactorX() / 72)); + this.pix = binarize(image.getImage(), imageDPI, targetDpi); + this.height = pix.h; + this.width = pix.w; + } + + @SneakyThrows private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java index 3afb0a8..37a1806 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java @@ -2,10 +2,12 @@ package com.knecon.fforesight.service.ocr.processor.model; import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; +import java.awt.image.BufferedImage; import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.knecon.fforesight.service.ocr.processor.utils.PdfDpiCalculator; +import lombok.SneakyThrows; import net.sourceforge.lept4j.Leptonica1; import net.sourceforge.lept4j.Pix; import net.sourceforge.lept4j.util.LeptUtils; @@ -62,6 +64,20 @@ public interface OcrImage { } + @SneakyThrows + default BufferedImage getBufferedImage() { + + return LeptUtils.convertPixToImage(getPix()); + } + + + @SneakyThrows + default BufferedImage getRotatedBufferedImage() { + + return LeptUtils.convertPixToImage(getRotatedPix()); + } + + /** * Retrieves the rotation degree of the OCR image. * diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/QuadPoint.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/QuadPoint.java index 53fc7b6..c40aa1d 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/QuadPoint.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/QuadPoint.java @@ -97,4 +97,10 @@ public record QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) { d().getY()); } + + public double size() { + + return a().distance(b()) * a().distance(d()); + } + } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java index 9b6d2d5..a022ac4 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java @@ -24,6 +24,7 @@ import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.util.Matrix; +import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; @@ -34,7 +35,7 @@ import lombok.SneakyThrows; public class ImageStreamEngine extends PDFStreamEngine { private ExtractedOcrImage currentImageOnPage; - private List imagesOnCurrentPage; + private List imagesOnCurrentPage; private OcrServiceSettings settings; private int pageNum; @@ -69,21 +70,14 @@ public class ImageStreamEngine extends PDFStreamEngine { } Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix(); - if (imageXObject.getColorSpace() instanceof PDDeviceRGB) { - BufferedImage image = imageXObject.getImage(); - this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi()); - } else if (imageXObject.getColorSpace() instanceof PDDeviceGray) { - BufferedImage image = imageXObject.getImage(); - this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi()); - } else { - BufferedImage pdfImage = imageXObject.getImage(); - BufferedImage image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY); - Graphics g = image.getGraphics(); - g.drawImage(pdfImage, 0, 0, null); - g.dispose(); - this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi()); - } - this.imagesOnCurrentPage.add(this.currentImageOnPage); + this.imagesOnCurrentPage.add(new ExtractedImage(pageNum, + imageXObject.getHeight(), + imageXObject.getWidth(), + imageXObject.getImage(), + imageCTM, + imagesOnCurrentPage.size(), + imageXObject.getColorSpace())); + //imagesOnPages.add(this.currentImageOnPage); } else if (xobject instanceof PDFormXObject) { PDFormXObject form = (PDFormXObject) xobject; diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java index 3ed6193..c488982 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java @@ -107,7 +107,7 @@ public class OCRService { int numberOfOcrThreads = Math.min(settings.getOcrThreadCount(), document.getNumberOfPages()); stats = new Statistics(numberOfExtractThreads, numberOfOcrThreads); - BlockingQueue ocrImageQueue = new ArrayBlockingQueue<>(numberOfOcrThreads); + BlockingQueue ocrImageQueue = new ArrayBlockingQueue<>(2 * numberOfOcrThreads); OcrImageFactory ocrImageFactory = new OcrImageFactory(document, documentFile, diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java index 3b29836..9551bbb 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java @@ -5,10 +5,10 @@ import java.util.List; import java.util.concurrent.BlockingQueue; import org.apache.pdfbox.Loader; -import org.apache.pdfbox.io.MemoryUsageSetting; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; +import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage; import com.knecon.fforesight.service.ocr.processor.model.OcrImage; import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine; @@ -26,6 +26,7 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class ImageExtractionThread extends Thread { + static double FULL_PAGE_IMAGE_THRESHOLD = 0.98; static double IMAGE_ALIGNMENT_THRESHOLD = 1; int id; @@ -40,6 +41,7 @@ public class ImageExtractionThread extends Thread { BlockingQueue imageOutputQueue; List stitchedPageNumbers; + @SneakyThrows @Override public void run() { @@ -48,20 +50,21 @@ public class ImageExtractionThread extends Thread { for (Integer pageIndex : pageIndices) { try (PDDocument document = Loader.loadPDF(documentFile)) { // load new PDDocument for thread safety, also keeps RAM usage low. timestamp = System.currentTimeMillis(); - List extractedOcrImages = getExtractedOcrImages(pageIndex, document); + List extractedImages = getExtractedOcrImages(pageIndex, document); stats.increaseImageExtraction(id, System.currentTimeMillis() - timestamp); - if (extractedOcrImages.isEmpty()) { + if (extractedImages.isEmpty()) { logger.logPageSkipped(pageIndex); } - if (checkForStitchedImages(extractedOcrImages)) { + if (checkForStitchedImages(extractedImages, document.getPage(pageIndex - 1))) { stitchedPageNumbers.add(pageIndex); logger.addImagesToProcess(pageIndex, 0); continue; } - for (ExtractedOcrImage image : extractedOcrImages) { - imageOutputQueue.put(image); + for (ExtractedImage image : extractedImages) { + ExtractedOcrImage ocrImage = new ExtractedOcrImage(image, settings.getDpi()); + imageOutputQueue.put(ocrImage); logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage()); } } @@ -69,7 +72,7 @@ public class ImageExtractionThread extends Thread { } - private List getExtractedOcrImages(Integer pageIndex, PDDocument document) { + private List getExtractedOcrImages(Integer pageIndex, PDDocument document) { PDPage page = document.getPage(pageIndex - 1); ImageStreamEngine imageStreamEngine = new ImageStreamEngine(settings); @@ -79,22 +82,22 @@ public class ImageExtractionThread extends Thread { @SneakyThrows - private boolean checkForStitchedImages(List imagesOnCurrentPage) { + private boolean checkForStitchedImages(List imagesOnCurrentPage, PDPage page) { - if (imagesOnCurrentPage.size() <= 1) { + if (imagesOnCurrentPage.isEmpty()) { return false; } - //checking for intersections or direct alignment of images - ExtractedOcrImage[] imageOnPagesArray = new ExtractedOcrImage[imagesOnCurrentPage.size()]; - int index = 0; - for (ExtractedOcrImage imageOnPage : imagesOnCurrentPage) { - imageOnPagesArray[index] = imageOnPage; - index++; + for (ExtractedImage imageOnPage : imagesOnCurrentPage) { + if (imageOnPage.getImageCoordinatesInInitialUserSpace().size() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight() * page.getCropBox().getWidth()) { + return true; + } } - for (int j = 0; j < imageOnPagesArray.length; j++) { - for (int i = j + 1; i < imageOnPagesArray.length; i++) { - if (imageOnPagesArray[j].getImageCoordinatesInInitialUserSpace().aligns(imageOnPagesArray[i].getImageCoordinatesInInitialUserSpace(), IMAGE_ALIGNMENT_THRESHOLD)) { + + //checking for intersections or direct alignment of images + for (int j = 0; j < imagesOnCurrentPage.size(); j++) { + for (int i = j + 1; i < imagesOnCurrentPage.size(); i++) { + if (imagesOnCurrentPage.get(j).getImageCoordinatesInInitialUserSpace().aligns(imagesOnCurrentPage.get(i).getImageCoordinatesInInitialUserSpace(), IMAGE_ALIGNMENT_THRESHOLD)) { // TODO: see if we can stitch aligning images using BufferedImage and skip the gs conversion entirely return true; } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java index ad567ef..d0b11b5 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java @@ -1,6 +1,10 @@ package com.knecon.fforesight.service.ocr.processor.service.threads; import static net.sourceforge.tess4j.ITessAPI.TRUE; +import static net.sourceforge.tess4j.TessAPI1.TessBaseAPICreate; +import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIInit1; +import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetPageSegMode; +import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetVariable; import java.io.File; import java.nio.FloatBuffer; @@ -16,6 +20,7 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrResult; import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger; import com.knecon.fforesight.service.ocr.processor.service.Statistics; import com.knecon.fforesight.service.ocr.processor.utils.Tesseract2; +import com.sun.jna.StringArray; import com.sun.jna.ptr.PointerByReference; import lombok.AccessLevel; @@ -42,8 +47,8 @@ public class OCRThread extends Thread { OcrProgressLogger logger; Statistics stats; OcrServiceSettings settings; - Tesseract2 instance; ITessAPI.TessBaseAPI detectionScriptHandle; + ITessAPI.TessBaseAPI tesseractHandle; public OCRThread(int id, @@ -61,8 +66,8 @@ public class OCRThread extends Thread { this.logger = logger; this.stats = stats; this.settings = settings; - this.instance = createInstance(settings); this.detectionScriptHandle = initDetectionScriptHandle(); + this.tesseractHandle = initTesseractHandle(settings); } @@ -88,9 +93,9 @@ public class OCRThread extends Thread { } } catch (NoSuchElementException e) { log.debug("Processed all Images, finishing."); + TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle); + TessAPI1.TessBaseAPIDelete(this.tesseractHandle); } - - TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle); } @@ -107,10 +112,8 @@ public class OCRThread extends Thread { Pix rotatedPix = image.getRotatedPix(); executeTesseract(psm, image.getDpi(), rotatedPix, tesseractOutputFileName); - synchronized (OCRThread.class) { - image.destroyPix(); - LeptUtils.disposePix(rotatedPix); - } + image.destroyPix(); + LeptUtils.disposePix(rotatedPix); results.add(OcrResult.create(image, tesseractOutputFileName)); logger.logImageFinished(image, psm); @@ -145,21 +148,37 @@ public class OCRThread extends Thread { orient_deg = orientationDegreeResultBuffer.get(); } - synchronized (OCRThread.class) { - TessAPI1.TessBaseAPIClear(detectionScriptHandle); - } + TessAPI1.TessBaseAPIClear(detectionScriptHandle); return orient_deg; } - synchronized private static ITessAPI.TessBaseAPI initDetectionScriptHandle() { + private static ITessAPI.TessBaseAPI initDetectionScriptHandle() { - ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate(); - String datapath = System.getenv("TESSDATA_PREFIX"); - TessAPI1.TessBaseAPIInit3(handle, datapath, "osd"); + synchronized (OCRThread.class) { - return handle; + ITessAPI.TessBaseAPI handle = TessBaseAPICreate(); + String datapath = System.getenv("TESSDATA_PREFIX"); +// TessBaseAPISetVariable(handle, "debug_file", "/dev/null"); + TessAPI1.TessBaseAPIInit3(handle, datapath, "osd"); + + return handle; + } + } + + + synchronized private static ITessAPI.TessBaseAPI initTesseractHandle(OcrServiceSettings settings) { + + synchronized (OCRThread.class) { + + ITessAPI.TessBaseAPI handle = TessBaseAPICreate(); + String datapath = System.getenv("TESSDATA_PREFIX"); +// TessBaseAPISetVariable(handle, "debug_file", "/dev/null"); + TessBaseAPIInit1(handle, datapath, settings.getLanguages(), 1, new PointerByReference(), 0); + + return handle; + } } @@ -173,19 +192,14 @@ public class OCRThread extends Thread { Leptonica1.pixWrite(folder + "/pix_" + a[a.length - 1] + ".png", pix, 3); } - instance.setVariable("user_defined_dpi", String.valueOf(dpi)); - instance.setPageSegMode(psm); - instance.createDocumentsWithResults(pix, null, tesseractOutputFileName, List.of(ITesseract.RenderedFormat.HOCR), ITessAPI.TessPageIteratorLevel.RIL_BLOCK); - } + TessBaseAPISetPageSegMode(tesseractHandle, psm); - - private static Tesseract2 createInstance(OcrServiceSettings settings) { - - Tesseract2 instance = new Tesseract2(); - instance.setVariable("debug_file", "/dev/null"); // remove warnings from std out - instance.setOcrEngineMode(1); // set to LSTM based Engine - instance.setLanguage(settings.getLanguages()); - return instance; + Tesseract2.createDocumentsWithResults(pix, + null, + tesseractOutputFileName, + List.of(ITesseract.RenderedFormat.HOCR), + ITessAPI.TessPageIteratorLevel.RIL_BLOCK, + tesseractHandle); } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java index 0592808..e165845 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java @@ -12,9 +12,9 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(level = AccessLevel.PRIVATE) public class OcrServiceSettings { - int ocrThreadCount = 4; // Number of OCR threads - int imageExtractThreadCount = 2; // Number of image extraction threads - int gsProcessCount = 2; // Number of Ghostscript processes + int ocrThreadCount = 16; // Number of OCR threads + int imageExtractThreadCount = 5; // Number of image extraction threads + int gsProcessCount = 5; // Number of Ghostscript processes int dpi = 300; // Target DPI for binarized images int psmOverride = -1; // Overrides the page segmentation mode if > 0 int minImageHeight = 20; // Minimum height for images to be processed diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java index d85dc46..eda0685 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java @@ -1,45 +1,54 @@ package com.knecon.fforesight.service.ocr.processor.utils; +import static net.sourceforge.tess4j.ITesseract.DOCUMENT_TITLE; + import java.awt.Rectangle; import java.nio.IntBuffer; import java.util.ArrayList; import java.util.List; +import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.sun.jna.Pointer; +import lombok.experimental.UtilityClass; import lombok.extern.slf4j.Slf4j; import net.sourceforge.lept4j.Pix; +import net.sourceforge.tess4j.ITessAPI; +import net.sourceforge.tess4j.ITesseract; import net.sourceforge.tess4j.OCRResult; import net.sourceforge.tess4j.TessAPI1; -import net.sourceforge.tess4j.Tesseract1; -import net.sourceforge.tess4j.TesseractException; import net.sourceforge.tess4j.Word; @Slf4j /** * Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted. */ -public class Tesseract2 extends Tesseract1 { +@UtilityClass +public class Tesseract2 extends TessAPI1 { + private int createDocuments(Pix pix, String filename, ITessAPI.TessBaseAPI handle, ITessAPI.TessResultRenderer renderer) { - private int createDocuments(Pix pix, String filename, TessResultRenderer renderer) { - - String title = TessBaseAPIGetStringVariable(getHandle(), DOCUMENT_TITLE); + String title = TessBaseAPIGetStringVariable(handle, DOCUMENT_TITLE); TessResultRendererBeginDocument(renderer, title); - int result = TessBaseAPIProcessPage(getHandle(), pix, 0, filename, null, 0, renderer); + int result = TessBaseAPIProcessPage(handle, pix, 0, filename, null, 0, renderer); TessResultRendererEndDocument(renderer); // if (result == ITessAPI.FALSE) { // throw new TesseractException("Error during processing page."); // } - return TessBaseAPIMeanTextConf(getHandle()); + return TessBaseAPIMeanTextConf(handle); } - public OCRResult createDocumentsWithResults(Pix bi, String filename, String outputbase, List formats, int pageIteratorLevel) throws TesseractException { + public OCRResult createDocumentsWithResults(Pix bi, + String filename, + String outputbase, + List formats, + int pageIteratorLevel, + ITessAPI.TessBaseAPI handle) { - List results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel); + List results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel, handle); if (!results.isEmpty()) { return results.get(0); } else { @@ -48,24 +57,26 @@ public class Tesseract2 extends Tesseract1 { } - public List createDocumentsWithResults(Pix[] pixs, String[] filenames, String[] outputbases, List formats, int pageIteratorLevel) { + public List createDocumentsWithResults(Pix[] pixs, + String[] filenames, + String[] outputbases, + List formats, + int pageIteratorLevel, + ITessAPI.TessBaseAPI handle) { if (pixs.length != filenames.length || pixs.length != outputbases.length) { throw new RuntimeException("The three arrays must match in length."); } - init(); - setVariables(); - List results = new ArrayList(); try { for (int i = 0; i < pixs.length; i++) { try { - TessResultRenderer renderer = createRenderers(outputbases[i], formats); - int meanTextConfidence = createDocuments(pixs[i], filenames[i], renderer); + ITessAPI.TessResultRenderer renderer = createRenderers(outputbases[i], formats); + int meanTextConfidence = createDocuments(pixs[i], filenames[i], handle, renderer); TessDeleteResultRenderer(renderer); - List words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel) : new ArrayList(); + List words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel, handle) : new ArrayList(); results.add(new OCRResult(meanTextConfidence, words)); } catch (Exception e) { // skip the problematic image file @@ -73,20 +84,22 @@ public class Tesseract2 extends Tesseract1 { } } } finally { - dispose(); + synchronized (OCRThread.class) { + TessAPI1.TessBaseAPIClear(handle); + } } return results; } - private List getRecognizedWords(int pageIteratorLevel) { + private List getRecognizedWords(int pageIteratorLevel, ITessAPI.TessBaseAPI handle) { List words = new ArrayList<>(); try { - TessResultIterator ri = TessBaseAPIGetIterator(getHandle()); - TessPageIterator pi = TessResultIteratorGetPageIterator(ri); + ITessAPI.TessResultIterator ri = TessBaseAPIGetIterator(handle); + ITessAPI.TessPageIterator pi = TessResultIteratorGetPageIterator(ri); TessPageIteratorBegin(pi); do { @@ -119,11 +132,11 @@ public class Tesseract2 extends Tesseract1 { } - private TessResultRenderer createRenderers(String outputbase, List formats) { + private ITessAPI.TessResultRenderer createRenderers(String outputbase, List formats) { - TessResultRenderer renderer = null; + ITessAPI.TessResultRenderer renderer = null; - for (RenderedFormat format : formats) { + for (ITesseract.RenderedFormat format : formats) { switch (format) { case HOCR: diff --git a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index 06465ad..a4707f8 100644 --- a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -31,7 +31,7 @@ import io.micrometer.prometheus.PrometheusMeterRegistry; import io.micrometer.prometheus.PrometheusTimer; import lombok.SneakyThrows; -@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help. +//@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help. @SpringBootTest() public class OcrServiceIntegrationTest extends AbstractTest { @@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcr() { - String text = testOCR("files/2009-1048395_50pages_tables.pdf"); + String text = testOCR("files/VV-352892.pdf"); } @@ -139,7 +139,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { String dir = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/"; List foundFiles = Files.walk(Path.of(dir)) - .sorted(Comparator.comparingLong(this::getFileSize)) +// .sorted(Comparator.comparingLong(this::getFileSize)) .map(Path::toFile) .filter(file -> file.getName().endsWith(".pdf")) .peek(System.out::println) @@ -162,7 +162,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcrForSpecificFile() { - testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/F.2. A16003E - Acute Inhalation Study.pdf")); + testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/Item 17_Toxicidade Inalatoria.pdf")); // testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/A23220A - 404 - Skin Irritation in vivo.pdf")); // testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/G.1.2 - 1768300_MMNA_A13617AV_report.pdf")); // testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 17_Toxicidade Inalatoria Aguda.pdf")); From efd3a1d952fe5ab0d8cf9559dbd46d2854852abb Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Wed, 22 Nov 2023 16:40:13 +0100 Subject: [PATCH 11/16] RED-7669: optimize OCR-module performance * move all non thread safe stuff to separate thread in the middle --- .../ocr/processor/model/ExtractedImage.java | 21 +++ .../processor/model/ExtractedOcrImage.java | 70 ++------ .../service/ocr/processor/model/OcrImage.java | 35 ---- .../processor/service/GhostScriptService.java | 2 +- .../processor/service/ImageStreamEngine.java | 5 +- .../ocr/processor/service/OCRService.java | 4 +- .../processor/service/OcrImageFactory.java | 24 ++- .../ocr/processor/service/Statistics.java | 12 +- .../threads/ImageExtractionThread.java | 19 +- .../threads/ImageProcessingThread.java | 166 ++++++++++++++++++ .../processor/service/threads/OCRThread.java | 98 ++--------- .../settings/OcrServiceSettings.java | 6 +- .../processor/utils/ImageProcessingUtils.java | 29 ++- .../ocr/processor/utils/Tesseract2.java | 61 +++---- .../v1/server/OcrServiceIntegrationTest.java | 6 +- 15 files changed, 316 insertions(+), 242 deletions(-) create mode 100644 ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java index 57ce77f..96e96c9 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java @@ -1,14 +1,20 @@ package com.knecon.fforesight.service.ocr.processor.model; +import java.awt.geom.Rectangle2D; import java.awt.image.BufferedImage; import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace; import org.apache.pdfbox.util.Matrix; +import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; + import lombok.AccessLevel; import lombok.Getter; import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; +import net.sourceforge.lept4j.Pix; +import net.sourceforge.lept4j.util.LeptUtils; @Getter @RequiredArgsConstructor @@ -24,4 +30,19 @@ public class ExtractedImage { int numberOnPage; PDColorSpace colorSpace; + + @SneakyThrows + public Pix asPix() { + + BufferedImage image = ImageProcessingUtils.convertToDeviceColorSpace(this); + ImageProcessingUtils.setAlphaChannelToWhite(image); + return LeptUtils.convertImageToPix(image); + } + + + public QuadPoint getImageCoordinatesInInitialUserSpace() { + + return QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, 1, 1)).getTransformed(ctm.createAffineTransform()); + } + } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java index cedcba9..c6abfad 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java @@ -1,18 +1,15 @@ package com.knecon.fforesight.service.ocr.processor.model; +import java.awt.Graphics; import java.awt.geom.AffineTransform; import java.awt.image.BufferedImage; -import java.io.IOException; -import java.nio.IntBuffer; -import java.util.concurrent.Semaphore; +import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; +import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; import org.apache.pdfbox.util.Matrix; import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; -import com.pdftron.sdf.Obj; -import com.sun.jna.StringArray; -import com.sun.jna.ptr.PointerByReference; import lombok.AccessLevel; import lombok.Getter; @@ -27,63 +24,20 @@ import net.sourceforge.tess4j.ITessAPI; @Slf4j @Getter @RequiredArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class ExtractedOcrImage implements OcrImage { - final int pageNumber; - final Pix pix; - final int originalHeight; - final int originalWidth; - final int height; - final int width; - final Matrix ctm; - final int numberOnPage; - - @Setter + int pageNumber; + int numberOnPage; + int originalHeight; + int originalWidth; + Matrix ctm; + Pix pix; + int height; + int width; int rotationDegrees; - @SneakyThrows - public ExtractedOcrImage(int pageNumber, int numberOnPage, BufferedImage bufferedImage, Matrix ctm, int targetDpi) { - - this.pageNumber = pageNumber; - this.numberOnPage = numberOnPage; - this.ctm = ctm; - this.originalHeight = bufferedImage.getHeight(); - this.originalWidth = bufferedImage.getWidth(); - float imageDPI = Math.abs(bufferedImage.getWidth() / (ctm.getScalingFactorX() / 72)); - this.pix = binarize(bufferedImage, imageDPI, targetDpi); - this.height = pix.h; - this.width = pix.w; - } - - - public ExtractedOcrImage(ExtractedImage image, int targetDpi) { - this.pageNumber = image.getPageNumber(); - this.numberOnPage = image.getNumberOnPage(); - this.ctm = image.getCtm(); - this.originalHeight = image.getImage().getHeight(); - this.originalWidth = image.getImage().getWidth(); - float imageDPI = Math.abs(image.getImage().getWidth() / (ctm.getScalingFactorX() / 72)); - this.pix = binarize(image.getImage(), imageDPI, targetDpi); - this.height = pix.h; - this.width = pix.w; - } - - - @SneakyThrows - private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) { - - ImageProcessingUtils.setAlphaChannelToWhite(image); - - synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs tesseract detection script. - Pix grayScale = ImageProcessingUtils.convertToGrayScale(image); - Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale); - return ImageProcessingUtils.despecklePix(scaledUp); - } - } - - @Override public AffineTransform getImageCTM() { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java index 37a1806..86cfd6a 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java @@ -71,13 +71,6 @@ public interface OcrImage { } - @SneakyThrows - default BufferedImage getRotatedBufferedImage() { - - return LeptUtils.convertPixToImage(getRotatedPix()); - } - - /** * Retrieves the rotation degree of the OCR image. * @@ -94,16 +87,6 @@ public interface OcrImage { int getOptimalPageSegmentationMode(); // TODO: evaluate if PSM can be dynamically chosen to increase performance - /** - * Sets the rotation degree of the OCR image. The rotation degree specifies the amount of rotation applied to the image. - * Currently only quadrant rotations are supported. - * Rotated partial images work, due to the CTM present in the pdf working with any rotation. - * - * @param rotationDegree The rotation degree of the OCR image. - */ - void setRotationDegrees(int rotationDegree); - - /** * Retrieves the buffered image associated with the OCR image. * @@ -112,24 +95,6 @@ public interface OcrImage { Pix getPix(); - /** - * Retrieves the rotated image of the OCR image. - * - * @return The rotated BufferedImage object of the OCR image. - */ - default Pix getRotatedPix() { - - synchronized (OCRThread.class) { - return switch (360 - getRotationDegrees()) { - case 90 -> Leptonica1.pixRotateOrth(getPix(), 1); - case 180 -> Leptonica1.pixRotateOrth(getPix(), 2); - case 270 -> Leptonica1.pixRotateOrth(getPix(), 3); - default -> getPix(); - }; - } - } - - default int getDpi() { return PdfDpiCalculator.calculateDpi(getImageBounds(), getImageCTM(), getWidth()); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java index 18d3568..1a4b54e 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java @@ -49,7 +49,7 @@ public class GhostScriptService { List> processInfoBatches = buildSubListForEachProcess(stitchedPageNumbers, numOfProcesses, - 2 * settings.getOcrThreadCount()); // use 2 times the thread count as batch size, such that GS generates the rendered pages as needed by the OCR Threads + settings.getOcrThreadCount()); // use 2 times the thread count as batch size, such that GS generates the rendered pages as needed by the OCR Threads for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) { long timestamp = System.currentTimeMillis(); List renderedPageImageFiles = Collections.synchronizedList(new LinkedList<>()); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java index a022ac4..662ae5b 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.ocr.processor.service; import java.awt.Graphics; +import java.awt.geom.Rectangle2D; import java.awt.image.BufferedImage; import java.io.IOException; import java.util.LinkedList; @@ -26,6 +27,7 @@ import org.apache.pdfbox.util.Matrix; import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage; +import com.knecon.fforesight.service.ocr.processor.model.QuadPoint; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; import lombok.Getter; @@ -34,7 +36,6 @@ import lombok.SneakyThrows; @Getter public class ImageStreamEngine extends PDFStreamEngine { - private ExtractedOcrImage currentImageOnPage; private List imagesOnCurrentPage; private OcrServiceSettings settings; private int pageNum; @@ -71,6 +72,7 @@ public class ImageStreamEngine extends PDFStreamEngine { Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix(); this.imagesOnCurrentPage.add(new ExtractedImage(pageNum, + QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, imageXObject.getWidth(), imageXObject.getHeight())), imageXObject.getHeight(), imageXObject.getWidth(), imageXObject.getImage(), @@ -78,7 +80,6 @@ public class ImageStreamEngine extends PDFStreamEngine { imagesOnCurrentPage.size(), imageXObject.getColorSpace())); - //imagesOnPages.add(this.currentImageOnPage); } else if (xobject instanceof PDFormXObject) { PDFormXObject form = (PDFormXObject) xobject; showForm(form); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java index c488982..54b8306 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java @@ -107,7 +107,7 @@ public class OCRService { int numberOfOcrThreads = Math.min(settings.getOcrThreadCount(), document.getNumberOfPages()); stats = new Statistics(numberOfExtractThreads, numberOfOcrThreads); - BlockingQueue ocrImageQueue = new ArrayBlockingQueue<>(2 * numberOfOcrThreads); + BlockingQueue ocrImageQueue = new ArrayBlockingQueue<>((int) (1.5 * numberOfOcrThreads)); OcrImageFactory ocrImageFactory = new OcrImageFactory(document, documentFile, @@ -128,7 +128,7 @@ public class OCRService { .toList(); log.info("Started {} OCR consumer threads, listening for images on the queue", ocrThreads.size()); ocrImageFactory.join(); - log.info("Extracted all images, interrupting ocr threads"); + log.info("Processed all images, interrupting ocr threads"); ocrThreads.forEach(Thread::interrupt); for (OCRThread ocrThread : ocrThreads) { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java index 3ff4683..e762d6e 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java @@ -6,13 +6,16 @@ import java.util.ArrayList; import java.util.Collections; import java.util.LinkedList; import java.util.List; +import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.stream.Collectors; import org.apache.pdfbox.pdmodel.PDDocument; +import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; import com.knecon.fforesight.service.ocr.processor.model.OcrImage; import com.knecon.fforesight.service.ocr.processor.service.threads.ImageExtractionThread; +import com.knecon.fforesight.service.ocr.processor.service.threads.ImageProcessingThread; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils; @@ -29,6 +32,8 @@ public class OcrImageFactory { File documentFile; Path tmpImageDir; GhostScriptService ghostScriptService; + BlockingQueue imageProcessingQueue; + ImageProcessingThread imageProcessingThread; BlockingQueue imageOutputQueue; List imageExtractionThreads; List stitchedPageNumbers; @@ -50,6 +55,7 @@ public class OcrImageFactory { this.tmpImageDir = tmpImageDir; this.ghostScriptService = ghostScriptService; this.imageOutputQueue = imageOutputQueue; + this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOutputQueue.remainingCapacity()); this.stitchedPageNumbers = Collections.synchronizedList(new LinkedList<>()); this.stats = stats; @@ -57,8 +63,10 @@ public class OcrImageFactory { List> balancedPageNumbers = ListSplittingUtils.buildBalancedContinuousSublist(document.getNumberOfPages(), numberOfThreads); for (int i = 0; i < balancedPageNumbers.size(); i++) { - imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageOutputQueue, stitchedPageNumbers)); + imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageProcessingQueue, stitchedPageNumbers)); } + this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOutputQueue, stats, settings); + log.info("Started {} image extraction threads, with ({}) pages each", imageExtractionThreads.size(), imageExtractionThreads.stream().map(ImageExtractionThread::getPageIndices).map(List::size).map(String::valueOf).collect(Collectors.joining(", "))); @@ -70,6 +78,8 @@ public class OcrImageFactory { for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) { imageExtractionThread.start(); } + imageProcessingThread.start(); + } @@ -79,11 +89,15 @@ public class OcrImageFactory { for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) { imageExtractionThread.join(); } - if (stitchedPageNumbers.isEmpty()) { - return; - } - ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageOutputQueue, stats); + if (!stitchedPageNumbers.isEmpty()) { + ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageOutputQueue, stats); + } + imageProcessingThread.interrupt(); + log.info("All images extracted, interrupting processing thread."); + + imageProcessingThread.join(); + } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/Statistics.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/Statistics.java index 73fe284..97d44e3 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/Statistics.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/Statistics.java @@ -15,6 +15,7 @@ public class Statistics { List tesseractDuration; AtomicLong pdf2ImgDuration; AtomicLong writingTextDuration; + AtomicLong imageProcessingDuration; public Statistics(int numberOfExtractThreads, int numberOfOcrThreads) { @@ -23,6 +24,7 @@ public class Statistics { this.tesseractDuration = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfOcrThreads, 0L))); this.pdf2ImgDuration = new AtomicLong(0); this.writingTextDuration = new AtomicLong(0); + this.imageProcessingDuration = new AtomicLong(0); } @@ -32,6 +34,12 @@ public class Statistics { } + public void increaseImageProcessing(long duration) { + + imageProcessingDuration.addAndGet(duration); + } + + public void increaseTesseractDuration(int threadId, long duration) { tesseractDuration.set(threadId, tesseractDuration.get(threadId) + duration); @@ -53,13 +61,15 @@ public class Statistics { @Override public String toString() { - return String.format("imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, PDF2Img=%.2f s, writingText=%.2f s", + return String.format( + "imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s", ((float) imageExtraction.stream().mapToLong(Long::longValue).average().orElse(0) / 1000), ((float) imageExtraction.stream().mapToLong(Long::longValue).max().orElse(0) / 1000), ((float) imageExtraction.stream().mapToLong(Long::longValue).min().orElse(0) / 1000), ((float) tesseractDuration.stream().mapToLong(Long::longValue).average().orElse(0) / 1000), ((float) tesseractDuration.stream().mapToLong(Long::longValue).max().orElse(0) / 1000), ((float) tesseractDuration.stream().mapToLong(Long::longValue).min().orElse(0) / 1000), + (float) imageProcessingDuration.get() / 1000, (float) pdf2ImgDuration.get() / 1000, (float) writingTextDuration.get() / 1000); } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java index 9551bbb..89161f6 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java @@ -9,8 +9,6 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; -import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage; -import com.knecon.fforesight.service.ocr.processor.model.OcrImage; import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine; import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger; import com.knecon.fforesight.service.ocr.processor.service.Statistics; @@ -26,7 +24,7 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class ImageExtractionThread extends Thread { - static double FULL_PAGE_IMAGE_THRESHOLD = 0.98; + static double FULL_PAGE_IMAGE_THRESHOLD = 0.99; static double IMAGE_ALIGNMENT_THRESHOLD = 1; int id; @@ -38,7 +36,7 @@ public class ImageExtractionThread extends Thread { OcrServiceSettings settings; // output is written to these lists - BlockingQueue imageOutputQueue; + BlockingQueue imageProcessingQueue; List stitchedPageNumbers; @@ -50,21 +48,20 @@ public class ImageExtractionThread extends Thread { for (Integer pageIndex : pageIndices) { try (PDDocument document = Loader.loadPDF(documentFile)) { // load new PDDocument for thread safety, also keeps RAM usage low. timestamp = System.currentTimeMillis(); - List extractedImages = getExtractedOcrImages(pageIndex, document); + List extractedImages = getExtractedImages(pageIndex, document); stats.increaseImageExtraction(id, System.currentTimeMillis() - timestamp); if (extractedImages.isEmpty()) { logger.logPageSkipped(pageIndex); } - if (checkForStitchedImages(extractedImages, document.getPage(pageIndex - 1))) { + if (checkForFullPageOrStitchedImages(extractedImages, document.getPage(pageIndex - 1))) { stitchedPageNumbers.add(pageIndex); logger.addImagesToProcess(pageIndex, 0); continue; } for (ExtractedImage image : extractedImages) { - ExtractedOcrImage ocrImage = new ExtractedOcrImage(image, settings.getDpi()); - imageOutputQueue.put(ocrImage); + imageProcessingQueue.put(image); logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage()); } } @@ -72,7 +69,7 @@ public class ImageExtractionThread extends Thread { } - private List getExtractedOcrImages(Integer pageIndex, PDDocument document) { + private List getExtractedImages(Integer pageIndex, PDDocument document) { PDPage page = document.getPage(pageIndex - 1); ImageStreamEngine imageStreamEngine = new ImageStreamEngine(settings); @@ -82,14 +79,14 @@ public class ImageExtractionThread extends Thread { @SneakyThrows - private boolean checkForStitchedImages(List imagesOnCurrentPage, PDPage page) { + private boolean checkForFullPageOrStitchedImages(List imagesOnCurrentPage, PDPage page) { if (imagesOnCurrentPage.isEmpty()) { return false; } for (ExtractedImage imageOnPage : imagesOnCurrentPage) { - if (imageOnPage.getImageCoordinatesInInitialUserSpace().size() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight() * page.getCropBox().getWidth()) { + if (imageOnPage.getWidth() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getWidth() && imageOnPage.getHeight() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight()) { return true; } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java new file mode 100644 index 0000000..e42185f --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java @@ -0,0 +1,166 @@ +package com.knecon.fforesight.service.ocr.processor.service.threads; + +import static net.sourceforge.tess4j.ITessAPI.TRUE; + +import java.nio.FloatBuffer; +import java.nio.IntBuffer; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.BlockingQueue; + +import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; +import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage; +import com.knecon.fforesight.service.ocr.processor.model.OcrImage; +import com.knecon.fforesight.service.ocr.processor.service.Statistics; +import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; +import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; +import com.sun.jna.ptr.PointerByReference; + +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; +import net.sourceforge.lept4j.Leptonica1; +import net.sourceforge.lept4j.Pix; +import net.sourceforge.lept4j.util.LeptUtils; +import net.sourceforge.tess4j.ITessAPI; +import net.sourceforge.tess4j.TessAPI1; + +/* + * This thread does all the image processing. There should only be one, since Leptonica is not thread safe. + */ +@Slf4j +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class ImageProcessingThread extends Thread { + + BlockingQueue imageInputQueue; + BlockingQueue imageOutputQueue; + ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle(); + Statistics stats; + OcrServiceSettings settings; + + + @SneakyThrows + @Override + public void run() { + + // Interrupting signals that the image extraction has finished + while (true) { + try { + final ExtractedImage image = imageInputQueue.take(); + OcrImage extractedOcrImage = this.process(image); + try { + imageOutputQueue.put(extractedOcrImage); + } catch (InterruptedException e) { + imageOutputQueue.put(extractedOcrImage); + break; + } + + } catch (InterruptedException e) { + break; + } + } + log.info("Leaving initial uninterrupted loop!"); + // empty the queue + List remainingImages = new ArrayList<>(imageInputQueue.size()); + imageInputQueue.drainTo(remainingImages); + remainingImages.forEach(image -> { + OcrImage ocrImage = this.process(image); + try { + imageOutputQueue.put(ocrImage); + } catch (InterruptedException e) { + log.error(e.getMessage()); + } + }); + + TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle); + } + + + private OcrImage process(ExtractedImage extractedImage) { + + long timestamp = System.currentTimeMillis(); + float imageDPI = Math.abs(extractedImage.getImage().getWidth() / (extractedImage.getCtm().getScalingFactorX() / 72)); + + Pix pix = binarize(extractedImage.asPix(), imageDPI, settings.getDpi()); + + int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle); + Pix rotatedPix = switch (360 - orientDegree) { + case 90 -> Leptonica1.pixRotateOrth(pix, 1); + case 180 -> Leptonica1.pixRotateOrth(pix, 2); + case 270 -> Leptonica1.pixRotateOrth(pix, 3); + default -> pix; + }; + OcrImage extractedOcrImage = new ExtractedOcrImage(extractedImage.getPageNumber(), + extractedImage.getNumberOnPage(), + extractedImage.getHeight(), + extractedImage.getWidth(), + extractedImage.getCtm(), + rotatedPix, + pix.h, + pix.w, + orientDegree); + + if (pix != rotatedPix) { + LeptUtils.disposePix(pix); + } + + stats.increaseImageProcessing(System.currentTimeMillis() - timestamp); + + return extractedOcrImage; + } + + + static public int detectOrientation(Pix pix, int dpi, ITessAPI.TessBaseAPI detectionScriptHandle) { + + TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, pix); + TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, dpi); + + IntBuffer orientationDegreeResultBuffer; + FloatBuffer orientationDegreeConfidenceBuffer; + PointerByReference scriptureNameBuffer; + FloatBuffer scriptureConfidenceBuffer; + + orientationDegreeResultBuffer = IntBuffer.allocate(1); + orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1); + scriptureNameBuffer = new PointerByReference(); + scriptureConfidenceBuffer = FloatBuffer.allocate(1); + + int orientationDegree = 0; + int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle, + orientationDegreeResultBuffer, + orientationDegreeConfidenceBuffer, + scriptureNameBuffer, + scriptureConfidenceBuffer); + if (result == TRUE && orientationDegreeConfidenceBuffer.get() > 10) { + orientationDegree = orientationDegreeResultBuffer.get(); + } + + TessAPI1.TessBaseAPIClear(detectionScriptHandle); + + return orientationDegree; + } + + + @SneakyThrows + private Pix binarize(Pix pix, float imageDpi, int targetDpi) { + + Pix grayScale = ImageProcessingUtils.convertToGrayScale(pix); + Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale); + return ImageProcessingUtils.despecklePix(scaledUp); + + } + + + private static ITessAPI.TessBaseAPI initDetectionScriptHandle() { + + ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate(); + String datapath = System.getenv("TESSDATA_PREFIX"); + TessAPI1.TessBaseAPIInit3(handle, datapath, "osd"); + + return handle; + } + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java index d0b11b5..9c1a0a7 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java @@ -47,8 +47,7 @@ public class OCRThread extends Thread { OcrProgressLogger logger; Statistics stats; OcrServiceSettings settings; - ITessAPI.TessBaseAPI detectionScriptHandle; - ITessAPI.TessBaseAPI tesseractHandle; + Tesseract2 instance; public OCRThread(int id, @@ -66,8 +65,7 @@ public class OCRThread extends Thread { this.logger = logger; this.stats = stats; this.settings = settings; - this.detectionScriptHandle = initDetectionScriptHandle(); - this.tesseractHandle = initTesseractHandle(settings); + this.instance = createInstance(settings); } @@ -92,10 +90,9 @@ public class OCRThread extends Thread { this.process(image); } } catch (NoSuchElementException e) { - log.debug("Processed all Images, finishing."); - TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle); - TessAPI1.TessBaseAPIDelete(this.tesseractHandle); + log.debug("Executed tesseract on all Images, finishing."); } + } @@ -107,13 +104,8 @@ public class OCRThread extends Thread { int psm = settings.getPsmOverride() < 0 ? image.getOptimalPageSegmentationMode() : settings.getPsmOverride(); - int orientDegree = detectOrientation(image); - image.setRotationDegrees(orientDegree); - Pix rotatedPix = image.getRotatedPix(); - executeTesseract(psm, image.getDpi(), rotatedPix, tesseractOutputFileName); - + executeTesseract(psm, image.getDpi(), image.getPix(), tesseractOutputFileName); image.destroyPix(); - LeptUtils.disposePix(rotatedPix); results.add(OcrResult.create(image, tesseractOutputFileName)); logger.logImageFinished(image, psm); @@ -121,67 +113,6 @@ public class OCRThread extends Thread { } - public int detectOrientation(OcrImage image) { - - IntBuffer orientationDegreeResultBuffer; - FloatBuffer orientationDegreeConfidenceBuffer; - PointerByReference scriptureNameBuffer; - FloatBuffer scriptureConfidenceBuffer; - - TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, image.getPix()); - TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, image.getDpi()); - - synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs in leptonica binarization. - orientationDegreeResultBuffer = IntBuffer.allocate(1); - orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1); - scriptureNameBuffer = new PointerByReference(); - scriptureConfidenceBuffer = FloatBuffer.allocate(1); - } - - int orient_deg = 0; - int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle, - orientationDegreeResultBuffer, - orientationDegreeConfidenceBuffer, - scriptureNameBuffer, - scriptureConfidenceBuffer); - if (result == TRUE) { - orient_deg = orientationDegreeResultBuffer.get(); - } - - TessAPI1.TessBaseAPIClear(detectionScriptHandle); - - return orient_deg; - } - - - private static ITessAPI.TessBaseAPI initDetectionScriptHandle() { - - synchronized (OCRThread.class) { - - ITessAPI.TessBaseAPI handle = TessBaseAPICreate(); - String datapath = System.getenv("TESSDATA_PREFIX"); -// TessBaseAPISetVariable(handle, "debug_file", "/dev/null"); - TessAPI1.TessBaseAPIInit3(handle, datapath, "osd"); - - return handle; - } - } - - - synchronized private static ITessAPI.TessBaseAPI initTesseractHandle(OcrServiceSettings settings) { - - synchronized (OCRThread.class) { - - ITessAPI.TessBaseAPI handle = TessBaseAPICreate(); - String datapath = System.getenv("TESSDATA_PREFIX"); -// TessBaseAPISetVariable(handle, "debug_file", "/dev/null"); - TessBaseAPIInit1(handle, datapath, settings.getLanguages(), 1, new PointerByReference(), 0); - - return handle; - } - } - - @SneakyThrows public void executeTesseract(int psm, int dpi, Pix pix, String tesseractOutputFileName) { @@ -192,14 +123,19 @@ public class OCRThread extends Thread { Leptonica1.pixWrite(folder + "/pix_" + a[a.length - 1] + ".png", pix, 3); } - TessBaseAPISetPageSegMode(tesseractHandle, psm); + instance.setVariable("user_defined_dpi", String.valueOf(dpi)); + instance.setPageSegMode(psm); + instance.createDocumentsWithResults(pix, null, tesseractOutputFileName, List.of(ITesseract.RenderedFormat.HOCR), ITessAPI.TessPageIteratorLevel.RIL_BLOCK); + } - Tesseract2.createDocumentsWithResults(pix, - null, - tesseractOutputFileName, - List.of(ITesseract.RenderedFormat.HOCR), - ITessAPI.TessPageIteratorLevel.RIL_BLOCK, - tesseractHandle); + + private static Tesseract2 createInstance(OcrServiceSettings settings) { + + Tesseract2 instance = new Tesseract2(); + instance.setVariable("debug_file", "/dev/null"); // remove warnings from std out + instance.setOcrEngineMode(1); // set to LSTM based Engine + instance.setLanguage(settings.getLanguages()); + return instance; } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java index e165845..3185982 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java @@ -13,13 +13,13 @@ import lombok.experimental.FieldDefaults; public class OcrServiceSettings { int ocrThreadCount = 16; // Number of OCR threads - int imageExtractThreadCount = 5; // Number of image extraction threads - int gsProcessCount = 5; // Number of Ghostscript processes + int imageExtractThreadCount = 2; // Number of image extraction threads + int gsProcessCount = 2; // Number of Ghostscript processes int dpi = 300; // Target DPI for binarized images int psmOverride = -1; // Overrides the page segmentation mode if > 0 int minImageHeight = 20; // Minimum height for images to be processed int minImageWidth = 20; // Minimum width for images to be processed - boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes + boolean debug = true; // If true, overlays OCR images with a grid and draws word bounding boxes boolean removeWatermark; // If true, watermarks will be removed String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR"); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java index 1727113..d41752d 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java @@ -2,10 +2,16 @@ package com.knecon.fforesight.service.ocr.processor.utils; import java.awt.AlphaComposite; import java.awt.Color; +import java.awt.Graphics; import java.awt.Graphics2D; import java.awt.Transparency; import java.awt.image.BufferedImage; +import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; +import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; + +import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; + import lombok.SneakyThrows; import lombok.experimental.UtilityClass; import net.sourceforge.lept4j.Leptonica1; @@ -15,6 +21,22 @@ import net.sourceforge.lept4j.util.LeptUtils; @UtilityClass public class ImageProcessingUtils { + public BufferedImage convertToDeviceColorSpace(ExtractedImage extractedImage) { + + BufferedImage image; + if (extractedImage.getColorSpace() instanceof PDDeviceRGB || extractedImage.getColorSpace() instanceof PDDeviceGray) { + image = extractedImage.getImage(); + } else { + BufferedImage pdfImage = extractedImage.getImage(); + image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY); + Graphics g = image.getGraphics(); + g.drawImage(pdfImage, 0, 0, null); + g.dispose(); + } + return image; + } + + public static Pix despecklePix(Pix pix) { assert pix.d == 8; @@ -23,7 +45,9 @@ public class ImageProcessingUtils { // too small to properly despeckle, just binarize instead. despeckled = Leptonica1.pixThresholdToBinary(pix, 180); } else { - despeckled = LeptUtils.despeckle(pix, LeptUtils.SEL_STR3, 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though... + despeckled = LeptUtils.despeckle(pix, + LeptUtils.SEL_STR3, + 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though... if (despeckled == null) { despeckled = Leptonica1.pixThresholdToBinary(pix, 180); } @@ -56,9 +80,8 @@ public class ImageProcessingUtils { @SneakyThrows - public static Pix convertToGrayScale(BufferedImage image) { + public static Pix convertToGrayScale(Pix pix) { - Pix pix = LeptUtils.convertImageToPix(image); if (pix.d == 8) { return pix; } else if (pix.d == 32) { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java index eda0685..d85dc46 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java @@ -1,54 +1,45 @@ package com.knecon.fforesight.service.ocr.processor.utils; -import static net.sourceforge.tess4j.ITesseract.DOCUMENT_TITLE; - import java.awt.Rectangle; import java.nio.IntBuffer; import java.util.ArrayList; import java.util.List; -import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.sun.jna.Pointer; -import lombok.experimental.UtilityClass; import lombok.extern.slf4j.Slf4j; import net.sourceforge.lept4j.Pix; -import net.sourceforge.tess4j.ITessAPI; -import net.sourceforge.tess4j.ITesseract; import net.sourceforge.tess4j.OCRResult; import net.sourceforge.tess4j.TessAPI1; +import net.sourceforge.tess4j.Tesseract1; +import net.sourceforge.tess4j.TesseractException; import net.sourceforge.tess4j.Word; @Slf4j /** * Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted. */ -@UtilityClass -public class Tesseract2 extends TessAPI1 { +public class Tesseract2 extends Tesseract1 { - private int createDocuments(Pix pix, String filename, ITessAPI.TessBaseAPI handle, ITessAPI.TessResultRenderer renderer) { - String title = TessBaseAPIGetStringVariable(handle, DOCUMENT_TITLE); + private int createDocuments(Pix pix, String filename, TessResultRenderer renderer) { + + String title = TessBaseAPIGetStringVariable(getHandle(), DOCUMENT_TITLE); TessResultRendererBeginDocument(renderer, title); - int result = TessBaseAPIProcessPage(handle, pix, 0, filename, null, 0, renderer); + int result = TessBaseAPIProcessPage(getHandle(), pix, 0, filename, null, 0, renderer); TessResultRendererEndDocument(renderer); // if (result == ITessAPI.FALSE) { // throw new TesseractException("Error during processing page."); // } - return TessBaseAPIMeanTextConf(handle); + return TessBaseAPIMeanTextConf(getHandle()); } - public OCRResult createDocumentsWithResults(Pix bi, - String filename, - String outputbase, - List formats, - int pageIteratorLevel, - ITessAPI.TessBaseAPI handle) { + public OCRResult createDocumentsWithResults(Pix bi, String filename, String outputbase, List formats, int pageIteratorLevel) throws TesseractException { - List results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel, handle); + List results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel); if (!results.isEmpty()) { return results.get(0); } else { @@ -57,26 +48,24 @@ public class Tesseract2 extends TessAPI1 { } - public List createDocumentsWithResults(Pix[] pixs, - String[] filenames, - String[] outputbases, - List formats, - int pageIteratorLevel, - ITessAPI.TessBaseAPI handle) { + public List createDocumentsWithResults(Pix[] pixs, String[] filenames, String[] outputbases, List formats, int pageIteratorLevel) { if (pixs.length != filenames.length || pixs.length != outputbases.length) { throw new RuntimeException("The three arrays must match in length."); } + init(); + setVariables(); + List results = new ArrayList(); try { for (int i = 0; i < pixs.length; i++) { try { - ITessAPI.TessResultRenderer renderer = createRenderers(outputbases[i], formats); - int meanTextConfidence = createDocuments(pixs[i], filenames[i], handle, renderer); + TessResultRenderer renderer = createRenderers(outputbases[i], formats); + int meanTextConfidence = createDocuments(pixs[i], filenames[i], renderer); TessDeleteResultRenderer(renderer); - List words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel, handle) : new ArrayList(); + List words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel) : new ArrayList(); results.add(new OCRResult(meanTextConfidence, words)); } catch (Exception e) { // skip the problematic image file @@ -84,22 +73,20 @@ public class Tesseract2 extends TessAPI1 { } } } finally { - synchronized (OCRThread.class) { - TessAPI1.TessBaseAPIClear(handle); - } + dispose(); } return results; } - private List getRecognizedWords(int pageIteratorLevel, ITessAPI.TessBaseAPI handle) { + private List getRecognizedWords(int pageIteratorLevel) { List words = new ArrayList<>(); try { - ITessAPI.TessResultIterator ri = TessBaseAPIGetIterator(handle); - ITessAPI.TessPageIterator pi = TessResultIteratorGetPageIterator(ri); + TessResultIterator ri = TessBaseAPIGetIterator(getHandle()); + TessPageIterator pi = TessResultIteratorGetPageIterator(ri); TessPageIteratorBegin(pi); do { @@ -132,11 +119,11 @@ public class Tesseract2 extends TessAPI1 { } - private ITessAPI.TessResultRenderer createRenderers(String outputbase, List formats) { + private TessResultRenderer createRenderers(String outputbase, List formats) { - ITessAPI.TessResultRenderer renderer = null; + TessResultRenderer renderer = null; - for (ITesseract.RenderedFormat format : formats) { + for (RenderedFormat format : formats) { switch (format) { case HOCR: diff --git a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index a4707f8..5b5204a 100644 --- a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcr() { - String text = testOCR("files/VV-352892.pdf"); + String text = testOCR("files/2009-1048395_50pages_tables.pdf"); } @@ -139,7 +139,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { String dir = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/"; List foundFiles = Files.walk(Path.of(dir)) -// .sorted(Comparator.comparingLong(this::getFileSize)) + .sorted(Comparator.comparingLong(this::getFileSize)) .map(Path::toFile) .filter(file -> file.getName().endsWith(".pdf")) .peek(System.out::println) @@ -162,7 +162,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcrForSpecificFile() { - testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/Item 17_Toxicidade Inalatoria.pdf")); + testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/F.2. A16003E - Acute Inhalation Study.pdf")); // testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/A23220A - 404 - Skin Irritation in vivo.pdf")); // testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/G.1.2 - 1768300_MMNA_A13617AV_report.pdf")); // testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 17_Toxicidade Inalatoria Aguda.pdf")); From 955ff6281d79f3c8783bfec82eafa8785a7fe541 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Thu, 23 Nov 2023 14:56:00 +0100 Subject: [PATCH 12/16] RED-7669: optimize OCR-module performance * move all critical stuff to its own singleton thread * make gs process queue any image once the file has been written --- .../ocr/processor/model/PageInformation.java | 12 ++ .../model/RenderedPageImageFile.java | 11 +- .../processor/model/RenderedPageOcrImage.java | 37 ++---- .../ocr/processor/model/UnprocessedImage.java | 9 ++ .../processor/service/GhostScriptService.java | 69 ++++++---- .../processor/service/ImageStreamEngine.java | 5 - .../processor/service/OcrImageFactory.java | 13 +- .../service/threads/BlockingQueueFiller.java | 61 +++++++++ .../threads/GhostScriptOutputHandler.java | 122 ++++++++++++++++++ .../threads/ImageExtractionThread.java | 5 +- .../threads/ImageProcessingThread.java | 83 ++++++++++-- .../service/threads/ProcessIOLogger.java | 55 -------- .../settings/OcrServiceSettings.java | 6 +- .../processor/utils/ImageProcessingUtils.java | 4 +- .../ocr/processor/utils/Pdf2ImgTest.java | 28 +--- .../v1/server/OcrServiceIntegrationTest.java | 2 +- 16 files changed, 353 insertions(+), 169 deletions(-) create mode 100644 ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java create mode 100644 ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/UnprocessedImage.java create mode 100644 ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/BlockingQueueFiller.java create mode 100644 ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/GhostScriptOutputHandler.java delete mode 100644 ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ProcessIOLogger.java diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java new file mode 100644 index 0000000..4935eda --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java @@ -0,0 +1,12 @@ +package com.knecon.fforesight.service.ocr.processor.model; + +import org.apache.pdfbox.pdmodel.PDPage; + +public record PageInformation(int height, int width, int number, int rotationDegrees) { + + public static PageInformation fromPDPage(int pageNum, PDPage page) { + + return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation()); + } + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageImageFile.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageImageFile.java index 2b773fe..4bb78fb 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageImageFile.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageImageFile.java @@ -1,5 +1,14 @@ package com.knecon.fforesight.service.ocr.processor.model; -public record RenderedPageImageFile(int pageNumber, String absoluteFilePath) { +import net.sourceforge.lept4j.Leptonica1; +import net.sourceforge.lept4j.Pix; + +public record RenderedPageImageFile(int pageNumber, String absoluteFilePath) implements UnprocessedImage { + + @Override + public Pix asPix() { + + return Leptonica1.pixRead(absoluteFilePath); + } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageOcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageOcrImage.java index 42abff4..1141eb5 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageOcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageOcrImage.java @@ -8,6 +8,7 @@ import org.apache.pdfbox.pdmodel.PDPage; import lombok.AccessLevel; import lombok.Getter; +import lombok.RequiredArgsConstructor; import lombok.Setter; import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; @@ -16,29 +17,17 @@ import net.sourceforge.lept4j.Pix; import net.sourceforge.tess4j.ITessAPI; @Getter -@FieldDefaults(level = AccessLevel.PRIVATE) +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class RenderedPageOcrImage implements OcrImage { - final String absoluteImagePath; - final int height; - final int width; - final PageInformation pageInformation; - final Pix pix; - @Setter + int height; + int width; + PageInformation pageInformation; + Pix pix; int rotationDegrees; - @SneakyThrows - public RenderedPageOcrImage(RenderedPageImageFile renderedPageImageFile, PDDocument document) { - - this.pageInformation = PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1)); - this.absoluteImagePath = renderedPageImageFile.absoluteFilePath(); - this.pix = Leptonica1.pixRead(absoluteImagePath); - this.height = getPix().h; - this.width = getPix().w; - } - - @Override public int getOptimalPageSegmentationMode() { @@ -107,7 +96,7 @@ public class RenderedPageOcrImage implements OcrImage { // PDFBox always returns page height and width based on rotation double pageWidth; - if (pageInformation.rotationDegrees == 90 || pageInformation.rotationDegrees == 270) { + if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) { pageWidth = pageInformation.height(); } else { pageWidth = pageInformation.width(); @@ -116,14 +105,4 @@ public class RenderedPageOcrImage implements OcrImage { return pageWidth / width; } - - private record PageInformation(int height, int width, int number, int rotationDegrees) { - - public static PageInformation fromPDPage(int pageNum, PDPage page) { - - return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation()); - } - - } - } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/UnprocessedImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/UnprocessedImage.java new file mode 100644 index 0000000..6facc56 --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/UnprocessedImage.java @@ -0,0 +1,9 @@ +package com.knecon.fforesight.service.ocr.processor.model; + +import net.sourceforge.lept4j.Pix; + +public interface UnprocessedImage { + + Pix asPix(); + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java index 1a4b54e..a767f91 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java @@ -4,18 +4,26 @@ import java.io.InputStream; import java.nio.file.Path; import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; import java.util.LinkedList; import java.util.List; +import java.util.Map; import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.LinkedTransferQueue; import java.util.stream.Collectors; import org.apache.pdfbox.pdmodel.PDDocument; import org.springframework.stereotype.Service; +import com.azure.core.implementation.GeoObjectHelper; import com.knecon.fforesight.service.ocr.processor.model.OcrImage; +import com.knecon.fforesight.service.ocr.processor.model.PageInformation; import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile; import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage; -import com.knecon.fforesight.service.ocr.processor.service.threads.ProcessIOLogger; +import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage; +import com.knecon.fforesight.service.ocr.processor.service.threads.BlockingQueueFiller; +import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils; @@ -24,6 +32,7 @@ import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; import lombok.extern.slf4j.Slf4j; +import net.sourceforge.lept4j.Pix; @Slf4j @Service @@ -42,17 +51,19 @@ public class GhostScriptService { String documentAbsolutePath, Path tmpImageDir, PDDocument document, - BlockingQueue imageOutputQueue, + BlockingQueue imageProcessingQueue, Statistics stats) { + BlockingQueue imageFileCollectorQueue = new LinkedBlockingDeque<>(); + Thread asyncTransferThread = new BlockingQueueFiller(imageFileCollectorQueue, imageProcessingQueue); + asyncTransferThread.start(); int numOfProcesses = Math.min(settings.getGsProcessCount(), stitchedPageNumbers.size()); List> processInfoBatches = buildSubListForEachProcess(stitchedPageNumbers, numOfProcesses, - settings.getOcrThreadCount()); // use 2 times the thread count as batch size, such that GS generates the rendered pages as needed by the OCR Threads + 256 * numOfProcesses); // GS has a limit on how many pageIndices per call are possible, so we limit it to 256 pages per process for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) { long timestamp = System.currentTimeMillis(); - List renderedPageImageFiles = Collections.synchronizedList(new LinkedList<>()); List processInfos = processInfoBatches.get(batchIdx); log.info("Batch {}: Running {} gs processes with ({}) pages each", @@ -63,9 +74,9 @@ public class GhostScriptService { int finalBatchIdx = batchIdx; List processes = processInfos.stream() .parallel() - .map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.stitchedPageNumbers(), tmpImageDir, documentAbsolutePath, renderedPageImageFiles)) - .peek(s -> log.debug(String.join(" ", s))) - .map(this::executeProcess) + .map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.stitchedPageNumbers(), tmpImageDir, documentAbsolutePath)) + .peek(s -> log.debug(String.join(" ", s.cmdArgs()))) + .map(processInfo -> executeProcess(processInfo, imageFileCollectorQueue)) .toList(); List processExitCodes = new LinkedList<>(); @@ -73,14 +84,9 @@ public class GhostScriptService { processExitCodes.add(process.waitFor()); } stats.increasePDF2ImgDuration(System.currentTimeMillis() - timestamp); - log.info("Batch {}: Ghostscript processes finished with exit codes " + processExitCodes, batchIdx); - for (RenderedPageImageFile renderedPageImageFile : renderedPageImageFiles) { - OcrImage image = new RenderedPageOcrImage(renderedPageImageFile, document); - imageOutputQueue.put(image); - } - } + asyncTransferThread.interrupt(); } @@ -107,20 +113,28 @@ public class GhostScriptService { @SneakyThrows - private String[] buildCmdArgs(Integer processIdx, - Integer batchIdx, - List stitchedImagePageIndices, - Path outputDir, - String documentAbsolutePath, - List fullPageImages) { + private ProcessCmdsAndRenderedImageFiles buildCmdArgs(Integer processIdx, + Integer batchIdx, + List stitchedImagePageIndices, + Path outputDir, + String documentAbsolutePath) { String imagePathFormat = outputDir.resolve("output_" + processIdx + "_" + batchIdx + ".%04d" + FORMAT).toFile().toString(); + Map fullPageImages = new HashMap<>(); for (int i = 0; i < stitchedImagePageIndices.size(); i++) { Integer pageNumber = stitchedImagePageIndices.get(i); - fullPageImages.add(new RenderedPageImageFile(pageNumber, String.format(imagePathFormat, i + 1))); + fullPageImages.put(pageNumber, new RenderedPageImageFile(pageNumber, String.format(imagePathFormat, i + 1))); } + String[] cmdArgs = buildCmdArgs(stitchedImagePageIndices, documentAbsolutePath, imagePathFormat); + + return new ProcessCmdsAndRenderedImageFiles(cmdArgs, fullPageImages); + } + + + private String[] buildCmdArgs(List stitchedImagePageIndices, String documentAbsolutePath, String imagePathFormat) { + StringBuilder sPageList = new StringBuilder(); int i = 1; for (Integer integer : stitchedImagePageIndices) { @@ -131,18 +145,19 @@ public class GhostScriptService { i++; } - return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + settings.getDpi(), "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"}; + String[] cmdArgs = new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + settings.getDpi(), "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"}; + return cmdArgs; } @SneakyThrows - private Process executeProcess(String[] cmdArgs) { + private Process executeProcess(ProcessCmdsAndRenderedImageFiles processInfo, BlockingQueue imageFileCollectorQueue) { - Process p = Runtime.getRuntime().exec(cmdArgs); + Process p = Runtime.getRuntime().exec(processInfo.cmdArgs()); InputStream stdOut = p.getInputStream(); - ProcessIOLogger stdOutLogger = new ProcessIOLogger(stdOut, "GS", ProcessIOLogger.Type.STD_OUT); + GhostScriptOutputHandler stdOutLogger = GhostScriptOutputHandler.stdOut(stdOut, processInfo.renderedPageImageFiles(), imageFileCollectorQueue); InputStream stdError = p.getErrorStream(); - ProcessIOLogger stdErrorLogger = new ProcessIOLogger(stdError, "GS", ProcessIOLogger.Type.ERROR); + GhostScriptOutputHandler stdErrorLogger = GhostScriptOutputHandler.errorHandler(stdError); stdOutLogger.start(); stdErrorLogger.start(); @@ -150,6 +165,10 @@ public class GhostScriptService { } + private record ProcessCmdsAndRenderedImageFiles(String[] cmdArgs, Map renderedPageImageFiles) { + + } + private record ProcessInfo(Integer processIdx, List stitchedPageNumbers) { } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java index 662ae5b..0c64115 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java @@ -1,8 +1,6 @@ package com.knecon.fforesight.service.ocr.processor.service; -import java.awt.Graphics; import java.awt.geom.Rectangle2D; -import java.awt.image.BufferedImage; import java.io.IOException; import java.util.LinkedList; import java.util.List; @@ -19,14 +17,11 @@ import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.graphics.PDXObject; -import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; -import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.util.Matrix; import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; -import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage; import com.knecon.fforesight.service.ocr.processor.model.QuadPoint; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java index e762d6e..2e913e2 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java @@ -14,6 +14,7 @@ import org.apache.pdfbox.pdmodel.PDDocument; import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; import com.knecon.fforesight.service.ocr.processor.model.OcrImage; +import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage; import com.knecon.fforesight.service.ocr.processor.service.threads.ImageExtractionThread; import com.knecon.fforesight.service.ocr.processor.service.threads.ImageProcessingThread; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; @@ -32,7 +33,7 @@ public class OcrImageFactory { File documentFile; Path tmpImageDir; GhostScriptService ghostScriptService; - BlockingQueue imageProcessingQueue; + BlockingQueue imageProcessingQueue; ImageProcessingThread imageProcessingThread; BlockingQueue imageOutputQueue; List imageExtractionThreads; @@ -45,7 +46,7 @@ public class OcrImageFactory { Path tmpImageDir, int numberOfThreads, GhostScriptService ghostScriptService, - BlockingQueue imageOutputQueue, + BlockingQueue imageOcrQueue, OcrProgressLogger logger, OcrServiceSettings settings, Statistics stats) { @@ -54,8 +55,8 @@ public class OcrImageFactory { this.documentFile = documentFile; this.tmpImageDir = tmpImageDir; this.ghostScriptService = ghostScriptService; - this.imageOutputQueue = imageOutputQueue; - this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOutputQueue.remainingCapacity()); + this.imageOutputQueue = imageOcrQueue; + this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOcrQueue.remainingCapacity()); this.stitchedPageNumbers = Collections.synchronizedList(new LinkedList<>()); this.stats = stats; @@ -65,7 +66,7 @@ public class OcrImageFactory { for (int i = 0; i < balancedPageNumbers.size(); i++) { imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageProcessingQueue, stitchedPageNumbers)); } - this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOutputQueue, stats, settings); + this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOcrQueue, stats, settings, document); log.info("Started {} image extraction threads, with ({}) pages each", imageExtractionThreads.size(), @@ -91,7 +92,7 @@ public class OcrImageFactory { } if (!stitchedPageNumbers.isEmpty()) { - ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageOutputQueue, stats); + ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageProcessingQueue, stats); } imageProcessingThread.interrupt(); log.info("All images extracted, interrupting processing thread."); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/BlockingQueueFiller.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/BlockingQueueFiller.java new file mode 100644 index 0000000..554e190 --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/BlockingQueueFiller.java @@ -0,0 +1,61 @@ +package com.knecon.fforesight.service.ocr.processor.service.threads; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.BlockingQueue; + +import com.knecon.fforesight.service.ocr.processor.model.OcrImage; +import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile; +import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage; + +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; +import net.sourceforge.tess4j.TessAPI1; + + +/* +This just moves the Elements from the GhostScriptOutputListener into the ImageProcessing queue asynchronously + */ +@Slf4j +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class BlockingQueueFiller extends Thread { + + BlockingQueue imageInputQueue; + BlockingQueue imageOutputQueue; + + @SneakyThrows + @Override + public void run() { + + // Interrupting signals that the image extraction has finished + while (true) { + try { + final UnprocessedImage image = imageInputQueue.take(); + try { + imageOutputQueue.put(image); + } catch (InterruptedException e) { + imageOutputQueue.put(image); + break; + } + + } catch (InterruptedException e) { + break; + } + } + // empty the queue + List remainingImages = new ArrayList<>(imageInputQueue.size()); + imageInputQueue.drainTo(remainingImages); + remainingImages.forEach(image -> { + try { + imageOutputQueue.put(image); + } catch (InterruptedException e) { + log.error(e.getMessage()); + } + }); + + } +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/GhostScriptOutputHandler.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/GhostScriptOutputHandler.java new file mode 100644 index 0000000..0dd0c60 --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/GhostScriptOutputHandler.java @@ -0,0 +1,122 @@ +package com.knecon.fforesight.service.ocr.processor.service.threads; + +import java.io.BufferedReader; +import java.io.File; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.concurrent.BlockingQueue; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile; + +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@RequiredArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class GhostScriptOutputHandler extends Thread { + + static Pattern pageFinishedPattern = Pattern.compile("Page (\\d+)"); + + // If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock. + // Since both need to read simultaneously we need to implement the readers as separate threads. + + final InputStream is; + final String processName; + final Type type; + + final Map pagesToProcess; + final BlockingQueue renderedPageImageFileOutput; + + int currentPageNumber; + + + public static GhostScriptOutputHandler errorHandler(InputStream is) { + + return new GhostScriptOutputHandler(is, "GS", Type.ERROR, null, null); + } + + + public static GhostScriptOutputHandler stdOut(InputStream is, + Map pagesToProcess, + BlockingQueue renderedPageImageFileOutput) { + + return new GhostScriptOutputHandler(is, "GS", Type.STD_OUT, pagesToProcess, renderedPageImageFileOutput); + } + + + @SneakyThrows + public void run() { + + try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) { + + String line; + while (true) { + line = br.readLine(); + + if (line == null) { + break; + } + + if (type.equals(Type.ERROR)) { + log.error(processName + "_" + type.name() + ">" + line); + } else { + log.debug(processName + "_" + type.name() + ">" + line); + addProcessedImageToQueue(line); + } + } + } + is.close(); + if (type.equals(Type.STD_OUT)) { + queueFinishedPage(currentPageNumber); + } + + } + + + private void addProcessedImageToQueue(String line) { + + /* + Ghostscript prints the pageNumber it is currently working on, so we remember the current page and queue it as soon as the next comes in. + */ + Matcher pageNumberMatcher = pageFinishedPattern.matcher(line); + if (pageNumberMatcher.find()) { + int pageNumber = Integer.parseInt(pageNumberMatcher.group(1)); + + if (currentPageNumber == 0) { + currentPageNumber = pageNumber; + return; + } + + queueFinishedPage(currentPageNumber); + currentPageNumber = pageNumber; + } + } + + + private void queueFinishedPage(int pageNumber) { + + var imageFile = this.pagesToProcess.get(pageNumber); + if (imageFile == null) { + throw new IllegalArgumentException(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet())); + } + assert new File(imageFile.absoluteFilePath()).isFile(); + renderedPageImageFileOutput.add(imageFile); + } + + + public enum Type { + ERROR, + STD_OUT + } + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java index 89161f6..c81067a 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java @@ -9,6 +9,7 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; +import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage; import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine; import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger; import com.knecon.fforesight.service.ocr.processor.service.Statistics; @@ -36,7 +37,7 @@ public class ImageExtractionThread extends Thread { OcrServiceSettings settings; // output is written to these lists - BlockingQueue imageProcessingQueue; + BlockingQueue imageProcessingQueue; List stitchedPageNumbers; @@ -61,7 +62,7 @@ public class ImageExtractionThread extends Thread { } for (ExtractedImage image : extractedImages) { - imageProcessingQueue.put(image); + imageProcessingQueue.put((UnprocessedImage) image); logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage()); } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java index e42185f..492f571 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java @@ -2,15 +2,22 @@ package com.knecon.fforesight.service.ocr.processor.service.threads; import static net.sourceforge.tess4j.ITessAPI.TRUE; +import java.lang.annotation.Documented; import java.nio.FloatBuffer; import java.nio.IntBuffer; import java.util.ArrayList; import java.util.List; import java.util.concurrent.BlockingQueue; +import org.apache.pdfbox.pdmodel.PDDocument; + import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage; import com.knecon.fforesight.service.ocr.processor.model.OcrImage; +import com.knecon.fforesight.service.ocr.processor.model.PageInformation; +import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile; +import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage; +import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage; import com.knecon.fforesight.service.ocr.processor.service.Statistics; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; @@ -35,11 +42,12 @@ import net.sourceforge.tess4j.TessAPI1; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class ImageProcessingThread extends Thread { - BlockingQueue imageInputQueue; + BlockingQueue imageInputQueue; BlockingQueue imageOutputQueue; ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle(); Statistics stats; OcrServiceSettings settings; + PDDocument document; @SneakyThrows @@ -49,7 +57,7 @@ public class ImageProcessingThread extends Thread { // Interrupting signals that the image extraction has finished while (true) { try { - final ExtractedImage image = imageInputQueue.take(); + final UnprocessedImage image = imageInputQueue.take(); OcrImage extractedOcrImage = this.process(image); try { imageOutputQueue.put(extractedOcrImage); @@ -62,9 +70,8 @@ public class ImageProcessingThread extends Thread { break; } } - log.info("Leaving initial uninterrupted loop!"); // empty the queue - List remainingImages = new ArrayList<>(imageInputQueue.size()); + List remainingImages = new ArrayList<>(imageInputQueue.size()); imageInputQueue.drainTo(remainingImages); remainingImages.forEach(image -> { OcrImage ocrImage = this.process(image); @@ -79,21 +86,61 @@ public class ImageProcessingThread extends Thread { } - private OcrImage process(ExtractedImage extractedImage) { + private OcrImage process(UnprocessedImage unprocessedImage) { long timestamp = System.currentTimeMillis(); + + OcrImage ocrImage; + if (unprocessedImage instanceof ExtractedImage extractedImage) { + ocrImage = processExtractedImage(extractedImage); + } else if (unprocessedImage instanceof RenderedPageImageFile renderedPageImageFile) { + ocrImage = processRenderedPageImageFile(renderedPageImageFile); + } else { + throw new UnsupportedOperationException(String.format("Class %s is not supported!", unprocessedImage.getClass())); + } + + stats.increaseImageProcessing(System.currentTimeMillis() - timestamp); + + return ocrImage; + } + + + private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) { + + Pix grayScale = ImageProcessingUtils.convertToGrayScale(renderedPageImageFile.asPix()); + Pix despeckled = ImageProcessingUtils.despecklePix(grayScale); + + int orientDegree = detectOrientation(despeckled, settings.getDpi(), detectionScriptHandle); + Pix rotatedPix = switch (360 - orientDegree) { + case 90 -> Leptonica1.pixRotateOrth(despeckled, 1); + case 180 -> Leptonica1.pixRotateOrth(despeckled, 2); + case 270 -> Leptonica1.pixRotateOrth(despeckled, 3); + default -> despeckled; + }; + + OcrImage ocrImage = new RenderedPageOcrImage(despeckled.h, + despeckled.w, + PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1)), + rotatedPix, + orientDegree); + + if (despeckled != rotatedPix) { + LeptUtils.disposePix(despeckled); + } + return ocrImage; + } + + + private OcrImage processExtractedImage(ExtractedImage extractedImage) { + float imageDPI = Math.abs(extractedImage.getImage().getWidth() / (extractedImage.getCtm().getScalingFactorX() / 72)); Pix pix = binarize(extractedImage.asPix(), imageDPI, settings.getDpi()); int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle); - Pix rotatedPix = switch (360 - orientDegree) { - case 90 -> Leptonica1.pixRotateOrth(pix, 1); - case 180 -> Leptonica1.pixRotateOrth(pix, 2); - case 270 -> Leptonica1.pixRotateOrth(pix, 3); - default -> pix; - }; - OcrImage extractedOcrImage = new ExtractedOcrImage(extractedImage.getPageNumber(), + Pix rotatedPix = getRotatedPix(orientDegree, pix); + + OcrImage ocrImage = new ExtractedOcrImage(extractedImage.getPageNumber(), extractedImage.getNumberOnPage(), extractedImage.getHeight(), extractedImage.getWidth(), @@ -106,10 +153,18 @@ public class ImageProcessingThread extends Thread { if (pix != rotatedPix) { LeptUtils.disposePix(pix); } + return ocrImage; + } - stats.increaseImageProcessing(System.currentTimeMillis() - timestamp); - return extractedOcrImage; + private static Pix getRotatedPix(int orientDegree, Pix pix) { + + return switch (360 - orientDegree) { + case 90 -> Leptonica1.pixRotateOrth(pix, 1); + case 180 -> Leptonica1.pixRotateOrth(pix, 2); + case 270 -> Leptonica1.pixRotateOrth(pix, 3); + default -> pix; + }; } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ProcessIOLogger.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ProcessIOLogger.java deleted file mode 100644 index b068dd0..0000000 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ProcessIOLogger.java +++ /dev/null @@ -1,55 +0,0 @@ -package com.knecon.fforesight.service.ocr.processor.service.threads; - -import java.io.BufferedReader; -import java.io.InputStream; -import java.io.InputStreamReader; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.SneakyThrows; -import lombok.experimental.FieldDefaults; -import lombok.extern.slf4j.Slf4j; - -@Slf4j -@AllArgsConstructor -@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -public class ProcessIOLogger extends Thread { - - // If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock. - // Since both need to read simultaneously we need to implement the readers as separate threads. - - InputStream is; - String processName; - Type type; - - - @SneakyThrows - public void run() { - - try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) { - - String line; - while (true) { - line = br.readLine(); - - if (line == null) { - break; - } - - if (type.equals(Type.ERROR)) { - log.error(processName + "_" + type.name() + ">" + line); - } else { - log.debug(processName + "_" + type.name() + ">" + line); - } - } - } - is.close(); - } - - - public enum Type { - ERROR, - STD_OUT - } - -} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java index 3185982..d8e3665 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java @@ -12,14 +12,14 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(level = AccessLevel.PRIVATE) public class OcrServiceSettings { - int ocrThreadCount = 16; // Number of OCR threads + int ocrThreadCount = 4; // Number of OCR threads int imageExtractThreadCount = 2; // Number of image extraction threads - int gsProcessCount = 2; // Number of Ghostscript processes + int gsProcessCount = 1; // Number of Ghostscript processes int dpi = 300; // Target DPI for binarized images int psmOverride = -1; // Overrides the page segmentation mode if > 0 int minImageHeight = 20; // Minimum height for images to be processed int minImageWidth = 20; // Minimum width for images to be processed - boolean debug = true; // If true, overlays OCR images with a grid and draws word bounding boxes + boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes boolean removeWatermark; // If true, watermarks will be removed String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR"); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java index d41752d..118afba 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java @@ -88,10 +88,12 @@ public class ImageProcessingUtils { Pix grayScale = Leptonica1.pixConvertRGBToGrayFast(pix); LeptUtils.disposePix(pix); return grayScale; - } else { + } else if (pix.d == 1) { Pix grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255); LeptUtils.disposePix(pix); return grayScale; + } else { + throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d)); } } diff --git a/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/Pdf2ImgTest.java b/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/Pdf2ImgTest.java index cc5f064..4388da9 100644 --- a/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/Pdf2ImgTest.java +++ b/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/Pdf2ImgTest.java @@ -1,10 +1,7 @@ package com.knecon.fforesight.service.ocr.processor.utils; import java.awt.image.BufferedImage; -import java.io.BufferedReader; import java.io.File; -import java.io.InputStream; -import java.io.InputStreamReader; import java.util.LinkedList; import java.util.List; import java.util.stream.IntStream; @@ -19,7 +16,7 @@ import org.springframework.core.io.ClassPathResource; import org.springframework.util.FileSystemUtils; import com.knecon.fforesight.service.ocr.processor.service.OsUtils; -import com.knecon.fforesight.service.ocr.processor.service.threads.ProcessIOLogger; +import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler; import lombok.SneakyThrows; @@ -50,29 +47,6 @@ public class Pdf2ImgTest { } - @Test - @SneakyThrows - public void testGhostScript() { - - String outputDir = "/tmp/ghostscript_out/"; - new File(outputDir).mkdirs(); - ClassPathResource resource = new ClassPathResource("files/Cyberport__SD-Faktura-Kopie_(ZRG2)_-_31.08.2020.pdf"); - - String[] cmdArgs = new String[]{"gs", "-dNOPAUSE", "-sDEVICE=tiff24nc", "-r" + DPI, "-sOutputFile=" + outputDir + "page%04d", resource.getFile().toString(), "-c", "quit"}; - Process p = Runtime.getRuntime().exec(cmdArgs); - ProcessIOLogger logger = new ProcessIOLogger(p.getInputStream(), "GS", ProcessIOLogger.Type.STD_OUT); - logger.start(); - ProcessIOLogger errorLogger = new ProcessIOLogger(p.getErrorStream(), "GS", ProcessIOLogger.Type.STD_OUT); - errorLogger.start(); - int exitcode = p.waitFor(); - logger.join(); - errorLogger.join(); - System.out.println("Ghostscript finished with exit code " + exitcode); - FileSystemUtils.deleteRecursively(new File(outputDir)); - - } - - @Test @SneakyThrows public void testGhostScriptParallel() { diff --git a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index 5b5204a..62146b0 100644 --- a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcr() { - String text = testOCR("files/2009-1048395_50pages_tables.pdf"); + String text = testOCR("files/StitchedImagesMultiPage.pdf"); } From 880bebcafc3bd16928176e16b42fd8a3bdb317ae Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Thu, 23 Nov 2023 15:21:07 +0100 Subject: [PATCH 13/16] RED-7669: optimize OCR-module performance * move all critical stuff to its own singleton thread * make gs process queue any image once the file has been written --- .../ocr/processor/model/ExtractedImage.java | 16 +------ .../threads/ImageExtractionThread.java | 4 +- .../threads/ImageProcessingThread.java | 46 ++++++------------- .../processor/utils/ImageProcessingUtils.java | 17 +++++-- .../utils/ImageProcessingUtilsTest.java | 34 ++++++++++++++ 5 files changed, 67 insertions(+), 50 deletions(-) create mode 100644 ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtilsTest.java diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java index 96e96c9..c8d47f2 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java @@ -16,20 +16,8 @@ import lombok.experimental.FieldDefaults; import net.sourceforge.lept4j.Pix; import net.sourceforge.lept4j.util.LeptUtils; -@Getter -@RequiredArgsConstructor -@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -public class ExtractedImage { - - int pageNumber; - QuadPoint position; - int height; - int width; - BufferedImage image; - Matrix ctm; - int numberOnPage; - PDColorSpace colorSpace; - +public record ExtractedImage( + int pageNumber, QuadPoint position, int height, int width, BufferedImage image, Matrix ctm, int numberOnPage, PDColorSpace colorSpace) implements UnprocessedImage { @SneakyThrows public Pix asPix() { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java index c81067a..d0340ca 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java @@ -63,7 +63,7 @@ public class ImageExtractionThread extends Thread { for (ExtractedImage image : extractedImages) { imageProcessingQueue.put((UnprocessedImage) image); - logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage()); + logger.addImagesToProcess(image.pageNumber(), image.numberOnPage()); } } } @@ -87,7 +87,7 @@ public class ImageExtractionThread extends Thread { } for (ExtractedImage imageOnPage : imagesOnCurrentPage) { - if (imageOnPage.getWidth() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getWidth() && imageOnPage.getHeight() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight()) { + if (imageOnPage.width() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getWidth() && imageOnPage.height() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight()) { return true; } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java index 492f571..5dfe1eb 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java @@ -2,7 +2,6 @@ package com.knecon.fforesight.service.ocr.processor.service.threads; import static net.sourceforge.tess4j.ITessAPI.TRUE; -import java.lang.annotation.Documented; import java.nio.FloatBuffer; import java.nio.IntBuffer; import java.util.ArrayList; @@ -28,7 +27,6 @@ import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; import lombok.extern.slf4j.Slf4j; -import net.sourceforge.lept4j.Leptonica1; import net.sourceforge.lept4j.Pix; import net.sourceforge.lept4j.util.LeptUtils; import net.sourceforge.tess4j.ITessAPI; @@ -107,44 +105,39 @@ public class ImageProcessingThread extends Thread { private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) { - Pix grayScale = ImageProcessingUtils.convertToGrayScale(renderedPageImageFile.asPix()); - Pix despeckled = ImageProcessingUtils.despecklePix(grayScale); + Pix pix = binarize(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi()); - int orientDegree = detectOrientation(despeckled, settings.getDpi(), detectionScriptHandle); - Pix rotatedPix = switch (360 - orientDegree) { - case 90 -> Leptonica1.pixRotateOrth(despeckled, 1); - case 180 -> Leptonica1.pixRotateOrth(despeckled, 2); - case 270 -> Leptonica1.pixRotateOrth(despeckled, 3); - default -> despeckled; - }; + int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle); + Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix); - OcrImage ocrImage = new RenderedPageOcrImage(despeckled.h, - despeckled.w, + OcrImage ocrImage = new RenderedPageOcrImage(pix.h, + pix.w, PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1)), rotatedPix, orientDegree); - if (despeckled != rotatedPix) { - LeptUtils.disposePix(despeckled); + if (pix != rotatedPix) { + LeptUtils.disposePix(pix); } + return ocrImage; } private OcrImage processExtractedImage(ExtractedImage extractedImage) { - float imageDPI = Math.abs(extractedImage.getImage().getWidth() / (extractedImage.getCtm().getScalingFactorX() / 72)); + float imageDPI = Math.abs(extractedImage.image().getWidth() / (extractedImage.ctm().getScalingFactorX() / 72)); Pix pix = binarize(extractedImage.asPix(), imageDPI, settings.getDpi()); int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle); - Pix rotatedPix = getRotatedPix(orientDegree, pix); + Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix); - OcrImage ocrImage = new ExtractedOcrImage(extractedImage.getPageNumber(), - extractedImage.getNumberOnPage(), - extractedImage.getHeight(), - extractedImage.getWidth(), - extractedImage.getCtm(), + OcrImage ocrImage = new ExtractedOcrImage(extractedImage.pageNumber(), + extractedImage.numberOnPage(), + extractedImage.height(), + extractedImage.width(), + extractedImage.ctm(), rotatedPix, pix.h, pix.w, @@ -157,15 +150,6 @@ public class ImageProcessingThread extends Thread { } - private static Pix getRotatedPix(int orientDegree, Pix pix) { - - return switch (360 - orientDegree) { - case 90 -> Leptonica1.pixRotateOrth(pix, 1); - case 180 -> Leptonica1.pixRotateOrth(pix, 2); - case 270 -> Leptonica1.pixRotateOrth(pix, 3); - default -> pix; - }; - } static public int detectOrientation(Pix pix, int dpi, ITessAPI.TessBaseAPI detectionScriptHandle) { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java index 118afba..cd8d7f8 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java @@ -24,10 +24,10 @@ public class ImageProcessingUtils { public BufferedImage convertToDeviceColorSpace(ExtractedImage extractedImage) { BufferedImage image; - if (extractedImage.getColorSpace() instanceof PDDeviceRGB || extractedImage.getColorSpace() instanceof PDDeviceGray) { - image = extractedImage.getImage(); + if (extractedImage.colorSpace() instanceof PDDeviceRGB || extractedImage.colorSpace() instanceof PDDeviceGray) { + image = extractedImage.image(); } else { - BufferedImage pdfImage = extractedImage.getImage(); + BufferedImage pdfImage = extractedImage.image(); image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY); Graphics g = image.getGraphics(); g.drawImage(pdfImage, 0, 0, null); @@ -98,6 +98,17 @@ public class ImageProcessingUtils { } + public Pix deRotatePix(int orientDegree, Pix pix) { + + return switch (360 - orientDegree) { + case 90 -> Leptonica1.pixRotateOrth(pix, 1); + case 180 -> Leptonica1.pixRotateOrth(pix, 2); + case 270 -> Leptonica1.pixRotateOrth(pix, 3); + default -> pix; + }; + } + + public static void setAlphaChannelToWhite(BufferedImage image) { if (image.getTransparency() == Transparency.TRANSLUCENT) { diff --git a/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtilsTest.java b/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtilsTest.java new file mode 100644 index 0000000..746d84b --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtilsTest.java @@ -0,0 +1,34 @@ +package com.knecon.fforesight.service.ocr.processor.utils; + +import static net.sourceforge.lept4j.ILeptonica.IFF_PNG; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import net.sourceforge.lept4j.Leptonica1; +import net.sourceforge.lept4j.Pix; + +class ImageProcessingUtilsTest { + + @BeforeEach + public void loadLeptonica() { + + System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB")); + } + + + @Test + public void testRotation() { + + Pix pix = Leptonica1.pixRead("/home/kschuettler/Downloads/painHarold.webp"); + Pix pix2 = ImageProcessingUtils.deRotatePix(0, pix); + Leptonica1.pixWrite("/tmp/0.png", pix2, IFF_PNG); + Pix pix3 = ImageProcessingUtils.deRotatePix(90, pix); + Leptonica1.pixWrite("/tmp/90.png", pix3, IFF_PNG); + Pix pix4 = ImageProcessingUtils.deRotatePix(180, pix); + Leptonica1.pixWrite("/tmp/180.png", pix4, IFF_PNG); + Pix pix5 = ImageProcessingUtils.deRotatePix(270, pix); + Leptonica1.pixWrite("/tmp/270.png", pix5, IFF_PNG); + } + +} \ No newline at end of file From c7ccbae6fffd5b8cdc80ca5bc9a68dabc16d8951 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Thu, 23 Nov 2023 15:21:23 +0100 Subject: [PATCH 14/16] RED-7669: optimize OCR-module performance * move all critical stuff to its own singleton thread * make gs process queue any image once the file has been written --- .../service/ocr/processor/utils/ImageProcessingUtilsTest.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtilsTest.java b/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtilsTest.java index 746d84b..62703cd 100644 --- a/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtilsTest.java +++ b/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtilsTest.java @@ -3,11 +3,13 @@ package com.knecon.fforesight.service.ocr.processor.utils; import static net.sourceforge.lept4j.ILeptonica.IFF_PNG; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import net.sourceforge.lept4j.Leptonica1; import net.sourceforge.lept4j.Pix; +@Disabled class ImageProcessingUtilsTest { @BeforeEach From d3190844a31673f8654d86bde56168d5d2184ca6 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Thu, 23 Nov 2023 15:49:07 +0100 Subject: [PATCH 15/16] RED-7669: optimize OCR-module performance * move all critical stuff to its own singleton thread * make gs process queue any image once the file has been written --- .../service/ocr/v1/server/OcrServiceIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index 62146b0..e20ee6e 100644 --- a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -31,7 +31,7 @@ import io.micrometer.prometheus.PrometheusMeterRegistry; import io.micrometer.prometheus.PrometheusTimer; import lombok.SneakyThrows; -//@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help. +@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help. @SpringBootTest() public class OcrServiceIntegrationTest extends AbstractTest { From 1926707ae106ae651ad4c1deb2afa34b3e27754f Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Thu, 23 Nov 2023 16:00:53 +0100 Subject: [PATCH 16/16] RED-7669: optimize OCR-module performance * move all critical stuff to its own singleton thread * make gs process queue any image once the file has been written --- .../service/ocr/v1/server/OcrServiceIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index e20ee6e..06465ad 100644 --- a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcr() { - String text = testOCR("files/StitchedImagesMultiPage.pdf"); + String text = testOCR("files/2009-1048395_50pages_tables.pdf"); }