From 759bae64998b6fbd252a37de352e8e24e36dd52d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kilian=20Sch=C3=BCttler?= Date: Mon, 20 Nov 2023 09:55:48 +0100 Subject: [PATCH] RED-7669: optimize OCR-module performance --- .../processor/model/ExtractedOcrImage.java | 73 +++----------- .../service/ocr/processor/model/OcrImage.java | 15 +-- .../processor/service/ImageStreamEngine.java | 9 +- .../processor/service/threads/OCRThread.java | 39 ++++++-- .../processor/utils/ImageProcessingUtils.java | 96 +++++++++++++++++++ .../ocr/processor/utils/Tesseract2.java | 3 + .../v1/server/OcrServiceIntegrationTest.java | 62 ++++++++++++ 7 files changed, 217 insertions(+), 80 deletions(-) create mode 100644 ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java index 72633a1..74f792d 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java @@ -8,9 +8,14 @@ import java.awt.geom.AffineTransform; import java.awt.image.BufferedImage; import java.io.IOException; import java.nio.IntBuffer; +import java.util.concurrent.Semaphore; import org.apache.pdfbox.util.Matrix; +import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; +import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; +import com.pdftron.sdf.Obj; + import lombok.AccessLevel; import lombok.Getter; import lombok.RequiredArgsConstructor; @@ -41,8 +46,9 @@ public class ExtractedOcrImage implements OcrImage { @Setter int rotationDegrees; + @SneakyThrows - public ExtractedOcrImage(int pageNumber, int numberOnPage, BufferedImage bufferedImage, Matrix ctm, int targetDpi, boolean isGray) { + public ExtractedOcrImage(int pageNumber, int numberOnPage, BufferedImage bufferedImage, Matrix ctm, int targetDpi) { this.pageNumber = pageNumber; this.numberOnPage = numberOnPage; @@ -50,72 +56,21 @@ public class ExtractedOcrImage implements OcrImage { this.originalHeight = bufferedImage.getHeight(); this.originalWidth = bufferedImage.getWidth(); float imageDPI = Math.abs(bufferedImage.getWidth() / (ctm.getScalingFactorX() / 72)); - this.pix = binarize(bufferedImage, imageDPI, targetDpi, isGray); + this.pix = binarize(bufferedImage, imageDPI, targetDpi); this.height = pix.h; this.width = pix.w; } @SneakyThrows - private Pix binarize(BufferedImage image, float imageDpi, int targetDpi, boolean isGray) { + private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) { - setAlphaChannelToWhite(image); - Pix grayScale = convertToGrayScale(image, isGray); - Pix scaledUp = scaleToTargetDpi(imageDpi, targetDpi, grayScale); - Pix despeckled = LeptUtils.despeckle(scaledUp, LeptUtils.SEL_STR3, 3); - LeptUtils.disposePix(scaledUp); - return despeckled; - } + ImageProcessingUtils.setAlphaChannelToWhite(image); - - private static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) { - - Pix scaledUp; - float targetFactor = targetDpi / imageDpi; - - if (targetFactor > 3) { - scaledUp = Leptonica1.pixScaleGray4xLI(grayScale); - LeptUtils.disposePix(grayScale); - } else if (targetFactor > 1.9) { - scaledUp = Leptonica1.pixScaleGray2xLI(grayScale); - LeptUtils.disposePix(grayScale); - } else { - scaledUp = grayScale; - } - return scaledUp; - } - - - private static Pix convertToGrayScale(BufferedImage image, boolean isGray) throws IOException { - - Pix pix = LeptUtils.convertImageToPix(image); - Pix grayScale; - if (isGray) { - grayScale = pix; - } else { - grayScale = Leptonica1.pixConvertRGBToGrayFast(pix); - LeptUtils.disposePix(pix); - } - return grayScale; - } - - - private static void setAlphaChannelToWhite(BufferedImage image) { - - if (image.getTransparency() == Transparency.TRANSLUCENT) { - // NOTE: For BITMASK images, the color model is likely IndexColorModel, - // and this model will contain the "real" color of the transparent parts - // which is likely a better fit than unconditionally setting it to white. - - // Fill background with white - Graphics2D graphics = image.createGraphics(); - try { - graphics.setComposite(AlphaComposite.DstOver); // Set composite rules to paint "behind" - graphics.setPaint(Color.WHITE); - graphics.fillRect(0, 0, image.getWidth(), image.getHeight()); - } finally { - graphics.dispose(); - } + synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs tesseract detection script. + Pix grayScale = ImageProcessingUtils.convertToGrayScale(image); + Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale); + return ImageProcessingUtils.despecklePix(scaledUp); } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java index be30547..3afb0a8 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.ocr.processor.model; import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; +import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.knecon.fforesight.service.ocr.processor.utils.PdfDpiCalculator; import net.sourceforge.lept4j.Leptonica1; @@ -102,12 +103,14 @@ public interface OcrImage { */ default Pix getRotatedPix() { - return switch (360 - getRotationDegrees()) { - case 90 -> Leptonica1.pixRotateOrth(getPix(), 1); - case 180 -> Leptonica1.pixRotateOrth(getPix(), 2); - case 270 -> Leptonica1.pixRotateOrth(getPix(), 3); - default -> getPix(); - }; + synchronized (OCRThread.class) { + return switch (360 - getRotationDegrees()) { + case 90 -> Leptonica1.pixRotateOrth(getPix(), 1); + case 180 -> Leptonica1.pixRotateOrth(getPix(), 2); + case 270 -> Leptonica1.pixRotateOrth(getPix(), 3); + default -> getPix(); + }; + } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java index cb43753..9b6d2d5 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java @@ -71,18 +71,17 @@ public class ImageStreamEngine extends PDFStreamEngine { Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix(); if (imageXObject.getColorSpace() instanceof PDDeviceRGB) { BufferedImage image = imageXObject.getImage(); - this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi(), false); + this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi()); } else if (imageXObject.getColorSpace() instanceof PDDeviceGray) { BufferedImage image = imageXObject.getImage(); - this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi(), true); + this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi()); } else { BufferedImage pdfImage = imageXObject.getImage(); - BufferedImage image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), - BufferedImage.TYPE_BYTE_GRAY); + BufferedImage image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY); Graphics g = image.getGraphics(); g.drawImage(pdfImage, 0, 0, null); g.dispose(); - this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi(), true); + this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi()); } this.imagesOnCurrentPage.add(this.currentImageOnPage); //imagesOnPages.add(this.currentImageOnPage); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java index 4ab322a..ad567ef 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java @@ -89,6 +89,8 @@ public class OCRThread extends Thread { } catch (NoSuchElementException e) { log.debug("Processed all Images, finishing."); } + + TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle); } @@ -104,8 +106,11 @@ public class OCRThread extends Thread { image.setRotationDegrees(orientDegree); Pix rotatedPix = image.getRotatedPix(); executeTesseract(psm, image.getDpi(), rotatedPix, tesseractOutputFileName); - image.destroyPix(); - LeptUtils.disposePix(rotatedPix); + + synchronized (OCRThread.class) { + image.destroyPix(); + LeptUtils.disposePix(rotatedPix); + } results.add(OcrResult.create(image, tesseractOutputFileName)); logger.logImageFinished(image, psm); @@ -115,26 +120,40 @@ public class OCRThread extends Thread { public int detectOrientation(OcrImage image) { + IntBuffer orientationDegreeResultBuffer; + FloatBuffer orientationDegreeConfidenceBuffer; + PointerByReference scriptureNameBuffer; + FloatBuffer scriptureConfidenceBuffer; + TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, image.getPix()); TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, image.getDpi()); - IntBuffer orient_degB = IntBuffer.allocate(1); - FloatBuffer orient_confB = FloatBuffer.allocate(1); - PointerByReference script_nameB = new PointerByReference(); - FloatBuffer script_confB = FloatBuffer.allocate(1); + synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs in leptonica binarization. + orientationDegreeResultBuffer = IntBuffer.allocate(1); + orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1); + scriptureNameBuffer = new PointerByReference(); + scriptureConfidenceBuffer = FloatBuffer.allocate(1); + } int orient_deg = 0; - int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle, orient_degB, orient_confB, script_nameB, script_confB); + int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle, + orientationDegreeResultBuffer, + orientationDegreeConfidenceBuffer, + scriptureNameBuffer, + scriptureConfidenceBuffer); if (result == TRUE) { - orient_deg = orient_degB.get(); + orient_deg = orientationDegreeResultBuffer.get(); + } + + synchronized (OCRThread.class) { + TessAPI1.TessBaseAPIClear(detectionScriptHandle); } - TessAPI1.TessBaseAPIClear(detectionScriptHandle); return orient_deg; } - private static ITessAPI.TessBaseAPI initDetectionScriptHandle() { + synchronized private static ITessAPI.TessBaseAPI initDetectionScriptHandle() { ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate(); String datapath = System.getenv("TESSDATA_PREFIX"); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java new file mode 100644 index 0000000..ffa9f74 --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java @@ -0,0 +1,96 @@ +package com.knecon.fforesight.service.ocr.processor.utils; + +import java.awt.AlphaComposite; +import java.awt.Color; +import java.awt.Graphics2D; +import java.awt.Transparency; +import java.awt.image.BufferedImage; +import java.io.IOException; + +import lombok.SneakyThrows; +import lombok.experimental.UtilityClass; +import net.sourceforge.lept4j.Leptonica1; +import net.sourceforge.lept4j.Pix; +import net.sourceforge.lept4j.util.LeptUtils; + +@UtilityClass +public class ImageProcessingUtils { + + public static Pix despecklePix(Pix pix) { + + assert pix.d == 8; + Pix despeckled; + if (pix.w < 100 || pix.h < 100) { + // too small to properly despeckle, just binarize instead. + despeckled = Leptonica1.pixThresholdToBinary(pix, 180); + } else { + despeckled = LeptUtils.despeckle(pix, LeptUtils.SEL_STR3, 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though... + if (despeckled == null) { + despeckled = Leptonica1.pixThresholdToBinary(pix, 180); + } + } + if (pix != despeckled) { + LeptUtils.disposePix(pix); + } + return despeckled; + } + + + public static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) { + + float targetFactor = targetDpi / imageDpi; + + if (targetFactor > 3) { + Pix scaledUp; + scaledUp = Leptonica1.pixScaleGray4xLI(grayScale); + LeptUtils.disposePix(grayScale); + return scaledUp; + } else if (targetFactor > 1.9) { + Pix scaledUp; + scaledUp = Leptonica1.pixScaleGray2xLI(grayScale); + LeptUtils.disposePix(grayScale); + return scaledUp; + } else { + return grayScale; + } + } + + + @SneakyThrows + public static Pix convertToGrayScale(BufferedImage image) { + + Pix pix = LeptUtils.convertImageToPix(image); + if (pix.d == 8) { + return pix; + } else if (pix.d == 32) { + Pix grayScale = Leptonica1.pixConvertRGBToGrayFast(pix); + LeptUtils.disposePix(pix); + return grayScale; + } else { + Pix grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255); + LeptUtils.disposePix(pix); + return grayScale; + } + } + + + public static void setAlphaChannelToWhite(BufferedImage image) { + + if (image.getTransparency() == Transparency.TRANSLUCENT) { + // NOTE: For BITMASK images, the color model is likely IndexColorModel, + // and this model will contain the "real" color of the transparent parts + // which is likely a better fit than unconditionally setting it to white. + + // Fill background with white + Graphics2D graphics = image.createGraphics(); + try { + graphics.setComposite(AlphaComposite.DstOver); // Set composite rules to paint "behind" + graphics.setPaint(Color.WHITE); + graphics.fillRect(0, 0, image.getWidth(), image.getHeight()); + } finally { + graphics.dispose(); + } + } + } + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java index 6332d86..d85dc46 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java @@ -16,6 +16,9 @@ import net.sourceforge.tess4j.TesseractException; import net.sourceforge.tess4j.Word; @Slf4j +/** + * Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted. + */ public class Tesseract2 extends Tesseract1 { diff --git a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index 3456651..06465ad 100644 --- a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -4,10 +4,15 @@ import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTex import static com.knecon.fforesight.service.ocr.processor.service.OsUtils.getTemporaryDirectory; import static org.assertj.core.api.Assertions.assertThat; +import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; +import java.nio.file.Files; import java.nio.file.Path; +import java.util.Comparator; +import java.util.List; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; @@ -127,4 +132,61 @@ public class OcrServiceIntegrationTest extends AbstractTest { } } + + @Test + @SneakyThrows + public void testOcrForAllDMFiles() { + + String dir = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/"; + List foundFiles = Files.walk(Path.of(dir)) + .sorted(Comparator.comparingLong(this::getFileSize)) + .map(Path::toFile) + .filter(file -> file.getName().endsWith(".pdf")) + .peek(System.out::println) + .toList(); + int fileCount = foundFiles.size(); + AtomicInteger processedCount = new AtomicInteger(); + System.out.printf("Found %s files, starting OCR for each.%n%n", fileCount); + foundFiles.stream().peek(file -> System.out.printf("%s/%s: %s%n", processedCount.getAndIncrement(), fileCount, file)).forEach(this::testOCRForFile); + } + + + @SneakyThrows + public long getFileSize(Path path) { + + return Files.size(path); + } + + + @Test + @SneakyThrows + public void testOcrForSpecificFile() { + + testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/F.2. A16003E - Acute Inhalation Study.pdf")); +// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/A23220A - 404 - Skin Irritation in vivo.pdf")); +// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/G.1.2 - 1768300_MMNA_A13617AV_report.pdf")); +// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 17_Toxicidade Inalatoria Aguda.pdf")); +// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 20_Sensibilizacao_02.pdf")); +// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/ITEM 23_A15149W - Dermal absorption of formulated product.pdf")); +// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 16_Toxicidade Cutanea Aguda.pdf")); + } + + + + @SneakyThrows + private void testOCRForFile(File file) { + + var originId = FileStorageService.getStorageId(TEST_DOSSIER_ID, "file", FileType.ORIGIN); + try (var fileStream = new FileInputStream(file)) { + storageService.storeObject(TenantContext.getTenantId(), originId, fileStream); + } + + Path tmpFileName = Path.of(getTemporaryDirectory()).resolve(Path.of(file.getAbsolutePath()).getFileName()); + try (var out = new FileOutputStream(tmpFileName.toFile())) { + ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", out); + System.out.println("File:" + tmpFileName); + } + System.out.println("\n\n"); + } + }