From bb5b4a2fd8ec96c6986518d9dcf58ae1f53bb024 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Tue, 21 Nov 2023 14:37:18 +0100 Subject: [PATCH] RED-7669: optimize OCR-module performance * binarize images after reading --- .../ocr/processor/model/ExtractedImage.java | 27 +++++++ .../processor/model/ExtractedOcrImage.java | 15 ++++ .../service/ocr/processor/model/OcrImage.java | 16 +++++ .../ocr/processor/model/QuadPoint.java | 6 ++ .../processor/service/ImageStreamEngine.java | 26 +++---- .../ocr/processor/service/OCRService.java | 2 +- .../threads/ImageExtractionThread.java | 39 ++++++----- .../processor/service/threads/OCRThread.java | 70 +++++++++++-------- .../settings/OcrServiceSettings.java | 6 +- .../ocr/processor/utils/Tesseract2.java | 61 +++++++++------- .../v1/server/OcrServiceIntegrationTest.java | 8 +-- 11 files changed, 182 insertions(+), 94 deletions(-) create mode 100644 ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java new file mode 100644 index 0000000..57ce77f --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java @@ -0,0 +1,27 @@ +package com.knecon.fforesight.service.ocr.processor.model; + +import java.awt.image.BufferedImage; + +import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace; +import org.apache.pdfbox.util.Matrix; + +import lombok.AccessLevel; +import lombok.Getter; +import lombok.RequiredArgsConstructor; +import lombok.experimental.FieldDefaults; + +@Getter +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class ExtractedImage { + + int pageNumber; + QuadPoint position; + int height; + int width; + BufferedImage image; + Matrix ctm; + int numberOnPage; + PDColorSpace colorSpace; + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java index 819ccbe..cedcba9 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java @@ -11,6 +11,8 @@ import org.apache.pdfbox.util.Matrix; import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; import com.pdftron.sdf.Obj; +import com.sun.jna.StringArray; +import com.sun.jna.ptr.PointerByReference; import lombok.AccessLevel; import lombok.Getter; @@ -56,6 +58,19 @@ public class ExtractedOcrImage implements OcrImage { } + public ExtractedOcrImage(ExtractedImage image, int targetDpi) { + this.pageNumber = image.getPageNumber(); + this.numberOnPage = image.getNumberOnPage(); + this.ctm = image.getCtm(); + this.originalHeight = image.getImage().getHeight(); + this.originalWidth = image.getImage().getWidth(); + float imageDPI = Math.abs(image.getImage().getWidth() / (ctm.getScalingFactorX() / 72)); + this.pix = binarize(image.getImage(), imageDPI, targetDpi); + this.height = pix.h; + this.width = pix.w; + } + + @SneakyThrows private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java index 3afb0a8..37a1806 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java @@ -2,10 +2,12 @@ package com.knecon.fforesight.service.ocr.processor.model; import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; +import java.awt.image.BufferedImage; import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.knecon.fforesight.service.ocr.processor.utils.PdfDpiCalculator; +import lombok.SneakyThrows; import net.sourceforge.lept4j.Leptonica1; import net.sourceforge.lept4j.Pix; import net.sourceforge.lept4j.util.LeptUtils; @@ -62,6 +64,20 @@ public interface OcrImage { } + @SneakyThrows + default BufferedImage getBufferedImage() { + + return LeptUtils.convertPixToImage(getPix()); + } + + + @SneakyThrows + default BufferedImage getRotatedBufferedImage() { + + return LeptUtils.convertPixToImage(getRotatedPix()); + } + + /** * Retrieves the rotation degree of the OCR image. * diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/QuadPoint.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/QuadPoint.java index 53fc7b6..c40aa1d 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/QuadPoint.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/QuadPoint.java @@ -97,4 +97,10 @@ public record QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) { d().getY()); } + + public double size() { + + return a().distance(b()) * a().distance(d()); + } + } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java index 9b6d2d5..a022ac4 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java @@ -24,6 +24,7 @@ import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.util.Matrix; +import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; @@ -34,7 +35,7 @@ import lombok.SneakyThrows; public class ImageStreamEngine extends PDFStreamEngine { private ExtractedOcrImage currentImageOnPage; - private List imagesOnCurrentPage; + private List imagesOnCurrentPage; private OcrServiceSettings settings; private int pageNum; @@ -69,21 +70,14 @@ public class ImageStreamEngine extends PDFStreamEngine { } Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix(); - if (imageXObject.getColorSpace() instanceof PDDeviceRGB) { - BufferedImage image = imageXObject.getImage(); - this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi()); - } else if (imageXObject.getColorSpace() instanceof PDDeviceGray) { - BufferedImage image = imageXObject.getImage(); - this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi()); - } else { - BufferedImage pdfImage = imageXObject.getImage(); - BufferedImage image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY); - Graphics g = image.getGraphics(); - g.drawImage(pdfImage, 0, 0, null); - g.dispose(); - this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi()); - } - this.imagesOnCurrentPage.add(this.currentImageOnPage); + this.imagesOnCurrentPage.add(new ExtractedImage(pageNum, + imageXObject.getHeight(), + imageXObject.getWidth(), + imageXObject.getImage(), + imageCTM, + imagesOnCurrentPage.size(), + imageXObject.getColorSpace())); + //imagesOnPages.add(this.currentImageOnPage); } else if (xobject instanceof PDFormXObject) { PDFormXObject form = (PDFormXObject) xobject; diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java index 3ed6193..c488982 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java @@ -107,7 +107,7 @@ public class OCRService { int numberOfOcrThreads = Math.min(settings.getOcrThreadCount(), document.getNumberOfPages()); stats = new Statistics(numberOfExtractThreads, numberOfOcrThreads); - BlockingQueue ocrImageQueue = new ArrayBlockingQueue<>(numberOfOcrThreads); + BlockingQueue ocrImageQueue = new ArrayBlockingQueue<>(2 * numberOfOcrThreads); OcrImageFactory ocrImageFactory = new OcrImageFactory(document, documentFile, diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java index 3b29836..9551bbb 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java @@ -5,10 +5,10 @@ import java.util.List; import java.util.concurrent.BlockingQueue; import org.apache.pdfbox.Loader; -import org.apache.pdfbox.io.MemoryUsageSetting; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; +import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage; import com.knecon.fforesight.service.ocr.processor.model.OcrImage; import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine; @@ -26,6 +26,7 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class ImageExtractionThread extends Thread { + static double FULL_PAGE_IMAGE_THRESHOLD = 0.98; static double IMAGE_ALIGNMENT_THRESHOLD = 1; int id; @@ -40,6 +41,7 @@ public class ImageExtractionThread extends Thread { BlockingQueue imageOutputQueue; List stitchedPageNumbers; + @SneakyThrows @Override public void run() { @@ -48,20 +50,21 @@ public class ImageExtractionThread extends Thread { for (Integer pageIndex : pageIndices) { try (PDDocument document = Loader.loadPDF(documentFile)) { // load new PDDocument for thread safety, also keeps RAM usage low. timestamp = System.currentTimeMillis(); - List extractedOcrImages = getExtractedOcrImages(pageIndex, document); + List extractedImages = getExtractedOcrImages(pageIndex, document); stats.increaseImageExtraction(id, System.currentTimeMillis() - timestamp); - if (extractedOcrImages.isEmpty()) { + if (extractedImages.isEmpty()) { logger.logPageSkipped(pageIndex); } - if (checkForStitchedImages(extractedOcrImages)) { + if (checkForStitchedImages(extractedImages, document.getPage(pageIndex - 1))) { stitchedPageNumbers.add(pageIndex); logger.addImagesToProcess(pageIndex, 0); continue; } - for (ExtractedOcrImage image : extractedOcrImages) { - imageOutputQueue.put(image); + for (ExtractedImage image : extractedImages) { + ExtractedOcrImage ocrImage = new ExtractedOcrImage(image, settings.getDpi()); + imageOutputQueue.put(ocrImage); logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage()); } } @@ -69,7 +72,7 @@ public class ImageExtractionThread extends Thread { } - private List getExtractedOcrImages(Integer pageIndex, PDDocument document) { + private List getExtractedOcrImages(Integer pageIndex, PDDocument document) { PDPage page = document.getPage(pageIndex - 1); ImageStreamEngine imageStreamEngine = new ImageStreamEngine(settings); @@ -79,22 +82,22 @@ public class ImageExtractionThread extends Thread { @SneakyThrows - private boolean checkForStitchedImages(List imagesOnCurrentPage) { + private boolean checkForStitchedImages(List imagesOnCurrentPage, PDPage page) { - if (imagesOnCurrentPage.size() <= 1) { + if (imagesOnCurrentPage.isEmpty()) { return false; } - //checking for intersections or direct alignment of images - ExtractedOcrImage[] imageOnPagesArray = new ExtractedOcrImage[imagesOnCurrentPage.size()]; - int index = 0; - for (ExtractedOcrImage imageOnPage : imagesOnCurrentPage) { - imageOnPagesArray[index] = imageOnPage; - index++; + for (ExtractedImage imageOnPage : imagesOnCurrentPage) { + if (imageOnPage.getImageCoordinatesInInitialUserSpace().size() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight() * page.getCropBox().getWidth()) { + return true; + } } - for (int j = 0; j < imageOnPagesArray.length; j++) { - for (int i = j + 1; i < imageOnPagesArray.length; i++) { - if (imageOnPagesArray[j].getImageCoordinatesInInitialUserSpace().aligns(imageOnPagesArray[i].getImageCoordinatesInInitialUserSpace(), IMAGE_ALIGNMENT_THRESHOLD)) { + + //checking for intersections or direct alignment of images + for (int j = 0; j < imagesOnCurrentPage.size(); j++) { + for (int i = j + 1; i < imagesOnCurrentPage.size(); i++) { + if (imagesOnCurrentPage.get(j).getImageCoordinatesInInitialUserSpace().aligns(imagesOnCurrentPage.get(i).getImageCoordinatesInInitialUserSpace(), IMAGE_ALIGNMENT_THRESHOLD)) { // TODO: see if we can stitch aligning images using BufferedImage and skip the gs conversion entirely return true; } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java index ad567ef..d0b11b5 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java @@ -1,6 +1,10 @@ package com.knecon.fforesight.service.ocr.processor.service.threads; import static net.sourceforge.tess4j.ITessAPI.TRUE; +import static net.sourceforge.tess4j.TessAPI1.TessBaseAPICreate; +import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIInit1; +import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetPageSegMode; +import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetVariable; import java.io.File; import java.nio.FloatBuffer; @@ -16,6 +20,7 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrResult; import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger; import com.knecon.fforesight.service.ocr.processor.service.Statistics; import com.knecon.fforesight.service.ocr.processor.utils.Tesseract2; +import com.sun.jna.StringArray; import com.sun.jna.ptr.PointerByReference; import lombok.AccessLevel; @@ -42,8 +47,8 @@ public class OCRThread extends Thread { OcrProgressLogger logger; Statistics stats; OcrServiceSettings settings; - Tesseract2 instance; ITessAPI.TessBaseAPI detectionScriptHandle; + ITessAPI.TessBaseAPI tesseractHandle; public OCRThread(int id, @@ -61,8 +66,8 @@ public class OCRThread extends Thread { this.logger = logger; this.stats = stats; this.settings = settings; - this.instance = createInstance(settings); this.detectionScriptHandle = initDetectionScriptHandle(); + this.tesseractHandle = initTesseractHandle(settings); } @@ -88,9 +93,9 @@ public class OCRThread extends Thread { } } catch (NoSuchElementException e) { log.debug("Processed all Images, finishing."); + TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle); + TessAPI1.TessBaseAPIDelete(this.tesseractHandle); } - - TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle); } @@ -107,10 +112,8 @@ public class OCRThread extends Thread { Pix rotatedPix = image.getRotatedPix(); executeTesseract(psm, image.getDpi(), rotatedPix, tesseractOutputFileName); - synchronized (OCRThread.class) { - image.destroyPix(); - LeptUtils.disposePix(rotatedPix); - } + image.destroyPix(); + LeptUtils.disposePix(rotatedPix); results.add(OcrResult.create(image, tesseractOutputFileName)); logger.logImageFinished(image, psm); @@ -145,21 +148,37 @@ public class OCRThread extends Thread { orient_deg = orientationDegreeResultBuffer.get(); } - synchronized (OCRThread.class) { - TessAPI1.TessBaseAPIClear(detectionScriptHandle); - } + TessAPI1.TessBaseAPIClear(detectionScriptHandle); return orient_deg; } - synchronized private static ITessAPI.TessBaseAPI initDetectionScriptHandle() { + private static ITessAPI.TessBaseAPI initDetectionScriptHandle() { - ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate(); - String datapath = System.getenv("TESSDATA_PREFIX"); - TessAPI1.TessBaseAPIInit3(handle, datapath, "osd"); + synchronized (OCRThread.class) { - return handle; + ITessAPI.TessBaseAPI handle = TessBaseAPICreate(); + String datapath = System.getenv("TESSDATA_PREFIX"); +// TessBaseAPISetVariable(handle, "debug_file", "/dev/null"); + TessAPI1.TessBaseAPIInit3(handle, datapath, "osd"); + + return handle; + } + } + + + synchronized private static ITessAPI.TessBaseAPI initTesseractHandle(OcrServiceSettings settings) { + + synchronized (OCRThread.class) { + + ITessAPI.TessBaseAPI handle = TessBaseAPICreate(); + String datapath = System.getenv("TESSDATA_PREFIX"); +// TessBaseAPISetVariable(handle, "debug_file", "/dev/null"); + TessBaseAPIInit1(handle, datapath, settings.getLanguages(), 1, new PointerByReference(), 0); + + return handle; + } } @@ -173,19 +192,14 @@ public class OCRThread extends Thread { Leptonica1.pixWrite(folder + "/pix_" + a[a.length - 1] + ".png", pix, 3); } - instance.setVariable("user_defined_dpi", String.valueOf(dpi)); - instance.setPageSegMode(psm); - instance.createDocumentsWithResults(pix, null, tesseractOutputFileName, List.of(ITesseract.RenderedFormat.HOCR), ITessAPI.TessPageIteratorLevel.RIL_BLOCK); - } + TessBaseAPISetPageSegMode(tesseractHandle, psm); - - private static Tesseract2 createInstance(OcrServiceSettings settings) { - - Tesseract2 instance = new Tesseract2(); - instance.setVariable("debug_file", "/dev/null"); // remove warnings from std out - instance.setOcrEngineMode(1); // set to LSTM based Engine - instance.setLanguage(settings.getLanguages()); - return instance; + Tesseract2.createDocumentsWithResults(pix, + null, + tesseractOutputFileName, + List.of(ITesseract.RenderedFormat.HOCR), + ITessAPI.TessPageIteratorLevel.RIL_BLOCK, + tesseractHandle); } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java index 0592808..e165845 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java @@ -12,9 +12,9 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(level = AccessLevel.PRIVATE) public class OcrServiceSettings { - int ocrThreadCount = 4; // Number of OCR threads - int imageExtractThreadCount = 2; // Number of image extraction threads - int gsProcessCount = 2; // Number of Ghostscript processes + int ocrThreadCount = 16; // Number of OCR threads + int imageExtractThreadCount = 5; // Number of image extraction threads + int gsProcessCount = 5; // Number of Ghostscript processes int dpi = 300; // Target DPI for binarized images int psmOverride = -1; // Overrides the page segmentation mode if > 0 int minImageHeight = 20; // Minimum height for images to be processed diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java index d85dc46..eda0685 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java @@ -1,45 +1,54 @@ package com.knecon.fforesight.service.ocr.processor.utils; +import static net.sourceforge.tess4j.ITesseract.DOCUMENT_TITLE; + import java.awt.Rectangle; import java.nio.IntBuffer; import java.util.ArrayList; import java.util.List; +import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.sun.jna.Pointer; +import lombok.experimental.UtilityClass; import lombok.extern.slf4j.Slf4j; import net.sourceforge.lept4j.Pix; +import net.sourceforge.tess4j.ITessAPI; +import net.sourceforge.tess4j.ITesseract; import net.sourceforge.tess4j.OCRResult; import net.sourceforge.tess4j.TessAPI1; -import net.sourceforge.tess4j.Tesseract1; -import net.sourceforge.tess4j.TesseractException; import net.sourceforge.tess4j.Word; @Slf4j /** * Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted. */ -public class Tesseract2 extends Tesseract1 { +@UtilityClass +public class Tesseract2 extends TessAPI1 { + private int createDocuments(Pix pix, String filename, ITessAPI.TessBaseAPI handle, ITessAPI.TessResultRenderer renderer) { - private int createDocuments(Pix pix, String filename, TessResultRenderer renderer) { - - String title = TessBaseAPIGetStringVariable(getHandle(), DOCUMENT_TITLE); + String title = TessBaseAPIGetStringVariable(handle, DOCUMENT_TITLE); TessResultRendererBeginDocument(renderer, title); - int result = TessBaseAPIProcessPage(getHandle(), pix, 0, filename, null, 0, renderer); + int result = TessBaseAPIProcessPage(handle, pix, 0, filename, null, 0, renderer); TessResultRendererEndDocument(renderer); // if (result == ITessAPI.FALSE) { // throw new TesseractException("Error during processing page."); // } - return TessBaseAPIMeanTextConf(getHandle()); + return TessBaseAPIMeanTextConf(handle); } - public OCRResult createDocumentsWithResults(Pix bi, String filename, String outputbase, List formats, int pageIteratorLevel) throws TesseractException { + public OCRResult createDocumentsWithResults(Pix bi, + String filename, + String outputbase, + List formats, + int pageIteratorLevel, + ITessAPI.TessBaseAPI handle) { - List results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel); + List results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel, handle); if (!results.isEmpty()) { return results.get(0); } else { @@ -48,24 +57,26 @@ public class Tesseract2 extends Tesseract1 { } - public List createDocumentsWithResults(Pix[] pixs, String[] filenames, String[] outputbases, List formats, int pageIteratorLevel) { + public List createDocumentsWithResults(Pix[] pixs, + String[] filenames, + String[] outputbases, + List formats, + int pageIteratorLevel, + ITessAPI.TessBaseAPI handle) { if (pixs.length != filenames.length || pixs.length != outputbases.length) { throw new RuntimeException("The three arrays must match in length."); } - init(); - setVariables(); - List results = new ArrayList(); try { for (int i = 0; i < pixs.length; i++) { try { - TessResultRenderer renderer = createRenderers(outputbases[i], formats); - int meanTextConfidence = createDocuments(pixs[i], filenames[i], renderer); + ITessAPI.TessResultRenderer renderer = createRenderers(outputbases[i], formats); + int meanTextConfidence = createDocuments(pixs[i], filenames[i], handle, renderer); TessDeleteResultRenderer(renderer); - List words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel) : new ArrayList(); + List words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel, handle) : new ArrayList(); results.add(new OCRResult(meanTextConfidence, words)); } catch (Exception e) { // skip the problematic image file @@ -73,20 +84,22 @@ public class Tesseract2 extends Tesseract1 { } } } finally { - dispose(); + synchronized (OCRThread.class) { + TessAPI1.TessBaseAPIClear(handle); + } } return results; } - private List getRecognizedWords(int pageIteratorLevel) { + private List getRecognizedWords(int pageIteratorLevel, ITessAPI.TessBaseAPI handle) { List words = new ArrayList<>(); try { - TessResultIterator ri = TessBaseAPIGetIterator(getHandle()); - TessPageIterator pi = TessResultIteratorGetPageIterator(ri); + ITessAPI.TessResultIterator ri = TessBaseAPIGetIterator(handle); + ITessAPI.TessPageIterator pi = TessResultIteratorGetPageIterator(ri); TessPageIteratorBegin(pi); do { @@ -119,11 +132,11 @@ public class Tesseract2 extends Tesseract1 { } - private TessResultRenderer createRenderers(String outputbase, List formats) { + private ITessAPI.TessResultRenderer createRenderers(String outputbase, List formats) { - TessResultRenderer renderer = null; + ITessAPI.TessResultRenderer renderer = null; - for (RenderedFormat format : formats) { + for (ITesseract.RenderedFormat format : formats) { switch (format) { case HOCR: diff --git a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index 06465ad..a4707f8 100644 --- a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -31,7 +31,7 @@ import io.micrometer.prometheus.PrometheusMeterRegistry; import io.micrometer.prometheus.PrometheusTimer; import lombok.SneakyThrows; -@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help. +//@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help. @SpringBootTest() public class OcrServiceIntegrationTest extends AbstractTest { @@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcr() { - String text = testOCR("files/2009-1048395_50pages_tables.pdf"); + String text = testOCR("files/VV-352892.pdf"); } @@ -139,7 +139,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { String dir = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/"; List foundFiles = Files.walk(Path.of(dir)) - .sorted(Comparator.comparingLong(this::getFileSize)) +// .sorted(Comparator.comparingLong(this::getFileSize)) .map(Path::toFile) .filter(file -> file.getName().endsWith(".pdf")) .peek(System.out::println) @@ -162,7 +162,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcrForSpecificFile() { - testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/F.2. A16003E - Acute Inhalation Study.pdf")); + testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/Item 17_Toxicidade Inalatoria.pdf")); // testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/A23220A - 404 - Skin Irritation in vivo.pdf")); // testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/G.1.2 - 1768300_MMNA_A13617AV_report.pdf")); // testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 17_Toxicidade Inalatoria Aguda.pdf"));