From efd3a1d952fe5ab0d8cf9559dbd46d2854852abb Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Wed, 22 Nov 2023 16:40:13 +0100 Subject: [PATCH] RED-7669: optimize OCR-module performance * move all non thread safe stuff to separate thread in the middle --- .../ocr/processor/model/ExtractedImage.java | 21 +++ .../processor/model/ExtractedOcrImage.java | 70 ++------ .../service/ocr/processor/model/OcrImage.java | 35 ---- .../processor/service/GhostScriptService.java | 2 +- .../processor/service/ImageStreamEngine.java | 5 +- .../ocr/processor/service/OCRService.java | 4 +- .../processor/service/OcrImageFactory.java | 24 ++- .../ocr/processor/service/Statistics.java | 12 +- .../threads/ImageExtractionThread.java | 19 +- .../threads/ImageProcessingThread.java | 166 ++++++++++++++++++ .../processor/service/threads/OCRThread.java | 98 ++--------- .../settings/OcrServiceSettings.java | 6 +- .../processor/utils/ImageProcessingUtils.java | 29 ++- .../ocr/processor/utils/Tesseract2.java | 61 +++---- .../v1/server/OcrServiceIntegrationTest.java | 6 +- 15 files changed, 316 insertions(+), 242 deletions(-) create mode 100644 ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java index 57ce77f..96e96c9 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java @@ -1,14 +1,20 @@ package com.knecon.fforesight.service.ocr.processor.model; +import java.awt.geom.Rectangle2D; import java.awt.image.BufferedImage; import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace; import org.apache.pdfbox.util.Matrix; +import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; + import lombok.AccessLevel; import lombok.Getter; import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; +import net.sourceforge.lept4j.Pix; +import net.sourceforge.lept4j.util.LeptUtils; @Getter @RequiredArgsConstructor @@ -24,4 +30,19 @@ public class ExtractedImage { int numberOnPage; PDColorSpace colorSpace; + + @SneakyThrows + public Pix asPix() { + + BufferedImage image = ImageProcessingUtils.convertToDeviceColorSpace(this); + ImageProcessingUtils.setAlphaChannelToWhite(image); + return LeptUtils.convertImageToPix(image); + } + + + public QuadPoint getImageCoordinatesInInitialUserSpace() { + + return QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, 1, 1)).getTransformed(ctm.createAffineTransform()); + } + } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java index cedcba9..c6abfad 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java @@ -1,18 +1,15 @@ package com.knecon.fforesight.service.ocr.processor.model; +import java.awt.Graphics; import java.awt.geom.AffineTransform; import java.awt.image.BufferedImage; -import java.io.IOException; -import java.nio.IntBuffer; -import java.util.concurrent.Semaphore; +import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; +import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; import org.apache.pdfbox.util.Matrix; import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; -import com.pdftron.sdf.Obj; -import com.sun.jna.StringArray; -import com.sun.jna.ptr.PointerByReference; import lombok.AccessLevel; import lombok.Getter; @@ -27,63 +24,20 @@ import net.sourceforge.tess4j.ITessAPI; @Slf4j @Getter @RequiredArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class ExtractedOcrImage implements OcrImage { - final int pageNumber; - final Pix pix; - final int originalHeight; - final int originalWidth; - final int height; - final int width; - final Matrix ctm; - final int numberOnPage; - - @Setter + int pageNumber; + int numberOnPage; + int originalHeight; + int originalWidth; + Matrix ctm; + Pix pix; + int height; + int width; int rotationDegrees; - @SneakyThrows - public ExtractedOcrImage(int pageNumber, int numberOnPage, BufferedImage bufferedImage, Matrix ctm, int targetDpi) { - - this.pageNumber = pageNumber; - this.numberOnPage = numberOnPage; - this.ctm = ctm; - this.originalHeight = bufferedImage.getHeight(); - this.originalWidth = bufferedImage.getWidth(); - float imageDPI = Math.abs(bufferedImage.getWidth() / (ctm.getScalingFactorX() / 72)); - this.pix = binarize(bufferedImage, imageDPI, targetDpi); - this.height = pix.h; - this.width = pix.w; - } - - - public ExtractedOcrImage(ExtractedImage image, int targetDpi) { - this.pageNumber = image.getPageNumber(); - this.numberOnPage = image.getNumberOnPage(); - this.ctm = image.getCtm(); - this.originalHeight = image.getImage().getHeight(); - this.originalWidth = image.getImage().getWidth(); - float imageDPI = Math.abs(image.getImage().getWidth() / (ctm.getScalingFactorX() / 72)); - this.pix = binarize(image.getImage(), imageDPI, targetDpi); - this.height = pix.h; - this.width = pix.w; - } - - - @SneakyThrows - private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) { - - ImageProcessingUtils.setAlphaChannelToWhite(image); - - synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs tesseract detection script. - Pix grayScale = ImageProcessingUtils.convertToGrayScale(image); - Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale); - return ImageProcessingUtils.despecklePix(scaledUp); - } - } - - @Override public AffineTransform getImageCTM() { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java index 37a1806..86cfd6a 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java @@ -71,13 +71,6 @@ public interface OcrImage { } - @SneakyThrows - default BufferedImage getRotatedBufferedImage() { - - return LeptUtils.convertPixToImage(getRotatedPix()); - } - - /** * Retrieves the rotation degree of the OCR image. * @@ -94,16 +87,6 @@ public interface OcrImage { int getOptimalPageSegmentationMode(); // TODO: evaluate if PSM can be dynamically chosen to increase performance - /** - * Sets the rotation degree of the OCR image. The rotation degree specifies the amount of rotation applied to the image. - * Currently only quadrant rotations are supported. - * Rotated partial images work, due to the CTM present in the pdf working with any rotation. - * - * @param rotationDegree The rotation degree of the OCR image. - */ - void setRotationDegrees(int rotationDegree); - - /** * Retrieves the buffered image associated with the OCR image. * @@ -112,24 +95,6 @@ public interface OcrImage { Pix getPix(); - /** - * Retrieves the rotated image of the OCR image. - * - * @return The rotated BufferedImage object of the OCR image. - */ - default Pix getRotatedPix() { - - synchronized (OCRThread.class) { - return switch (360 - getRotationDegrees()) { - case 90 -> Leptonica1.pixRotateOrth(getPix(), 1); - case 180 -> Leptonica1.pixRotateOrth(getPix(), 2); - case 270 -> Leptonica1.pixRotateOrth(getPix(), 3); - default -> getPix(); - }; - } - } - - default int getDpi() { return PdfDpiCalculator.calculateDpi(getImageBounds(), getImageCTM(), getWidth()); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java index 18d3568..1a4b54e 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java @@ -49,7 +49,7 @@ public class GhostScriptService { List> processInfoBatches = buildSubListForEachProcess(stitchedPageNumbers, numOfProcesses, - 2 * settings.getOcrThreadCount()); // use 2 times the thread count as batch size, such that GS generates the rendered pages as needed by the OCR Threads + settings.getOcrThreadCount()); // use 2 times the thread count as batch size, such that GS generates the rendered pages as needed by the OCR Threads for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) { long timestamp = System.currentTimeMillis(); List renderedPageImageFiles = Collections.synchronizedList(new LinkedList<>()); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java index a022ac4..662ae5b 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.ocr.processor.service; import java.awt.Graphics; +import java.awt.geom.Rectangle2D; import java.awt.image.BufferedImage; import java.io.IOException; import java.util.LinkedList; @@ -26,6 +27,7 @@ import org.apache.pdfbox.util.Matrix; import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage; +import com.knecon.fforesight.service.ocr.processor.model.QuadPoint; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; import lombok.Getter; @@ -34,7 +36,6 @@ import lombok.SneakyThrows; @Getter public class ImageStreamEngine extends PDFStreamEngine { - private ExtractedOcrImage currentImageOnPage; private List imagesOnCurrentPage; private OcrServiceSettings settings; private int pageNum; @@ -71,6 +72,7 @@ public class ImageStreamEngine extends PDFStreamEngine { Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix(); this.imagesOnCurrentPage.add(new ExtractedImage(pageNum, + QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, imageXObject.getWidth(), imageXObject.getHeight())), imageXObject.getHeight(), imageXObject.getWidth(), imageXObject.getImage(), @@ -78,7 +80,6 @@ public class ImageStreamEngine extends PDFStreamEngine { imagesOnCurrentPage.size(), imageXObject.getColorSpace())); - //imagesOnPages.add(this.currentImageOnPage); } else if (xobject instanceof PDFormXObject) { PDFormXObject form = (PDFormXObject) xobject; showForm(form); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java index c488982..54b8306 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java @@ -107,7 +107,7 @@ public class OCRService { int numberOfOcrThreads = Math.min(settings.getOcrThreadCount(), document.getNumberOfPages()); stats = new Statistics(numberOfExtractThreads, numberOfOcrThreads); - BlockingQueue ocrImageQueue = new ArrayBlockingQueue<>(2 * numberOfOcrThreads); + BlockingQueue ocrImageQueue = new ArrayBlockingQueue<>((int) (1.5 * numberOfOcrThreads)); OcrImageFactory ocrImageFactory = new OcrImageFactory(document, documentFile, @@ -128,7 +128,7 @@ public class OCRService { .toList(); log.info("Started {} OCR consumer threads, listening for images on the queue", ocrThreads.size()); ocrImageFactory.join(); - log.info("Extracted all images, interrupting ocr threads"); + log.info("Processed all images, interrupting ocr threads"); ocrThreads.forEach(Thread::interrupt); for (OCRThread ocrThread : ocrThreads) { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java index 3ff4683..e762d6e 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java @@ -6,13 +6,16 @@ import java.util.ArrayList; import java.util.Collections; import java.util.LinkedList; import java.util.List; +import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.stream.Collectors; import org.apache.pdfbox.pdmodel.PDDocument; +import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; import com.knecon.fforesight.service.ocr.processor.model.OcrImage; import com.knecon.fforesight.service.ocr.processor.service.threads.ImageExtractionThread; +import com.knecon.fforesight.service.ocr.processor.service.threads.ImageProcessingThread; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils; @@ -29,6 +32,8 @@ public class OcrImageFactory { File documentFile; Path tmpImageDir; GhostScriptService ghostScriptService; + BlockingQueue imageProcessingQueue; + ImageProcessingThread imageProcessingThread; BlockingQueue imageOutputQueue; List imageExtractionThreads; List stitchedPageNumbers; @@ -50,6 +55,7 @@ public class OcrImageFactory { this.tmpImageDir = tmpImageDir; this.ghostScriptService = ghostScriptService; this.imageOutputQueue = imageOutputQueue; + this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOutputQueue.remainingCapacity()); this.stitchedPageNumbers = Collections.synchronizedList(new LinkedList<>()); this.stats = stats; @@ -57,8 +63,10 @@ public class OcrImageFactory { List> balancedPageNumbers = ListSplittingUtils.buildBalancedContinuousSublist(document.getNumberOfPages(), numberOfThreads); for (int i = 0; i < balancedPageNumbers.size(); i++) { - imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageOutputQueue, stitchedPageNumbers)); + imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageProcessingQueue, stitchedPageNumbers)); } + this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOutputQueue, stats, settings); + log.info("Started {} image extraction threads, with ({}) pages each", imageExtractionThreads.size(), imageExtractionThreads.stream().map(ImageExtractionThread::getPageIndices).map(List::size).map(String::valueOf).collect(Collectors.joining(", "))); @@ -70,6 +78,8 @@ public class OcrImageFactory { for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) { imageExtractionThread.start(); } + imageProcessingThread.start(); + } @@ -79,11 +89,15 @@ public class OcrImageFactory { for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) { imageExtractionThread.join(); } - if (stitchedPageNumbers.isEmpty()) { - return; - } - ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageOutputQueue, stats); + if (!stitchedPageNumbers.isEmpty()) { + ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageOutputQueue, stats); + } + imageProcessingThread.interrupt(); + log.info("All images extracted, interrupting processing thread."); + + imageProcessingThread.join(); + } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/Statistics.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/Statistics.java index 73fe284..97d44e3 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/Statistics.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/Statistics.java @@ -15,6 +15,7 @@ public class Statistics { List tesseractDuration; AtomicLong pdf2ImgDuration; AtomicLong writingTextDuration; + AtomicLong imageProcessingDuration; public Statistics(int numberOfExtractThreads, int numberOfOcrThreads) { @@ -23,6 +24,7 @@ public class Statistics { this.tesseractDuration = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfOcrThreads, 0L))); this.pdf2ImgDuration = new AtomicLong(0); this.writingTextDuration = new AtomicLong(0); + this.imageProcessingDuration = new AtomicLong(0); } @@ -32,6 +34,12 @@ public class Statistics { } + public void increaseImageProcessing(long duration) { + + imageProcessingDuration.addAndGet(duration); + } + + public void increaseTesseractDuration(int threadId, long duration) { tesseractDuration.set(threadId, tesseractDuration.get(threadId) + duration); @@ -53,13 +61,15 @@ public class Statistics { @Override public String toString() { - return String.format("imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, PDF2Img=%.2f s, writingText=%.2f s", + return String.format( + "imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s", ((float) imageExtraction.stream().mapToLong(Long::longValue).average().orElse(0) / 1000), ((float) imageExtraction.stream().mapToLong(Long::longValue).max().orElse(0) / 1000), ((float) imageExtraction.stream().mapToLong(Long::longValue).min().orElse(0) / 1000), ((float) tesseractDuration.stream().mapToLong(Long::longValue).average().orElse(0) / 1000), ((float) tesseractDuration.stream().mapToLong(Long::longValue).max().orElse(0) / 1000), ((float) tesseractDuration.stream().mapToLong(Long::longValue).min().orElse(0) / 1000), + (float) imageProcessingDuration.get() / 1000, (float) pdf2ImgDuration.get() / 1000, (float) writingTextDuration.get() / 1000); } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java index 9551bbb..89161f6 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java @@ -9,8 +9,6 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; -import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage; -import com.knecon.fforesight.service.ocr.processor.model.OcrImage; import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine; import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger; import com.knecon.fforesight.service.ocr.processor.service.Statistics; @@ -26,7 +24,7 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class ImageExtractionThread extends Thread { - static double FULL_PAGE_IMAGE_THRESHOLD = 0.98; + static double FULL_PAGE_IMAGE_THRESHOLD = 0.99; static double IMAGE_ALIGNMENT_THRESHOLD = 1; int id; @@ -38,7 +36,7 @@ public class ImageExtractionThread extends Thread { OcrServiceSettings settings; // output is written to these lists - BlockingQueue imageOutputQueue; + BlockingQueue imageProcessingQueue; List stitchedPageNumbers; @@ -50,21 +48,20 @@ public class ImageExtractionThread extends Thread { for (Integer pageIndex : pageIndices) { try (PDDocument document = Loader.loadPDF(documentFile)) { // load new PDDocument for thread safety, also keeps RAM usage low. timestamp = System.currentTimeMillis(); - List extractedImages = getExtractedOcrImages(pageIndex, document); + List extractedImages = getExtractedImages(pageIndex, document); stats.increaseImageExtraction(id, System.currentTimeMillis() - timestamp); if (extractedImages.isEmpty()) { logger.logPageSkipped(pageIndex); } - if (checkForStitchedImages(extractedImages, document.getPage(pageIndex - 1))) { + if (checkForFullPageOrStitchedImages(extractedImages, document.getPage(pageIndex - 1))) { stitchedPageNumbers.add(pageIndex); logger.addImagesToProcess(pageIndex, 0); continue; } for (ExtractedImage image : extractedImages) { - ExtractedOcrImage ocrImage = new ExtractedOcrImage(image, settings.getDpi()); - imageOutputQueue.put(ocrImage); + imageProcessingQueue.put(image); logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage()); } } @@ -72,7 +69,7 @@ public class ImageExtractionThread extends Thread { } - private List getExtractedOcrImages(Integer pageIndex, PDDocument document) { + private List getExtractedImages(Integer pageIndex, PDDocument document) { PDPage page = document.getPage(pageIndex - 1); ImageStreamEngine imageStreamEngine = new ImageStreamEngine(settings); @@ -82,14 +79,14 @@ public class ImageExtractionThread extends Thread { @SneakyThrows - private boolean checkForStitchedImages(List imagesOnCurrentPage, PDPage page) { + private boolean checkForFullPageOrStitchedImages(List imagesOnCurrentPage, PDPage page) { if (imagesOnCurrentPage.isEmpty()) { return false; } for (ExtractedImage imageOnPage : imagesOnCurrentPage) { - if (imageOnPage.getImageCoordinatesInInitialUserSpace().size() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight() * page.getCropBox().getWidth()) { + if (imageOnPage.getWidth() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getWidth() && imageOnPage.getHeight() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight()) { return true; } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java new file mode 100644 index 0000000..e42185f --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java @@ -0,0 +1,166 @@ +package com.knecon.fforesight.service.ocr.processor.service.threads; + +import static net.sourceforge.tess4j.ITessAPI.TRUE; + +import java.nio.FloatBuffer; +import java.nio.IntBuffer; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.BlockingQueue; + +import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; +import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage; +import com.knecon.fforesight.service.ocr.processor.model.OcrImage; +import com.knecon.fforesight.service.ocr.processor.service.Statistics; +import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; +import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; +import com.sun.jna.ptr.PointerByReference; + +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; +import net.sourceforge.lept4j.Leptonica1; +import net.sourceforge.lept4j.Pix; +import net.sourceforge.lept4j.util.LeptUtils; +import net.sourceforge.tess4j.ITessAPI; +import net.sourceforge.tess4j.TessAPI1; + +/* + * This thread does all the image processing. There should only be one, since Leptonica is not thread safe. + */ +@Slf4j +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class ImageProcessingThread extends Thread { + + BlockingQueue imageInputQueue; + BlockingQueue imageOutputQueue; + ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle(); + Statistics stats; + OcrServiceSettings settings; + + + @SneakyThrows + @Override + public void run() { + + // Interrupting signals that the image extraction has finished + while (true) { + try { + final ExtractedImage image = imageInputQueue.take(); + OcrImage extractedOcrImage = this.process(image); + try { + imageOutputQueue.put(extractedOcrImage); + } catch (InterruptedException e) { + imageOutputQueue.put(extractedOcrImage); + break; + } + + } catch (InterruptedException e) { + break; + } + } + log.info("Leaving initial uninterrupted loop!"); + // empty the queue + List remainingImages = new ArrayList<>(imageInputQueue.size()); + imageInputQueue.drainTo(remainingImages); + remainingImages.forEach(image -> { + OcrImage ocrImage = this.process(image); + try { + imageOutputQueue.put(ocrImage); + } catch (InterruptedException e) { + log.error(e.getMessage()); + } + }); + + TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle); + } + + + private OcrImage process(ExtractedImage extractedImage) { + + long timestamp = System.currentTimeMillis(); + float imageDPI = Math.abs(extractedImage.getImage().getWidth() / (extractedImage.getCtm().getScalingFactorX() / 72)); + + Pix pix = binarize(extractedImage.asPix(), imageDPI, settings.getDpi()); + + int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle); + Pix rotatedPix = switch (360 - orientDegree) { + case 90 -> Leptonica1.pixRotateOrth(pix, 1); + case 180 -> Leptonica1.pixRotateOrth(pix, 2); + case 270 -> Leptonica1.pixRotateOrth(pix, 3); + default -> pix; + }; + OcrImage extractedOcrImage = new ExtractedOcrImage(extractedImage.getPageNumber(), + extractedImage.getNumberOnPage(), + extractedImage.getHeight(), + extractedImage.getWidth(), + extractedImage.getCtm(), + rotatedPix, + pix.h, + pix.w, + orientDegree); + + if (pix != rotatedPix) { + LeptUtils.disposePix(pix); + } + + stats.increaseImageProcessing(System.currentTimeMillis() - timestamp); + + return extractedOcrImage; + } + + + static public int detectOrientation(Pix pix, int dpi, ITessAPI.TessBaseAPI detectionScriptHandle) { + + TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, pix); + TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, dpi); + + IntBuffer orientationDegreeResultBuffer; + FloatBuffer orientationDegreeConfidenceBuffer; + PointerByReference scriptureNameBuffer; + FloatBuffer scriptureConfidenceBuffer; + + orientationDegreeResultBuffer = IntBuffer.allocate(1); + orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1); + scriptureNameBuffer = new PointerByReference(); + scriptureConfidenceBuffer = FloatBuffer.allocate(1); + + int orientationDegree = 0; + int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle, + orientationDegreeResultBuffer, + orientationDegreeConfidenceBuffer, + scriptureNameBuffer, + scriptureConfidenceBuffer); + if (result == TRUE && orientationDegreeConfidenceBuffer.get() > 10) { + orientationDegree = orientationDegreeResultBuffer.get(); + } + + TessAPI1.TessBaseAPIClear(detectionScriptHandle); + + return orientationDegree; + } + + + @SneakyThrows + private Pix binarize(Pix pix, float imageDpi, int targetDpi) { + + Pix grayScale = ImageProcessingUtils.convertToGrayScale(pix); + Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale); + return ImageProcessingUtils.despecklePix(scaledUp); + + } + + + private static ITessAPI.TessBaseAPI initDetectionScriptHandle() { + + ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate(); + String datapath = System.getenv("TESSDATA_PREFIX"); + TessAPI1.TessBaseAPIInit3(handle, datapath, "osd"); + + return handle; + } + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java index d0b11b5..9c1a0a7 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java @@ -47,8 +47,7 @@ public class OCRThread extends Thread { OcrProgressLogger logger; Statistics stats; OcrServiceSettings settings; - ITessAPI.TessBaseAPI detectionScriptHandle; - ITessAPI.TessBaseAPI tesseractHandle; + Tesseract2 instance; public OCRThread(int id, @@ -66,8 +65,7 @@ public class OCRThread extends Thread { this.logger = logger; this.stats = stats; this.settings = settings; - this.detectionScriptHandle = initDetectionScriptHandle(); - this.tesseractHandle = initTesseractHandle(settings); + this.instance = createInstance(settings); } @@ -92,10 +90,9 @@ public class OCRThread extends Thread { this.process(image); } } catch (NoSuchElementException e) { - log.debug("Processed all Images, finishing."); - TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle); - TessAPI1.TessBaseAPIDelete(this.tesseractHandle); + log.debug("Executed tesseract on all Images, finishing."); } + } @@ -107,13 +104,8 @@ public class OCRThread extends Thread { int psm = settings.getPsmOverride() < 0 ? image.getOptimalPageSegmentationMode() : settings.getPsmOverride(); - int orientDegree = detectOrientation(image); - image.setRotationDegrees(orientDegree); - Pix rotatedPix = image.getRotatedPix(); - executeTesseract(psm, image.getDpi(), rotatedPix, tesseractOutputFileName); - + executeTesseract(psm, image.getDpi(), image.getPix(), tesseractOutputFileName); image.destroyPix(); - LeptUtils.disposePix(rotatedPix); results.add(OcrResult.create(image, tesseractOutputFileName)); logger.logImageFinished(image, psm); @@ -121,67 +113,6 @@ public class OCRThread extends Thread { } - public int detectOrientation(OcrImage image) { - - IntBuffer orientationDegreeResultBuffer; - FloatBuffer orientationDegreeConfidenceBuffer; - PointerByReference scriptureNameBuffer; - FloatBuffer scriptureConfidenceBuffer; - - TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, image.getPix()); - TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, image.getDpi()); - - synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs in leptonica binarization. - orientationDegreeResultBuffer = IntBuffer.allocate(1); - orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1); - scriptureNameBuffer = new PointerByReference(); - scriptureConfidenceBuffer = FloatBuffer.allocate(1); - } - - int orient_deg = 0; - int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle, - orientationDegreeResultBuffer, - orientationDegreeConfidenceBuffer, - scriptureNameBuffer, - scriptureConfidenceBuffer); - if (result == TRUE) { - orient_deg = orientationDegreeResultBuffer.get(); - } - - TessAPI1.TessBaseAPIClear(detectionScriptHandle); - - return orient_deg; - } - - - private static ITessAPI.TessBaseAPI initDetectionScriptHandle() { - - synchronized (OCRThread.class) { - - ITessAPI.TessBaseAPI handle = TessBaseAPICreate(); - String datapath = System.getenv("TESSDATA_PREFIX"); -// TessBaseAPISetVariable(handle, "debug_file", "/dev/null"); - TessAPI1.TessBaseAPIInit3(handle, datapath, "osd"); - - return handle; - } - } - - - synchronized private static ITessAPI.TessBaseAPI initTesseractHandle(OcrServiceSettings settings) { - - synchronized (OCRThread.class) { - - ITessAPI.TessBaseAPI handle = TessBaseAPICreate(); - String datapath = System.getenv("TESSDATA_PREFIX"); -// TessBaseAPISetVariable(handle, "debug_file", "/dev/null"); - TessBaseAPIInit1(handle, datapath, settings.getLanguages(), 1, new PointerByReference(), 0); - - return handle; - } - } - - @SneakyThrows public void executeTesseract(int psm, int dpi, Pix pix, String tesseractOutputFileName) { @@ -192,14 +123,19 @@ public class OCRThread extends Thread { Leptonica1.pixWrite(folder + "/pix_" + a[a.length - 1] + ".png", pix, 3); } - TessBaseAPISetPageSegMode(tesseractHandle, psm); + instance.setVariable("user_defined_dpi", String.valueOf(dpi)); + instance.setPageSegMode(psm); + instance.createDocumentsWithResults(pix, null, tesseractOutputFileName, List.of(ITesseract.RenderedFormat.HOCR), ITessAPI.TessPageIteratorLevel.RIL_BLOCK); + } - Tesseract2.createDocumentsWithResults(pix, - null, - tesseractOutputFileName, - List.of(ITesseract.RenderedFormat.HOCR), - ITessAPI.TessPageIteratorLevel.RIL_BLOCK, - tesseractHandle); + + private static Tesseract2 createInstance(OcrServiceSettings settings) { + + Tesseract2 instance = new Tesseract2(); + instance.setVariable("debug_file", "/dev/null"); // remove warnings from std out + instance.setOcrEngineMode(1); // set to LSTM based Engine + instance.setLanguage(settings.getLanguages()); + return instance; } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java index e165845..3185982 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java @@ -13,13 +13,13 @@ import lombok.experimental.FieldDefaults; public class OcrServiceSettings { int ocrThreadCount = 16; // Number of OCR threads - int imageExtractThreadCount = 5; // Number of image extraction threads - int gsProcessCount = 5; // Number of Ghostscript processes + int imageExtractThreadCount = 2; // Number of image extraction threads + int gsProcessCount = 2; // Number of Ghostscript processes int dpi = 300; // Target DPI for binarized images int psmOverride = -1; // Overrides the page segmentation mode if > 0 int minImageHeight = 20; // Minimum height for images to be processed int minImageWidth = 20; // Minimum width for images to be processed - boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes + boolean debug = true; // If true, overlays OCR images with a grid and draws word bounding boxes boolean removeWatermark; // If true, watermarks will be removed String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR"); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java index 1727113..d41752d 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java @@ -2,10 +2,16 @@ package com.knecon.fforesight.service.ocr.processor.utils; import java.awt.AlphaComposite; import java.awt.Color; +import java.awt.Graphics; import java.awt.Graphics2D; import java.awt.Transparency; import java.awt.image.BufferedImage; +import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; +import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; + +import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; + import lombok.SneakyThrows; import lombok.experimental.UtilityClass; import net.sourceforge.lept4j.Leptonica1; @@ -15,6 +21,22 @@ import net.sourceforge.lept4j.util.LeptUtils; @UtilityClass public class ImageProcessingUtils { + public BufferedImage convertToDeviceColorSpace(ExtractedImage extractedImage) { + + BufferedImage image; + if (extractedImage.getColorSpace() instanceof PDDeviceRGB || extractedImage.getColorSpace() instanceof PDDeviceGray) { + image = extractedImage.getImage(); + } else { + BufferedImage pdfImage = extractedImage.getImage(); + image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY); + Graphics g = image.getGraphics(); + g.drawImage(pdfImage, 0, 0, null); + g.dispose(); + } + return image; + } + + public static Pix despecklePix(Pix pix) { assert pix.d == 8; @@ -23,7 +45,9 @@ public class ImageProcessingUtils { // too small to properly despeckle, just binarize instead. despeckled = Leptonica1.pixThresholdToBinary(pix, 180); } else { - despeckled = LeptUtils.despeckle(pix, LeptUtils.SEL_STR3, 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though... + despeckled = LeptUtils.despeckle(pix, + LeptUtils.SEL_STR3, + 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though... if (despeckled == null) { despeckled = Leptonica1.pixThresholdToBinary(pix, 180); } @@ -56,9 +80,8 @@ public class ImageProcessingUtils { @SneakyThrows - public static Pix convertToGrayScale(BufferedImage image) { + public static Pix convertToGrayScale(Pix pix) { - Pix pix = LeptUtils.convertImageToPix(image); if (pix.d == 8) { return pix; } else if (pix.d == 32) { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java index eda0685..d85dc46 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java @@ -1,54 +1,45 @@ package com.knecon.fforesight.service.ocr.processor.utils; -import static net.sourceforge.tess4j.ITesseract.DOCUMENT_TITLE; - import java.awt.Rectangle; import java.nio.IntBuffer; import java.util.ArrayList; import java.util.List; -import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.sun.jna.Pointer; -import lombok.experimental.UtilityClass; import lombok.extern.slf4j.Slf4j; import net.sourceforge.lept4j.Pix; -import net.sourceforge.tess4j.ITessAPI; -import net.sourceforge.tess4j.ITesseract; import net.sourceforge.tess4j.OCRResult; import net.sourceforge.tess4j.TessAPI1; +import net.sourceforge.tess4j.Tesseract1; +import net.sourceforge.tess4j.TesseractException; import net.sourceforge.tess4j.Word; @Slf4j /** * Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted. */ -@UtilityClass -public class Tesseract2 extends TessAPI1 { +public class Tesseract2 extends Tesseract1 { - private int createDocuments(Pix pix, String filename, ITessAPI.TessBaseAPI handle, ITessAPI.TessResultRenderer renderer) { - String title = TessBaseAPIGetStringVariable(handle, DOCUMENT_TITLE); + private int createDocuments(Pix pix, String filename, TessResultRenderer renderer) { + + String title = TessBaseAPIGetStringVariable(getHandle(), DOCUMENT_TITLE); TessResultRendererBeginDocument(renderer, title); - int result = TessBaseAPIProcessPage(handle, pix, 0, filename, null, 0, renderer); + int result = TessBaseAPIProcessPage(getHandle(), pix, 0, filename, null, 0, renderer); TessResultRendererEndDocument(renderer); // if (result == ITessAPI.FALSE) { // throw new TesseractException("Error during processing page."); // } - return TessBaseAPIMeanTextConf(handle); + return TessBaseAPIMeanTextConf(getHandle()); } - public OCRResult createDocumentsWithResults(Pix bi, - String filename, - String outputbase, - List formats, - int pageIteratorLevel, - ITessAPI.TessBaseAPI handle) { + public OCRResult createDocumentsWithResults(Pix bi, String filename, String outputbase, List formats, int pageIteratorLevel) throws TesseractException { - List results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel, handle); + List results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel); if (!results.isEmpty()) { return results.get(0); } else { @@ -57,26 +48,24 @@ public class Tesseract2 extends TessAPI1 { } - public List createDocumentsWithResults(Pix[] pixs, - String[] filenames, - String[] outputbases, - List formats, - int pageIteratorLevel, - ITessAPI.TessBaseAPI handle) { + public List createDocumentsWithResults(Pix[] pixs, String[] filenames, String[] outputbases, List formats, int pageIteratorLevel) { if (pixs.length != filenames.length || pixs.length != outputbases.length) { throw new RuntimeException("The three arrays must match in length."); } + init(); + setVariables(); + List results = new ArrayList(); try { for (int i = 0; i < pixs.length; i++) { try { - ITessAPI.TessResultRenderer renderer = createRenderers(outputbases[i], formats); - int meanTextConfidence = createDocuments(pixs[i], filenames[i], handle, renderer); + TessResultRenderer renderer = createRenderers(outputbases[i], formats); + int meanTextConfidence = createDocuments(pixs[i], filenames[i], renderer); TessDeleteResultRenderer(renderer); - List words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel, handle) : new ArrayList(); + List words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel) : new ArrayList(); results.add(new OCRResult(meanTextConfidence, words)); } catch (Exception e) { // skip the problematic image file @@ -84,22 +73,20 @@ public class Tesseract2 extends TessAPI1 { } } } finally { - synchronized (OCRThread.class) { - TessAPI1.TessBaseAPIClear(handle); - } + dispose(); } return results; } - private List getRecognizedWords(int pageIteratorLevel, ITessAPI.TessBaseAPI handle) { + private List getRecognizedWords(int pageIteratorLevel) { List words = new ArrayList<>(); try { - ITessAPI.TessResultIterator ri = TessBaseAPIGetIterator(handle); - ITessAPI.TessPageIterator pi = TessResultIteratorGetPageIterator(ri); + TessResultIterator ri = TessBaseAPIGetIterator(getHandle()); + TessPageIterator pi = TessResultIteratorGetPageIterator(ri); TessPageIteratorBegin(pi); do { @@ -132,11 +119,11 @@ public class Tesseract2 extends TessAPI1 { } - private ITessAPI.TessResultRenderer createRenderers(String outputbase, List formats) { + private TessResultRenderer createRenderers(String outputbase, List formats) { - ITessAPI.TessResultRenderer renderer = null; + TessResultRenderer renderer = null; - for (ITesseract.RenderedFormat format : formats) { + for (RenderedFormat format : formats) { switch (format) { case HOCR: diff --git a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index a4707f8..5b5204a 100644 --- a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcr() { - String text = testOCR("files/VV-352892.pdf"); + String text = testOCR("files/2009-1048395_50pages_tables.pdf"); } @@ -139,7 +139,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { String dir = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/"; List foundFiles = Files.walk(Path.of(dir)) -// .sorted(Comparator.comparingLong(this::getFileSize)) + .sorted(Comparator.comparingLong(this::getFileSize)) .map(Path::toFile) .filter(file -> file.getName().endsWith(".pdf")) .peek(System.out::println) @@ -162,7 +162,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcrForSpecificFile() { - testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/Item 17_Toxicidade Inalatoria.pdf")); + testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/F.2. A16003E - Acute Inhalation Study.pdf")); // testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/A23220A - 404 - Skin Irritation in vivo.pdf")); // testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/G.1.2 - 1768300_MMNA_A13617AV_report.pdf")); // testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 17_Toxicidade Inalatoria Aguda.pdf"));