RED-7669: optimize OCR-module performance

* move all non thread safe stuff to separate thread in the middle
This commit is contained in:
Kilian Schuettler 2023-11-22 16:40:13 +01:00
parent bb5b4a2fd8
commit efd3a1d952
15 changed files with 316 additions and 242 deletions

View File

@ -1,14 +1,20 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
@Getter
@RequiredArgsConstructor
@ -24,4 +30,19 @@ public class ExtractedImage {
int numberOnPage;
PDColorSpace colorSpace;
@SneakyThrows
public Pix asPix() {
BufferedImage image = ImageProcessingUtils.convertToDeviceColorSpace(this);
ImageProcessingUtils.setAlphaChannelToWhite(image);
return LeptUtils.convertImageToPix(image);
}
public QuadPoint getImageCoordinatesInInitialUserSpace() {
return QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, 1, 1)).getTransformed(ctm.createAffineTransform());
}
}

View File

@ -1,18 +1,15 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.Graphics;
import java.awt.geom.AffineTransform;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.nio.IntBuffer;
import java.util.concurrent.Semaphore;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import com.pdftron.sdf.Obj;
import com.sun.jna.StringArray;
import com.sun.jna.ptr.PointerByReference;
import lombok.AccessLevel;
import lombok.Getter;
@ -27,63 +24,20 @@ import net.sourceforge.tess4j.ITessAPI;
@Slf4j
@Getter
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ExtractedOcrImage implements OcrImage {
final int pageNumber;
final Pix pix;
final int originalHeight;
final int originalWidth;
final int height;
final int width;
final Matrix ctm;
final int numberOnPage;
@Setter
int pageNumber;
int numberOnPage;
int originalHeight;
int originalWidth;
Matrix ctm;
Pix pix;
int height;
int width;
int rotationDegrees;
@SneakyThrows
public ExtractedOcrImage(int pageNumber, int numberOnPage, BufferedImage bufferedImage, Matrix ctm, int targetDpi) {
this.pageNumber = pageNumber;
this.numberOnPage = numberOnPage;
this.ctm = ctm;
this.originalHeight = bufferedImage.getHeight();
this.originalWidth = bufferedImage.getWidth();
float imageDPI = Math.abs(bufferedImage.getWidth() / (ctm.getScalingFactorX() / 72));
this.pix = binarize(bufferedImage, imageDPI, targetDpi);
this.height = pix.h;
this.width = pix.w;
}
public ExtractedOcrImage(ExtractedImage image, int targetDpi) {
this.pageNumber = image.getPageNumber();
this.numberOnPage = image.getNumberOnPage();
this.ctm = image.getCtm();
this.originalHeight = image.getImage().getHeight();
this.originalWidth = image.getImage().getWidth();
float imageDPI = Math.abs(image.getImage().getWidth() / (ctm.getScalingFactorX() / 72));
this.pix = binarize(image.getImage(), imageDPI, targetDpi);
this.height = pix.h;
this.width = pix.w;
}
@SneakyThrows
private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) {
ImageProcessingUtils.setAlphaChannelToWhite(image);
synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs tesseract detection script.
Pix grayScale = ImageProcessingUtils.convertToGrayScale(image);
Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale);
return ImageProcessingUtils.despecklePix(scaledUp);
}
}
@Override
public AffineTransform getImageCTM() {

View File

@ -71,13 +71,6 @@ public interface OcrImage {
}
@SneakyThrows
default BufferedImage getRotatedBufferedImage() {
return LeptUtils.convertPixToImage(getRotatedPix());
}
/**
* Retrieves the rotation degree of the OCR image.
*
@ -94,16 +87,6 @@ public interface OcrImage {
int getOptimalPageSegmentationMode(); // TODO: evaluate if PSM can be dynamically chosen to increase performance
/**
* Sets the rotation degree of the OCR image. The rotation degree specifies the amount of rotation applied to the image.
* Currently only quadrant rotations are supported.
* Rotated partial images work, due to the CTM present in the pdf working with any rotation.
*
* @param rotationDegree The rotation degree of the OCR image.
*/
void setRotationDegrees(int rotationDegree);
/**
* Retrieves the buffered image associated with the OCR image.
*
@ -112,24 +95,6 @@ public interface OcrImage {
Pix getPix();
/**
* Retrieves the rotated image of the OCR image.
*
* @return The rotated BufferedImage object of the OCR image.
*/
default Pix getRotatedPix() {
synchronized (OCRThread.class) {
return switch (360 - getRotationDegrees()) {
case 90 -> Leptonica1.pixRotateOrth(getPix(), 1);
case 180 -> Leptonica1.pixRotateOrth(getPix(), 2);
case 270 -> Leptonica1.pixRotateOrth(getPix(), 3);
default -> getPix();
};
}
}
default int getDpi() {
return PdfDpiCalculator.calculateDpi(getImageBounds(), getImageCTM(), getWidth());

View File

@ -49,7 +49,7 @@ public class GhostScriptService {
List<List<ProcessInfo>> processInfoBatches = buildSubListForEachProcess(stitchedPageNumbers,
numOfProcesses,
2 * settings.getOcrThreadCount()); // use 2 times the thread count as batch size, such that GS generates the rendered pages as needed by the OCR Threads
settings.getOcrThreadCount()); // use 2 times the thread count as batch size, such that GS generates the rendered pages as needed by the OCR Threads
for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) {
long timestamp = System.currentTimeMillis();
List<RenderedPageImageFile> renderedPageImageFiles = Collections.synchronizedList(new LinkedList<>());

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.awt.Graphics;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.util.LinkedList;
@ -26,6 +27,7 @@ import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import lombok.Getter;
@ -34,7 +36,6 @@ import lombok.SneakyThrows;
@Getter
public class ImageStreamEngine extends PDFStreamEngine {
private ExtractedOcrImage currentImageOnPage;
private List<ExtractedImage> imagesOnCurrentPage;
private OcrServiceSettings settings;
private int pageNum;
@ -71,6 +72,7 @@ public class ImageStreamEngine extends PDFStreamEngine {
Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix();
this.imagesOnCurrentPage.add(new ExtractedImage(pageNum,
QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, imageXObject.getWidth(), imageXObject.getHeight())),
imageXObject.getHeight(),
imageXObject.getWidth(),
imageXObject.getImage(),
@ -78,7 +80,6 @@ public class ImageStreamEngine extends PDFStreamEngine {
imagesOnCurrentPage.size(),
imageXObject.getColorSpace()));
//imagesOnPages.add(this.currentImageOnPage);
} else if (xobject instanceof PDFormXObject) {
PDFormXObject form = (PDFormXObject) xobject;
showForm(form);

View File

@ -107,7 +107,7 @@ public class OCRService {
int numberOfOcrThreads = Math.min(settings.getOcrThreadCount(), document.getNumberOfPages());
stats = new Statistics(numberOfExtractThreads, numberOfOcrThreads);
BlockingQueue<OcrImage> ocrImageQueue = new ArrayBlockingQueue<>(2 * numberOfOcrThreads);
BlockingQueue<OcrImage> ocrImageQueue = new ArrayBlockingQueue<>((int) (1.5 * numberOfOcrThreads));
OcrImageFactory ocrImageFactory = new OcrImageFactory(document,
documentFile,
@ -128,7 +128,7 @@ public class OCRService {
.toList();
log.info("Started {} OCR consumer threads, listening for images on the queue", ocrThreads.size());
ocrImageFactory.join();
log.info("Extracted all images, interrupting ocr threads");
log.info("Processed all images, interrupting ocr threads");
ocrThreads.forEach(Thread::interrupt);
for (OCRThread ocrThread : ocrThreads) {

View File

@ -6,13 +6,16 @@ import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.stream.Collectors;
import org.apache.pdfbox.pdmodel.PDDocument;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageExtractionThread;
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageProcessingThread;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
@ -29,6 +32,8 @@ public class OcrImageFactory {
File documentFile;
Path tmpImageDir;
GhostScriptService ghostScriptService;
BlockingQueue<ExtractedImage> imageProcessingQueue;
ImageProcessingThread imageProcessingThread;
BlockingQueue<OcrImage> imageOutputQueue;
List<ImageExtractionThread> imageExtractionThreads;
List<Integer> stitchedPageNumbers;
@ -50,6 +55,7 @@ public class OcrImageFactory {
this.tmpImageDir = tmpImageDir;
this.ghostScriptService = ghostScriptService;
this.imageOutputQueue = imageOutputQueue;
this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOutputQueue.remainingCapacity());
this.stitchedPageNumbers = Collections.synchronizedList(new LinkedList<>());
this.stats = stats;
@ -57,8 +63,10 @@ public class OcrImageFactory {
List<List<Integer>> balancedPageNumbers = ListSplittingUtils.buildBalancedContinuousSublist(document.getNumberOfPages(), numberOfThreads);
for (int i = 0; i < balancedPageNumbers.size(); i++) {
imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageOutputQueue, stitchedPageNumbers));
imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageProcessingQueue, stitchedPageNumbers));
}
this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOutputQueue, stats, settings);
log.info("Started {} image extraction threads, with ({}) pages each",
imageExtractionThreads.size(),
imageExtractionThreads.stream().map(ImageExtractionThread::getPageIndices).map(List::size).map(String::valueOf).collect(Collectors.joining(", ")));
@ -70,6 +78,8 @@ public class OcrImageFactory {
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
imageExtractionThread.start();
}
imageProcessingThread.start();
}
@ -79,11 +89,15 @@ public class OcrImageFactory {
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
imageExtractionThread.join();
}
if (stitchedPageNumbers.isEmpty()) {
return;
}
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageOutputQueue, stats);
if (!stitchedPageNumbers.isEmpty()) {
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageOutputQueue, stats);
}
imageProcessingThread.interrupt();
log.info("All images extracted, interrupting processing thread.");
imageProcessingThread.join();
}
}

View File

@ -15,6 +15,7 @@ public class Statistics {
List<Long> tesseractDuration;
AtomicLong pdf2ImgDuration;
AtomicLong writingTextDuration;
AtomicLong imageProcessingDuration;
public Statistics(int numberOfExtractThreads, int numberOfOcrThreads) {
@ -23,6 +24,7 @@ public class Statistics {
this.tesseractDuration = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfOcrThreads, 0L)));
this.pdf2ImgDuration = new AtomicLong(0);
this.writingTextDuration = new AtomicLong(0);
this.imageProcessingDuration = new AtomicLong(0);
}
@ -32,6 +34,12 @@ public class Statistics {
}
public void increaseImageProcessing(long duration) {
imageProcessingDuration.addAndGet(duration);
}
public void increaseTesseractDuration(int threadId, long duration) {
tesseractDuration.set(threadId, tesseractDuration.get(threadId) + duration);
@ -53,13 +61,15 @@ public class Statistics {
@Override
public String toString() {
return String.format("imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, PDF2Img=%.2f s, writingText=%.2f s",
return String.format(
"imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s",
((float) imageExtraction.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
((float) imageExtraction.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
((float) imageExtraction.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
((float) tesseractDuration.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
((float) tesseractDuration.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
((float) tesseractDuration.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
(float) imageProcessingDuration.get() / 1000,
(float) pdf2ImgDuration.get() / 1000,
(float) writingTextDuration.get() / 1000);
}

View File

@ -9,8 +9,6 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine;
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
@ -26,7 +24,7 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ImageExtractionThread extends Thread {
static double FULL_PAGE_IMAGE_THRESHOLD = 0.98;
static double FULL_PAGE_IMAGE_THRESHOLD = 0.99;
static double IMAGE_ALIGNMENT_THRESHOLD = 1;
int id;
@ -38,7 +36,7 @@ public class ImageExtractionThread extends Thread {
OcrServiceSettings settings;
// output is written to these lists
BlockingQueue<OcrImage> imageOutputQueue;
BlockingQueue<ExtractedImage> imageProcessingQueue;
List<Integer> stitchedPageNumbers;
@ -50,21 +48,20 @@ public class ImageExtractionThread extends Thread {
for (Integer pageIndex : pageIndices) {
try (PDDocument document = Loader.loadPDF(documentFile)) { // load new PDDocument for thread safety, also keeps RAM usage low.
timestamp = System.currentTimeMillis();
List<ExtractedImage> extractedImages = getExtractedOcrImages(pageIndex, document);
List<ExtractedImage> extractedImages = getExtractedImages(pageIndex, document);
stats.increaseImageExtraction(id, System.currentTimeMillis() - timestamp);
if (extractedImages.isEmpty()) {
logger.logPageSkipped(pageIndex);
}
if (checkForStitchedImages(extractedImages, document.getPage(pageIndex - 1))) {
if (checkForFullPageOrStitchedImages(extractedImages, document.getPage(pageIndex - 1))) {
stitchedPageNumbers.add(pageIndex);
logger.addImagesToProcess(pageIndex, 0);
continue;
}
for (ExtractedImage image : extractedImages) {
ExtractedOcrImage ocrImage = new ExtractedOcrImage(image, settings.getDpi());
imageOutputQueue.put(ocrImage);
imageProcessingQueue.put(image);
logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage());
}
}
@ -72,7 +69,7 @@ public class ImageExtractionThread extends Thread {
}
private List<ExtractedImage> getExtractedOcrImages(Integer pageIndex, PDDocument document) {
private List<ExtractedImage> getExtractedImages(Integer pageIndex, PDDocument document) {
PDPage page = document.getPage(pageIndex - 1);
ImageStreamEngine imageStreamEngine = new ImageStreamEngine(settings);
@ -82,14 +79,14 @@ public class ImageExtractionThread extends Thread {
@SneakyThrows
private boolean checkForStitchedImages(List<ExtractedImage> imagesOnCurrentPage, PDPage page) {
private boolean checkForFullPageOrStitchedImages(List<ExtractedImage> imagesOnCurrentPage, PDPage page) {
if (imagesOnCurrentPage.isEmpty()) {
return false;
}
for (ExtractedImage imageOnPage : imagesOnCurrentPage) {
if (imageOnPage.getImageCoordinatesInInitialUserSpace().size() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight() * page.getCropBox().getWidth()) {
if (imageOnPage.getWidth() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getWidth() && imageOnPage.getHeight() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight()) {
return true;
}
}

View File

@ -0,0 +1,166 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import static net.sourceforge.tess4j.ITessAPI.TRUE;
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import com.sun.jna.ptr.PointerByReference;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
import net.sourceforge.tess4j.ITessAPI;
import net.sourceforge.tess4j.TessAPI1;
/*
* This thread does all the image processing. There should only be one, since Leptonica is not thread safe.
*/
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ImageProcessingThread extends Thread {
BlockingQueue<ExtractedImage> imageInputQueue;
BlockingQueue<OcrImage> imageOutputQueue;
ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
Statistics stats;
OcrServiceSettings settings;
@SneakyThrows
@Override
public void run() {
// Interrupting signals that the image extraction has finished
while (true) {
try {
final ExtractedImage image = imageInputQueue.take();
OcrImage extractedOcrImage = this.process(image);
try {
imageOutputQueue.put(extractedOcrImage);
} catch (InterruptedException e) {
imageOutputQueue.put(extractedOcrImage);
break;
}
} catch (InterruptedException e) {
break;
}
}
log.info("Leaving initial uninterrupted loop!");
// empty the queue
List<ExtractedImage> remainingImages = new ArrayList<>(imageInputQueue.size());
imageInputQueue.drainTo(remainingImages);
remainingImages.forEach(image -> {
OcrImage ocrImage = this.process(image);
try {
imageOutputQueue.put(ocrImage);
} catch (InterruptedException e) {
log.error(e.getMessage());
}
});
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
}
private OcrImage process(ExtractedImage extractedImage) {
long timestamp = System.currentTimeMillis();
float imageDPI = Math.abs(extractedImage.getImage().getWidth() / (extractedImage.getCtm().getScalingFactorX() / 72));
Pix pix = binarize(extractedImage.asPix(), imageDPI, settings.getDpi());
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
Pix rotatedPix = switch (360 - orientDegree) {
case 90 -> Leptonica1.pixRotateOrth(pix, 1);
case 180 -> Leptonica1.pixRotateOrth(pix, 2);
case 270 -> Leptonica1.pixRotateOrth(pix, 3);
default -> pix;
};
OcrImage extractedOcrImage = new ExtractedOcrImage(extractedImage.getPageNumber(),
extractedImage.getNumberOnPage(),
extractedImage.getHeight(),
extractedImage.getWidth(),
extractedImage.getCtm(),
rotatedPix,
pix.h,
pix.w,
orientDegree);
if (pix != rotatedPix) {
LeptUtils.disposePix(pix);
}
stats.increaseImageProcessing(System.currentTimeMillis() - timestamp);
return extractedOcrImage;
}
static public int detectOrientation(Pix pix, int dpi, ITessAPI.TessBaseAPI detectionScriptHandle) {
TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, pix);
TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, dpi);
IntBuffer orientationDegreeResultBuffer;
FloatBuffer orientationDegreeConfidenceBuffer;
PointerByReference scriptureNameBuffer;
FloatBuffer scriptureConfidenceBuffer;
orientationDegreeResultBuffer = IntBuffer.allocate(1);
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
scriptureNameBuffer = new PointerByReference();
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
int orientationDegree = 0;
int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle,
orientationDegreeResultBuffer,
orientationDegreeConfidenceBuffer,
scriptureNameBuffer,
scriptureConfidenceBuffer);
if (result == TRUE && orientationDegreeConfidenceBuffer.get() > 10) {
orientationDegree = orientationDegreeResultBuffer.get();
}
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
return orientationDegree;
}
@SneakyThrows
private Pix binarize(Pix pix, float imageDpi, int targetDpi) {
Pix grayScale = ImageProcessingUtils.convertToGrayScale(pix);
Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale);
return ImageProcessingUtils.despecklePix(scaledUp);
}
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
String datapath = System.getenv("TESSDATA_PREFIX");
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
return handle;
}
}

View File

@ -47,8 +47,7 @@ public class OCRThread extends Thread {
OcrProgressLogger logger;
Statistics stats;
OcrServiceSettings settings;
ITessAPI.TessBaseAPI detectionScriptHandle;
ITessAPI.TessBaseAPI tesseractHandle;
Tesseract2 instance;
public OCRThread(int id,
@ -66,8 +65,7 @@ public class OCRThread extends Thread {
this.logger = logger;
this.stats = stats;
this.settings = settings;
this.detectionScriptHandle = initDetectionScriptHandle();
this.tesseractHandle = initTesseractHandle(settings);
this.instance = createInstance(settings);
}
@ -92,10 +90,9 @@ public class OCRThread extends Thread {
this.process(image);
}
} catch (NoSuchElementException e) {
log.debug("Processed all Images, finishing.");
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
TessAPI1.TessBaseAPIDelete(this.tesseractHandle);
log.debug("Executed tesseract on all Images, finishing.");
}
}
@ -107,13 +104,8 @@ public class OCRThread extends Thread {
int psm = settings.getPsmOverride() < 0 ? image.getOptimalPageSegmentationMode() : settings.getPsmOverride();
int orientDegree = detectOrientation(image);
image.setRotationDegrees(orientDegree);
Pix rotatedPix = image.getRotatedPix();
executeTesseract(psm, image.getDpi(), rotatedPix, tesseractOutputFileName);
executeTesseract(psm, image.getDpi(), image.getPix(), tesseractOutputFileName);
image.destroyPix();
LeptUtils.disposePix(rotatedPix);
results.add(OcrResult.create(image, tesseractOutputFileName));
logger.logImageFinished(image, psm);
@ -121,67 +113,6 @@ public class OCRThread extends Thread {
}
public int detectOrientation(OcrImage image) {
IntBuffer orientationDegreeResultBuffer;
FloatBuffer orientationDegreeConfidenceBuffer;
PointerByReference scriptureNameBuffer;
FloatBuffer scriptureConfidenceBuffer;
TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, image.getPix());
TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, image.getDpi());
synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs in leptonica binarization.
orientationDegreeResultBuffer = IntBuffer.allocate(1);
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
scriptureNameBuffer = new PointerByReference();
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
}
int orient_deg = 0;
int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle,
orientationDegreeResultBuffer,
orientationDegreeConfidenceBuffer,
scriptureNameBuffer,
scriptureConfidenceBuffer);
if (result == TRUE) {
orient_deg = orientationDegreeResultBuffer.get();
}
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
return orient_deg;
}
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
synchronized (OCRThread.class) {
ITessAPI.TessBaseAPI handle = TessBaseAPICreate();
String datapath = System.getenv("TESSDATA_PREFIX");
// TessBaseAPISetVariable(handle, "debug_file", "/dev/null");
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
return handle;
}
}
synchronized private static ITessAPI.TessBaseAPI initTesseractHandle(OcrServiceSettings settings) {
synchronized (OCRThread.class) {
ITessAPI.TessBaseAPI handle = TessBaseAPICreate();
String datapath = System.getenv("TESSDATA_PREFIX");
// TessBaseAPISetVariable(handle, "debug_file", "/dev/null");
TessBaseAPIInit1(handle, datapath, settings.getLanguages(), 1, new PointerByReference(), 0);
return handle;
}
}
@SneakyThrows
public void executeTesseract(int psm, int dpi, Pix pix, String tesseractOutputFileName) {
@ -192,14 +123,19 @@ public class OCRThread extends Thread {
Leptonica1.pixWrite(folder + "/pix_" + a[a.length - 1] + ".png", pix, 3);
}
TessBaseAPISetPageSegMode(tesseractHandle, psm);
instance.setVariable("user_defined_dpi", String.valueOf(dpi));
instance.setPageSegMode(psm);
instance.createDocumentsWithResults(pix, null, tesseractOutputFileName, List.of(ITesseract.RenderedFormat.HOCR), ITessAPI.TessPageIteratorLevel.RIL_BLOCK);
}
Tesseract2.createDocumentsWithResults(pix,
null,
tesseractOutputFileName,
List.of(ITesseract.RenderedFormat.HOCR),
ITessAPI.TessPageIteratorLevel.RIL_BLOCK,
tesseractHandle);
private static Tesseract2 createInstance(OcrServiceSettings settings) {
Tesseract2 instance = new Tesseract2();
instance.setVariable("debug_file", "/dev/null"); // remove warnings from std out
instance.setOcrEngineMode(1); // set to LSTM based Engine
instance.setLanguage(settings.getLanguages());
return instance;
}
}

View File

@ -13,13 +13,13 @@ import lombok.experimental.FieldDefaults;
public class OcrServiceSettings {
int ocrThreadCount = 16; // Number of OCR threads
int imageExtractThreadCount = 5; // Number of image extraction threads
int gsProcessCount = 5; // Number of Ghostscript processes
int imageExtractThreadCount = 2; // Number of image extraction threads
int gsProcessCount = 2; // Number of Ghostscript processes
int dpi = 300; // Target DPI for binarized images
int psmOverride = -1; // Overrides the page segmentation mode if > 0
int minImageHeight = 20; // Minimum height for images to be processed
int minImageWidth = 20; // Minimum width for images to be processed
boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes
boolean debug = true; // If true, overlays OCR images with a grid and draws word bounding boxes
boolean removeWatermark; // If true, watermarks will be removed
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR");

View File

@ -2,10 +2,16 @@ package com.knecon.fforesight.service.ocr.processor.utils;
import java.awt.AlphaComposite;
import java.awt.Color;
import java.awt.Graphics;
import java.awt.Graphics2D;
import java.awt.Transparency;
import java.awt.image.BufferedImage;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
import net.sourceforge.lept4j.Leptonica1;
@ -15,6 +21,22 @@ import net.sourceforge.lept4j.util.LeptUtils;
@UtilityClass
public class ImageProcessingUtils {
public BufferedImage convertToDeviceColorSpace(ExtractedImage extractedImage) {
BufferedImage image;
if (extractedImage.getColorSpace() instanceof PDDeviceRGB || extractedImage.getColorSpace() instanceof PDDeviceGray) {
image = extractedImage.getImage();
} else {
BufferedImage pdfImage = extractedImage.getImage();
image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
Graphics g = image.getGraphics();
g.drawImage(pdfImage, 0, 0, null);
g.dispose();
}
return image;
}
public static Pix despecklePix(Pix pix) {
assert pix.d == 8;
@ -23,7 +45,9 @@ public class ImageProcessingUtils {
// too small to properly despeckle, just binarize instead.
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
} else {
despeckled = LeptUtils.despeckle(pix, LeptUtils.SEL_STR3, 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though...
despeckled = LeptUtils.despeckle(pix,
LeptUtils.SEL_STR3,
3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though...
if (despeckled == null) {
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
}
@ -56,9 +80,8 @@ public class ImageProcessingUtils {
@SneakyThrows
public static Pix convertToGrayScale(BufferedImage image) {
public static Pix convertToGrayScale(Pix pix) {
Pix pix = LeptUtils.convertImageToPix(image);
if (pix.d == 8) {
return pix;
} else if (pix.d == 32) {

View File

@ -1,54 +1,45 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import static net.sourceforge.tess4j.ITesseract.DOCUMENT_TITLE;
import java.awt.Rectangle;
import java.nio.IntBuffer;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
import com.sun.jna.Pointer;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.tess4j.ITessAPI;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.OCRResult;
import net.sourceforge.tess4j.TessAPI1;
import net.sourceforge.tess4j.Tesseract1;
import net.sourceforge.tess4j.TesseractException;
import net.sourceforge.tess4j.Word;
@Slf4j
/**
* Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted.
*/
@UtilityClass
public class Tesseract2 extends TessAPI1 {
public class Tesseract2 extends Tesseract1 {
private int createDocuments(Pix pix, String filename, ITessAPI.TessBaseAPI handle, ITessAPI.TessResultRenderer renderer) {
String title = TessBaseAPIGetStringVariable(handle, DOCUMENT_TITLE);
private int createDocuments(Pix pix, String filename, TessResultRenderer renderer) {
String title = TessBaseAPIGetStringVariable(getHandle(), DOCUMENT_TITLE);
TessResultRendererBeginDocument(renderer, title);
int result = TessBaseAPIProcessPage(handle, pix, 0, filename, null, 0, renderer);
int result = TessBaseAPIProcessPage(getHandle(), pix, 0, filename, null, 0, renderer);
TessResultRendererEndDocument(renderer);
// if (result == ITessAPI.FALSE) {
// throw new TesseractException("Error during processing page.");
// }
return TessBaseAPIMeanTextConf(handle);
return TessBaseAPIMeanTextConf(getHandle());
}
public OCRResult createDocumentsWithResults(Pix bi,
String filename,
String outputbase,
List<ITesseract.RenderedFormat> formats,
int pageIteratorLevel,
ITessAPI.TessBaseAPI handle) {
public OCRResult createDocumentsWithResults(Pix bi, String filename, String outputbase, List<RenderedFormat> formats, int pageIteratorLevel) throws TesseractException {
List<OCRResult> results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel, handle);
List<OCRResult> results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel);
if (!results.isEmpty()) {
return results.get(0);
} else {
@ -57,26 +48,24 @@ public class Tesseract2 extends TessAPI1 {
}
public List<OCRResult> createDocumentsWithResults(Pix[] pixs,
String[] filenames,
String[] outputbases,
List<ITesseract.RenderedFormat> formats,
int pageIteratorLevel,
ITessAPI.TessBaseAPI handle) {
public List<OCRResult> createDocumentsWithResults(Pix[] pixs, String[] filenames, String[] outputbases, List<RenderedFormat> formats, int pageIteratorLevel) {
if (pixs.length != filenames.length || pixs.length != outputbases.length) {
throw new RuntimeException("The three arrays must match in length.");
}
init();
setVariables();
List<OCRResult> results = new ArrayList<OCRResult>();
try {
for (int i = 0; i < pixs.length; i++) {
try {
ITessAPI.TessResultRenderer renderer = createRenderers(outputbases[i], formats);
int meanTextConfidence = createDocuments(pixs[i], filenames[i], handle, renderer);
TessResultRenderer renderer = createRenderers(outputbases[i], formats);
int meanTextConfidence = createDocuments(pixs[i], filenames[i], renderer);
TessDeleteResultRenderer(renderer);
List<Word> words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel, handle) : new ArrayList<Word>();
List<Word> words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel) : new ArrayList<Word>();
results.add(new OCRResult(meanTextConfidence, words));
} catch (Exception e) {
// skip the problematic image file
@ -84,22 +73,20 @@ public class Tesseract2 extends TessAPI1 {
}
}
} finally {
synchronized (OCRThread.class) {
TessAPI1.TessBaseAPIClear(handle);
}
dispose();
}
return results;
}
private List<Word> getRecognizedWords(int pageIteratorLevel, ITessAPI.TessBaseAPI handle) {
private List<Word> getRecognizedWords(int pageIteratorLevel) {
List<Word> words = new ArrayList<>();
try {
ITessAPI.TessResultIterator ri = TessBaseAPIGetIterator(handle);
ITessAPI.TessPageIterator pi = TessResultIteratorGetPageIterator(ri);
TessResultIterator ri = TessBaseAPIGetIterator(getHandle());
TessPageIterator pi = TessResultIteratorGetPageIterator(ri);
TessPageIteratorBegin(pi);
do {
@ -132,11 +119,11 @@ public class Tesseract2 extends TessAPI1 {
}
private ITessAPI.TessResultRenderer createRenderers(String outputbase, List<ITesseract.RenderedFormat> formats) {
private TessResultRenderer createRenderers(String outputbase, List<RenderedFormat> formats) {
ITessAPI.TessResultRenderer renderer = null;
TessResultRenderer renderer = null;
for (ITesseract.RenderedFormat format : formats) {
for (RenderedFormat format : formats) {
switch (format) {
case HOCR:

View File

@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@SneakyThrows
public void testOcr() {
String text = testOCR("files/VV-352892.pdf");
String text = testOCR("files/2009-1048395_50pages_tables.pdf");
}
@ -139,7 +139,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
String dir = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/";
List<File> foundFiles = Files.walk(Path.of(dir))
// .sorted(Comparator.comparingLong(this::getFileSize))
.sorted(Comparator.comparingLong(this::getFileSize))
.map(Path::toFile)
.filter(file -> file.getName().endsWith(".pdf"))
.peek(System.out::println)
@ -162,7 +162,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@SneakyThrows
public void testOcrForSpecificFile() {
testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/Item 17_Toxicidade Inalatoria.pdf"));
testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/F.2. A16003E - Acute Inhalation Study.pdf"));
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/A23220A - 404 - Skin Irritation in vivo.pdf"));
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/G.1.2 - 1768300_MMNA_A13617AV_report.pdf"));
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 17_Toxicidade Inalatoria Aguda.pdf"));