Merge branch 'RED-7668' into 'master'

RED-7669: optimize OCR-module performance

Closes RED-7668

See merge request redactmanager/ocr-service!22
This commit is contained in:
Dominique Eifländer 2023-11-23 16:04:43 +01:00
commit a50f54676e
23 changed files with 691 additions and 333 deletions

View File

@ -0,0 +1,36 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
public record ExtractedImage(
int pageNumber, QuadPoint position, int height, int width, BufferedImage image, Matrix ctm, int numberOnPage, PDColorSpace colorSpace) implements UnprocessedImage {
@SneakyThrows
public Pix asPix() {
BufferedImage image = ImageProcessingUtils.convertToDeviceColorSpace(this);
ImageProcessingUtils.setAlphaChannelToWhite(image);
return LeptUtils.convertImageToPix(image);
}
public QuadPoint getImageCoordinatesInInitialUserSpace() {
return QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, 1, 1)).getTransformed(ctm.createAffineTransform());
}
}

View File

@ -1,20 +1,15 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.AlphaComposite;
import java.awt.Color;
import java.awt.Graphics2D;
import java.awt.Transparency;
import java.awt.Graphics;
import java.awt.geom.AffineTransform;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.nio.IntBuffer;
import java.util.concurrent.Semaphore;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import com.pdftron.sdf.Obj;
import lombok.AccessLevel;
import lombok.Getter;
@ -23,58 +18,26 @@ import lombok.Setter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
import net.sourceforge.tess4j.ITessAPI;
@Slf4j
@Getter
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ExtractedOcrImage implements OcrImage {
final int pageNumber;
final Pix pix;
final int originalHeight;
final int originalWidth;
final int height;
final int width;
final Matrix ctm;
final int numberOnPage;
@Setter
int pageNumber;
int numberOnPage;
int originalHeight;
int originalWidth;
Matrix ctm;
Pix pix;
int height;
int width;
int rotationDegrees;
@SneakyThrows
public ExtractedOcrImage(int pageNumber, int numberOnPage, BufferedImage bufferedImage, Matrix ctm, int targetDpi) {
this.pageNumber = pageNumber;
this.numberOnPage = numberOnPage;
this.ctm = ctm;
this.originalHeight = bufferedImage.getHeight();
this.originalWidth = bufferedImage.getWidth();
float imageDPI = Math.abs(bufferedImage.getWidth() / (ctm.getScalingFactorX() / 72));
this.pix = binarize(bufferedImage, imageDPI, targetDpi);
this.height = pix.h;
this.width = pix.w;
}
@SneakyThrows
private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) {
ImageProcessingUtils.setAlphaChannelToWhite(image);
synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs tesseract detection script.
Pix grayScale = ImageProcessingUtils.convertToGrayScale(image);
Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale);
return ImageProcessingUtils.despecklePix(scaledUp);
}
}
@Override
public AffineTransform getImageCTM() {

View File

@ -2,10 +2,12 @@ package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.image.BufferedImage;
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
import com.knecon.fforesight.service.ocr.processor.utils.PdfDpiCalculator;
import lombok.SneakyThrows;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
@ -62,6 +64,13 @@ public interface OcrImage {
}
@SneakyThrows
default BufferedImage getBufferedImage() {
return LeptUtils.convertPixToImage(getPix());
}
/**
* Retrieves the rotation degree of the OCR image.
*
@ -78,16 +87,6 @@ public interface OcrImage {
int getOptimalPageSegmentationMode(); // TODO: evaluate if PSM can be dynamically chosen to increase performance
/**
* Sets the rotation degree of the OCR image. The rotation degree specifies the amount of rotation applied to the image.
* Currently only quadrant rotations are supported.
* Rotated partial images work, due to the CTM present in the pdf working with any rotation.
*
* @param rotationDegree The rotation degree of the OCR image.
*/
void setRotationDegrees(int rotationDegree);
/**
* Retrieves the buffered image associated with the OCR image.
*
@ -96,24 +95,6 @@ public interface OcrImage {
Pix getPix();
/**
* Retrieves the rotated image of the OCR image.
*
* @return The rotated BufferedImage object of the OCR image.
*/
default Pix getRotatedPix() {
synchronized (OCRThread.class) {
return switch (360 - getRotationDegrees()) {
case 90 -> Leptonica1.pixRotateOrth(getPix(), 1);
case 180 -> Leptonica1.pixRotateOrth(getPix(), 2);
case 270 -> Leptonica1.pixRotateOrth(getPix(), 3);
default -> getPix();
};
}
}
default int getDpi() {
return PdfDpiCalculator.calculateDpi(getImageBounds(), getImageCTM(), getWidth());

View File

@ -0,0 +1,12 @@
package com.knecon.fforesight.service.ocr.processor.model;
import org.apache.pdfbox.pdmodel.PDPage;
public record PageInformation(int height, int width, int number, int rotationDegrees) {
public static PageInformation fromPDPage(int pageNum, PDPage page) {
return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation());
}
}

View File

@ -97,4 +97,10 @@ public record QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) {
d().getY());
}
public double size() {
return a().distance(b()) * a().distance(d());
}
}

View File

@ -1,5 +1,14 @@
package com.knecon.fforesight.service.ocr.processor.model;
public record RenderedPageImageFile(int pageNumber, String absoluteFilePath) {
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
public record RenderedPageImageFile(int pageNumber, String absoluteFilePath) implements UnprocessedImage {
@Override
public Pix asPix() {
return Leptonica1.pixRead(absoluteFilePath);
}
}

View File

@ -8,6 +8,7 @@ import org.apache.pdfbox.pdmodel.PDPage;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.Setter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
@ -16,29 +17,17 @@ import net.sourceforge.lept4j.Pix;
import net.sourceforge.tess4j.ITessAPI;
@Getter
@FieldDefaults(level = AccessLevel.PRIVATE)
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class RenderedPageOcrImage implements OcrImage {
final String absoluteImagePath;
final int height;
final int width;
final PageInformation pageInformation;
final Pix pix;
@Setter
int height;
int width;
PageInformation pageInformation;
Pix pix;
int rotationDegrees;
@SneakyThrows
public RenderedPageOcrImage(RenderedPageImageFile renderedPageImageFile, PDDocument document) {
this.pageInformation = PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1));
this.absoluteImagePath = renderedPageImageFile.absoluteFilePath();
this.pix = Leptonica1.pixRead(absoluteImagePath);
this.height = getPix().h;
this.width = getPix().w;
}
@Override
public int getOptimalPageSegmentationMode() {
@ -107,7 +96,7 @@ public class RenderedPageOcrImage implements OcrImage {
// PDFBox always returns page height and width based on rotation
double pageWidth;
if (pageInformation.rotationDegrees == 90 || pageInformation.rotationDegrees == 270) {
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
pageWidth = pageInformation.height();
} else {
pageWidth = pageInformation.width();
@ -116,14 +105,4 @@ public class RenderedPageOcrImage implements OcrImage {
return pageWidth / width;
}
private record PageInformation(int height, int width, int number, int rotationDegrees) {
public static PageInformation fromPDPage(int pageNum, PDPage page) {
return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation());
}
}
}

View File

@ -0,0 +1,9 @@
package com.knecon.fforesight.service.ocr.processor.model;
import net.sourceforge.lept4j.Pix;
public interface UnprocessedImage {
Pix asPix();
}

View File

@ -4,18 +4,26 @@ import java.io.InputStream;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.LinkedTransferQueue;
import java.util.stream.Collectors;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service;
import com.azure.core.implementation.GeoObjectHelper;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage;
import com.knecon.fforesight.service.ocr.processor.service.threads.ProcessIOLogger;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import com.knecon.fforesight.service.ocr.processor.service.threads.BlockingQueueFiller;
import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
@ -24,6 +32,7 @@ import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.Pix;
@Slf4j
@Service
@ -42,17 +51,19 @@ public class GhostScriptService {
String documentAbsolutePath,
Path tmpImageDir,
PDDocument document,
BlockingQueue<OcrImage> imageOutputQueue,
BlockingQueue<UnprocessedImage> imageProcessingQueue,
Statistics stats) {
BlockingQueue<RenderedPageImageFile> imageFileCollectorQueue = new LinkedBlockingDeque<>();
Thread asyncTransferThread = new BlockingQueueFiller(imageFileCollectorQueue, imageProcessingQueue);
asyncTransferThread.start();
int numOfProcesses = Math.min(settings.getGsProcessCount(), stitchedPageNumbers.size());
List<List<ProcessInfo>> processInfoBatches = buildSubListForEachProcess(stitchedPageNumbers,
numOfProcesses,
2 * settings.getOcrThreadCount()); // use 2 times the thread count as batch size, such that GS generates the rendered pages as needed by the OCR Threads
256 * numOfProcesses); // GS has a limit on how many pageIndices per call are possible, so we limit it to 256 pages per process
for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) {
long timestamp = System.currentTimeMillis();
List<RenderedPageImageFile> renderedPageImageFiles = Collections.synchronizedList(new LinkedList<>());
List<ProcessInfo> processInfos = processInfoBatches.get(batchIdx);
log.info("Batch {}: Running {} gs processes with ({}) pages each",
@ -63,9 +74,9 @@ public class GhostScriptService {
int finalBatchIdx = batchIdx;
List<Process> processes = processInfos.stream()
.parallel()
.map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.stitchedPageNumbers(), tmpImageDir, documentAbsolutePath, renderedPageImageFiles))
.peek(s -> log.debug(String.join(" ", s)))
.map(this::executeProcess)
.map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.stitchedPageNumbers(), tmpImageDir, documentAbsolutePath))
.peek(s -> log.debug(String.join(" ", s.cmdArgs())))
.map(processInfo -> executeProcess(processInfo, imageFileCollectorQueue))
.toList();
List<Integer> processExitCodes = new LinkedList<>();
@ -73,14 +84,9 @@ public class GhostScriptService {
processExitCodes.add(process.waitFor());
}
stats.increasePDF2ImgDuration(System.currentTimeMillis() - timestamp);
log.info("Batch {}: Ghostscript processes finished with exit codes " + processExitCodes, batchIdx);
for (RenderedPageImageFile renderedPageImageFile : renderedPageImageFiles) {
OcrImage image = new RenderedPageOcrImage(renderedPageImageFile, document);
imageOutputQueue.put(image);
}
}
asyncTransferThread.interrupt();
}
@ -107,20 +113,28 @@ public class GhostScriptService {
@SneakyThrows
private String[] buildCmdArgs(Integer processIdx,
Integer batchIdx,
List<Integer> stitchedImagePageIndices,
Path outputDir,
String documentAbsolutePath,
List<RenderedPageImageFile> fullPageImages) {
private ProcessCmdsAndRenderedImageFiles buildCmdArgs(Integer processIdx,
Integer batchIdx,
List<Integer> stitchedImagePageIndices,
Path outputDir,
String documentAbsolutePath) {
String imagePathFormat = outputDir.resolve("output_" + processIdx + "_" + batchIdx + ".%04d" + FORMAT).toFile().toString();
Map<Integer, RenderedPageImageFile> fullPageImages = new HashMap<>();
for (int i = 0; i < stitchedImagePageIndices.size(); i++) {
Integer pageNumber = stitchedImagePageIndices.get(i);
fullPageImages.add(new RenderedPageImageFile(pageNumber, String.format(imagePathFormat, i + 1)));
fullPageImages.put(pageNumber, new RenderedPageImageFile(pageNumber, String.format(imagePathFormat, i + 1)));
}
String[] cmdArgs = buildCmdArgs(stitchedImagePageIndices, documentAbsolutePath, imagePathFormat);
return new ProcessCmdsAndRenderedImageFiles(cmdArgs, fullPageImages);
}
private String[] buildCmdArgs(List<Integer> stitchedImagePageIndices, String documentAbsolutePath, String imagePathFormat) {
StringBuilder sPageList = new StringBuilder();
int i = 1;
for (Integer integer : stitchedImagePageIndices) {
@ -131,18 +145,19 @@ public class GhostScriptService {
i++;
}
return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + settings.getDpi(), "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"};
String[] cmdArgs = new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + settings.getDpi(), "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"};
return cmdArgs;
}
@SneakyThrows
private Process executeProcess(String[] cmdArgs) {
private Process executeProcess(ProcessCmdsAndRenderedImageFiles processInfo, BlockingQueue<RenderedPageImageFile> imageFileCollectorQueue) {
Process p = Runtime.getRuntime().exec(cmdArgs);
Process p = Runtime.getRuntime().exec(processInfo.cmdArgs());
InputStream stdOut = p.getInputStream();
ProcessIOLogger stdOutLogger = new ProcessIOLogger(stdOut, "GS", ProcessIOLogger.Type.STD_OUT);
GhostScriptOutputHandler stdOutLogger = GhostScriptOutputHandler.stdOut(stdOut, processInfo.renderedPageImageFiles(), imageFileCollectorQueue);
InputStream stdError = p.getErrorStream();
ProcessIOLogger stdErrorLogger = new ProcessIOLogger(stdError, "GS", ProcessIOLogger.Type.ERROR);
GhostScriptOutputHandler stdErrorLogger = GhostScriptOutputHandler.errorHandler(stdError);
stdOutLogger.start();
stdErrorLogger.start();
@ -150,6 +165,10 @@ public class GhostScriptService {
}
private record ProcessCmdsAndRenderedImageFiles(String[] cmdArgs, Map<Integer, RenderedPageImageFile> renderedPageImageFiles) {
}
private record ProcessInfo(Integer processIdx, List<Integer> stitchedPageNumbers) {
}

View File

@ -1,7 +1,6 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.awt.Graphics;
import java.awt.image.BufferedImage;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
@ -18,13 +17,12 @@ import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import lombok.Getter;
@ -33,8 +31,7 @@ import lombok.SneakyThrows;
@Getter
public class ImageStreamEngine extends PDFStreamEngine {
private ExtractedOcrImage currentImageOnPage;
private List<ExtractedOcrImage> imagesOnCurrentPage;
private List<ExtractedImage> imagesOnCurrentPage;
private OcrServiceSettings settings;
private int pageNum;
@ -69,22 +66,15 @@ public class ImageStreamEngine extends PDFStreamEngine {
}
Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix();
if (imageXObject.getColorSpace() instanceof PDDeviceRGB) {
BufferedImage image = imageXObject.getImage();
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi());
} else if (imageXObject.getColorSpace() instanceof PDDeviceGray) {
BufferedImage image = imageXObject.getImage();
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi());
} else {
BufferedImage pdfImage = imageXObject.getImage();
BufferedImage image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
Graphics g = image.getGraphics();
g.drawImage(pdfImage, 0, 0, null);
g.dispose();
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi());
}
this.imagesOnCurrentPage.add(this.currentImageOnPage);
//imagesOnPages.add(this.currentImageOnPage);
this.imagesOnCurrentPage.add(new ExtractedImage(pageNum,
QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, imageXObject.getWidth(), imageXObject.getHeight())),
imageXObject.getHeight(),
imageXObject.getWidth(),
imageXObject.getImage(),
imageCTM,
imagesOnCurrentPage.size(),
imageXObject.getColorSpace()));
} else if (xobject instanceof PDFormXObject) {
PDFormXObject form = (PDFormXObject) xobject;
showForm(form);

View File

@ -107,7 +107,7 @@ public class OCRService {
int numberOfOcrThreads = Math.min(settings.getOcrThreadCount(), document.getNumberOfPages());
stats = new Statistics(numberOfExtractThreads, numberOfOcrThreads);
BlockingQueue<OcrImage> ocrImageQueue = new ArrayBlockingQueue<>(numberOfOcrThreads);
BlockingQueue<OcrImage> ocrImageQueue = new ArrayBlockingQueue<>((int) (1.5 * numberOfOcrThreads));
OcrImageFactory ocrImageFactory = new OcrImageFactory(document,
documentFile,
@ -128,7 +128,7 @@ public class OCRService {
.toList();
log.info("Started {} OCR consumer threads, listening for images on the queue", ocrThreads.size());
ocrImageFactory.join();
log.info("Extracted all images, interrupting ocr threads");
log.info("Processed all images, interrupting ocr threads");
ocrThreads.forEach(Thread::interrupt);
for (OCRThread ocrThread : ocrThreads) {

View File

@ -6,13 +6,17 @@ import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.stream.Collectors;
import org.apache.pdfbox.pdmodel.PDDocument;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageExtractionThread;
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageProcessingThread;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
@ -29,6 +33,8 @@ public class OcrImageFactory {
File documentFile;
Path tmpImageDir;
GhostScriptService ghostScriptService;
BlockingQueue<UnprocessedImage> imageProcessingQueue;
ImageProcessingThread imageProcessingThread;
BlockingQueue<OcrImage> imageOutputQueue;
List<ImageExtractionThread> imageExtractionThreads;
List<Integer> stitchedPageNumbers;
@ -40,7 +46,7 @@ public class OcrImageFactory {
Path tmpImageDir,
int numberOfThreads,
GhostScriptService ghostScriptService,
BlockingQueue<OcrImage> imageOutputQueue,
BlockingQueue<OcrImage> imageOcrQueue,
OcrProgressLogger logger,
OcrServiceSettings settings,
Statistics stats) {
@ -49,7 +55,8 @@ public class OcrImageFactory {
this.documentFile = documentFile;
this.tmpImageDir = tmpImageDir;
this.ghostScriptService = ghostScriptService;
this.imageOutputQueue = imageOutputQueue;
this.imageOutputQueue = imageOcrQueue;
this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOcrQueue.remainingCapacity());
this.stitchedPageNumbers = Collections.synchronizedList(new LinkedList<>());
this.stats = stats;
@ -57,8 +64,10 @@ public class OcrImageFactory {
List<List<Integer>> balancedPageNumbers = ListSplittingUtils.buildBalancedContinuousSublist(document.getNumberOfPages(), numberOfThreads);
for (int i = 0; i < balancedPageNumbers.size(); i++) {
imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageOutputQueue, stitchedPageNumbers));
imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageProcessingQueue, stitchedPageNumbers));
}
this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOcrQueue, stats, settings, document);
log.info("Started {} image extraction threads, with ({}) pages each",
imageExtractionThreads.size(),
imageExtractionThreads.stream().map(ImageExtractionThread::getPageIndices).map(List::size).map(String::valueOf).collect(Collectors.joining(", ")));
@ -70,6 +79,8 @@ public class OcrImageFactory {
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
imageExtractionThread.start();
}
imageProcessingThread.start();
}
@ -79,11 +90,15 @@ public class OcrImageFactory {
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
imageExtractionThread.join();
}
if (stitchedPageNumbers.isEmpty()) {
return;
}
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageOutputQueue, stats);
if (!stitchedPageNumbers.isEmpty()) {
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageProcessingQueue, stats);
}
imageProcessingThread.interrupt();
log.info("All images extracted, interrupting processing thread.");
imageProcessingThread.join();
}
}

View File

@ -15,6 +15,7 @@ public class Statistics {
List<Long> tesseractDuration;
AtomicLong pdf2ImgDuration;
AtomicLong writingTextDuration;
AtomicLong imageProcessingDuration;
public Statistics(int numberOfExtractThreads, int numberOfOcrThreads) {
@ -23,6 +24,7 @@ public class Statistics {
this.tesseractDuration = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfOcrThreads, 0L)));
this.pdf2ImgDuration = new AtomicLong(0);
this.writingTextDuration = new AtomicLong(0);
this.imageProcessingDuration = new AtomicLong(0);
}
@ -32,6 +34,12 @@ public class Statistics {
}
public void increaseImageProcessing(long duration) {
imageProcessingDuration.addAndGet(duration);
}
public void increaseTesseractDuration(int threadId, long duration) {
tesseractDuration.set(threadId, tesseractDuration.get(threadId) + duration);
@ -53,13 +61,15 @@ public class Statistics {
@Override
public String toString() {
return String.format("imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, PDF2Img=%.2f s, writingText=%.2f s",
return String.format(
"imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s",
((float) imageExtraction.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
((float) imageExtraction.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
((float) imageExtraction.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
((float) tesseractDuration.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
((float) tesseractDuration.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
((float) tesseractDuration.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
(float) imageProcessingDuration.get() / 1000,
(float) pdf2ImgDuration.get() / 1000,
(float) writingTextDuration.get() / 1000);
}

View File

@ -0,0 +1,61 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.tess4j.TessAPI1;
/*
This just moves the Elements from the GhostScriptOutputListener into the ImageProcessing queue asynchronously
*/
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class BlockingQueueFiller extends Thread {
BlockingQueue<RenderedPageImageFile> imageInputQueue;
BlockingQueue<UnprocessedImage> imageOutputQueue;
@SneakyThrows
@Override
public void run() {
// Interrupting signals that the image extraction has finished
while (true) {
try {
final UnprocessedImage image = imageInputQueue.take();
try {
imageOutputQueue.put(image);
} catch (InterruptedException e) {
imageOutputQueue.put(image);
break;
}
} catch (InterruptedException e) {
break;
}
}
// empty the queue
List<UnprocessedImage> remainingImages = new ArrayList<>(imageInputQueue.size());
imageInputQueue.drainTo(remainingImages);
remainingImages.forEach(image -> {
try {
imageOutputQueue.put(image);
} catch (InterruptedException e) {
log.error(e.getMessage());
}
});
}
}

View File

@ -0,0 +1,122 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.BlockingQueue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class GhostScriptOutputHandler extends Thread {
static Pattern pageFinishedPattern = Pattern.compile("Page (\\d+)");
// If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock.
// Since both need to read simultaneously we need to implement the readers as separate threads.
final InputStream is;
final String processName;
final Type type;
final Map<Integer, RenderedPageImageFile> pagesToProcess;
final BlockingQueue<RenderedPageImageFile> renderedPageImageFileOutput;
int currentPageNumber;
public static GhostScriptOutputHandler errorHandler(InputStream is) {
return new GhostScriptOutputHandler(is, "GS", Type.ERROR, null, null);
}
public static GhostScriptOutputHandler stdOut(InputStream is,
Map<Integer, RenderedPageImageFile> pagesToProcess,
BlockingQueue<RenderedPageImageFile> renderedPageImageFileOutput) {
return new GhostScriptOutputHandler(is, "GS", Type.STD_OUT, pagesToProcess, renderedPageImageFileOutput);
}
@SneakyThrows
public void run() {
try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) {
String line;
while (true) {
line = br.readLine();
if (line == null) {
break;
}
if (type.equals(Type.ERROR)) {
log.error(processName + "_" + type.name() + ">" + line);
} else {
log.debug(processName + "_" + type.name() + ">" + line);
addProcessedImageToQueue(line);
}
}
}
is.close();
if (type.equals(Type.STD_OUT)) {
queueFinishedPage(currentPageNumber);
}
}
private void addProcessedImageToQueue(String line) {
/*
Ghostscript prints the pageNumber it is currently working on, so we remember the current page and queue it as soon as the next comes in.
*/
Matcher pageNumberMatcher = pageFinishedPattern.matcher(line);
if (pageNumberMatcher.find()) {
int pageNumber = Integer.parseInt(pageNumberMatcher.group(1));
if (currentPageNumber == 0) {
currentPageNumber = pageNumber;
return;
}
queueFinishedPage(currentPageNumber);
currentPageNumber = pageNumber;
}
}
private void queueFinishedPage(int pageNumber) {
var imageFile = this.pagesToProcess.get(pageNumber);
if (imageFile == null) {
throw new IllegalArgumentException(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet()));
}
assert new File(imageFile.absoluteFilePath()).isFile();
renderedPageImageFileOutput.add(imageFile);
}
public enum Type {
ERROR,
STD_OUT
}
}

View File

@ -5,12 +5,11 @@ import java.util.List;
import java.util.concurrent.BlockingQueue;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine;
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
@ -26,6 +25,7 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ImageExtractionThread extends Thread {
static double FULL_PAGE_IMAGE_THRESHOLD = 0.99;
static double IMAGE_ALIGNMENT_THRESHOLD = 1;
int id;
@ -37,9 +37,10 @@ public class ImageExtractionThread extends Thread {
OcrServiceSettings settings;
// output is written to these lists
BlockingQueue<OcrImage> imageOutputQueue;
BlockingQueue<UnprocessedImage> imageProcessingQueue;
List<Integer> stitchedPageNumbers;
@SneakyThrows
@Override
public void run() {
@ -48,28 +49,28 @@ public class ImageExtractionThread extends Thread {
for (Integer pageIndex : pageIndices) {
try (PDDocument document = Loader.loadPDF(documentFile)) { // load new PDDocument for thread safety, also keeps RAM usage low.
timestamp = System.currentTimeMillis();
List<ExtractedOcrImage> extractedOcrImages = getExtractedOcrImages(pageIndex, document);
List<ExtractedImage> extractedImages = getExtractedImages(pageIndex, document);
stats.increaseImageExtraction(id, System.currentTimeMillis() - timestamp);
if (extractedOcrImages.isEmpty()) {
if (extractedImages.isEmpty()) {
logger.logPageSkipped(pageIndex);
}
if (checkForStitchedImages(extractedOcrImages)) {
if (checkForFullPageOrStitchedImages(extractedImages, document.getPage(pageIndex - 1))) {
stitchedPageNumbers.add(pageIndex);
logger.addImagesToProcess(pageIndex, 0);
continue;
}
for (ExtractedOcrImage image : extractedOcrImages) {
imageOutputQueue.put(image);
logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage());
for (ExtractedImage image : extractedImages) {
imageProcessingQueue.put((UnprocessedImage) image);
logger.addImagesToProcess(image.pageNumber(), image.numberOnPage());
}
}
}
}
private List<ExtractedOcrImage> getExtractedOcrImages(Integer pageIndex, PDDocument document) {
private List<ExtractedImage> getExtractedImages(Integer pageIndex, PDDocument document) {
PDPage page = document.getPage(pageIndex - 1);
ImageStreamEngine imageStreamEngine = new ImageStreamEngine(settings);
@ -79,22 +80,22 @@ public class ImageExtractionThread extends Thread {
@SneakyThrows
private boolean checkForStitchedImages(List<ExtractedOcrImage> imagesOnCurrentPage) {
private boolean checkForFullPageOrStitchedImages(List<ExtractedImage> imagesOnCurrentPage, PDPage page) {
if (imagesOnCurrentPage.size() <= 1) {
if (imagesOnCurrentPage.isEmpty()) {
return false;
}
//checking for intersections or direct alignment of images
ExtractedOcrImage[] imageOnPagesArray = new ExtractedOcrImage[imagesOnCurrentPage.size()];
int index = 0;
for (ExtractedOcrImage imageOnPage : imagesOnCurrentPage) {
imageOnPagesArray[index] = imageOnPage;
index++;
for (ExtractedImage imageOnPage : imagesOnCurrentPage) {
if (imageOnPage.width() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getWidth() && imageOnPage.height() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight()) {
return true;
}
}
for (int j = 0; j < imageOnPagesArray.length; j++) {
for (int i = j + 1; i < imageOnPagesArray.length; i++) {
if (imageOnPagesArray[j].getImageCoordinatesInInitialUserSpace().aligns(imageOnPagesArray[i].getImageCoordinatesInInitialUserSpace(), IMAGE_ALIGNMENT_THRESHOLD)) {
//checking for intersections or direct alignment of images
for (int j = 0; j < imagesOnCurrentPage.size(); j++) {
for (int i = j + 1; i < imagesOnCurrentPage.size(); i++) {
if (imagesOnCurrentPage.get(j).getImageCoordinatesInInitialUserSpace().aligns(imagesOnCurrentPage.get(i).getImageCoordinatesInInitialUserSpace(), IMAGE_ALIGNMENT_THRESHOLD)) {
// TODO: see if we can stitch aligning images using BufferedImage and skip the gs conversion entirely
return true;
}

View File

@ -0,0 +1,205 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import static net.sourceforge.tess4j.ITessAPI.TRUE;
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import org.apache.pdfbox.pdmodel.PDDocument;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import com.sun.jna.ptr.PointerByReference;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
import net.sourceforge.tess4j.ITessAPI;
import net.sourceforge.tess4j.TessAPI1;
/*
* This thread does all the image processing. There should only be one, since Leptonica is not thread safe.
*/
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ImageProcessingThread extends Thread {
BlockingQueue<UnprocessedImage> imageInputQueue;
BlockingQueue<OcrImage> imageOutputQueue;
ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
Statistics stats;
OcrServiceSettings settings;
PDDocument document;
@SneakyThrows
@Override
public void run() {
// Interrupting signals that the image extraction has finished
while (true) {
try {
final UnprocessedImage image = imageInputQueue.take();
OcrImage extractedOcrImage = this.process(image);
try {
imageOutputQueue.put(extractedOcrImage);
} catch (InterruptedException e) {
imageOutputQueue.put(extractedOcrImage);
break;
}
} catch (InterruptedException e) {
break;
}
}
// empty the queue
List<UnprocessedImage> remainingImages = new ArrayList<>(imageInputQueue.size());
imageInputQueue.drainTo(remainingImages);
remainingImages.forEach(image -> {
OcrImage ocrImage = this.process(image);
try {
imageOutputQueue.put(ocrImage);
} catch (InterruptedException e) {
log.error(e.getMessage());
}
});
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
}
private OcrImage process(UnprocessedImage unprocessedImage) {
long timestamp = System.currentTimeMillis();
OcrImage ocrImage;
if (unprocessedImage instanceof ExtractedImage extractedImage) {
ocrImage = processExtractedImage(extractedImage);
} else if (unprocessedImage instanceof RenderedPageImageFile renderedPageImageFile) {
ocrImage = processRenderedPageImageFile(renderedPageImageFile);
} else {
throw new UnsupportedOperationException(String.format("Class %s is not supported!", unprocessedImage.getClass()));
}
stats.increaseImageProcessing(System.currentTimeMillis() - timestamp);
return ocrImage;
}
private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) {
Pix pix = binarize(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi());
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix);
OcrImage ocrImage = new RenderedPageOcrImage(pix.h,
pix.w,
PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1)),
rotatedPix,
orientDegree);
if (pix != rotatedPix) {
LeptUtils.disposePix(pix);
}
return ocrImage;
}
private OcrImage processExtractedImage(ExtractedImage extractedImage) {
float imageDPI = Math.abs(extractedImage.image().getWidth() / (extractedImage.ctm().getScalingFactorX() / 72));
Pix pix = binarize(extractedImage.asPix(), imageDPI, settings.getDpi());
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix);
OcrImage ocrImage = new ExtractedOcrImage(extractedImage.pageNumber(),
extractedImage.numberOnPage(),
extractedImage.height(),
extractedImage.width(),
extractedImage.ctm(),
rotatedPix,
pix.h,
pix.w,
orientDegree);
if (pix != rotatedPix) {
LeptUtils.disposePix(pix);
}
return ocrImage;
}
static public int detectOrientation(Pix pix, int dpi, ITessAPI.TessBaseAPI detectionScriptHandle) {
TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, pix);
TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, dpi);
IntBuffer orientationDegreeResultBuffer;
FloatBuffer orientationDegreeConfidenceBuffer;
PointerByReference scriptureNameBuffer;
FloatBuffer scriptureConfidenceBuffer;
orientationDegreeResultBuffer = IntBuffer.allocate(1);
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
scriptureNameBuffer = new PointerByReference();
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
int orientationDegree = 0;
int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle,
orientationDegreeResultBuffer,
orientationDegreeConfidenceBuffer,
scriptureNameBuffer,
scriptureConfidenceBuffer);
if (result == TRUE && orientationDegreeConfidenceBuffer.get() > 10) {
orientationDegree = orientationDegreeResultBuffer.get();
}
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
return orientationDegree;
}
@SneakyThrows
private Pix binarize(Pix pix, float imageDpi, int targetDpi) {
Pix grayScale = ImageProcessingUtils.convertToGrayScale(pix);
Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale);
return ImageProcessingUtils.despecklePix(scaledUp);
}
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
String datapath = System.getenv("TESSDATA_PREFIX");
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
return handle;
}
}

View File

@ -1,6 +1,10 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import static net.sourceforge.tess4j.ITessAPI.TRUE;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPICreate;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIInit1;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetPageSegMode;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetVariable;
import java.io.File;
import java.nio.FloatBuffer;
@ -16,6 +20,7 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
import com.knecon.fforesight.service.ocr.processor.utils.Tesseract2;
import com.sun.jna.StringArray;
import com.sun.jna.ptr.PointerByReference;
import lombok.AccessLevel;
@ -43,7 +48,6 @@ public class OCRThread extends Thread {
Statistics stats;
OcrServiceSettings settings;
Tesseract2 instance;
ITessAPI.TessBaseAPI detectionScriptHandle;
public OCRThread(int id,
@ -62,7 +66,6 @@ public class OCRThread extends Thread {
this.stats = stats;
this.settings = settings;
this.instance = createInstance(settings);
this.detectionScriptHandle = initDetectionScriptHandle();
}
@ -87,10 +90,9 @@ public class OCRThread extends Thread {
this.process(image);
}
} catch (NoSuchElementException e) {
log.debug("Processed all Images, finishing.");
log.debug("Executed tesseract on all Images, finishing.");
}
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
}
@ -102,15 +104,8 @@ public class OCRThread extends Thread {
int psm = settings.getPsmOverride() < 0 ? image.getOptimalPageSegmentationMode() : settings.getPsmOverride();
int orientDegree = detectOrientation(image);
image.setRotationDegrees(orientDegree);
Pix rotatedPix = image.getRotatedPix();
executeTesseract(psm, image.getDpi(), rotatedPix, tesseractOutputFileName);
synchronized (OCRThread.class) {
image.destroyPix();
LeptUtils.disposePix(rotatedPix);
}
executeTesseract(psm, image.getDpi(), image.getPix(), tesseractOutputFileName);
image.destroyPix();
results.add(OcrResult.create(image, tesseractOutputFileName));
logger.logImageFinished(image, psm);
@ -118,51 +113,6 @@ public class OCRThread extends Thread {
}
public int detectOrientation(OcrImage image) {
IntBuffer orientationDegreeResultBuffer;
FloatBuffer orientationDegreeConfidenceBuffer;
PointerByReference scriptureNameBuffer;
FloatBuffer scriptureConfidenceBuffer;
TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, image.getPix());
TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, image.getDpi());
synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs in leptonica binarization.
orientationDegreeResultBuffer = IntBuffer.allocate(1);
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
scriptureNameBuffer = new PointerByReference();
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
}
int orient_deg = 0;
int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle,
orientationDegreeResultBuffer,
orientationDegreeConfidenceBuffer,
scriptureNameBuffer,
scriptureConfidenceBuffer);
if (result == TRUE) {
orient_deg = orientationDegreeResultBuffer.get();
}
synchronized (OCRThread.class) {
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
}
return orient_deg;
}
synchronized private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
String datapath = System.getenv("TESSDATA_PREFIX");
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
return handle;
}
@SneakyThrows
public void executeTesseract(int psm, int dpi, Pix pix, String tesseractOutputFileName) {

View File

@ -1,55 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ProcessIOLogger extends Thread {
// If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock.
// Since both need to read simultaneously we need to implement the readers as separate threads.
InputStream is;
String processName;
Type type;
@SneakyThrows
public void run() {
try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) {
String line;
while (true) {
line = br.readLine();
if (line == null) {
break;
}
if (type.equals(Type.ERROR)) {
log.error(processName + "_" + type.name() + ">" + line);
} else {
log.debug(processName + "_" + type.name() + ">" + line);
}
}
}
is.close();
}
public enum Type {
ERROR,
STD_OUT
}
}

View File

@ -14,7 +14,7 @@ public class OcrServiceSettings {
int ocrThreadCount = 4; // Number of OCR threads
int imageExtractThreadCount = 2; // Number of image extraction threads
int gsProcessCount = 2; // Number of Ghostscript processes
int gsProcessCount = 1; // Number of Ghostscript processes
int dpi = 300; // Target DPI for binarized images
int psmOverride = -1; // Overrides the page segmentation mode if > 0
int minImageHeight = 20; // Minimum height for images to be processed

View File

@ -2,10 +2,15 @@ package com.knecon.fforesight.service.ocr.processor.utils;
import java.awt.AlphaComposite;
import java.awt.Color;
import java.awt.Graphics;
import java.awt.Graphics2D;
import java.awt.Transparency;
import java.awt.image.BufferedImage;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@ -16,6 +21,22 @@ import net.sourceforge.lept4j.util.LeptUtils;
@UtilityClass
public class ImageProcessingUtils {
public BufferedImage convertToDeviceColorSpace(ExtractedImage extractedImage) {
BufferedImage image;
if (extractedImage.colorSpace() instanceof PDDeviceRGB || extractedImage.colorSpace() instanceof PDDeviceGray) {
image = extractedImage.image();
} else {
BufferedImage pdfImage = extractedImage.image();
image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
Graphics g = image.getGraphics();
g.drawImage(pdfImage, 0, 0, null);
g.dispose();
}
return image;
}
public static Pix despecklePix(Pix pix) {
assert pix.d == 8;
@ -24,7 +45,9 @@ public class ImageProcessingUtils {
// too small to properly despeckle, just binarize instead.
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
} else {
despeckled = LeptUtils.despeckle(pix, LeptUtils.SEL_STR3, 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though...
despeckled = LeptUtils.despeckle(pix,
LeptUtils.SEL_STR3,
3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though...
if (despeckled == null) {
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
}
@ -57,23 +80,35 @@ public class ImageProcessingUtils {
@SneakyThrows
public static Pix convertToGrayScale(BufferedImage image) {
public static Pix convertToGrayScale(Pix pix) {
Pix pix = LeptUtils.convertImageToPix(image);
if (pix.d == 8) {
return pix;
} else if (pix.d == 32) {
Pix grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
LeptUtils.disposePix(pix);
return grayScale;
} else {
} else if (pix.d == 1) {
Pix grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255);
LeptUtils.disposePix(pix);
return grayScale;
} else {
throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d));
}
}
public Pix deRotatePix(int orientDegree, Pix pix) {
return switch (360 - orientDegree) {
case 90 -> Leptonica1.pixRotateOrth(pix, 1);
case 180 -> Leptonica1.pixRotateOrth(pix, 2);
case 270 -> Leptonica1.pixRotateOrth(pix, 3);
default -> pix;
};
}
public static void setAlphaChannelToWhite(BufferedImage image) {
if (image.getTransparency() == Transparency.TRANSLUCENT) {

View File

@ -0,0 +1,36 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import static net.sourceforge.lept4j.ILeptonica.IFF_PNG;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
@Disabled
class ImageProcessingUtilsTest {
@BeforeEach
public void loadLeptonica() {
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
}
@Test
public void testRotation() {
Pix pix = Leptonica1.pixRead("/home/kschuettler/Downloads/painHarold.webp");
Pix pix2 = ImageProcessingUtils.deRotatePix(0, pix);
Leptonica1.pixWrite("/tmp/0.png", pix2, IFF_PNG);
Pix pix3 = ImageProcessingUtils.deRotatePix(90, pix);
Leptonica1.pixWrite("/tmp/90.png", pix3, IFF_PNG);
Pix pix4 = ImageProcessingUtils.deRotatePix(180, pix);
Leptonica1.pixWrite("/tmp/180.png", pix4, IFF_PNG);
Pix pix5 = ImageProcessingUtils.deRotatePix(270, pix);
Leptonica1.pixWrite("/tmp/270.png", pix5, IFF_PNG);
}
}

View File

@ -1,10 +1,7 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import java.awt.image.BufferedImage;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.IntStream;
@ -19,7 +16,7 @@ import org.springframework.core.io.ClassPathResource;
import org.springframework.util.FileSystemUtils;
import com.knecon.fforesight.service.ocr.processor.service.OsUtils;
import com.knecon.fforesight.service.ocr.processor.service.threads.ProcessIOLogger;
import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler;
import lombok.SneakyThrows;
@ -50,29 +47,6 @@ public class Pdf2ImgTest {
}
@Test
@SneakyThrows
public void testGhostScript() {
String outputDir = "/tmp/ghostscript_out/";
new File(outputDir).mkdirs();
ClassPathResource resource = new ClassPathResource("files/Cyberport__SD-Faktura-Kopie_(ZRG2)_-_31.08.2020.pdf");
String[] cmdArgs = new String[]{"gs", "-dNOPAUSE", "-sDEVICE=tiff24nc", "-r" + DPI, "-sOutputFile=" + outputDir + "page%04d", resource.getFile().toString(), "-c", "quit"};
Process p = Runtime.getRuntime().exec(cmdArgs);
ProcessIOLogger logger = new ProcessIOLogger(p.getInputStream(), "GS", ProcessIOLogger.Type.STD_OUT);
logger.start();
ProcessIOLogger errorLogger = new ProcessIOLogger(p.getErrorStream(), "GS", ProcessIOLogger.Type.STD_OUT);
errorLogger.start();
int exitcode = p.waitFor();
logger.join();
errorLogger.join();
System.out.println("Ghostscript finished with exit code " + exitcode);
FileSystemUtils.deleteRecursively(new File(outputDir));
}
@Test
@SneakyThrows
public void testGhostScriptParallel() {