RED-7669: optimize OCR-module performance

* move all critical stuff to its own singleton thread
* make gs process queue any image once the file has been written
This commit is contained in:
Kilian Schuettler 2023-11-23 14:56:00 +01:00
parent efd3a1d952
commit 955ff6281d
16 changed files with 353 additions and 169 deletions

View File

@ -0,0 +1,12 @@
package com.knecon.fforesight.service.ocr.processor.model;
import org.apache.pdfbox.pdmodel.PDPage;
public record PageInformation(int height, int width, int number, int rotationDegrees) {
public static PageInformation fromPDPage(int pageNum, PDPage page) {
return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation());
}
}

View File

@ -1,5 +1,14 @@
package com.knecon.fforesight.service.ocr.processor.model;
public record RenderedPageImageFile(int pageNumber, String absoluteFilePath) {
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
public record RenderedPageImageFile(int pageNumber, String absoluteFilePath) implements UnprocessedImage {
@Override
public Pix asPix() {
return Leptonica1.pixRead(absoluteFilePath);
}
}

View File

@ -8,6 +8,7 @@ import org.apache.pdfbox.pdmodel.PDPage;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.Setter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
@ -16,29 +17,17 @@ import net.sourceforge.lept4j.Pix;
import net.sourceforge.tess4j.ITessAPI;
@Getter
@FieldDefaults(level = AccessLevel.PRIVATE)
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class RenderedPageOcrImage implements OcrImage {
final String absoluteImagePath;
final int height;
final int width;
final PageInformation pageInformation;
final Pix pix;
@Setter
int height;
int width;
PageInformation pageInformation;
Pix pix;
int rotationDegrees;
@SneakyThrows
public RenderedPageOcrImage(RenderedPageImageFile renderedPageImageFile, PDDocument document) {
this.pageInformation = PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1));
this.absoluteImagePath = renderedPageImageFile.absoluteFilePath();
this.pix = Leptonica1.pixRead(absoluteImagePath);
this.height = getPix().h;
this.width = getPix().w;
}
@Override
public int getOptimalPageSegmentationMode() {
@ -107,7 +96,7 @@ public class RenderedPageOcrImage implements OcrImage {
// PDFBox always returns page height and width based on rotation
double pageWidth;
if (pageInformation.rotationDegrees == 90 || pageInformation.rotationDegrees == 270) {
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
pageWidth = pageInformation.height();
} else {
pageWidth = pageInformation.width();
@ -116,14 +105,4 @@ public class RenderedPageOcrImage implements OcrImage {
return pageWidth / width;
}
private record PageInformation(int height, int width, int number, int rotationDegrees) {
public static PageInformation fromPDPage(int pageNum, PDPage page) {
return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation());
}
}
}

View File

@ -0,0 +1,9 @@
package com.knecon.fforesight.service.ocr.processor.model;
import net.sourceforge.lept4j.Pix;
public interface UnprocessedImage {
Pix asPix();
}

View File

@ -4,18 +4,26 @@ import java.io.InputStream;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.LinkedTransferQueue;
import java.util.stream.Collectors;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service;
import com.azure.core.implementation.GeoObjectHelper;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage;
import com.knecon.fforesight.service.ocr.processor.service.threads.ProcessIOLogger;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import com.knecon.fforesight.service.ocr.processor.service.threads.BlockingQueueFiller;
import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
@ -24,6 +32,7 @@ import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.Pix;
@Slf4j
@Service
@ -42,17 +51,19 @@ public class GhostScriptService {
String documentAbsolutePath,
Path tmpImageDir,
PDDocument document,
BlockingQueue<OcrImage> imageOutputQueue,
BlockingQueue<UnprocessedImage> imageProcessingQueue,
Statistics stats) {
BlockingQueue<RenderedPageImageFile> imageFileCollectorQueue = new LinkedBlockingDeque<>();
Thread asyncTransferThread = new BlockingQueueFiller(imageFileCollectorQueue, imageProcessingQueue);
asyncTransferThread.start();
int numOfProcesses = Math.min(settings.getGsProcessCount(), stitchedPageNumbers.size());
List<List<ProcessInfo>> processInfoBatches = buildSubListForEachProcess(stitchedPageNumbers,
numOfProcesses,
settings.getOcrThreadCount()); // use 2 times the thread count as batch size, such that GS generates the rendered pages as needed by the OCR Threads
256 * numOfProcesses); // GS has a limit on how many pageIndices per call are possible, so we limit it to 256 pages per process
for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) {
long timestamp = System.currentTimeMillis();
List<RenderedPageImageFile> renderedPageImageFiles = Collections.synchronizedList(new LinkedList<>());
List<ProcessInfo> processInfos = processInfoBatches.get(batchIdx);
log.info("Batch {}: Running {} gs processes with ({}) pages each",
@ -63,9 +74,9 @@ public class GhostScriptService {
int finalBatchIdx = batchIdx;
List<Process> processes = processInfos.stream()
.parallel()
.map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.stitchedPageNumbers(), tmpImageDir, documentAbsolutePath, renderedPageImageFiles))
.peek(s -> log.debug(String.join(" ", s)))
.map(this::executeProcess)
.map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.stitchedPageNumbers(), tmpImageDir, documentAbsolutePath))
.peek(s -> log.debug(String.join(" ", s.cmdArgs())))
.map(processInfo -> executeProcess(processInfo, imageFileCollectorQueue))
.toList();
List<Integer> processExitCodes = new LinkedList<>();
@ -73,14 +84,9 @@ public class GhostScriptService {
processExitCodes.add(process.waitFor());
}
stats.increasePDF2ImgDuration(System.currentTimeMillis() - timestamp);
log.info("Batch {}: Ghostscript processes finished with exit codes " + processExitCodes, batchIdx);
for (RenderedPageImageFile renderedPageImageFile : renderedPageImageFiles) {
OcrImage image = new RenderedPageOcrImage(renderedPageImageFile, document);
imageOutputQueue.put(image);
}
}
asyncTransferThread.interrupt();
}
@ -107,20 +113,28 @@ public class GhostScriptService {
@SneakyThrows
private String[] buildCmdArgs(Integer processIdx,
Integer batchIdx,
List<Integer> stitchedImagePageIndices,
Path outputDir,
String documentAbsolutePath,
List<RenderedPageImageFile> fullPageImages) {
private ProcessCmdsAndRenderedImageFiles buildCmdArgs(Integer processIdx,
Integer batchIdx,
List<Integer> stitchedImagePageIndices,
Path outputDir,
String documentAbsolutePath) {
String imagePathFormat = outputDir.resolve("output_" + processIdx + "_" + batchIdx + ".%04d" + FORMAT).toFile().toString();
Map<Integer, RenderedPageImageFile> fullPageImages = new HashMap<>();
for (int i = 0; i < stitchedImagePageIndices.size(); i++) {
Integer pageNumber = stitchedImagePageIndices.get(i);
fullPageImages.add(new RenderedPageImageFile(pageNumber, String.format(imagePathFormat, i + 1)));
fullPageImages.put(pageNumber, new RenderedPageImageFile(pageNumber, String.format(imagePathFormat, i + 1)));
}
String[] cmdArgs = buildCmdArgs(stitchedImagePageIndices, documentAbsolutePath, imagePathFormat);
return new ProcessCmdsAndRenderedImageFiles(cmdArgs, fullPageImages);
}
private String[] buildCmdArgs(List<Integer> stitchedImagePageIndices, String documentAbsolutePath, String imagePathFormat) {
StringBuilder sPageList = new StringBuilder();
int i = 1;
for (Integer integer : stitchedImagePageIndices) {
@ -131,18 +145,19 @@ public class GhostScriptService {
i++;
}
return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + settings.getDpi(), "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"};
String[] cmdArgs = new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + settings.getDpi(), "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"};
return cmdArgs;
}
@SneakyThrows
private Process executeProcess(String[] cmdArgs) {
private Process executeProcess(ProcessCmdsAndRenderedImageFiles processInfo, BlockingQueue<RenderedPageImageFile> imageFileCollectorQueue) {
Process p = Runtime.getRuntime().exec(cmdArgs);
Process p = Runtime.getRuntime().exec(processInfo.cmdArgs());
InputStream stdOut = p.getInputStream();
ProcessIOLogger stdOutLogger = new ProcessIOLogger(stdOut, "GS", ProcessIOLogger.Type.STD_OUT);
GhostScriptOutputHandler stdOutLogger = GhostScriptOutputHandler.stdOut(stdOut, processInfo.renderedPageImageFiles(), imageFileCollectorQueue);
InputStream stdError = p.getErrorStream();
ProcessIOLogger stdErrorLogger = new ProcessIOLogger(stdError, "GS", ProcessIOLogger.Type.ERROR);
GhostScriptOutputHandler stdErrorLogger = GhostScriptOutputHandler.errorHandler(stdError);
stdOutLogger.start();
stdErrorLogger.start();
@ -150,6 +165,10 @@ public class GhostScriptService {
}
private record ProcessCmdsAndRenderedImageFiles(String[] cmdArgs, Map<Integer, RenderedPageImageFile> renderedPageImageFiles) {
}
private record ProcessInfo(Integer processIdx, List<Integer> stitchedPageNumbers) {
}

View File

@ -1,8 +1,6 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.awt.Graphics;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
@ -19,14 +17,11 @@ import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;

View File

@ -14,6 +14,7 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageExtractionThread;
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageProcessingThread;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
@ -32,7 +33,7 @@ public class OcrImageFactory {
File documentFile;
Path tmpImageDir;
GhostScriptService ghostScriptService;
BlockingQueue<ExtractedImage> imageProcessingQueue;
BlockingQueue<UnprocessedImage> imageProcessingQueue;
ImageProcessingThread imageProcessingThread;
BlockingQueue<OcrImage> imageOutputQueue;
List<ImageExtractionThread> imageExtractionThreads;
@ -45,7 +46,7 @@ public class OcrImageFactory {
Path tmpImageDir,
int numberOfThreads,
GhostScriptService ghostScriptService,
BlockingQueue<OcrImage> imageOutputQueue,
BlockingQueue<OcrImage> imageOcrQueue,
OcrProgressLogger logger,
OcrServiceSettings settings,
Statistics stats) {
@ -54,8 +55,8 @@ public class OcrImageFactory {
this.documentFile = documentFile;
this.tmpImageDir = tmpImageDir;
this.ghostScriptService = ghostScriptService;
this.imageOutputQueue = imageOutputQueue;
this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOutputQueue.remainingCapacity());
this.imageOutputQueue = imageOcrQueue;
this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOcrQueue.remainingCapacity());
this.stitchedPageNumbers = Collections.synchronizedList(new LinkedList<>());
this.stats = stats;
@ -65,7 +66,7 @@ public class OcrImageFactory {
for (int i = 0; i < balancedPageNumbers.size(); i++) {
imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageProcessingQueue, stitchedPageNumbers));
}
this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOutputQueue, stats, settings);
this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOcrQueue, stats, settings, document);
log.info("Started {} image extraction threads, with ({}) pages each",
imageExtractionThreads.size(),
@ -91,7 +92,7 @@ public class OcrImageFactory {
}
if (!stitchedPageNumbers.isEmpty()) {
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageOutputQueue, stats);
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageProcessingQueue, stats);
}
imageProcessingThread.interrupt();
log.info("All images extracted, interrupting processing thread.");

View File

@ -0,0 +1,61 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.tess4j.TessAPI1;
/*
This just moves the Elements from the GhostScriptOutputListener into the ImageProcessing queue asynchronously
*/
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class BlockingQueueFiller extends Thread {
BlockingQueue<RenderedPageImageFile> imageInputQueue;
BlockingQueue<UnprocessedImage> imageOutputQueue;
@SneakyThrows
@Override
public void run() {
// Interrupting signals that the image extraction has finished
while (true) {
try {
final UnprocessedImage image = imageInputQueue.take();
try {
imageOutputQueue.put(image);
} catch (InterruptedException e) {
imageOutputQueue.put(image);
break;
}
} catch (InterruptedException e) {
break;
}
}
// empty the queue
List<UnprocessedImage> remainingImages = new ArrayList<>(imageInputQueue.size());
imageInputQueue.drainTo(remainingImages);
remainingImages.forEach(image -> {
try {
imageOutputQueue.put(image);
} catch (InterruptedException e) {
log.error(e.getMessage());
}
});
}
}

View File

@ -0,0 +1,122 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.BlockingQueue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class GhostScriptOutputHandler extends Thread {
static Pattern pageFinishedPattern = Pattern.compile("Page (\\d+)");
// If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock.
// Since both need to read simultaneously we need to implement the readers as separate threads.
final InputStream is;
final String processName;
final Type type;
final Map<Integer, RenderedPageImageFile> pagesToProcess;
final BlockingQueue<RenderedPageImageFile> renderedPageImageFileOutput;
int currentPageNumber;
public static GhostScriptOutputHandler errorHandler(InputStream is) {
return new GhostScriptOutputHandler(is, "GS", Type.ERROR, null, null);
}
public static GhostScriptOutputHandler stdOut(InputStream is,
Map<Integer, RenderedPageImageFile> pagesToProcess,
BlockingQueue<RenderedPageImageFile> renderedPageImageFileOutput) {
return new GhostScriptOutputHandler(is, "GS", Type.STD_OUT, pagesToProcess, renderedPageImageFileOutput);
}
@SneakyThrows
public void run() {
try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) {
String line;
while (true) {
line = br.readLine();
if (line == null) {
break;
}
if (type.equals(Type.ERROR)) {
log.error(processName + "_" + type.name() + ">" + line);
} else {
log.debug(processName + "_" + type.name() + ">" + line);
addProcessedImageToQueue(line);
}
}
}
is.close();
if (type.equals(Type.STD_OUT)) {
queueFinishedPage(currentPageNumber);
}
}
private void addProcessedImageToQueue(String line) {
/*
Ghostscript prints the pageNumber it is currently working on, so we remember the current page and queue it as soon as the next comes in.
*/
Matcher pageNumberMatcher = pageFinishedPattern.matcher(line);
if (pageNumberMatcher.find()) {
int pageNumber = Integer.parseInt(pageNumberMatcher.group(1));
if (currentPageNumber == 0) {
currentPageNumber = pageNumber;
return;
}
queueFinishedPage(currentPageNumber);
currentPageNumber = pageNumber;
}
}
private void queueFinishedPage(int pageNumber) {
var imageFile = this.pagesToProcess.get(pageNumber);
if (imageFile == null) {
throw new IllegalArgumentException(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet()));
}
assert new File(imageFile.absoluteFilePath()).isFile();
renderedPageImageFileOutput.add(imageFile);
}
public enum Type {
ERROR,
STD_OUT
}
}

View File

@ -9,6 +9,7 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine;
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
@ -36,7 +37,7 @@ public class ImageExtractionThread extends Thread {
OcrServiceSettings settings;
// output is written to these lists
BlockingQueue<ExtractedImage> imageProcessingQueue;
BlockingQueue<UnprocessedImage> imageProcessingQueue;
List<Integer> stitchedPageNumbers;
@ -61,7 +62,7 @@ public class ImageExtractionThread extends Thread {
}
for (ExtractedImage image : extractedImages) {
imageProcessingQueue.put(image);
imageProcessingQueue.put((UnprocessedImage) image);
logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage());
}
}

View File

@ -2,15 +2,22 @@ package com.knecon.fforesight.service.ocr.processor.service.threads;
import static net.sourceforge.tess4j.ITessAPI.TRUE;
import java.lang.annotation.Documented;
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import org.apache.pdfbox.pdmodel.PDDocument;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
@ -35,11 +42,12 @@ import net.sourceforge.tess4j.TessAPI1;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ImageProcessingThread extends Thread {
BlockingQueue<ExtractedImage> imageInputQueue;
BlockingQueue<UnprocessedImage> imageInputQueue;
BlockingQueue<OcrImage> imageOutputQueue;
ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
Statistics stats;
OcrServiceSettings settings;
PDDocument document;
@SneakyThrows
@ -49,7 +57,7 @@ public class ImageProcessingThread extends Thread {
// Interrupting signals that the image extraction has finished
while (true) {
try {
final ExtractedImage image = imageInputQueue.take();
final UnprocessedImage image = imageInputQueue.take();
OcrImage extractedOcrImage = this.process(image);
try {
imageOutputQueue.put(extractedOcrImage);
@ -62,9 +70,8 @@ public class ImageProcessingThread extends Thread {
break;
}
}
log.info("Leaving initial uninterrupted loop!");
// empty the queue
List<ExtractedImage> remainingImages = new ArrayList<>(imageInputQueue.size());
List<UnprocessedImage> remainingImages = new ArrayList<>(imageInputQueue.size());
imageInputQueue.drainTo(remainingImages);
remainingImages.forEach(image -> {
OcrImage ocrImage = this.process(image);
@ -79,21 +86,61 @@ public class ImageProcessingThread extends Thread {
}
private OcrImage process(ExtractedImage extractedImage) {
private OcrImage process(UnprocessedImage unprocessedImage) {
long timestamp = System.currentTimeMillis();
OcrImage ocrImage;
if (unprocessedImage instanceof ExtractedImage extractedImage) {
ocrImage = processExtractedImage(extractedImage);
} else if (unprocessedImage instanceof RenderedPageImageFile renderedPageImageFile) {
ocrImage = processRenderedPageImageFile(renderedPageImageFile);
} else {
throw new UnsupportedOperationException(String.format("Class %s is not supported!", unprocessedImage.getClass()));
}
stats.increaseImageProcessing(System.currentTimeMillis() - timestamp);
return ocrImage;
}
private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) {
Pix grayScale = ImageProcessingUtils.convertToGrayScale(renderedPageImageFile.asPix());
Pix despeckled = ImageProcessingUtils.despecklePix(grayScale);
int orientDegree = detectOrientation(despeckled, settings.getDpi(), detectionScriptHandle);
Pix rotatedPix = switch (360 - orientDegree) {
case 90 -> Leptonica1.pixRotateOrth(despeckled, 1);
case 180 -> Leptonica1.pixRotateOrth(despeckled, 2);
case 270 -> Leptonica1.pixRotateOrth(despeckled, 3);
default -> despeckled;
};
OcrImage ocrImage = new RenderedPageOcrImage(despeckled.h,
despeckled.w,
PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1)),
rotatedPix,
orientDegree);
if (despeckled != rotatedPix) {
LeptUtils.disposePix(despeckled);
}
return ocrImage;
}
private OcrImage processExtractedImage(ExtractedImage extractedImage) {
float imageDPI = Math.abs(extractedImage.getImage().getWidth() / (extractedImage.getCtm().getScalingFactorX() / 72));
Pix pix = binarize(extractedImage.asPix(), imageDPI, settings.getDpi());
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
Pix rotatedPix = switch (360 - orientDegree) {
case 90 -> Leptonica1.pixRotateOrth(pix, 1);
case 180 -> Leptonica1.pixRotateOrth(pix, 2);
case 270 -> Leptonica1.pixRotateOrth(pix, 3);
default -> pix;
};
OcrImage extractedOcrImage = new ExtractedOcrImage(extractedImage.getPageNumber(),
Pix rotatedPix = getRotatedPix(orientDegree, pix);
OcrImage ocrImage = new ExtractedOcrImage(extractedImage.getPageNumber(),
extractedImage.getNumberOnPage(),
extractedImage.getHeight(),
extractedImage.getWidth(),
@ -106,10 +153,18 @@ public class ImageProcessingThread extends Thread {
if (pix != rotatedPix) {
LeptUtils.disposePix(pix);
}
return ocrImage;
}
stats.increaseImageProcessing(System.currentTimeMillis() - timestamp);
return extractedOcrImage;
private static Pix getRotatedPix(int orientDegree, Pix pix) {
return switch (360 - orientDegree) {
case 90 -> Leptonica1.pixRotateOrth(pix, 1);
case 180 -> Leptonica1.pixRotateOrth(pix, 2);
case 270 -> Leptonica1.pixRotateOrth(pix, 3);
default -> pix;
};
}

View File

@ -1,55 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ProcessIOLogger extends Thread {
// If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock.
// Since both need to read simultaneously we need to implement the readers as separate threads.
InputStream is;
String processName;
Type type;
@SneakyThrows
public void run() {
try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) {
String line;
while (true) {
line = br.readLine();
if (line == null) {
break;
}
if (type.equals(Type.ERROR)) {
log.error(processName + "_" + type.name() + ">" + line);
} else {
log.debug(processName + "_" + type.name() + ">" + line);
}
}
}
is.close();
}
public enum Type {
ERROR,
STD_OUT
}
}

View File

@ -12,14 +12,14 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(level = AccessLevel.PRIVATE)
public class OcrServiceSettings {
int ocrThreadCount = 16; // Number of OCR threads
int ocrThreadCount = 4; // Number of OCR threads
int imageExtractThreadCount = 2; // Number of image extraction threads
int gsProcessCount = 2; // Number of Ghostscript processes
int gsProcessCount = 1; // Number of Ghostscript processes
int dpi = 300; // Target DPI for binarized images
int psmOverride = -1; // Overrides the page segmentation mode if > 0
int minImageHeight = 20; // Minimum height for images to be processed
int minImageWidth = 20; // Minimum width for images to be processed
boolean debug = true; // If true, overlays OCR images with a grid and draws word bounding boxes
boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes
boolean removeWatermark; // If true, watermarks will be removed
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR");

View File

@ -88,10 +88,12 @@ public class ImageProcessingUtils {
Pix grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
LeptUtils.disposePix(pix);
return grayScale;
} else {
} else if (pix.d == 1) {
Pix grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255);
LeptUtils.disposePix(pix);
return grayScale;
} else {
throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d));
}
}

View File

@ -1,10 +1,7 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import java.awt.image.BufferedImage;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.IntStream;
@ -19,7 +16,7 @@ import org.springframework.core.io.ClassPathResource;
import org.springframework.util.FileSystemUtils;
import com.knecon.fforesight.service.ocr.processor.service.OsUtils;
import com.knecon.fforesight.service.ocr.processor.service.threads.ProcessIOLogger;
import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler;
import lombok.SneakyThrows;
@ -50,29 +47,6 @@ public class Pdf2ImgTest {
}
@Test
@SneakyThrows
public void testGhostScript() {
String outputDir = "/tmp/ghostscript_out/";
new File(outputDir).mkdirs();
ClassPathResource resource = new ClassPathResource("files/Cyberport__SD-Faktura-Kopie_(ZRG2)_-_31.08.2020.pdf");
String[] cmdArgs = new String[]{"gs", "-dNOPAUSE", "-sDEVICE=tiff24nc", "-r" + DPI, "-sOutputFile=" + outputDir + "page%04d", resource.getFile().toString(), "-c", "quit"};
Process p = Runtime.getRuntime().exec(cmdArgs);
ProcessIOLogger logger = new ProcessIOLogger(p.getInputStream(), "GS", ProcessIOLogger.Type.STD_OUT);
logger.start();
ProcessIOLogger errorLogger = new ProcessIOLogger(p.getErrorStream(), "GS", ProcessIOLogger.Type.STD_OUT);
errorLogger.start();
int exitcode = p.waitFor();
logger.join();
errorLogger.join();
System.out.println("Ghostscript finished with exit code " + exitcode);
FileSystemUtils.deleteRecursively(new File(outputDir));
}
@Test
@SneakyThrows
public void testGhostScriptParallel() {

View File

@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@SneakyThrows
public void testOcr() {
String text = testOCR("files/2009-1048395_50pages_tables.pdf");
String text = testOCR("files/StitchedImagesMultiPage.pdf");
}