RED-7669: optimize OCR-module performance
* move all critical stuff to its own singleton thread * make gs process queue any image once the file has been written
This commit is contained in:
parent
efd3a1d952
commit
955ff6281d
@ -0,0 +1,12 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
|
||||
public record PageInformation(int height, int width, int number, int rotationDegrees) {
|
||||
|
||||
public static PageInformation fromPDPage(int pageNum, PDPage page) {
|
||||
|
||||
return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation());
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,5 +1,14 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
public record RenderedPageImageFile(int pageNumber, String absoluteFilePath) {
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
|
||||
public record RenderedPageImageFile(int pageNumber, String absoluteFilePath) implements UnprocessedImage {
|
||||
|
||||
@Override
|
||||
public Pix asPix() {
|
||||
|
||||
return Leptonica1.pixRead(absoluteFilePath);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -8,6 +8,7 @@ import org.apache.pdfbox.pdmodel.PDPage;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.Setter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
@ -16,29 +17,17 @@ import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.tess4j.ITessAPI;
|
||||
|
||||
@Getter
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class RenderedPageOcrImage implements OcrImage {
|
||||
|
||||
final String absoluteImagePath;
|
||||
final int height;
|
||||
final int width;
|
||||
final PageInformation pageInformation;
|
||||
final Pix pix;
|
||||
@Setter
|
||||
int height;
|
||||
int width;
|
||||
PageInformation pageInformation;
|
||||
Pix pix;
|
||||
int rotationDegrees;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public RenderedPageOcrImage(RenderedPageImageFile renderedPageImageFile, PDDocument document) {
|
||||
|
||||
this.pageInformation = PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1));
|
||||
this.absoluteImagePath = renderedPageImageFile.absoluteFilePath();
|
||||
this.pix = Leptonica1.pixRead(absoluteImagePath);
|
||||
this.height = getPix().h;
|
||||
this.width = getPix().w;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getOptimalPageSegmentationMode() {
|
||||
|
||||
@ -107,7 +96,7 @@ public class RenderedPageOcrImage implements OcrImage {
|
||||
|
||||
// PDFBox always returns page height and width based on rotation
|
||||
double pageWidth;
|
||||
if (pageInformation.rotationDegrees == 90 || pageInformation.rotationDegrees == 270) {
|
||||
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
|
||||
pageWidth = pageInformation.height();
|
||||
} else {
|
||||
pageWidth = pageInformation.width();
|
||||
@ -116,14 +105,4 @@ public class RenderedPageOcrImage implements OcrImage {
|
||||
return pageWidth / width;
|
||||
}
|
||||
|
||||
|
||||
private record PageInformation(int height, int width, int number, int rotationDegrees) {
|
||||
|
||||
public static PageInformation fromPDPage(int pageNum, PDPage page) {
|
||||
|
||||
return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,9 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
|
||||
public interface UnprocessedImage {
|
||||
|
||||
Pix asPix();
|
||||
|
||||
}
|
||||
@ -4,18 +4,26 @@ import java.io.InputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.LinkedBlockingDeque;
|
||||
import java.util.concurrent.LinkedTransferQueue;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.azure.core.implementation.GeoObjectHelper;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.ProcessIOLogger;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.BlockingQueueFiller;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
|
||||
|
||||
@ -24,6 +32,7 @@ import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -42,17 +51,19 @@ public class GhostScriptService {
|
||||
String documentAbsolutePath,
|
||||
Path tmpImageDir,
|
||||
PDDocument document,
|
||||
BlockingQueue<OcrImage> imageOutputQueue,
|
||||
BlockingQueue<UnprocessedImage> imageProcessingQueue,
|
||||
Statistics stats) {
|
||||
|
||||
BlockingQueue<RenderedPageImageFile> imageFileCollectorQueue = new LinkedBlockingDeque<>();
|
||||
Thread asyncTransferThread = new BlockingQueueFiller(imageFileCollectorQueue, imageProcessingQueue);
|
||||
asyncTransferThread.start();
|
||||
int numOfProcesses = Math.min(settings.getGsProcessCount(), stitchedPageNumbers.size());
|
||||
|
||||
List<List<ProcessInfo>> processInfoBatches = buildSubListForEachProcess(stitchedPageNumbers,
|
||||
numOfProcesses,
|
||||
settings.getOcrThreadCount()); // use 2 times the thread count as batch size, such that GS generates the rendered pages as needed by the OCR Threads
|
||||
256 * numOfProcesses); // GS has a limit on how many pageIndices per call are possible, so we limit it to 256 pages per process
|
||||
for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) {
|
||||
long timestamp = System.currentTimeMillis();
|
||||
List<RenderedPageImageFile> renderedPageImageFiles = Collections.synchronizedList(new LinkedList<>());
|
||||
List<ProcessInfo> processInfos = processInfoBatches.get(batchIdx);
|
||||
|
||||
log.info("Batch {}: Running {} gs processes with ({}) pages each",
|
||||
@ -63,9 +74,9 @@ public class GhostScriptService {
|
||||
int finalBatchIdx = batchIdx;
|
||||
List<Process> processes = processInfos.stream()
|
||||
.parallel()
|
||||
.map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.stitchedPageNumbers(), tmpImageDir, documentAbsolutePath, renderedPageImageFiles))
|
||||
.peek(s -> log.debug(String.join(" ", s)))
|
||||
.map(this::executeProcess)
|
||||
.map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.stitchedPageNumbers(), tmpImageDir, documentAbsolutePath))
|
||||
.peek(s -> log.debug(String.join(" ", s.cmdArgs())))
|
||||
.map(processInfo -> executeProcess(processInfo, imageFileCollectorQueue))
|
||||
.toList();
|
||||
|
||||
List<Integer> processExitCodes = new LinkedList<>();
|
||||
@ -73,14 +84,9 @@ public class GhostScriptService {
|
||||
processExitCodes.add(process.waitFor());
|
||||
}
|
||||
stats.increasePDF2ImgDuration(System.currentTimeMillis() - timestamp);
|
||||
|
||||
log.info("Batch {}: Ghostscript processes finished with exit codes " + processExitCodes, batchIdx);
|
||||
for (RenderedPageImageFile renderedPageImageFile : renderedPageImageFiles) {
|
||||
OcrImage image = new RenderedPageOcrImage(renderedPageImageFile, document);
|
||||
imageOutputQueue.put(image);
|
||||
}
|
||||
|
||||
}
|
||||
asyncTransferThread.interrupt();
|
||||
}
|
||||
|
||||
|
||||
@ -107,20 +113,28 @@ public class GhostScriptService {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private String[] buildCmdArgs(Integer processIdx,
|
||||
Integer batchIdx,
|
||||
List<Integer> stitchedImagePageIndices,
|
||||
Path outputDir,
|
||||
String documentAbsolutePath,
|
||||
List<RenderedPageImageFile> fullPageImages) {
|
||||
private ProcessCmdsAndRenderedImageFiles buildCmdArgs(Integer processIdx,
|
||||
Integer batchIdx,
|
||||
List<Integer> stitchedImagePageIndices,
|
||||
Path outputDir,
|
||||
String documentAbsolutePath) {
|
||||
|
||||
String imagePathFormat = outputDir.resolve("output_" + processIdx + "_" + batchIdx + ".%04d" + FORMAT).toFile().toString();
|
||||
|
||||
Map<Integer, RenderedPageImageFile> fullPageImages = new HashMap<>();
|
||||
for (int i = 0; i < stitchedImagePageIndices.size(); i++) {
|
||||
Integer pageNumber = stitchedImagePageIndices.get(i);
|
||||
fullPageImages.add(new RenderedPageImageFile(pageNumber, String.format(imagePathFormat, i + 1)));
|
||||
fullPageImages.put(pageNumber, new RenderedPageImageFile(pageNumber, String.format(imagePathFormat, i + 1)));
|
||||
}
|
||||
|
||||
String[] cmdArgs = buildCmdArgs(stitchedImagePageIndices, documentAbsolutePath, imagePathFormat);
|
||||
|
||||
return new ProcessCmdsAndRenderedImageFiles(cmdArgs, fullPageImages);
|
||||
}
|
||||
|
||||
|
||||
private String[] buildCmdArgs(List<Integer> stitchedImagePageIndices, String documentAbsolutePath, String imagePathFormat) {
|
||||
|
||||
StringBuilder sPageList = new StringBuilder();
|
||||
int i = 1;
|
||||
for (Integer integer : stitchedImagePageIndices) {
|
||||
@ -131,18 +145,19 @@ public class GhostScriptService {
|
||||
i++;
|
||||
}
|
||||
|
||||
return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + settings.getDpi(), "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"};
|
||||
String[] cmdArgs = new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + settings.getDpi(), "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"};
|
||||
return cmdArgs;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Process executeProcess(String[] cmdArgs) {
|
||||
private Process executeProcess(ProcessCmdsAndRenderedImageFiles processInfo, BlockingQueue<RenderedPageImageFile> imageFileCollectorQueue) {
|
||||
|
||||
Process p = Runtime.getRuntime().exec(cmdArgs);
|
||||
Process p = Runtime.getRuntime().exec(processInfo.cmdArgs());
|
||||
InputStream stdOut = p.getInputStream();
|
||||
ProcessIOLogger stdOutLogger = new ProcessIOLogger(stdOut, "GS", ProcessIOLogger.Type.STD_OUT);
|
||||
GhostScriptOutputHandler stdOutLogger = GhostScriptOutputHandler.stdOut(stdOut, processInfo.renderedPageImageFiles(), imageFileCollectorQueue);
|
||||
InputStream stdError = p.getErrorStream();
|
||||
ProcessIOLogger stdErrorLogger = new ProcessIOLogger(stdError, "GS", ProcessIOLogger.Type.ERROR);
|
||||
GhostScriptOutputHandler stdErrorLogger = GhostScriptOutputHandler.errorHandler(stdError);
|
||||
|
||||
stdOutLogger.start();
|
||||
stdErrorLogger.start();
|
||||
@ -150,6 +165,10 @@ public class GhostScriptService {
|
||||
}
|
||||
|
||||
|
||||
private record ProcessCmdsAndRenderedImageFiles(String[] cmdArgs, Map<Integer, RenderedPageImageFile> renderedPageImageFiles) {
|
||||
|
||||
}
|
||||
|
||||
private record ProcessInfo(Integer processIdx, List<Integer> stitchedPageNumbers) {
|
||||
|
||||
}
|
||||
|
||||
@ -1,8 +1,6 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.awt.Graphics;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
@ -19,14 +17,11 @@ import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
|
||||
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
|
||||
@ -14,6 +14,7 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageExtractionThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageProcessingThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
@ -32,7 +33,7 @@ public class OcrImageFactory {
|
||||
File documentFile;
|
||||
Path tmpImageDir;
|
||||
GhostScriptService ghostScriptService;
|
||||
BlockingQueue<ExtractedImage> imageProcessingQueue;
|
||||
BlockingQueue<UnprocessedImage> imageProcessingQueue;
|
||||
ImageProcessingThread imageProcessingThread;
|
||||
BlockingQueue<OcrImage> imageOutputQueue;
|
||||
List<ImageExtractionThread> imageExtractionThreads;
|
||||
@ -45,7 +46,7 @@ public class OcrImageFactory {
|
||||
Path tmpImageDir,
|
||||
int numberOfThreads,
|
||||
GhostScriptService ghostScriptService,
|
||||
BlockingQueue<OcrImage> imageOutputQueue,
|
||||
BlockingQueue<OcrImage> imageOcrQueue,
|
||||
OcrProgressLogger logger,
|
||||
OcrServiceSettings settings,
|
||||
Statistics stats) {
|
||||
@ -54,8 +55,8 @@ public class OcrImageFactory {
|
||||
this.documentFile = documentFile;
|
||||
this.tmpImageDir = tmpImageDir;
|
||||
this.ghostScriptService = ghostScriptService;
|
||||
this.imageOutputQueue = imageOutputQueue;
|
||||
this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOutputQueue.remainingCapacity());
|
||||
this.imageOutputQueue = imageOcrQueue;
|
||||
this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOcrQueue.remainingCapacity());
|
||||
this.stitchedPageNumbers = Collections.synchronizedList(new LinkedList<>());
|
||||
this.stats = stats;
|
||||
|
||||
@ -65,7 +66,7 @@ public class OcrImageFactory {
|
||||
for (int i = 0; i < balancedPageNumbers.size(); i++) {
|
||||
imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageProcessingQueue, stitchedPageNumbers));
|
||||
}
|
||||
this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOutputQueue, stats, settings);
|
||||
this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOcrQueue, stats, settings, document);
|
||||
|
||||
log.info("Started {} image extraction threads, with ({}) pages each",
|
||||
imageExtractionThreads.size(),
|
||||
@ -91,7 +92,7 @@ public class OcrImageFactory {
|
||||
}
|
||||
|
||||
if (!stitchedPageNumbers.isEmpty()) {
|
||||
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageOutputQueue, stats);
|
||||
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageProcessingQueue, stats);
|
||||
}
|
||||
imageProcessingThread.interrupt();
|
||||
log.info("All images extracted, interrupting processing thread.");
|
||||
|
||||
@ -0,0 +1,61 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.tess4j.TessAPI1;
|
||||
|
||||
|
||||
/*
|
||||
This just moves the Elements from the GhostScriptOutputListener into the ImageProcessing queue asynchronously
|
||||
*/
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class BlockingQueueFiller extends Thread {
|
||||
|
||||
BlockingQueue<RenderedPageImageFile> imageInputQueue;
|
||||
BlockingQueue<UnprocessedImage> imageOutputQueue;
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void run() {
|
||||
|
||||
// Interrupting signals that the image extraction has finished
|
||||
while (true) {
|
||||
try {
|
||||
final UnprocessedImage image = imageInputQueue.take();
|
||||
try {
|
||||
imageOutputQueue.put(image);
|
||||
} catch (InterruptedException e) {
|
||||
imageOutputQueue.put(image);
|
||||
break;
|
||||
}
|
||||
|
||||
} catch (InterruptedException e) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// empty the queue
|
||||
List<UnprocessedImage> remainingImages = new ArrayList<>(imageInputQueue.size());
|
||||
imageInputQueue.drainTo(remainingImages);
|
||||
remainingImages.forEach(image -> {
|
||||
try {
|
||||
imageOutputQueue.put(image);
|
||||
} catch (InterruptedException e) {
|
||||
log.error(e.getMessage());
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,122 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class GhostScriptOutputHandler extends Thread {
|
||||
|
||||
static Pattern pageFinishedPattern = Pattern.compile("Page (\\d+)");
|
||||
|
||||
// If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock.
|
||||
// Since both need to read simultaneously we need to implement the readers as separate threads.
|
||||
|
||||
final InputStream is;
|
||||
final String processName;
|
||||
final Type type;
|
||||
|
||||
final Map<Integer, RenderedPageImageFile> pagesToProcess;
|
||||
final BlockingQueue<RenderedPageImageFile> renderedPageImageFileOutput;
|
||||
|
||||
int currentPageNumber;
|
||||
|
||||
|
||||
public static GhostScriptOutputHandler errorHandler(InputStream is) {
|
||||
|
||||
return new GhostScriptOutputHandler(is, "GS", Type.ERROR, null, null);
|
||||
}
|
||||
|
||||
|
||||
public static GhostScriptOutputHandler stdOut(InputStream is,
|
||||
Map<Integer, RenderedPageImageFile> pagesToProcess,
|
||||
BlockingQueue<RenderedPageImageFile> renderedPageImageFileOutput) {
|
||||
|
||||
return new GhostScriptOutputHandler(is, "GS", Type.STD_OUT, pagesToProcess, renderedPageImageFileOutput);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void run() {
|
||||
|
||||
try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) {
|
||||
|
||||
String line;
|
||||
while (true) {
|
||||
line = br.readLine();
|
||||
|
||||
if (line == null) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (type.equals(Type.ERROR)) {
|
||||
log.error(processName + "_" + type.name() + ">" + line);
|
||||
} else {
|
||||
log.debug(processName + "_" + type.name() + ">" + line);
|
||||
addProcessedImageToQueue(line);
|
||||
}
|
||||
}
|
||||
}
|
||||
is.close();
|
||||
if (type.equals(Type.STD_OUT)) {
|
||||
queueFinishedPage(currentPageNumber);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void addProcessedImageToQueue(String line) {
|
||||
|
||||
/*
|
||||
Ghostscript prints the pageNumber it is currently working on, so we remember the current page and queue it as soon as the next comes in.
|
||||
*/
|
||||
Matcher pageNumberMatcher = pageFinishedPattern.matcher(line);
|
||||
if (pageNumberMatcher.find()) {
|
||||
int pageNumber = Integer.parseInt(pageNumberMatcher.group(1));
|
||||
|
||||
if (currentPageNumber == 0) {
|
||||
currentPageNumber = pageNumber;
|
||||
return;
|
||||
}
|
||||
|
||||
queueFinishedPage(currentPageNumber);
|
||||
currentPageNumber = pageNumber;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void queueFinishedPage(int pageNumber) {
|
||||
|
||||
var imageFile = this.pagesToProcess.get(pageNumber);
|
||||
if (imageFile == null) {
|
||||
throw new IllegalArgumentException(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet()));
|
||||
}
|
||||
assert new File(imageFile.absoluteFilePath()).isFile();
|
||||
renderedPageImageFileOutput.add(imageFile);
|
||||
}
|
||||
|
||||
|
||||
public enum Type {
|
||||
ERROR,
|
||||
STD_OUT
|
||||
}
|
||||
|
||||
}
|
||||
@ -9,6 +9,7 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
|
||||
@ -36,7 +37,7 @@ public class ImageExtractionThread extends Thread {
|
||||
OcrServiceSettings settings;
|
||||
|
||||
// output is written to these lists
|
||||
BlockingQueue<ExtractedImage> imageProcessingQueue;
|
||||
BlockingQueue<UnprocessedImage> imageProcessingQueue;
|
||||
List<Integer> stitchedPageNumbers;
|
||||
|
||||
|
||||
@ -61,7 +62,7 @@ public class ImageExtractionThread extends Thread {
|
||||
}
|
||||
|
||||
for (ExtractedImage image : extractedImages) {
|
||||
imageProcessingQueue.put(image);
|
||||
imageProcessingQueue.put((UnprocessedImage) image);
|
||||
logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage());
|
||||
}
|
||||
}
|
||||
|
||||
@ -2,15 +2,22 @@ package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import static net.sourceforge.tess4j.ITessAPI.TRUE;
|
||||
|
||||
import java.lang.annotation.Documented;
|
||||
import java.nio.FloatBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||
@ -35,11 +42,12 @@ import net.sourceforge.tess4j.TessAPI1;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ImageProcessingThread extends Thread {
|
||||
|
||||
BlockingQueue<ExtractedImage> imageInputQueue;
|
||||
BlockingQueue<UnprocessedImage> imageInputQueue;
|
||||
BlockingQueue<OcrImage> imageOutputQueue;
|
||||
ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
|
||||
Statistics stats;
|
||||
OcrServiceSettings settings;
|
||||
PDDocument document;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@ -49,7 +57,7 @@ public class ImageProcessingThread extends Thread {
|
||||
// Interrupting signals that the image extraction has finished
|
||||
while (true) {
|
||||
try {
|
||||
final ExtractedImage image = imageInputQueue.take();
|
||||
final UnprocessedImage image = imageInputQueue.take();
|
||||
OcrImage extractedOcrImage = this.process(image);
|
||||
try {
|
||||
imageOutputQueue.put(extractedOcrImage);
|
||||
@ -62,9 +70,8 @@ public class ImageProcessingThread extends Thread {
|
||||
break;
|
||||
}
|
||||
}
|
||||
log.info("Leaving initial uninterrupted loop!");
|
||||
// empty the queue
|
||||
List<ExtractedImage> remainingImages = new ArrayList<>(imageInputQueue.size());
|
||||
List<UnprocessedImage> remainingImages = new ArrayList<>(imageInputQueue.size());
|
||||
imageInputQueue.drainTo(remainingImages);
|
||||
remainingImages.forEach(image -> {
|
||||
OcrImage ocrImage = this.process(image);
|
||||
@ -79,21 +86,61 @@ public class ImageProcessingThread extends Thread {
|
||||
}
|
||||
|
||||
|
||||
private OcrImage process(ExtractedImage extractedImage) {
|
||||
private OcrImage process(UnprocessedImage unprocessedImage) {
|
||||
|
||||
long timestamp = System.currentTimeMillis();
|
||||
|
||||
OcrImage ocrImage;
|
||||
if (unprocessedImage instanceof ExtractedImage extractedImage) {
|
||||
ocrImage = processExtractedImage(extractedImage);
|
||||
} else if (unprocessedImage instanceof RenderedPageImageFile renderedPageImageFile) {
|
||||
ocrImage = processRenderedPageImageFile(renderedPageImageFile);
|
||||
} else {
|
||||
throw new UnsupportedOperationException(String.format("Class %s is not supported!", unprocessedImage.getClass()));
|
||||
}
|
||||
|
||||
stats.increaseImageProcessing(System.currentTimeMillis() - timestamp);
|
||||
|
||||
return ocrImage;
|
||||
}
|
||||
|
||||
|
||||
private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) {
|
||||
|
||||
Pix grayScale = ImageProcessingUtils.convertToGrayScale(renderedPageImageFile.asPix());
|
||||
Pix despeckled = ImageProcessingUtils.despecklePix(grayScale);
|
||||
|
||||
int orientDegree = detectOrientation(despeckled, settings.getDpi(), detectionScriptHandle);
|
||||
Pix rotatedPix = switch (360 - orientDegree) {
|
||||
case 90 -> Leptonica1.pixRotateOrth(despeckled, 1);
|
||||
case 180 -> Leptonica1.pixRotateOrth(despeckled, 2);
|
||||
case 270 -> Leptonica1.pixRotateOrth(despeckled, 3);
|
||||
default -> despeckled;
|
||||
};
|
||||
|
||||
OcrImage ocrImage = new RenderedPageOcrImage(despeckled.h,
|
||||
despeckled.w,
|
||||
PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1)),
|
||||
rotatedPix,
|
||||
orientDegree);
|
||||
|
||||
if (despeckled != rotatedPix) {
|
||||
LeptUtils.disposePix(despeckled);
|
||||
}
|
||||
return ocrImage;
|
||||
}
|
||||
|
||||
|
||||
private OcrImage processExtractedImage(ExtractedImage extractedImage) {
|
||||
|
||||
float imageDPI = Math.abs(extractedImage.getImage().getWidth() / (extractedImage.getCtm().getScalingFactorX() / 72));
|
||||
|
||||
Pix pix = binarize(extractedImage.asPix(), imageDPI, settings.getDpi());
|
||||
|
||||
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
|
||||
Pix rotatedPix = switch (360 - orientDegree) {
|
||||
case 90 -> Leptonica1.pixRotateOrth(pix, 1);
|
||||
case 180 -> Leptonica1.pixRotateOrth(pix, 2);
|
||||
case 270 -> Leptonica1.pixRotateOrth(pix, 3);
|
||||
default -> pix;
|
||||
};
|
||||
OcrImage extractedOcrImage = new ExtractedOcrImage(extractedImage.getPageNumber(),
|
||||
Pix rotatedPix = getRotatedPix(orientDegree, pix);
|
||||
|
||||
OcrImage ocrImage = new ExtractedOcrImage(extractedImage.getPageNumber(),
|
||||
extractedImage.getNumberOnPage(),
|
||||
extractedImage.getHeight(),
|
||||
extractedImage.getWidth(),
|
||||
@ -106,10 +153,18 @@ public class ImageProcessingThread extends Thread {
|
||||
if (pix != rotatedPix) {
|
||||
LeptUtils.disposePix(pix);
|
||||
}
|
||||
return ocrImage;
|
||||
}
|
||||
|
||||
stats.increaseImageProcessing(System.currentTimeMillis() - timestamp);
|
||||
|
||||
return extractedOcrImage;
|
||||
private static Pix getRotatedPix(int orientDegree, Pix pix) {
|
||||
|
||||
return switch (360 - orientDegree) {
|
||||
case 90 -> Leptonica1.pixRotateOrth(pix, 1);
|
||||
case 180 -> Leptonica1.pixRotateOrth(pix, 2);
|
||||
case 270 -> Leptonica1.pixRotateOrth(pix, 3);
|
||||
default -> pix;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,55 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ProcessIOLogger extends Thread {
|
||||
|
||||
// If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock.
|
||||
// Since both need to read simultaneously we need to implement the readers as separate threads.
|
||||
|
||||
InputStream is;
|
||||
String processName;
|
||||
Type type;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void run() {
|
||||
|
||||
try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) {
|
||||
|
||||
String line;
|
||||
while (true) {
|
||||
line = br.readLine();
|
||||
|
||||
if (line == null) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (type.equals(Type.ERROR)) {
|
||||
log.error(processName + "_" + type.name() + ">" + line);
|
||||
} else {
|
||||
log.debug(processName + "_" + type.name() + ">" + line);
|
||||
}
|
||||
}
|
||||
}
|
||||
is.close();
|
||||
}
|
||||
|
||||
|
||||
public enum Type {
|
||||
ERROR,
|
||||
STD_OUT
|
||||
}
|
||||
|
||||
}
|
||||
@ -12,14 +12,14 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class OcrServiceSettings {
|
||||
|
||||
int ocrThreadCount = 16; // Number of OCR threads
|
||||
int ocrThreadCount = 4; // Number of OCR threads
|
||||
int imageExtractThreadCount = 2; // Number of image extraction threads
|
||||
int gsProcessCount = 2; // Number of Ghostscript processes
|
||||
int gsProcessCount = 1; // Number of Ghostscript processes
|
||||
int dpi = 300; // Target DPI for binarized images
|
||||
int psmOverride = -1; // Overrides the page segmentation mode if > 0
|
||||
int minImageHeight = 20; // Minimum height for images to be processed
|
||||
int minImageWidth = 20; // Minimum width for images to be processed
|
||||
boolean debug = true; // If true, overlays OCR images with a grid and draws word bounding boxes
|
||||
boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes
|
||||
boolean removeWatermark; // If true, watermarks will be removed
|
||||
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
|
||||
COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR");
|
||||
|
||||
@ -88,10 +88,12 @@ public class ImageProcessingUtils {
|
||||
Pix grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
|
||||
LeptUtils.disposePix(pix);
|
||||
return grayScale;
|
||||
} else {
|
||||
} else if (pix.d == 1) {
|
||||
Pix grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255);
|
||||
LeptUtils.disposePix(pix);
|
||||
return grayScale;
|
||||
} else {
|
||||
throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1,10 +1,7 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.IntStream;
|
||||
@ -19,7 +16,7 @@ import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.util.FileSystemUtils;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OsUtils;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.ProcessIOLogger;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -50,29 +47,6 @@ public class Pdf2ImgTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testGhostScript() {
|
||||
|
||||
String outputDir = "/tmp/ghostscript_out/";
|
||||
new File(outputDir).mkdirs();
|
||||
ClassPathResource resource = new ClassPathResource("files/Cyberport__SD-Faktura-Kopie_(ZRG2)_-_31.08.2020.pdf");
|
||||
|
||||
String[] cmdArgs = new String[]{"gs", "-dNOPAUSE", "-sDEVICE=tiff24nc", "-r" + DPI, "-sOutputFile=" + outputDir + "page%04d", resource.getFile().toString(), "-c", "quit"};
|
||||
Process p = Runtime.getRuntime().exec(cmdArgs);
|
||||
ProcessIOLogger logger = new ProcessIOLogger(p.getInputStream(), "GS", ProcessIOLogger.Type.STD_OUT);
|
||||
logger.start();
|
||||
ProcessIOLogger errorLogger = new ProcessIOLogger(p.getErrorStream(), "GS", ProcessIOLogger.Type.STD_OUT);
|
||||
errorLogger.start();
|
||||
int exitcode = p.waitFor();
|
||||
logger.join();
|
||||
errorLogger.join();
|
||||
System.out.println("Ghostscript finished with exit code " + exitcode);
|
||||
FileSystemUtils.deleteRecursively(new File(outputDir));
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testGhostScriptParallel() {
|
||||
|
||||
@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testOcr() {
|
||||
|
||||
String text = testOCR("files/2009-1048395_50pages_tables.pdf");
|
||||
String text = testOCR("files/StitchedImagesMultiPage.pdf");
|
||||
}
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user