diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java new file mode 100644 index 0000000..4935eda --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java @@ -0,0 +1,12 @@ +package com.knecon.fforesight.service.ocr.processor.model; + +import org.apache.pdfbox.pdmodel.PDPage; + +public record PageInformation(int height, int width, int number, int rotationDegrees) { + + public static PageInformation fromPDPage(int pageNum, PDPage page) { + + return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation()); + } + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageImageFile.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageImageFile.java index 2b773fe..4bb78fb 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageImageFile.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageImageFile.java @@ -1,5 +1,14 @@ package com.knecon.fforesight.service.ocr.processor.model; -public record RenderedPageImageFile(int pageNumber, String absoluteFilePath) { +import net.sourceforge.lept4j.Leptonica1; +import net.sourceforge.lept4j.Pix; + +public record RenderedPageImageFile(int pageNumber, String absoluteFilePath) implements UnprocessedImage { + + @Override + public Pix asPix() { + + return Leptonica1.pixRead(absoluteFilePath); + } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageOcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageOcrImage.java index 42abff4..1141eb5 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageOcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageOcrImage.java @@ -8,6 +8,7 @@ import org.apache.pdfbox.pdmodel.PDPage; import lombok.AccessLevel; import lombok.Getter; +import lombok.RequiredArgsConstructor; import lombok.Setter; import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; @@ -16,29 +17,17 @@ import net.sourceforge.lept4j.Pix; import net.sourceforge.tess4j.ITessAPI; @Getter -@FieldDefaults(level = AccessLevel.PRIVATE) +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class RenderedPageOcrImage implements OcrImage { - final String absoluteImagePath; - final int height; - final int width; - final PageInformation pageInformation; - final Pix pix; - @Setter + int height; + int width; + PageInformation pageInformation; + Pix pix; int rotationDegrees; - @SneakyThrows - public RenderedPageOcrImage(RenderedPageImageFile renderedPageImageFile, PDDocument document) { - - this.pageInformation = PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1)); - this.absoluteImagePath = renderedPageImageFile.absoluteFilePath(); - this.pix = Leptonica1.pixRead(absoluteImagePath); - this.height = getPix().h; - this.width = getPix().w; - } - - @Override public int getOptimalPageSegmentationMode() { @@ -107,7 +96,7 @@ public class RenderedPageOcrImage implements OcrImage { // PDFBox always returns page height and width based on rotation double pageWidth; - if (pageInformation.rotationDegrees == 90 || pageInformation.rotationDegrees == 270) { + if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) { pageWidth = pageInformation.height(); } else { pageWidth = pageInformation.width(); @@ -116,14 +105,4 @@ public class RenderedPageOcrImage implements OcrImage { return pageWidth / width; } - - private record PageInformation(int height, int width, int number, int rotationDegrees) { - - public static PageInformation fromPDPage(int pageNum, PDPage page) { - - return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation()); - } - - } - } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/UnprocessedImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/UnprocessedImage.java new file mode 100644 index 0000000..6facc56 --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/UnprocessedImage.java @@ -0,0 +1,9 @@ +package com.knecon.fforesight.service.ocr.processor.model; + +import net.sourceforge.lept4j.Pix; + +public interface UnprocessedImage { + + Pix asPix(); + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java index 1a4b54e..a767f91 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java @@ -4,18 +4,26 @@ import java.io.InputStream; import java.nio.file.Path; import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; import java.util.LinkedList; import java.util.List; +import java.util.Map; import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.LinkedTransferQueue; import java.util.stream.Collectors; import org.apache.pdfbox.pdmodel.PDDocument; import org.springframework.stereotype.Service; +import com.azure.core.implementation.GeoObjectHelper; import com.knecon.fforesight.service.ocr.processor.model.OcrImage; +import com.knecon.fforesight.service.ocr.processor.model.PageInformation; import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile; import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage; -import com.knecon.fforesight.service.ocr.processor.service.threads.ProcessIOLogger; +import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage; +import com.knecon.fforesight.service.ocr.processor.service.threads.BlockingQueueFiller; +import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils; @@ -24,6 +32,7 @@ import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; import lombok.extern.slf4j.Slf4j; +import net.sourceforge.lept4j.Pix; @Slf4j @Service @@ -42,17 +51,19 @@ public class GhostScriptService { String documentAbsolutePath, Path tmpImageDir, PDDocument document, - BlockingQueue imageOutputQueue, + BlockingQueue imageProcessingQueue, Statistics stats) { + BlockingQueue imageFileCollectorQueue = new LinkedBlockingDeque<>(); + Thread asyncTransferThread = new BlockingQueueFiller(imageFileCollectorQueue, imageProcessingQueue); + asyncTransferThread.start(); int numOfProcesses = Math.min(settings.getGsProcessCount(), stitchedPageNumbers.size()); List> processInfoBatches = buildSubListForEachProcess(stitchedPageNumbers, numOfProcesses, - settings.getOcrThreadCount()); // use 2 times the thread count as batch size, such that GS generates the rendered pages as needed by the OCR Threads + 256 * numOfProcesses); // GS has a limit on how many pageIndices per call are possible, so we limit it to 256 pages per process for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) { long timestamp = System.currentTimeMillis(); - List renderedPageImageFiles = Collections.synchronizedList(new LinkedList<>()); List processInfos = processInfoBatches.get(batchIdx); log.info("Batch {}: Running {} gs processes with ({}) pages each", @@ -63,9 +74,9 @@ public class GhostScriptService { int finalBatchIdx = batchIdx; List processes = processInfos.stream() .parallel() - .map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.stitchedPageNumbers(), tmpImageDir, documentAbsolutePath, renderedPageImageFiles)) - .peek(s -> log.debug(String.join(" ", s))) - .map(this::executeProcess) + .map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.stitchedPageNumbers(), tmpImageDir, documentAbsolutePath)) + .peek(s -> log.debug(String.join(" ", s.cmdArgs()))) + .map(processInfo -> executeProcess(processInfo, imageFileCollectorQueue)) .toList(); List processExitCodes = new LinkedList<>(); @@ -73,14 +84,9 @@ public class GhostScriptService { processExitCodes.add(process.waitFor()); } stats.increasePDF2ImgDuration(System.currentTimeMillis() - timestamp); - log.info("Batch {}: Ghostscript processes finished with exit codes " + processExitCodes, batchIdx); - for (RenderedPageImageFile renderedPageImageFile : renderedPageImageFiles) { - OcrImage image = new RenderedPageOcrImage(renderedPageImageFile, document); - imageOutputQueue.put(image); - } - } + asyncTransferThread.interrupt(); } @@ -107,20 +113,28 @@ public class GhostScriptService { @SneakyThrows - private String[] buildCmdArgs(Integer processIdx, - Integer batchIdx, - List stitchedImagePageIndices, - Path outputDir, - String documentAbsolutePath, - List fullPageImages) { + private ProcessCmdsAndRenderedImageFiles buildCmdArgs(Integer processIdx, + Integer batchIdx, + List stitchedImagePageIndices, + Path outputDir, + String documentAbsolutePath) { String imagePathFormat = outputDir.resolve("output_" + processIdx + "_" + batchIdx + ".%04d" + FORMAT).toFile().toString(); + Map fullPageImages = new HashMap<>(); for (int i = 0; i < stitchedImagePageIndices.size(); i++) { Integer pageNumber = stitchedImagePageIndices.get(i); - fullPageImages.add(new RenderedPageImageFile(pageNumber, String.format(imagePathFormat, i + 1))); + fullPageImages.put(pageNumber, new RenderedPageImageFile(pageNumber, String.format(imagePathFormat, i + 1))); } + String[] cmdArgs = buildCmdArgs(stitchedImagePageIndices, documentAbsolutePath, imagePathFormat); + + return new ProcessCmdsAndRenderedImageFiles(cmdArgs, fullPageImages); + } + + + private String[] buildCmdArgs(List stitchedImagePageIndices, String documentAbsolutePath, String imagePathFormat) { + StringBuilder sPageList = new StringBuilder(); int i = 1; for (Integer integer : stitchedImagePageIndices) { @@ -131,18 +145,19 @@ public class GhostScriptService { i++; } - return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + settings.getDpi(), "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"}; + String[] cmdArgs = new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + settings.getDpi(), "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"}; + return cmdArgs; } @SneakyThrows - private Process executeProcess(String[] cmdArgs) { + private Process executeProcess(ProcessCmdsAndRenderedImageFiles processInfo, BlockingQueue imageFileCollectorQueue) { - Process p = Runtime.getRuntime().exec(cmdArgs); + Process p = Runtime.getRuntime().exec(processInfo.cmdArgs()); InputStream stdOut = p.getInputStream(); - ProcessIOLogger stdOutLogger = new ProcessIOLogger(stdOut, "GS", ProcessIOLogger.Type.STD_OUT); + GhostScriptOutputHandler stdOutLogger = GhostScriptOutputHandler.stdOut(stdOut, processInfo.renderedPageImageFiles(), imageFileCollectorQueue); InputStream stdError = p.getErrorStream(); - ProcessIOLogger stdErrorLogger = new ProcessIOLogger(stdError, "GS", ProcessIOLogger.Type.ERROR); + GhostScriptOutputHandler stdErrorLogger = GhostScriptOutputHandler.errorHandler(stdError); stdOutLogger.start(); stdErrorLogger.start(); @@ -150,6 +165,10 @@ public class GhostScriptService { } + private record ProcessCmdsAndRenderedImageFiles(String[] cmdArgs, Map renderedPageImageFiles) { + + } + private record ProcessInfo(Integer processIdx, List stitchedPageNumbers) { } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java index 662ae5b..0c64115 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java @@ -1,8 +1,6 @@ package com.knecon.fforesight.service.ocr.processor.service; -import java.awt.Graphics; import java.awt.geom.Rectangle2D; -import java.awt.image.BufferedImage; import java.io.IOException; import java.util.LinkedList; import java.util.List; @@ -19,14 +17,11 @@ import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.graphics.PDXObject; -import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; -import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.util.Matrix; import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; -import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage; import com.knecon.fforesight.service.ocr.processor.model.QuadPoint; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java index e762d6e..2e913e2 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java @@ -14,6 +14,7 @@ import org.apache.pdfbox.pdmodel.PDDocument; import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; import com.knecon.fforesight.service.ocr.processor.model.OcrImage; +import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage; import com.knecon.fforesight.service.ocr.processor.service.threads.ImageExtractionThread; import com.knecon.fforesight.service.ocr.processor.service.threads.ImageProcessingThread; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; @@ -32,7 +33,7 @@ public class OcrImageFactory { File documentFile; Path tmpImageDir; GhostScriptService ghostScriptService; - BlockingQueue imageProcessingQueue; + BlockingQueue imageProcessingQueue; ImageProcessingThread imageProcessingThread; BlockingQueue imageOutputQueue; List imageExtractionThreads; @@ -45,7 +46,7 @@ public class OcrImageFactory { Path tmpImageDir, int numberOfThreads, GhostScriptService ghostScriptService, - BlockingQueue imageOutputQueue, + BlockingQueue imageOcrQueue, OcrProgressLogger logger, OcrServiceSettings settings, Statistics stats) { @@ -54,8 +55,8 @@ public class OcrImageFactory { this.documentFile = documentFile; this.tmpImageDir = tmpImageDir; this.ghostScriptService = ghostScriptService; - this.imageOutputQueue = imageOutputQueue; - this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOutputQueue.remainingCapacity()); + this.imageOutputQueue = imageOcrQueue; + this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOcrQueue.remainingCapacity()); this.stitchedPageNumbers = Collections.synchronizedList(new LinkedList<>()); this.stats = stats; @@ -65,7 +66,7 @@ public class OcrImageFactory { for (int i = 0; i < balancedPageNumbers.size(); i++) { imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageProcessingQueue, stitchedPageNumbers)); } - this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOutputQueue, stats, settings); + this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOcrQueue, stats, settings, document); log.info("Started {} image extraction threads, with ({}) pages each", imageExtractionThreads.size(), @@ -91,7 +92,7 @@ public class OcrImageFactory { } if (!stitchedPageNumbers.isEmpty()) { - ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageOutputQueue, stats); + ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageProcessingQueue, stats); } imageProcessingThread.interrupt(); log.info("All images extracted, interrupting processing thread."); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/BlockingQueueFiller.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/BlockingQueueFiller.java new file mode 100644 index 0000000..554e190 --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/BlockingQueueFiller.java @@ -0,0 +1,61 @@ +package com.knecon.fforesight.service.ocr.processor.service.threads; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.BlockingQueue; + +import com.knecon.fforesight.service.ocr.processor.model.OcrImage; +import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile; +import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage; + +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; +import net.sourceforge.tess4j.TessAPI1; + + +/* +This just moves the Elements from the GhostScriptOutputListener into the ImageProcessing queue asynchronously + */ +@Slf4j +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class BlockingQueueFiller extends Thread { + + BlockingQueue imageInputQueue; + BlockingQueue imageOutputQueue; + + @SneakyThrows + @Override + public void run() { + + // Interrupting signals that the image extraction has finished + while (true) { + try { + final UnprocessedImage image = imageInputQueue.take(); + try { + imageOutputQueue.put(image); + } catch (InterruptedException e) { + imageOutputQueue.put(image); + break; + } + + } catch (InterruptedException e) { + break; + } + } + // empty the queue + List remainingImages = new ArrayList<>(imageInputQueue.size()); + imageInputQueue.drainTo(remainingImages); + remainingImages.forEach(image -> { + try { + imageOutputQueue.put(image); + } catch (InterruptedException e) { + log.error(e.getMessage()); + } + }); + + } +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/GhostScriptOutputHandler.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/GhostScriptOutputHandler.java new file mode 100644 index 0000000..0dd0c60 --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/GhostScriptOutputHandler.java @@ -0,0 +1,122 @@ +package com.knecon.fforesight.service.ocr.processor.service.threads; + +import java.io.BufferedReader; +import java.io.File; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.concurrent.BlockingQueue; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile; + +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@RequiredArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class GhostScriptOutputHandler extends Thread { + + static Pattern pageFinishedPattern = Pattern.compile("Page (\\d+)"); + + // If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock. + // Since both need to read simultaneously we need to implement the readers as separate threads. + + final InputStream is; + final String processName; + final Type type; + + final Map pagesToProcess; + final BlockingQueue renderedPageImageFileOutput; + + int currentPageNumber; + + + public static GhostScriptOutputHandler errorHandler(InputStream is) { + + return new GhostScriptOutputHandler(is, "GS", Type.ERROR, null, null); + } + + + public static GhostScriptOutputHandler stdOut(InputStream is, + Map pagesToProcess, + BlockingQueue renderedPageImageFileOutput) { + + return new GhostScriptOutputHandler(is, "GS", Type.STD_OUT, pagesToProcess, renderedPageImageFileOutput); + } + + + @SneakyThrows + public void run() { + + try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) { + + String line; + while (true) { + line = br.readLine(); + + if (line == null) { + break; + } + + if (type.equals(Type.ERROR)) { + log.error(processName + "_" + type.name() + ">" + line); + } else { + log.debug(processName + "_" + type.name() + ">" + line); + addProcessedImageToQueue(line); + } + } + } + is.close(); + if (type.equals(Type.STD_OUT)) { + queueFinishedPage(currentPageNumber); + } + + } + + + private void addProcessedImageToQueue(String line) { + + /* + Ghostscript prints the pageNumber it is currently working on, so we remember the current page and queue it as soon as the next comes in. + */ + Matcher pageNumberMatcher = pageFinishedPattern.matcher(line); + if (pageNumberMatcher.find()) { + int pageNumber = Integer.parseInt(pageNumberMatcher.group(1)); + + if (currentPageNumber == 0) { + currentPageNumber = pageNumber; + return; + } + + queueFinishedPage(currentPageNumber); + currentPageNumber = pageNumber; + } + } + + + private void queueFinishedPage(int pageNumber) { + + var imageFile = this.pagesToProcess.get(pageNumber); + if (imageFile == null) { + throw new IllegalArgumentException(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet())); + } + assert new File(imageFile.absoluteFilePath()).isFile(); + renderedPageImageFileOutput.add(imageFile); + } + + + public enum Type { + ERROR, + STD_OUT + } + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java index 89161f6..c81067a 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java @@ -9,6 +9,7 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; +import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage; import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine; import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger; import com.knecon.fforesight.service.ocr.processor.service.Statistics; @@ -36,7 +37,7 @@ public class ImageExtractionThread extends Thread { OcrServiceSettings settings; // output is written to these lists - BlockingQueue imageProcessingQueue; + BlockingQueue imageProcessingQueue; List stitchedPageNumbers; @@ -61,7 +62,7 @@ public class ImageExtractionThread extends Thread { } for (ExtractedImage image : extractedImages) { - imageProcessingQueue.put(image); + imageProcessingQueue.put((UnprocessedImage) image); logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage()); } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java index e42185f..492f571 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java @@ -2,15 +2,22 @@ package com.knecon.fforesight.service.ocr.processor.service.threads; import static net.sourceforge.tess4j.ITessAPI.TRUE; +import java.lang.annotation.Documented; import java.nio.FloatBuffer; import java.nio.IntBuffer; import java.util.ArrayList; import java.util.List; import java.util.concurrent.BlockingQueue; +import org.apache.pdfbox.pdmodel.PDDocument; + import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage; import com.knecon.fforesight.service.ocr.processor.model.OcrImage; +import com.knecon.fforesight.service.ocr.processor.model.PageInformation; +import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile; +import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage; +import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage; import com.knecon.fforesight.service.ocr.processor.service.Statistics; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; @@ -35,11 +42,12 @@ import net.sourceforge.tess4j.TessAPI1; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class ImageProcessingThread extends Thread { - BlockingQueue imageInputQueue; + BlockingQueue imageInputQueue; BlockingQueue imageOutputQueue; ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle(); Statistics stats; OcrServiceSettings settings; + PDDocument document; @SneakyThrows @@ -49,7 +57,7 @@ public class ImageProcessingThread extends Thread { // Interrupting signals that the image extraction has finished while (true) { try { - final ExtractedImage image = imageInputQueue.take(); + final UnprocessedImage image = imageInputQueue.take(); OcrImage extractedOcrImage = this.process(image); try { imageOutputQueue.put(extractedOcrImage); @@ -62,9 +70,8 @@ public class ImageProcessingThread extends Thread { break; } } - log.info("Leaving initial uninterrupted loop!"); // empty the queue - List remainingImages = new ArrayList<>(imageInputQueue.size()); + List remainingImages = new ArrayList<>(imageInputQueue.size()); imageInputQueue.drainTo(remainingImages); remainingImages.forEach(image -> { OcrImage ocrImage = this.process(image); @@ -79,21 +86,61 @@ public class ImageProcessingThread extends Thread { } - private OcrImage process(ExtractedImage extractedImage) { + private OcrImage process(UnprocessedImage unprocessedImage) { long timestamp = System.currentTimeMillis(); + + OcrImage ocrImage; + if (unprocessedImage instanceof ExtractedImage extractedImage) { + ocrImage = processExtractedImage(extractedImage); + } else if (unprocessedImage instanceof RenderedPageImageFile renderedPageImageFile) { + ocrImage = processRenderedPageImageFile(renderedPageImageFile); + } else { + throw new UnsupportedOperationException(String.format("Class %s is not supported!", unprocessedImage.getClass())); + } + + stats.increaseImageProcessing(System.currentTimeMillis() - timestamp); + + return ocrImage; + } + + + private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) { + + Pix grayScale = ImageProcessingUtils.convertToGrayScale(renderedPageImageFile.asPix()); + Pix despeckled = ImageProcessingUtils.despecklePix(grayScale); + + int orientDegree = detectOrientation(despeckled, settings.getDpi(), detectionScriptHandle); + Pix rotatedPix = switch (360 - orientDegree) { + case 90 -> Leptonica1.pixRotateOrth(despeckled, 1); + case 180 -> Leptonica1.pixRotateOrth(despeckled, 2); + case 270 -> Leptonica1.pixRotateOrth(despeckled, 3); + default -> despeckled; + }; + + OcrImage ocrImage = new RenderedPageOcrImage(despeckled.h, + despeckled.w, + PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1)), + rotatedPix, + orientDegree); + + if (despeckled != rotatedPix) { + LeptUtils.disposePix(despeckled); + } + return ocrImage; + } + + + private OcrImage processExtractedImage(ExtractedImage extractedImage) { + float imageDPI = Math.abs(extractedImage.getImage().getWidth() / (extractedImage.getCtm().getScalingFactorX() / 72)); Pix pix = binarize(extractedImage.asPix(), imageDPI, settings.getDpi()); int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle); - Pix rotatedPix = switch (360 - orientDegree) { - case 90 -> Leptonica1.pixRotateOrth(pix, 1); - case 180 -> Leptonica1.pixRotateOrth(pix, 2); - case 270 -> Leptonica1.pixRotateOrth(pix, 3); - default -> pix; - }; - OcrImage extractedOcrImage = new ExtractedOcrImage(extractedImage.getPageNumber(), + Pix rotatedPix = getRotatedPix(orientDegree, pix); + + OcrImage ocrImage = new ExtractedOcrImage(extractedImage.getPageNumber(), extractedImage.getNumberOnPage(), extractedImage.getHeight(), extractedImage.getWidth(), @@ -106,10 +153,18 @@ public class ImageProcessingThread extends Thread { if (pix != rotatedPix) { LeptUtils.disposePix(pix); } + return ocrImage; + } - stats.increaseImageProcessing(System.currentTimeMillis() - timestamp); - return extractedOcrImage; + private static Pix getRotatedPix(int orientDegree, Pix pix) { + + return switch (360 - orientDegree) { + case 90 -> Leptonica1.pixRotateOrth(pix, 1); + case 180 -> Leptonica1.pixRotateOrth(pix, 2); + case 270 -> Leptonica1.pixRotateOrth(pix, 3); + default -> pix; + }; } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ProcessIOLogger.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ProcessIOLogger.java deleted file mode 100644 index b068dd0..0000000 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ProcessIOLogger.java +++ /dev/null @@ -1,55 +0,0 @@ -package com.knecon.fforesight.service.ocr.processor.service.threads; - -import java.io.BufferedReader; -import java.io.InputStream; -import java.io.InputStreamReader; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.SneakyThrows; -import lombok.experimental.FieldDefaults; -import lombok.extern.slf4j.Slf4j; - -@Slf4j -@AllArgsConstructor -@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -public class ProcessIOLogger extends Thread { - - // If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock. - // Since both need to read simultaneously we need to implement the readers as separate threads. - - InputStream is; - String processName; - Type type; - - - @SneakyThrows - public void run() { - - try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) { - - String line; - while (true) { - line = br.readLine(); - - if (line == null) { - break; - } - - if (type.equals(Type.ERROR)) { - log.error(processName + "_" + type.name() + ">" + line); - } else { - log.debug(processName + "_" + type.name() + ">" + line); - } - } - } - is.close(); - } - - - public enum Type { - ERROR, - STD_OUT - } - -} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java index 3185982..d8e3665 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java @@ -12,14 +12,14 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(level = AccessLevel.PRIVATE) public class OcrServiceSettings { - int ocrThreadCount = 16; // Number of OCR threads + int ocrThreadCount = 4; // Number of OCR threads int imageExtractThreadCount = 2; // Number of image extraction threads - int gsProcessCount = 2; // Number of Ghostscript processes + int gsProcessCount = 1; // Number of Ghostscript processes int dpi = 300; // Target DPI for binarized images int psmOverride = -1; // Overrides the page segmentation mode if > 0 int minImageHeight = 20; // Minimum height for images to be processed int minImageWidth = 20; // Minimum width for images to be processed - boolean debug = true; // If true, overlays OCR images with a grid and draws word bounding boxes + boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes boolean removeWatermark; // If true, watermarks will be removed String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR"); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java index d41752d..118afba 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java @@ -88,10 +88,12 @@ public class ImageProcessingUtils { Pix grayScale = Leptonica1.pixConvertRGBToGrayFast(pix); LeptUtils.disposePix(pix); return grayScale; - } else { + } else if (pix.d == 1) { Pix grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255); LeptUtils.disposePix(pix); return grayScale; + } else { + throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d)); } } diff --git a/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/Pdf2ImgTest.java b/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/Pdf2ImgTest.java index cc5f064..4388da9 100644 --- a/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/Pdf2ImgTest.java +++ b/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/Pdf2ImgTest.java @@ -1,10 +1,7 @@ package com.knecon.fforesight.service.ocr.processor.utils; import java.awt.image.BufferedImage; -import java.io.BufferedReader; import java.io.File; -import java.io.InputStream; -import java.io.InputStreamReader; import java.util.LinkedList; import java.util.List; import java.util.stream.IntStream; @@ -19,7 +16,7 @@ import org.springframework.core.io.ClassPathResource; import org.springframework.util.FileSystemUtils; import com.knecon.fforesight.service.ocr.processor.service.OsUtils; -import com.knecon.fforesight.service.ocr.processor.service.threads.ProcessIOLogger; +import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler; import lombok.SneakyThrows; @@ -50,29 +47,6 @@ public class Pdf2ImgTest { } - @Test - @SneakyThrows - public void testGhostScript() { - - String outputDir = "/tmp/ghostscript_out/"; - new File(outputDir).mkdirs(); - ClassPathResource resource = new ClassPathResource("files/Cyberport__SD-Faktura-Kopie_(ZRG2)_-_31.08.2020.pdf"); - - String[] cmdArgs = new String[]{"gs", "-dNOPAUSE", "-sDEVICE=tiff24nc", "-r" + DPI, "-sOutputFile=" + outputDir + "page%04d", resource.getFile().toString(), "-c", "quit"}; - Process p = Runtime.getRuntime().exec(cmdArgs); - ProcessIOLogger logger = new ProcessIOLogger(p.getInputStream(), "GS", ProcessIOLogger.Type.STD_OUT); - logger.start(); - ProcessIOLogger errorLogger = new ProcessIOLogger(p.getErrorStream(), "GS", ProcessIOLogger.Type.STD_OUT); - errorLogger.start(); - int exitcode = p.waitFor(); - logger.join(); - errorLogger.join(); - System.out.println("Ghostscript finished with exit code " + exitcode); - FileSystemUtils.deleteRecursively(new File(outputDir)); - - } - - @Test @SneakyThrows public void testGhostScriptParallel() { diff --git a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index 5b5204a..62146b0 100644 --- a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcr() { - String text = testOCR("files/2009-1048395_50pages_tables.pdf"); + String text = testOCR("files/StitchedImagesMultiPage.pdf"); }