diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java new file mode 100644 index 0000000..c8d47f2 --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedImage.java @@ -0,0 +1,36 @@ +package com.knecon.fforesight.service.ocr.processor.model; + +import java.awt.geom.Rectangle2D; +import java.awt.image.BufferedImage; + +import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace; +import org.apache.pdfbox.util.Matrix; + +import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; + +import lombok.AccessLevel; +import lombok.Getter; +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; +import net.sourceforge.lept4j.Pix; +import net.sourceforge.lept4j.util.LeptUtils; + +public record ExtractedImage( + int pageNumber, QuadPoint position, int height, int width, BufferedImage image, Matrix ctm, int numberOnPage, PDColorSpace colorSpace) implements UnprocessedImage { + + @SneakyThrows + public Pix asPix() { + + BufferedImage image = ImageProcessingUtils.convertToDeviceColorSpace(this); + ImageProcessingUtils.setAlphaChannelToWhite(image); + return LeptUtils.convertImageToPix(image); + } + + + public QuadPoint getImageCoordinatesInInitialUserSpace() { + + return QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, 1, 1)).getTransformed(ctm.createAffineTransform()); + } + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java index 74f792d..c6abfad 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/ExtractedOcrImage.java @@ -1,20 +1,15 @@ package com.knecon.fforesight.service.ocr.processor.model; -import java.awt.AlphaComposite; -import java.awt.Color; -import java.awt.Graphics2D; -import java.awt.Transparency; +import java.awt.Graphics; import java.awt.geom.AffineTransform; import java.awt.image.BufferedImage; -import java.io.IOException; -import java.nio.IntBuffer; -import java.util.concurrent.Semaphore; +import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; +import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; import org.apache.pdfbox.util.Matrix; import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; -import com.pdftron.sdf.Obj; import lombok.AccessLevel; import lombok.Getter; @@ -23,58 +18,26 @@ import lombok.Setter; import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; import lombok.extern.slf4j.Slf4j; -import net.sourceforge.lept4j.Leptonica1; import net.sourceforge.lept4j.Pix; -import net.sourceforge.lept4j.util.LeptUtils; import net.sourceforge.tess4j.ITessAPI; @Slf4j @Getter @RequiredArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class ExtractedOcrImage implements OcrImage { - final int pageNumber; - final Pix pix; - final int originalHeight; - final int originalWidth; - final int height; - final int width; - final Matrix ctm; - final int numberOnPage; - - @Setter + int pageNumber; + int numberOnPage; + int originalHeight; + int originalWidth; + Matrix ctm; + Pix pix; + int height; + int width; int rotationDegrees; - @SneakyThrows - public ExtractedOcrImage(int pageNumber, int numberOnPage, BufferedImage bufferedImage, Matrix ctm, int targetDpi) { - - this.pageNumber = pageNumber; - this.numberOnPage = numberOnPage; - this.ctm = ctm; - this.originalHeight = bufferedImage.getHeight(); - this.originalWidth = bufferedImage.getWidth(); - float imageDPI = Math.abs(bufferedImage.getWidth() / (ctm.getScalingFactorX() / 72)); - this.pix = binarize(bufferedImage, imageDPI, targetDpi); - this.height = pix.h; - this.width = pix.w; - } - - - @SneakyThrows - private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) { - - ImageProcessingUtils.setAlphaChannelToWhite(image); - - synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs tesseract detection script. - Pix grayScale = ImageProcessingUtils.convertToGrayScale(image); - Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale); - return ImageProcessingUtils.despecklePix(scaledUp); - } - } - - @Override public AffineTransform getImageCTM() { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java index 3afb0a8..86cfd6a 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java @@ -2,10 +2,12 @@ package com.knecon.fforesight.service.ocr.processor.model; import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; +import java.awt.image.BufferedImage; import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.knecon.fforesight.service.ocr.processor.utils.PdfDpiCalculator; +import lombok.SneakyThrows; import net.sourceforge.lept4j.Leptonica1; import net.sourceforge.lept4j.Pix; import net.sourceforge.lept4j.util.LeptUtils; @@ -62,6 +64,13 @@ public interface OcrImage { } + @SneakyThrows + default BufferedImage getBufferedImage() { + + return LeptUtils.convertPixToImage(getPix()); + } + + /** * Retrieves the rotation degree of the OCR image. * @@ -78,16 +87,6 @@ public interface OcrImage { int getOptimalPageSegmentationMode(); // TODO: evaluate if PSM can be dynamically chosen to increase performance - /** - * Sets the rotation degree of the OCR image. The rotation degree specifies the amount of rotation applied to the image. - * Currently only quadrant rotations are supported. - * Rotated partial images work, due to the CTM present in the pdf working with any rotation. - * - * @param rotationDegree The rotation degree of the OCR image. - */ - void setRotationDegrees(int rotationDegree); - - /** * Retrieves the buffered image associated with the OCR image. * @@ -96,24 +95,6 @@ public interface OcrImage { Pix getPix(); - /** - * Retrieves the rotated image of the OCR image. - * - * @return The rotated BufferedImage object of the OCR image. - */ - default Pix getRotatedPix() { - - synchronized (OCRThread.class) { - return switch (360 - getRotationDegrees()) { - case 90 -> Leptonica1.pixRotateOrth(getPix(), 1); - case 180 -> Leptonica1.pixRotateOrth(getPix(), 2); - case 270 -> Leptonica1.pixRotateOrth(getPix(), 3); - default -> getPix(); - }; - } - } - - default int getDpi() { return PdfDpiCalculator.calculateDpi(getImageBounds(), getImageCTM(), getWidth()); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java new file mode 100644 index 0000000..4935eda --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java @@ -0,0 +1,12 @@ +package com.knecon.fforesight.service.ocr.processor.model; + +import org.apache.pdfbox.pdmodel.PDPage; + +public record PageInformation(int height, int width, int number, int rotationDegrees) { + + public static PageInformation fromPDPage(int pageNum, PDPage page) { + + return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation()); + } + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/QuadPoint.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/QuadPoint.java index 53fc7b6..c40aa1d 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/QuadPoint.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/QuadPoint.java @@ -97,4 +97,10 @@ public record QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) { d().getY()); } + + public double size() { + + return a().distance(b()) * a().distance(d()); + } + } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageImageFile.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageImageFile.java index 2b773fe..4bb78fb 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageImageFile.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageImageFile.java @@ -1,5 +1,14 @@ package com.knecon.fforesight.service.ocr.processor.model; -public record RenderedPageImageFile(int pageNumber, String absoluteFilePath) { +import net.sourceforge.lept4j.Leptonica1; +import net.sourceforge.lept4j.Pix; + +public record RenderedPageImageFile(int pageNumber, String absoluteFilePath) implements UnprocessedImage { + + @Override + public Pix asPix() { + + return Leptonica1.pixRead(absoluteFilePath); + } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageOcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageOcrImage.java index 42abff4..1141eb5 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageOcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageOcrImage.java @@ -8,6 +8,7 @@ import org.apache.pdfbox.pdmodel.PDPage; import lombok.AccessLevel; import lombok.Getter; +import lombok.RequiredArgsConstructor; import lombok.Setter; import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; @@ -16,29 +17,17 @@ import net.sourceforge.lept4j.Pix; import net.sourceforge.tess4j.ITessAPI; @Getter -@FieldDefaults(level = AccessLevel.PRIVATE) +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class RenderedPageOcrImage implements OcrImage { - final String absoluteImagePath; - final int height; - final int width; - final PageInformation pageInformation; - final Pix pix; - @Setter + int height; + int width; + PageInformation pageInformation; + Pix pix; int rotationDegrees; - @SneakyThrows - public RenderedPageOcrImage(RenderedPageImageFile renderedPageImageFile, PDDocument document) { - - this.pageInformation = PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1)); - this.absoluteImagePath = renderedPageImageFile.absoluteFilePath(); - this.pix = Leptonica1.pixRead(absoluteImagePath); - this.height = getPix().h; - this.width = getPix().w; - } - - @Override public int getOptimalPageSegmentationMode() { @@ -107,7 +96,7 @@ public class RenderedPageOcrImage implements OcrImage { // PDFBox always returns page height and width based on rotation double pageWidth; - if (pageInformation.rotationDegrees == 90 || pageInformation.rotationDegrees == 270) { + if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) { pageWidth = pageInformation.height(); } else { pageWidth = pageInformation.width(); @@ -116,14 +105,4 @@ public class RenderedPageOcrImage implements OcrImage { return pageWidth / width; } - - private record PageInformation(int height, int width, int number, int rotationDegrees) { - - public static PageInformation fromPDPage(int pageNum, PDPage page) { - - return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation()); - } - - } - } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/UnprocessedImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/UnprocessedImage.java new file mode 100644 index 0000000..6facc56 --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/UnprocessedImage.java @@ -0,0 +1,9 @@ +package com.knecon.fforesight.service.ocr.processor.model; + +import net.sourceforge.lept4j.Pix; + +public interface UnprocessedImage { + + Pix asPix(); + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java index 18d3568..a767f91 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java @@ -4,18 +4,26 @@ import java.io.InputStream; import java.nio.file.Path; import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; import java.util.LinkedList; import java.util.List; +import java.util.Map; import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.LinkedTransferQueue; import java.util.stream.Collectors; import org.apache.pdfbox.pdmodel.PDDocument; import org.springframework.stereotype.Service; +import com.azure.core.implementation.GeoObjectHelper; import com.knecon.fforesight.service.ocr.processor.model.OcrImage; +import com.knecon.fforesight.service.ocr.processor.model.PageInformation; import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile; import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage; -import com.knecon.fforesight.service.ocr.processor.service.threads.ProcessIOLogger; +import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage; +import com.knecon.fforesight.service.ocr.processor.service.threads.BlockingQueueFiller; +import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils; @@ -24,6 +32,7 @@ import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; import lombok.extern.slf4j.Slf4j; +import net.sourceforge.lept4j.Pix; @Slf4j @Service @@ -42,17 +51,19 @@ public class GhostScriptService { String documentAbsolutePath, Path tmpImageDir, PDDocument document, - BlockingQueue imageOutputQueue, + BlockingQueue imageProcessingQueue, Statistics stats) { + BlockingQueue imageFileCollectorQueue = new LinkedBlockingDeque<>(); + Thread asyncTransferThread = new BlockingQueueFiller(imageFileCollectorQueue, imageProcessingQueue); + asyncTransferThread.start(); int numOfProcesses = Math.min(settings.getGsProcessCount(), stitchedPageNumbers.size()); List> processInfoBatches = buildSubListForEachProcess(stitchedPageNumbers, numOfProcesses, - 2 * settings.getOcrThreadCount()); // use 2 times the thread count as batch size, such that GS generates the rendered pages as needed by the OCR Threads + 256 * numOfProcesses); // GS has a limit on how many pageIndices per call are possible, so we limit it to 256 pages per process for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) { long timestamp = System.currentTimeMillis(); - List renderedPageImageFiles = Collections.synchronizedList(new LinkedList<>()); List processInfos = processInfoBatches.get(batchIdx); log.info("Batch {}: Running {} gs processes with ({}) pages each", @@ -63,9 +74,9 @@ public class GhostScriptService { int finalBatchIdx = batchIdx; List processes = processInfos.stream() .parallel() - .map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.stitchedPageNumbers(), tmpImageDir, documentAbsolutePath, renderedPageImageFiles)) - .peek(s -> log.debug(String.join(" ", s))) - .map(this::executeProcess) + .map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.stitchedPageNumbers(), tmpImageDir, documentAbsolutePath)) + .peek(s -> log.debug(String.join(" ", s.cmdArgs()))) + .map(processInfo -> executeProcess(processInfo, imageFileCollectorQueue)) .toList(); List processExitCodes = new LinkedList<>(); @@ -73,14 +84,9 @@ public class GhostScriptService { processExitCodes.add(process.waitFor()); } stats.increasePDF2ImgDuration(System.currentTimeMillis() - timestamp); - log.info("Batch {}: Ghostscript processes finished with exit codes " + processExitCodes, batchIdx); - for (RenderedPageImageFile renderedPageImageFile : renderedPageImageFiles) { - OcrImage image = new RenderedPageOcrImage(renderedPageImageFile, document); - imageOutputQueue.put(image); - } - } + asyncTransferThread.interrupt(); } @@ -107,20 +113,28 @@ public class GhostScriptService { @SneakyThrows - private String[] buildCmdArgs(Integer processIdx, - Integer batchIdx, - List stitchedImagePageIndices, - Path outputDir, - String documentAbsolutePath, - List fullPageImages) { + private ProcessCmdsAndRenderedImageFiles buildCmdArgs(Integer processIdx, + Integer batchIdx, + List stitchedImagePageIndices, + Path outputDir, + String documentAbsolutePath) { String imagePathFormat = outputDir.resolve("output_" + processIdx + "_" + batchIdx + ".%04d" + FORMAT).toFile().toString(); + Map fullPageImages = new HashMap<>(); for (int i = 0; i < stitchedImagePageIndices.size(); i++) { Integer pageNumber = stitchedImagePageIndices.get(i); - fullPageImages.add(new RenderedPageImageFile(pageNumber, String.format(imagePathFormat, i + 1))); + fullPageImages.put(pageNumber, new RenderedPageImageFile(pageNumber, String.format(imagePathFormat, i + 1))); } + String[] cmdArgs = buildCmdArgs(stitchedImagePageIndices, documentAbsolutePath, imagePathFormat); + + return new ProcessCmdsAndRenderedImageFiles(cmdArgs, fullPageImages); + } + + + private String[] buildCmdArgs(List stitchedImagePageIndices, String documentAbsolutePath, String imagePathFormat) { + StringBuilder sPageList = new StringBuilder(); int i = 1; for (Integer integer : stitchedImagePageIndices) { @@ -131,18 +145,19 @@ public class GhostScriptService { i++; } - return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + settings.getDpi(), "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"}; + String[] cmdArgs = new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + settings.getDpi(), "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"}; + return cmdArgs; } @SneakyThrows - private Process executeProcess(String[] cmdArgs) { + private Process executeProcess(ProcessCmdsAndRenderedImageFiles processInfo, BlockingQueue imageFileCollectorQueue) { - Process p = Runtime.getRuntime().exec(cmdArgs); + Process p = Runtime.getRuntime().exec(processInfo.cmdArgs()); InputStream stdOut = p.getInputStream(); - ProcessIOLogger stdOutLogger = new ProcessIOLogger(stdOut, "GS", ProcessIOLogger.Type.STD_OUT); + GhostScriptOutputHandler stdOutLogger = GhostScriptOutputHandler.stdOut(stdOut, processInfo.renderedPageImageFiles(), imageFileCollectorQueue); InputStream stdError = p.getErrorStream(); - ProcessIOLogger stdErrorLogger = new ProcessIOLogger(stdError, "GS", ProcessIOLogger.Type.ERROR); + GhostScriptOutputHandler stdErrorLogger = GhostScriptOutputHandler.errorHandler(stdError); stdOutLogger.start(); stdErrorLogger.start(); @@ -150,6 +165,10 @@ public class GhostScriptService { } + private record ProcessCmdsAndRenderedImageFiles(String[] cmdArgs, Map renderedPageImageFiles) { + + } + private record ProcessInfo(Integer processIdx, List stitchedPageNumbers) { } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java index 9b6d2d5..0c64115 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/ImageStreamEngine.java @@ -1,7 +1,6 @@ package com.knecon.fforesight.service.ocr.processor.service; -import java.awt.Graphics; -import java.awt.image.BufferedImage; +import java.awt.geom.Rectangle2D; import java.io.IOException; import java.util.LinkedList; import java.util.List; @@ -18,13 +17,12 @@ import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.graphics.PDXObject; -import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; -import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.util.Matrix; -import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage; +import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; +import com.knecon.fforesight.service.ocr.processor.model.QuadPoint; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; import lombok.Getter; @@ -33,8 +31,7 @@ import lombok.SneakyThrows; @Getter public class ImageStreamEngine extends PDFStreamEngine { - private ExtractedOcrImage currentImageOnPage; - private List imagesOnCurrentPage; + private List imagesOnCurrentPage; private OcrServiceSettings settings; private int pageNum; @@ -69,22 +66,15 @@ public class ImageStreamEngine extends PDFStreamEngine { } Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix(); - if (imageXObject.getColorSpace() instanceof PDDeviceRGB) { - BufferedImage image = imageXObject.getImage(); - this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi()); - } else if (imageXObject.getColorSpace() instanceof PDDeviceGray) { - BufferedImage image = imageXObject.getImage(); - this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi()); - } else { - BufferedImage pdfImage = imageXObject.getImage(); - BufferedImage image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY); - Graphics g = image.getGraphics(); - g.drawImage(pdfImage, 0, 0, null); - g.dispose(); - this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi()); - } - this.imagesOnCurrentPage.add(this.currentImageOnPage); - //imagesOnPages.add(this.currentImageOnPage); + this.imagesOnCurrentPage.add(new ExtractedImage(pageNum, + QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, imageXObject.getWidth(), imageXObject.getHeight())), + imageXObject.getHeight(), + imageXObject.getWidth(), + imageXObject.getImage(), + imageCTM, + imagesOnCurrentPage.size(), + imageXObject.getColorSpace())); + } else if (xobject instanceof PDFormXObject) { PDFormXObject form = (PDFormXObject) xobject; showForm(form); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java index 3ed6193..54b8306 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java @@ -107,7 +107,7 @@ public class OCRService { int numberOfOcrThreads = Math.min(settings.getOcrThreadCount(), document.getNumberOfPages()); stats = new Statistics(numberOfExtractThreads, numberOfOcrThreads); - BlockingQueue ocrImageQueue = new ArrayBlockingQueue<>(numberOfOcrThreads); + BlockingQueue ocrImageQueue = new ArrayBlockingQueue<>((int) (1.5 * numberOfOcrThreads)); OcrImageFactory ocrImageFactory = new OcrImageFactory(document, documentFile, @@ -128,7 +128,7 @@ public class OCRService { .toList(); log.info("Started {} OCR consumer threads, listening for images on the queue", ocrThreads.size()); ocrImageFactory.join(); - log.info("Extracted all images, interrupting ocr threads"); + log.info("Processed all images, interrupting ocr threads"); ocrThreads.forEach(Thread::interrupt); for (OCRThread ocrThread : ocrThreads) { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java index 3ff4683..2e913e2 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrImageFactory.java @@ -6,13 +6,17 @@ import java.util.ArrayList; import java.util.Collections; import java.util.LinkedList; import java.util.List; +import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.stream.Collectors; import org.apache.pdfbox.pdmodel.PDDocument; +import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; import com.knecon.fforesight.service.ocr.processor.model.OcrImage; +import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage; import com.knecon.fforesight.service.ocr.processor.service.threads.ImageExtractionThread; +import com.knecon.fforesight.service.ocr.processor.service.threads.ImageProcessingThread; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils; @@ -29,6 +33,8 @@ public class OcrImageFactory { File documentFile; Path tmpImageDir; GhostScriptService ghostScriptService; + BlockingQueue imageProcessingQueue; + ImageProcessingThread imageProcessingThread; BlockingQueue imageOutputQueue; List imageExtractionThreads; List stitchedPageNumbers; @@ -40,7 +46,7 @@ public class OcrImageFactory { Path tmpImageDir, int numberOfThreads, GhostScriptService ghostScriptService, - BlockingQueue imageOutputQueue, + BlockingQueue imageOcrQueue, OcrProgressLogger logger, OcrServiceSettings settings, Statistics stats) { @@ -49,7 +55,8 @@ public class OcrImageFactory { this.documentFile = documentFile; this.tmpImageDir = tmpImageDir; this.ghostScriptService = ghostScriptService; - this.imageOutputQueue = imageOutputQueue; + this.imageOutputQueue = imageOcrQueue; + this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOcrQueue.remainingCapacity()); this.stitchedPageNumbers = Collections.synchronizedList(new LinkedList<>()); this.stats = stats; @@ -57,8 +64,10 @@ public class OcrImageFactory { List> balancedPageNumbers = ListSplittingUtils.buildBalancedContinuousSublist(document.getNumberOfPages(), numberOfThreads); for (int i = 0; i < balancedPageNumbers.size(); i++) { - imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageOutputQueue, stitchedPageNumbers)); + imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageProcessingQueue, stitchedPageNumbers)); } + this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOcrQueue, stats, settings, document); + log.info("Started {} image extraction threads, with ({}) pages each", imageExtractionThreads.size(), imageExtractionThreads.stream().map(ImageExtractionThread::getPageIndices).map(List::size).map(String::valueOf).collect(Collectors.joining(", "))); @@ -70,6 +79,8 @@ public class OcrImageFactory { for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) { imageExtractionThread.start(); } + imageProcessingThread.start(); + } @@ -79,11 +90,15 @@ public class OcrImageFactory { for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) { imageExtractionThread.join(); } - if (stitchedPageNumbers.isEmpty()) { - return; - } - ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageOutputQueue, stats); + if (!stitchedPageNumbers.isEmpty()) { + ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageProcessingQueue, stats); + } + imageProcessingThread.interrupt(); + log.info("All images extracted, interrupting processing thread."); + + imageProcessingThread.join(); + } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/Statistics.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/Statistics.java index 73fe284..97d44e3 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/Statistics.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/Statistics.java @@ -15,6 +15,7 @@ public class Statistics { List tesseractDuration; AtomicLong pdf2ImgDuration; AtomicLong writingTextDuration; + AtomicLong imageProcessingDuration; public Statistics(int numberOfExtractThreads, int numberOfOcrThreads) { @@ -23,6 +24,7 @@ public class Statistics { this.tesseractDuration = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfOcrThreads, 0L))); this.pdf2ImgDuration = new AtomicLong(0); this.writingTextDuration = new AtomicLong(0); + this.imageProcessingDuration = new AtomicLong(0); } @@ -32,6 +34,12 @@ public class Statistics { } + public void increaseImageProcessing(long duration) { + + imageProcessingDuration.addAndGet(duration); + } + + public void increaseTesseractDuration(int threadId, long duration) { tesseractDuration.set(threadId, tesseractDuration.get(threadId) + duration); @@ -53,13 +61,15 @@ public class Statistics { @Override public String toString() { - return String.format("imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, PDF2Img=%.2f s, writingText=%.2f s", + return String.format( + "imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s", ((float) imageExtraction.stream().mapToLong(Long::longValue).average().orElse(0) / 1000), ((float) imageExtraction.stream().mapToLong(Long::longValue).max().orElse(0) / 1000), ((float) imageExtraction.stream().mapToLong(Long::longValue).min().orElse(0) / 1000), ((float) tesseractDuration.stream().mapToLong(Long::longValue).average().orElse(0) / 1000), ((float) tesseractDuration.stream().mapToLong(Long::longValue).max().orElse(0) / 1000), ((float) tesseractDuration.stream().mapToLong(Long::longValue).min().orElse(0) / 1000), + (float) imageProcessingDuration.get() / 1000, (float) pdf2ImgDuration.get() / 1000, (float) writingTextDuration.get() / 1000); } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/BlockingQueueFiller.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/BlockingQueueFiller.java new file mode 100644 index 0000000..554e190 --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/BlockingQueueFiller.java @@ -0,0 +1,61 @@ +package com.knecon.fforesight.service.ocr.processor.service.threads; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.BlockingQueue; + +import com.knecon.fforesight.service.ocr.processor.model.OcrImage; +import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile; +import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage; + +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; +import net.sourceforge.tess4j.TessAPI1; + + +/* +This just moves the Elements from the GhostScriptOutputListener into the ImageProcessing queue asynchronously + */ +@Slf4j +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class BlockingQueueFiller extends Thread { + + BlockingQueue imageInputQueue; + BlockingQueue imageOutputQueue; + + @SneakyThrows + @Override + public void run() { + + // Interrupting signals that the image extraction has finished + while (true) { + try { + final UnprocessedImage image = imageInputQueue.take(); + try { + imageOutputQueue.put(image); + } catch (InterruptedException e) { + imageOutputQueue.put(image); + break; + } + + } catch (InterruptedException e) { + break; + } + } + // empty the queue + List remainingImages = new ArrayList<>(imageInputQueue.size()); + imageInputQueue.drainTo(remainingImages); + remainingImages.forEach(image -> { + try { + imageOutputQueue.put(image); + } catch (InterruptedException e) { + log.error(e.getMessage()); + } + }); + + } +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/GhostScriptOutputHandler.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/GhostScriptOutputHandler.java new file mode 100644 index 0000000..0dd0c60 --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/GhostScriptOutputHandler.java @@ -0,0 +1,122 @@ +package com.knecon.fforesight.service.ocr.processor.service.threads; + +import java.io.BufferedReader; +import java.io.File; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.concurrent.BlockingQueue; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile; + +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@RequiredArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class GhostScriptOutputHandler extends Thread { + + static Pattern pageFinishedPattern = Pattern.compile("Page (\\d+)"); + + // If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock. + // Since both need to read simultaneously we need to implement the readers as separate threads. + + final InputStream is; + final String processName; + final Type type; + + final Map pagesToProcess; + final BlockingQueue renderedPageImageFileOutput; + + int currentPageNumber; + + + public static GhostScriptOutputHandler errorHandler(InputStream is) { + + return new GhostScriptOutputHandler(is, "GS", Type.ERROR, null, null); + } + + + public static GhostScriptOutputHandler stdOut(InputStream is, + Map pagesToProcess, + BlockingQueue renderedPageImageFileOutput) { + + return new GhostScriptOutputHandler(is, "GS", Type.STD_OUT, pagesToProcess, renderedPageImageFileOutput); + } + + + @SneakyThrows + public void run() { + + try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) { + + String line; + while (true) { + line = br.readLine(); + + if (line == null) { + break; + } + + if (type.equals(Type.ERROR)) { + log.error(processName + "_" + type.name() + ">" + line); + } else { + log.debug(processName + "_" + type.name() + ">" + line); + addProcessedImageToQueue(line); + } + } + } + is.close(); + if (type.equals(Type.STD_OUT)) { + queueFinishedPage(currentPageNumber); + } + + } + + + private void addProcessedImageToQueue(String line) { + + /* + Ghostscript prints the pageNumber it is currently working on, so we remember the current page and queue it as soon as the next comes in. + */ + Matcher pageNumberMatcher = pageFinishedPattern.matcher(line); + if (pageNumberMatcher.find()) { + int pageNumber = Integer.parseInt(pageNumberMatcher.group(1)); + + if (currentPageNumber == 0) { + currentPageNumber = pageNumber; + return; + } + + queueFinishedPage(currentPageNumber); + currentPageNumber = pageNumber; + } + } + + + private void queueFinishedPage(int pageNumber) { + + var imageFile = this.pagesToProcess.get(pageNumber); + if (imageFile == null) { + throw new IllegalArgumentException(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet())); + } + assert new File(imageFile.absoluteFilePath()).isFile(); + renderedPageImageFileOutput.add(imageFile); + } + + + public enum Type { + ERROR, + STD_OUT + } + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java index 3b29836..d0340ca 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageExtractionThread.java @@ -5,12 +5,11 @@ import java.util.List; import java.util.concurrent.BlockingQueue; import org.apache.pdfbox.Loader; -import org.apache.pdfbox.io.MemoryUsageSetting; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; -import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage; -import com.knecon.fforesight.service.ocr.processor.model.OcrImage; +import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; +import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage; import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine; import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger; import com.knecon.fforesight.service.ocr.processor.service.Statistics; @@ -26,6 +25,7 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class ImageExtractionThread extends Thread { + static double FULL_PAGE_IMAGE_THRESHOLD = 0.99; static double IMAGE_ALIGNMENT_THRESHOLD = 1; int id; @@ -37,9 +37,10 @@ public class ImageExtractionThread extends Thread { OcrServiceSettings settings; // output is written to these lists - BlockingQueue imageOutputQueue; + BlockingQueue imageProcessingQueue; List stitchedPageNumbers; + @SneakyThrows @Override public void run() { @@ -48,28 +49,28 @@ public class ImageExtractionThread extends Thread { for (Integer pageIndex : pageIndices) { try (PDDocument document = Loader.loadPDF(documentFile)) { // load new PDDocument for thread safety, also keeps RAM usage low. timestamp = System.currentTimeMillis(); - List extractedOcrImages = getExtractedOcrImages(pageIndex, document); + List extractedImages = getExtractedImages(pageIndex, document); stats.increaseImageExtraction(id, System.currentTimeMillis() - timestamp); - if (extractedOcrImages.isEmpty()) { + if (extractedImages.isEmpty()) { logger.logPageSkipped(pageIndex); } - if (checkForStitchedImages(extractedOcrImages)) { + if (checkForFullPageOrStitchedImages(extractedImages, document.getPage(pageIndex - 1))) { stitchedPageNumbers.add(pageIndex); logger.addImagesToProcess(pageIndex, 0); continue; } - for (ExtractedOcrImage image : extractedOcrImages) { - imageOutputQueue.put(image); - logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage()); + for (ExtractedImage image : extractedImages) { + imageProcessingQueue.put((UnprocessedImage) image); + logger.addImagesToProcess(image.pageNumber(), image.numberOnPage()); } } } } - private List getExtractedOcrImages(Integer pageIndex, PDDocument document) { + private List getExtractedImages(Integer pageIndex, PDDocument document) { PDPage page = document.getPage(pageIndex - 1); ImageStreamEngine imageStreamEngine = new ImageStreamEngine(settings); @@ -79,22 +80,22 @@ public class ImageExtractionThread extends Thread { @SneakyThrows - private boolean checkForStitchedImages(List imagesOnCurrentPage) { + private boolean checkForFullPageOrStitchedImages(List imagesOnCurrentPage, PDPage page) { - if (imagesOnCurrentPage.size() <= 1) { + if (imagesOnCurrentPage.isEmpty()) { return false; } - //checking for intersections or direct alignment of images - ExtractedOcrImage[] imageOnPagesArray = new ExtractedOcrImage[imagesOnCurrentPage.size()]; - int index = 0; - for (ExtractedOcrImage imageOnPage : imagesOnCurrentPage) { - imageOnPagesArray[index] = imageOnPage; - index++; + for (ExtractedImage imageOnPage : imagesOnCurrentPage) { + if (imageOnPage.width() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getWidth() && imageOnPage.height() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight()) { + return true; + } } - for (int j = 0; j < imageOnPagesArray.length; j++) { - for (int i = j + 1; i < imageOnPagesArray.length; i++) { - if (imageOnPagesArray[j].getImageCoordinatesInInitialUserSpace().aligns(imageOnPagesArray[i].getImageCoordinatesInInitialUserSpace(), IMAGE_ALIGNMENT_THRESHOLD)) { + + //checking for intersections or direct alignment of images + for (int j = 0; j < imagesOnCurrentPage.size(); j++) { + for (int i = j + 1; i < imagesOnCurrentPage.size(); i++) { + if (imagesOnCurrentPage.get(j).getImageCoordinatesInInitialUserSpace().aligns(imagesOnCurrentPage.get(i).getImageCoordinatesInInitialUserSpace(), IMAGE_ALIGNMENT_THRESHOLD)) { // TODO: see if we can stitch aligning images using BufferedImage and skip the gs conversion entirely return true; } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java new file mode 100644 index 0000000..5dfe1eb --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java @@ -0,0 +1,205 @@ +package com.knecon.fforesight.service.ocr.processor.service.threads; + +import static net.sourceforge.tess4j.ITessAPI.TRUE; + +import java.nio.FloatBuffer; +import java.nio.IntBuffer; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.BlockingQueue; + +import org.apache.pdfbox.pdmodel.PDDocument; + +import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; +import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage; +import com.knecon.fforesight.service.ocr.processor.model.OcrImage; +import com.knecon.fforesight.service.ocr.processor.model.PageInformation; +import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile; +import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage; +import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage; +import com.knecon.fforesight.service.ocr.processor.service.Statistics; +import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; +import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; +import com.sun.jna.ptr.PointerByReference; + +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; +import net.sourceforge.lept4j.Pix; +import net.sourceforge.lept4j.util.LeptUtils; +import net.sourceforge.tess4j.ITessAPI; +import net.sourceforge.tess4j.TessAPI1; + +/* + * This thread does all the image processing. There should only be one, since Leptonica is not thread safe. + */ +@Slf4j +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class ImageProcessingThread extends Thread { + + BlockingQueue imageInputQueue; + BlockingQueue imageOutputQueue; + ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle(); + Statistics stats; + OcrServiceSettings settings; + PDDocument document; + + + @SneakyThrows + @Override + public void run() { + + // Interrupting signals that the image extraction has finished + while (true) { + try { + final UnprocessedImage image = imageInputQueue.take(); + OcrImage extractedOcrImage = this.process(image); + try { + imageOutputQueue.put(extractedOcrImage); + } catch (InterruptedException e) { + imageOutputQueue.put(extractedOcrImage); + break; + } + + } catch (InterruptedException e) { + break; + } + } + // empty the queue + List remainingImages = new ArrayList<>(imageInputQueue.size()); + imageInputQueue.drainTo(remainingImages); + remainingImages.forEach(image -> { + OcrImage ocrImage = this.process(image); + try { + imageOutputQueue.put(ocrImage); + } catch (InterruptedException e) { + log.error(e.getMessage()); + } + }); + + TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle); + } + + + private OcrImage process(UnprocessedImage unprocessedImage) { + + long timestamp = System.currentTimeMillis(); + + OcrImage ocrImage; + if (unprocessedImage instanceof ExtractedImage extractedImage) { + ocrImage = processExtractedImage(extractedImage); + } else if (unprocessedImage instanceof RenderedPageImageFile renderedPageImageFile) { + ocrImage = processRenderedPageImageFile(renderedPageImageFile); + } else { + throw new UnsupportedOperationException(String.format("Class %s is not supported!", unprocessedImage.getClass())); + } + + stats.increaseImageProcessing(System.currentTimeMillis() - timestamp); + + return ocrImage; + } + + + private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) { + + Pix pix = binarize(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi()); + + int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle); + Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix); + + OcrImage ocrImage = new RenderedPageOcrImage(pix.h, + pix.w, + PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1)), + rotatedPix, + orientDegree); + + if (pix != rotatedPix) { + LeptUtils.disposePix(pix); + } + + return ocrImage; + } + + + private OcrImage processExtractedImage(ExtractedImage extractedImage) { + + float imageDPI = Math.abs(extractedImage.image().getWidth() / (extractedImage.ctm().getScalingFactorX() / 72)); + + Pix pix = binarize(extractedImage.asPix(), imageDPI, settings.getDpi()); + + int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle); + Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix); + + OcrImage ocrImage = new ExtractedOcrImage(extractedImage.pageNumber(), + extractedImage.numberOnPage(), + extractedImage.height(), + extractedImage.width(), + extractedImage.ctm(), + rotatedPix, + pix.h, + pix.w, + orientDegree); + + if (pix != rotatedPix) { + LeptUtils.disposePix(pix); + } + return ocrImage; + } + + + + + static public int detectOrientation(Pix pix, int dpi, ITessAPI.TessBaseAPI detectionScriptHandle) { + + TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, pix); + TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, dpi); + + IntBuffer orientationDegreeResultBuffer; + FloatBuffer orientationDegreeConfidenceBuffer; + PointerByReference scriptureNameBuffer; + FloatBuffer scriptureConfidenceBuffer; + + orientationDegreeResultBuffer = IntBuffer.allocate(1); + orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1); + scriptureNameBuffer = new PointerByReference(); + scriptureConfidenceBuffer = FloatBuffer.allocate(1); + + int orientationDegree = 0; + int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle, + orientationDegreeResultBuffer, + orientationDegreeConfidenceBuffer, + scriptureNameBuffer, + scriptureConfidenceBuffer); + if (result == TRUE && orientationDegreeConfidenceBuffer.get() > 10) { + orientationDegree = orientationDegreeResultBuffer.get(); + } + + TessAPI1.TessBaseAPIClear(detectionScriptHandle); + + return orientationDegree; + } + + + @SneakyThrows + private Pix binarize(Pix pix, float imageDpi, int targetDpi) { + + Pix grayScale = ImageProcessingUtils.convertToGrayScale(pix); + Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale); + return ImageProcessingUtils.despecklePix(scaledUp); + + } + + + private static ITessAPI.TessBaseAPI initDetectionScriptHandle() { + + ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate(); + String datapath = System.getenv("TESSDATA_PREFIX"); + TessAPI1.TessBaseAPIInit3(handle, datapath, "osd"); + + return handle; + } + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java index ad567ef..9c1a0a7 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java @@ -1,6 +1,10 @@ package com.knecon.fforesight.service.ocr.processor.service.threads; import static net.sourceforge.tess4j.ITessAPI.TRUE; +import static net.sourceforge.tess4j.TessAPI1.TessBaseAPICreate; +import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIInit1; +import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetPageSegMode; +import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetVariable; import java.io.File; import java.nio.FloatBuffer; @@ -16,6 +20,7 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrResult; import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger; import com.knecon.fforesight.service.ocr.processor.service.Statistics; import com.knecon.fforesight.service.ocr.processor.utils.Tesseract2; +import com.sun.jna.StringArray; import com.sun.jna.ptr.PointerByReference; import lombok.AccessLevel; @@ -43,7 +48,6 @@ public class OCRThread extends Thread { Statistics stats; OcrServiceSettings settings; Tesseract2 instance; - ITessAPI.TessBaseAPI detectionScriptHandle; public OCRThread(int id, @@ -62,7 +66,6 @@ public class OCRThread extends Thread { this.stats = stats; this.settings = settings; this.instance = createInstance(settings); - this.detectionScriptHandle = initDetectionScriptHandle(); } @@ -87,10 +90,9 @@ public class OCRThread extends Thread { this.process(image); } } catch (NoSuchElementException e) { - log.debug("Processed all Images, finishing."); + log.debug("Executed tesseract on all Images, finishing."); } - TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle); } @@ -102,15 +104,8 @@ public class OCRThread extends Thread { int psm = settings.getPsmOverride() < 0 ? image.getOptimalPageSegmentationMode() : settings.getPsmOverride(); - int orientDegree = detectOrientation(image); - image.setRotationDegrees(orientDegree); - Pix rotatedPix = image.getRotatedPix(); - executeTesseract(psm, image.getDpi(), rotatedPix, tesseractOutputFileName); - - synchronized (OCRThread.class) { - image.destroyPix(); - LeptUtils.disposePix(rotatedPix); - } + executeTesseract(psm, image.getDpi(), image.getPix(), tesseractOutputFileName); + image.destroyPix(); results.add(OcrResult.create(image, tesseractOutputFileName)); logger.logImageFinished(image, psm); @@ -118,51 +113,6 @@ public class OCRThread extends Thread { } - public int detectOrientation(OcrImage image) { - - IntBuffer orientationDegreeResultBuffer; - FloatBuffer orientationDegreeConfidenceBuffer; - PointerByReference scriptureNameBuffer; - FloatBuffer scriptureConfidenceBuffer; - - TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, image.getPix()); - TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, image.getDpi()); - - synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs in leptonica binarization. - orientationDegreeResultBuffer = IntBuffer.allocate(1); - orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1); - scriptureNameBuffer = new PointerByReference(); - scriptureConfidenceBuffer = FloatBuffer.allocate(1); - } - - int orient_deg = 0; - int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle, - orientationDegreeResultBuffer, - orientationDegreeConfidenceBuffer, - scriptureNameBuffer, - scriptureConfidenceBuffer); - if (result == TRUE) { - orient_deg = orientationDegreeResultBuffer.get(); - } - - synchronized (OCRThread.class) { - TessAPI1.TessBaseAPIClear(detectionScriptHandle); - } - - return orient_deg; - } - - - synchronized private static ITessAPI.TessBaseAPI initDetectionScriptHandle() { - - ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate(); - String datapath = System.getenv("TESSDATA_PREFIX"); - TessAPI1.TessBaseAPIInit3(handle, datapath, "osd"); - - return handle; - } - - @SneakyThrows public void executeTesseract(int psm, int dpi, Pix pix, String tesseractOutputFileName) { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ProcessIOLogger.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ProcessIOLogger.java deleted file mode 100644 index b068dd0..0000000 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ProcessIOLogger.java +++ /dev/null @@ -1,55 +0,0 @@ -package com.knecon.fforesight.service.ocr.processor.service.threads; - -import java.io.BufferedReader; -import java.io.InputStream; -import java.io.InputStreamReader; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.SneakyThrows; -import lombok.experimental.FieldDefaults; -import lombok.extern.slf4j.Slf4j; - -@Slf4j -@AllArgsConstructor -@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -public class ProcessIOLogger extends Thread { - - // If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock. - // Since both need to read simultaneously we need to implement the readers as separate threads. - - InputStream is; - String processName; - Type type; - - - @SneakyThrows - public void run() { - - try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) { - - String line; - while (true) { - line = br.readLine(); - - if (line == null) { - break; - } - - if (type.equals(Type.ERROR)) { - log.error(processName + "_" + type.name() + ">" + line); - } else { - log.debug(processName + "_" + type.name() + ">" + line); - } - } - } - is.close(); - } - - - public enum Type { - ERROR, - STD_OUT - } - -} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java index 0592808..d8e3665 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java @@ -14,7 +14,7 @@ public class OcrServiceSettings { int ocrThreadCount = 4; // Number of OCR threads int imageExtractThreadCount = 2; // Number of image extraction threads - int gsProcessCount = 2; // Number of Ghostscript processes + int gsProcessCount = 1; // Number of Ghostscript processes int dpi = 300; // Target DPI for binarized images int psmOverride = -1; // Overrides the page segmentation mode if > 0 int minImageHeight = 20; // Minimum height for images to be processed diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java index ffa9f74..cd8d7f8 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java @@ -2,10 +2,15 @@ package com.knecon.fforesight.service.ocr.processor.utils; import java.awt.AlphaComposite; import java.awt.Color; +import java.awt.Graphics; import java.awt.Graphics2D; import java.awt.Transparency; import java.awt.image.BufferedImage; -import java.io.IOException; + +import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; +import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; + +import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; import lombok.SneakyThrows; import lombok.experimental.UtilityClass; @@ -16,6 +21,22 @@ import net.sourceforge.lept4j.util.LeptUtils; @UtilityClass public class ImageProcessingUtils { + public BufferedImage convertToDeviceColorSpace(ExtractedImage extractedImage) { + + BufferedImage image; + if (extractedImage.colorSpace() instanceof PDDeviceRGB || extractedImage.colorSpace() instanceof PDDeviceGray) { + image = extractedImage.image(); + } else { + BufferedImage pdfImage = extractedImage.image(); + image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY); + Graphics g = image.getGraphics(); + g.drawImage(pdfImage, 0, 0, null); + g.dispose(); + } + return image; + } + + public static Pix despecklePix(Pix pix) { assert pix.d == 8; @@ -24,7 +45,9 @@ public class ImageProcessingUtils { // too small to properly despeckle, just binarize instead. despeckled = Leptonica1.pixThresholdToBinary(pix, 180); } else { - despeckled = LeptUtils.despeckle(pix, LeptUtils.SEL_STR3, 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though... + despeckled = LeptUtils.despeckle(pix, + LeptUtils.SEL_STR3, + 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though... if (despeckled == null) { despeckled = Leptonica1.pixThresholdToBinary(pix, 180); } @@ -57,23 +80,35 @@ public class ImageProcessingUtils { @SneakyThrows - public static Pix convertToGrayScale(BufferedImage image) { + public static Pix convertToGrayScale(Pix pix) { - Pix pix = LeptUtils.convertImageToPix(image); if (pix.d == 8) { return pix; } else if (pix.d == 32) { Pix grayScale = Leptonica1.pixConvertRGBToGrayFast(pix); LeptUtils.disposePix(pix); return grayScale; - } else { + } else if (pix.d == 1) { Pix grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255); LeptUtils.disposePix(pix); return grayScale; + } else { + throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d)); } } + public Pix deRotatePix(int orientDegree, Pix pix) { + + return switch (360 - orientDegree) { + case 90 -> Leptonica1.pixRotateOrth(pix, 1); + case 180 -> Leptonica1.pixRotateOrth(pix, 2); + case 270 -> Leptonica1.pixRotateOrth(pix, 3); + default -> pix; + }; + } + + public static void setAlphaChannelToWhite(BufferedImage image) { if (image.getTransparency() == Transparency.TRANSLUCENT) { diff --git a/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtilsTest.java b/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtilsTest.java new file mode 100644 index 0000000..62703cd --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtilsTest.java @@ -0,0 +1,36 @@ +package com.knecon.fforesight.service.ocr.processor.utils; + +import static net.sourceforge.lept4j.ILeptonica.IFF_PNG; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import net.sourceforge.lept4j.Leptonica1; +import net.sourceforge.lept4j.Pix; + +@Disabled +class ImageProcessingUtilsTest { + + @BeforeEach + public void loadLeptonica() { + + System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB")); + } + + + @Test + public void testRotation() { + + Pix pix = Leptonica1.pixRead("/home/kschuettler/Downloads/painHarold.webp"); + Pix pix2 = ImageProcessingUtils.deRotatePix(0, pix); + Leptonica1.pixWrite("/tmp/0.png", pix2, IFF_PNG); + Pix pix3 = ImageProcessingUtils.deRotatePix(90, pix); + Leptonica1.pixWrite("/tmp/90.png", pix3, IFF_PNG); + Pix pix4 = ImageProcessingUtils.deRotatePix(180, pix); + Leptonica1.pixWrite("/tmp/180.png", pix4, IFF_PNG); + Pix pix5 = ImageProcessingUtils.deRotatePix(270, pix); + Leptonica1.pixWrite("/tmp/270.png", pix5, IFF_PNG); + } + +} \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/Pdf2ImgTest.java b/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/Pdf2ImgTest.java index cc5f064..4388da9 100644 --- a/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/Pdf2ImgTest.java +++ b/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/utils/Pdf2ImgTest.java @@ -1,10 +1,7 @@ package com.knecon.fforesight.service.ocr.processor.utils; import java.awt.image.BufferedImage; -import java.io.BufferedReader; import java.io.File; -import java.io.InputStream; -import java.io.InputStreamReader; import java.util.LinkedList; import java.util.List; import java.util.stream.IntStream; @@ -19,7 +16,7 @@ import org.springframework.core.io.ClassPathResource; import org.springframework.util.FileSystemUtils; import com.knecon.fforesight.service.ocr.processor.service.OsUtils; -import com.knecon.fforesight.service.ocr.processor.service.threads.ProcessIOLogger; +import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler; import lombok.SneakyThrows; @@ -50,29 +47,6 @@ public class Pdf2ImgTest { } - @Test - @SneakyThrows - public void testGhostScript() { - - String outputDir = "/tmp/ghostscript_out/"; - new File(outputDir).mkdirs(); - ClassPathResource resource = new ClassPathResource("files/Cyberport__SD-Faktura-Kopie_(ZRG2)_-_31.08.2020.pdf"); - - String[] cmdArgs = new String[]{"gs", "-dNOPAUSE", "-sDEVICE=tiff24nc", "-r" + DPI, "-sOutputFile=" + outputDir + "page%04d", resource.getFile().toString(), "-c", "quit"}; - Process p = Runtime.getRuntime().exec(cmdArgs); - ProcessIOLogger logger = new ProcessIOLogger(p.getInputStream(), "GS", ProcessIOLogger.Type.STD_OUT); - logger.start(); - ProcessIOLogger errorLogger = new ProcessIOLogger(p.getErrorStream(), "GS", ProcessIOLogger.Type.STD_OUT); - errorLogger.start(); - int exitcode = p.waitFor(); - logger.join(); - errorLogger.join(); - System.out.println("Ghostscript finished with exit code " + exitcode); - FileSystemUtils.deleteRecursively(new File(outputDir)); - - } - - @Test @SneakyThrows public void testGhostScriptParallel() {