Merge branch 'RED-7668' into 'master'
RED-7669: optimize OCR-module performance Closes RED-7668 See merge request redactmanager/ocr-service!22
This commit is contained in:
commit
a50f54676e
@ -0,0 +1,36 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
public record ExtractedImage(
|
||||
int pageNumber, QuadPoint position, int height, int width, BufferedImage image, Matrix ctm, int numberOnPage, PDColorSpace colorSpace) implements UnprocessedImage {
|
||||
|
||||
@SneakyThrows
|
||||
public Pix asPix() {
|
||||
|
||||
BufferedImage image = ImageProcessingUtils.convertToDeviceColorSpace(this);
|
||||
ImageProcessingUtils.setAlphaChannelToWhite(image);
|
||||
return LeptUtils.convertImageToPix(image);
|
||||
}
|
||||
|
||||
|
||||
public QuadPoint getImageCoordinatesInInitialUserSpace() {
|
||||
|
||||
return QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, 1, 1)).getTransformed(ctm.createAffineTransform());
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,20 +1,15 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.AlphaComposite;
|
||||
import java.awt.Color;
|
||||
import java.awt.Graphics2D;
|
||||
import java.awt.Transparency;
|
||||
import java.awt.Graphics;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.concurrent.Semaphore;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||
import com.pdftron.sdf.Obj;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
@ -23,58 +18,26 @@ import lombok.Setter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
import net.sourceforge.tess4j.ITessAPI;
|
||||
|
||||
@Slf4j
|
||||
@Getter
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ExtractedOcrImage implements OcrImage {
|
||||
|
||||
final int pageNumber;
|
||||
final Pix pix;
|
||||
final int originalHeight;
|
||||
final int originalWidth;
|
||||
final int height;
|
||||
final int width;
|
||||
final Matrix ctm;
|
||||
final int numberOnPage;
|
||||
|
||||
@Setter
|
||||
int pageNumber;
|
||||
int numberOnPage;
|
||||
int originalHeight;
|
||||
int originalWidth;
|
||||
Matrix ctm;
|
||||
Pix pix;
|
||||
int height;
|
||||
int width;
|
||||
int rotationDegrees;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public ExtractedOcrImage(int pageNumber, int numberOnPage, BufferedImage bufferedImage, Matrix ctm, int targetDpi) {
|
||||
|
||||
this.pageNumber = pageNumber;
|
||||
this.numberOnPage = numberOnPage;
|
||||
this.ctm = ctm;
|
||||
this.originalHeight = bufferedImage.getHeight();
|
||||
this.originalWidth = bufferedImage.getWidth();
|
||||
float imageDPI = Math.abs(bufferedImage.getWidth() / (ctm.getScalingFactorX() / 72));
|
||||
this.pix = binarize(bufferedImage, imageDPI, targetDpi);
|
||||
this.height = pix.h;
|
||||
this.width = pix.w;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) {
|
||||
|
||||
ImageProcessingUtils.setAlphaChannelToWhite(image);
|
||||
|
||||
synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs tesseract detection script.
|
||||
Pix grayScale = ImageProcessingUtils.convertToGrayScale(image);
|
||||
Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale);
|
||||
return ImageProcessingUtils.despecklePix(scaledUp);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public AffineTransform getImageCTM() {
|
||||
|
||||
|
||||
@ -2,10 +2,12 @@ package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.PdfDpiCalculator;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
@ -62,6 +64,13 @@ public interface OcrImage {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
default BufferedImage getBufferedImage() {
|
||||
|
||||
return LeptUtils.convertPixToImage(getPix());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the rotation degree of the OCR image.
|
||||
*
|
||||
@ -78,16 +87,6 @@ public interface OcrImage {
|
||||
int getOptimalPageSegmentationMode(); // TODO: evaluate if PSM can be dynamically chosen to increase performance
|
||||
|
||||
|
||||
/**
|
||||
* Sets the rotation degree of the OCR image. The rotation degree specifies the amount of rotation applied to the image.
|
||||
* Currently only quadrant rotations are supported.
|
||||
* Rotated partial images work, due to the CTM present in the pdf working with any rotation.
|
||||
*
|
||||
* @param rotationDegree The rotation degree of the OCR image.
|
||||
*/
|
||||
void setRotationDegrees(int rotationDegree);
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the buffered image associated with the OCR image.
|
||||
*
|
||||
@ -96,24 +95,6 @@ public interface OcrImage {
|
||||
Pix getPix();
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the rotated image of the OCR image.
|
||||
*
|
||||
* @return The rotated BufferedImage object of the OCR image.
|
||||
*/
|
||||
default Pix getRotatedPix() {
|
||||
|
||||
synchronized (OCRThread.class) {
|
||||
return switch (360 - getRotationDegrees()) {
|
||||
case 90 -> Leptonica1.pixRotateOrth(getPix(), 1);
|
||||
case 180 -> Leptonica1.pixRotateOrth(getPix(), 2);
|
||||
case 270 -> Leptonica1.pixRotateOrth(getPix(), 3);
|
||||
default -> getPix();
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
default int getDpi() {
|
||||
|
||||
return PdfDpiCalculator.calculateDpi(getImageBounds(), getImageCTM(), getWidth());
|
||||
|
||||
@ -0,0 +1,12 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
|
||||
public record PageInformation(int height, int width, int number, int rotationDegrees) {
|
||||
|
||||
public static PageInformation fromPDPage(int pageNum, PDPage page) {
|
||||
|
||||
return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation());
|
||||
}
|
||||
|
||||
}
|
||||
@ -97,4 +97,10 @@ public record QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) {
|
||||
d().getY());
|
||||
}
|
||||
|
||||
|
||||
public double size() {
|
||||
|
||||
return a().distance(b()) * a().distance(d());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,14 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
public record RenderedPageImageFile(int pageNumber, String absoluteFilePath) {
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
|
||||
public record RenderedPageImageFile(int pageNumber, String absoluteFilePath) implements UnprocessedImage {
|
||||
|
||||
@Override
|
||||
public Pix asPix() {
|
||||
|
||||
return Leptonica1.pixRead(absoluteFilePath);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -8,6 +8,7 @@ import org.apache.pdfbox.pdmodel.PDPage;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.Setter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
@ -16,29 +17,17 @@ import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.tess4j.ITessAPI;
|
||||
|
||||
@Getter
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class RenderedPageOcrImage implements OcrImage {
|
||||
|
||||
final String absoluteImagePath;
|
||||
final int height;
|
||||
final int width;
|
||||
final PageInformation pageInformation;
|
||||
final Pix pix;
|
||||
@Setter
|
||||
int height;
|
||||
int width;
|
||||
PageInformation pageInformation;
|
||||
Pix pix;
|
||||
int rotationDegrees;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public RenderedPageOcrImage(RenderedPageImageFile renderedPageImageFile, PDDocument document) {
|
||||
|
||||
this.pageInformation = PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1));
|
||||
this.absoluteImagePath = renderedPageImageFile.absoluteFilePath();
|
||||
this.pix = Leptonica1.pixRead(absoluteImagePath);
|
||||
this.height = getPix().h;
|
||||
this.width = getPix().w;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getOptimalPageSegmentationMode() {
|
||||
|
||||
@ -107,7 +96,7 @@ public class RenderedPageOcrImage implements OcrImage {
|
||||
|
||||
// PDFBox always returns page height and width based on rotation
|
||||
double pageWidth;
|
||||
if (pageInformation.rotationDegrees == 90 || pageInformation.rotationDegrees == 270) {
|
||||
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
|
||||
pageWidth = pageInformation.height();
|
||||
} else {
|
||||
pageWidth = pageInformation.width();
|
||||
@ -116,14 +105,4 @@ public class RenderedPageOcrImage implements OcrImage {
|
||||
return pageWidth / width;
|
||||
}
|
||||
|
||||
|
||||
private record PageInformation(int height, int width, int number, int rotationDegrees) {
|
||||
|
||||
public static PageInformation fromPDPage(int pageNum, PDPage page) {
|
||||
|
||||
return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,9 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
|
||||
public interface UnprocessedImage {
|
||||
|
||||
Pix asPix();
|
||||
|
||||
}
|
||||
@ -4,18 +4,26 @@ import java.io.InputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.LinkedBlockingDeque;
|
||||
import java.util.concurrent.LinkedTransferQueue;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.azure.core.implementation.GeoObjectHelper;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.ProcessIOLogger;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.BlockingQueueFiller;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
|
||||
|
||||
@ -24,6 +32,7 @@ import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -42,17 +51,19 @@ public class GhostScriptService {
|
||||
String documentAbsolutePath,
|
||||
Path tmpImageDir,
|
||||
PDDocument document,
|
||||
BlockingQueue<OcrImage> imageOutputQueue,
|
||||
BlockingQueue<UnprocessedImage> imageProcessingQueue,
|
||||
Statistics stats) {
|
||||
|
||||
BlockingQueue<RenderedPageImageFile> imageFileCollectorQueue = new LinkedBlockingDeque<>();
|
||||
Thread asyncTransferThread = new BlockingQueueFiller(imageFileCollectorQueue, imageProcessingQueue);
|
||||
asyncTransferThread.start();
|
||||
int numOfProcesses = Math.min(settings.getGsProcessCount(), stitchedPageNumbers.size());
|
||||
|
||||
List<List<ProcessInfo>> processInfoBatches = buildSubListForEachProcess(stitchedPageNumbers,
|
||||
numOfProcesses,
|
||||
2 * settings.getOcrThreadCount()); // use 2 times the thread count as batch size, such that GS generates the rendered pages as needed by the OCR Threads
|
||||
256 * numOfProcesses); // GS has a limit on how many pageIndices per call are possible, so we limit it to 256 pages per process
|
||||
for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) {
|
||||
long timestamp = System.currentTimeMillis();
|
||||
List<RenderedPageImageFile> renderedPageImageFiles = Collections.synchronizedList(new LinkedList<>());
|
||||
List<ProcessInfo> processInfos = processInfoBatches.get(batchIdx);
|
||||
|
||||
log.info("Batch {}: Running {} gs processes with ({}) pages each",
|
||||
@ -63,9 +74,9 @@ public class GhostScriptService {
|
||||
int finalBatchIdx = batchIdx;
|
||||
List<Process> processes = processInfos.stream()
|
||||
.parallel()
|
||||
.map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.stitchedPageNumbers(), tmpImageDir, documentAbsolutePath, renderedPageImageFiles))
|
||||
.peek(s -> log.debug(String.join(" ", s)))
|
||||
.map(this::executeProcess)
|
||||
.map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.stitchedPageNumbers(), tmpImageDir, documentAbsolutePath))
|
||||
.peek(s -> log.debug(String.join(" ", s.cmdArgs())))
|
||||
.map(processInfo -> executeProcess(processInfo, imageFileCollectorQueue))
|
||||
.toList();
|
||||
|
||||
List<Integer> processExitCodes = new LinkedList<>();
|
||||
@ -73,14 +84,9 @@ public class GhostScriptService {
|
||||
processExitCodes.add(process.waitFor());
|
||||
}
|
||||
stats.increasePDF2ImgDuration(System.currentTimeMillis() - timestamp);
|
||||
|
||||
log.info("Batch {}: Ghostscript processes finished with exit codes " + processExitCodes, batchIdx);
|
||||
for (RenderedPageImageFile renderedPageImageFile : renderedPageImageFiles) {
|
||||
OcrImage image = new RenderedPageOcrImage(renderedPageImageFile, document);
|
||||
imageOutputQueue.put(image);
|
||||
}
|
||||
|
||||
}
|
||||
asyncTransferThread.interrupt();
|
||||
}
|
||||
|
||||
|
||||
@ -107,20 +113,28 @@ public class GhostScriptService {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private String[] buildCmdArgs(Integer processIdx,
|
||||
Integer batchIdx,
|
||||
List<Integer> stitchedImagePageIndices,
|
||||
Path outputDir,
|
||||
String documentAbsolutePath,
|
||||
List<RenderedPageImageFile> fullPageImages) {
|
||||
private ProcessCmdsAndRenderedImageFiles buildCmdArgs(Integer processIdx,
|
||||
Integer batchIdx,
|
||||
List<Integer> stitchedImagePageIndices,
|
||||
Path outputDir,
|
||||
String documentAbsolutePath) {
|
||||
|
||||
String imagePathFormat = outputDir.resolve("output_" + processIdx + "_" + batchIdx + ".%04d" + FORMAT).toFile().toString();
|
||||
|
||||
Map<Integer, RenderedPageImageFile> fullPageImages = new HashMap<>();
|
||||
for (int i = 0; i < stitchedImagePageIndices.size(); i++) {
|
||||
Integer pageNumber = stitchedImagePageIndices.get(i);
|
||||
fullPageImages.add(new RenderedPageImageFile(pageNumber, String.format(imagePathFormat, i + 1)));
|
||||
fullPageImages.put(pageNumber, new RenderedPageImageFile(pageNumber, String.format(imagePathFormat, i + 1)));
|
||||
}
|
||||
|
||||
String[] cmdArgs = buildCmdArgs(stitchedImagePageIndices, documentAbsolutePath, imagePathFormat);
|
||||
|
||||
return new ProcessCmdsAndRenderedImageFiles(cmdArgs, fullPageImages);
|
||||
}
|
||||
|
||||
|
||||
private String[] buildCmdArgs(List<Integer> stitchedImagePageIndices, String documentAbsolutePath, String imagePathFormat) {
|
||||
|
||||
StringBuilder sPageList = new StringBuilder();
|
||||
int i = 1;
|
||||
for (Integer integer : stitchedImagePageIndices) {
|
||||
@ -131,18 +145,19 @@ public class GhostScriptService {
|
||||
i++;
|
||||
}
|
||||
|
||||
return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + settings.getDpi(), "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"};
|
||||
String[] cmdArgs = new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + settings.getDpi(), "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"};
|
||||
return cmdArgs;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Process executeProcess(String[] cmdArgs) {
|
||||
private Process executeProcess(ProcessCmdsAndRenderedImageFiles processInfo, BlockingQueue<RenderedPageImageFile> imageFileCollectorQueue) {
|
||||
|
||||
Process p = Runtime.getRuntime().exec(cmdArgs);
|
||||
Process p = Runtime.getRuntime().exec(processInfo.cmdArgs());
|
||||
InputStream stdOut = p.getInputStream();
|
||||
ProcessIOLogger stdOutLogger = new ProcessIOLogger(stdOut, "GS", ProcessIOLogger.Type.STD_OUT);
|
||||
GhostScriptOutputHandler stdOutLogger = GhostScriptOutputHandler.stdOut(stdOut, processInfo.renderedPageImageFiles(), imageFileCollectorQueue);
|
||||
InputStream stdError = p.getErrorStream();
|
||||
ProcessIOLogger stdErrorLogger = new ProcessIOLogger(stdError, "GS", ProcessIOLogger.Type.ERROR);
|
||||
GhostScriptOutputHandler stdErrorLogger = GhostScriptOutputHandler.errorHandler(stdError);
|
||||
|
||||
stdOutLogger.start();
|
||||
stdErrorLogger.start();
|
||||
@ -150,6 +165,10 @@ public class GhostScriptService {
|
||||
}
|
||||
|
||||
|
||||
private record ProcessCmdsAndRenderedImageFiles(String[] cmdArgs, Map<Integer, RenderedPageImageFile> renderedPageImageFiles) {
|
||||
|
||||
}
|
||||
|
||||
private record ProcessInfo(Integer processIdx, List<Integer> stitchedPageNumbers) {
|
||||
|
||||
}
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.awt.Graphics;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
@ -18,13 +17,12 @@ import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
|
||||
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
import lombok.Getter;
|
||||
@ -33,8 +31,7 @@ import lombok.SneakyThrows;
|
||||
@Getter
|
||||
public class ImageStreamEngine extends PDFStreamEngine {
|
||||
|
||||
private ExtractedOcrImage currentImageOnPage;
|
||||
private List<ExtractedOcrImage> imagesOnCurrentPage;
|
||||
private List<ExtractedImage> imagesOnCurrentPage;
|
||||
private OcrServiceSettings settings;
|
||||
private int pageNum;
|
||||
|
||||
@ -69,22 +66,15 @@ public class ImageStreamEngine extends PDFStreamEngine {
|
||||
}
|
||||
|
||||
Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix();
|
||||
if (imageXObject.getColorSpace() instanceof PDDeviceRGB) {
|
||||
BufferedImage image = imageXObject.getImage();
|
||||
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi());
|
||||
} else if (imageXObject.getColorSpace() instanceof PDDeviceGray) {
|
||||
BufferedImage image = imageXObject.getImage();
|
||||
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi());
|
||||
} else {
|
||||
BufferedImage pdfImage = imageXObject.getImage();
|
||||
BufferedImage image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
|
||||
Graphics g = image.getGraphics();
|
||||
g.drawImage(pdfImage, 0, 0, null);
|
||||
g.dispose();
|
||||
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi());
|
||||
}
|
||||
this.imagesOnCurrentPage.add(this.currentImageOnPage);
|
||||
//imagesOnPages.add(this.currentImageOnPage);
|
||||
this.imagesOnCurrentPage.add(new ExtractedImage(pageNum,
|
||||
QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, imageXObject.getWidth(), imageXObject.getHeight())),
|
||||
imageXObject.getHeight(),
|
||||
imageXObject.getWidth(),
|
||||
imageXObject.getImage(),
|
||||
imageCTM,
|
||||
imagesOnCurrentPage.size(),
|
||||
imageXObject.getColorSpace()));
|
||||
|
||||
} else if (xobject instanceof PDFormXObject) {
|
||||
PDFormXObject form = (PDFormXObject) xobject;
|
||||
showForm(form);
|
||||
|
||||
@ -107,7 +107,7 @@ public class OCRService {
|
||||
int numberOfOcrThreads = Math.min(settings.getOcrThreadCount(), document.getNumberOfPages());
|
||||
stats = new Statistics(numberOfExtractThreads, numberOfOcrThreads);
|
||||
|
||||
BlockingQueue<OcrImage> ocrImageQueue = new ArrayBlockingQueue<>(numberOfOcrThreads);
|
||||
BlockingQueue<OcrImage> ocrImageQueue = new ArrayBlockingQueue<>((int) (1.5 * numberOfOcrThreads));
|
||||
|
||||
OcrImageFactory ocrImageFactory = new OcrImageFactory(document,
|
||||
documentFile,
|
||||
@ -128,7 +128,7 @@ public class OCRService {
|
||||
.toList();
|
||||
log.info("Started {} OCR consumer threads, listening for images on the queue", ocrThreads.size());
|
||||
ocrImageFactory.join();
|
||||
log.info("Extracted all images, interrupting ocr threads");
|
||||
log.info("Processed all images, interrupting ocr threads");
|
||||
|
||||
ocrThreads.forEach(Thread::interrupt);
|
||||
for (OCRThread ocrThread : ocrThreads) {
|
||||
|
||||
@ -6,13 +6,17 @@ import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageExtractionThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageProcessingThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
|
||||
|
||||
@ -29,6 +33,8 @@ public class OcrImageFactory {
|
||||
File documentFile;
|
||||
Path tmpImageDir;
|
||||
GhostScriptService ghostScriptService;
|
||||
BlockingQueue<UnprocessedImage> imageProcessingQueue;
|
||||
ImageProcessingThread imageProcessingThread;
|
||||
BlockingQueue<OcrImage> imageOutputQueue;
|
||||
List<ImageExtractionThread> imageExtractionThreads;
|
||||
List<Integer> stitchedPageNumbers;
|
||||
@ -40,7 +46,7 @@ public class OcrImageFactory {
|
||||
Path tmpImageDir,
|
||||
int numberOfThreads,
|
||||
GhostScriptService ghostScriptService,
|
||||
BlockingQueue<OcrImage> imageOutputQueue,
|
||||
BlockingQueue<OcrImage> imageOcrQueue,
|
||||
OcrProgressLogger logger,
|
||||
OcrServiceSettings settings,
|
||||
Statistics stats) {
|
||||
@ -49,7 +55,8 @@ public class OcrImageFactory {
|
||||
this.documentFile = documentFile;
|
||||
this.tmpImageDir = tmpImageDir;
|
||||
this.ghostScriptService = ghostScriptService;
|
||||
this.imageOutputQueue = imageOutputQueue;
|
||||
this.imageOutputQueue = imageOcrQueue;
|
||||
this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOcrQueue.remainingCapacity());
|
||||
this.stitchedPageNumbers = Collections.synchronizedList(new LinkedList<>());
|
||||
this.stats = stats;
|
||||
|
||||
@ -57,8 +64,10 @@ public class OcrImageFactory {
|
||||
|
||||
List<List<Integer>> balancedPageNumbers = ListSplittingUtils.buildBalancedContinuousSublist(document.getNumberOfPages(), numberOfThreads);
|
||||
for (int i = 0; i < balancedPageNumbers.size(); i++) {
|
||||
imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageOutputQueue, stitchedPageNumbers));
|
||||
imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageProcessingQueue, stitchedPageNumbers));
|
||||
}
|
||||
this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOcrQueue, stats, settings, document);
|
||||
|
||||
log.info("Started {} image extraction threads, with ({}) pages each",
|
||||
imageExtractionThreads.size(),
|
||||
imageExtractionThreads.stream().map(ImageExtractionThread::getPageIndices).map(List::size).map(String::valueOf).collect(Collectors.joining(", ")));
|
||||
@ -70,6 +79,8 @@ public class OcrImageFactory {
|
||||
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
|
||||
imageExtractionThread.start();
|
||||
}
|
||||
imageProcessingThread.start();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -79,11 +90,15 @@ public class OcrImageFactory {
|
||||
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
|
||||
imageExtractionThread.join();
|
||||
}
|
||||
if (stitchedPageNumbers.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageOutputQueue, stats);
|
||||
if (!stitchedPageNumbers.isEmpty()) {
|
||||
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageProcessingQueue, stats);
|
||||
}
|
||||
imageProcessingThread.interrupt();
|
||||
log.info("All images extracted, interrupting processing thread.");
|
||||
|
||||
imageProcessingThread.join();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -15,6 +15,7 @@ public class Statistics {
|
||||
List<Long> tesseractDuration;
|
||||
AtomicLong pdf2ImgDuration;
|
||||
AtomicLong writingTextDuration;
|
||||
AtomicLong imageProcessingDuration;
|
||||
|
||||
|
||||
public Statistics(int numberOfExtractThreads, int numberOfOcrThreads) {
|
||||
@ -23,6 +24,7 @@ public class Statistics {
|
||||
this.tesseractDuration = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfOcrThreads, 0L)));
|
||||
this.pdf2ImgDuration = new AtomicLong(0);
|
||||
this.writingTextDuration = new AtomicLong(0);
|
||||
this.imageProcessingDuration = new AtomicLong(0);
|
||||
}
|
||||
|
||||
|
||||
@ -32,6 +34,12 @@ public class Statistics {
|
||||
}
|
||||
|
||||
|
||||
public void increaseImageProcessing(long duration) {
|
||||
|
||||
imageProcessingDuration.addAndGet(duration);
|
||||
}
|
||||
|
||||
|
||||
public void increaseTesseractDuration(int threadId, long duration) {
|
||||
|
||||
tesseractDuration.set(threadId, tesseractDuration.get(threadId) + duration);
|
||||
@ -53,13 +61,15 @@ public class Statistics {
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.format("imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, PDF2Img=%.2f s, writingText=%.2f s",
|
||||
return String.format(
|
||||
"imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s",
|
||||
((float) imageExtraction.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
|
||||
((float) imageExtraction.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
|
||||
((float) imageExtraction.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
|
||||
((float) tesseractDuration.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
|
||||
((float) tesseractDuration.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
|
||||
((float) tesseractDuration.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
|
||||
(float) imageProcessingDuration.get() / 1000,
|
||||
(float) pdf2ImgDuration.get() / 1000,
|
||||
(float) writingTextDuration.get() / 1000);
|
||||
}
|
||||
|
||||
@ -0,0 +1,61 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.tess4j.TessAPI1;
|
||||
|
||||
|
||||
/*
|
||||
This just moves the Elements from the GhostScriptOutputListener into the ImageProcessing queue asynchronously
|
||||
*/
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class BlockingQueueFiller extends Thread {
|
||||
|
||||
BlockingQueue<RenderedPageImageFile> imageInputQueue;
|
||||
BlockingQueue<UnprocessedImage> imageOutputQueue;
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void run() {
|
||||
|
||||
// Interrupting signals that the image extraction has finished
|
||||
while (true) {
|
||||
try {
|
||||
final UnprocessedImage image = imageInputQueue.take();
|
||||
try {
|
||||
imageOutputQueue.put(image);
|
||||
} catch (InterruptedException e) {
|
||||
imageOutputQueue.put(image);
|
||||
break;
|
||||
}
|
||||
|
||||
} catch (InterruptedException e) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// empty the queue
|
||||
List<UnprocessedImage> remainingImages = new ArrayList<>(imageInputQueue.size());
|
||||
imageInputQueue.drainTo(remainingImages);
|
||||
remainingImages.forEach(image -> {
|
||||
try {
|
||||
imageOutputQueue.put(image);
|
||||
} catch (InterruptedException e) {
|
||||
log.error(e.getMessage());
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,122 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class GhostScriptOutputHandler extends Thread {
|
||||
|
||||
static Pattern pageFinishedPattern = Pattern.compile("Page (\\d+)");
|
||||
|
||||
// If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock.
|
||||
// Since both need to read simultaneously we need to implement the readers as separate threads.
|
||||
|
||||
final InputStream is;
|
||||
final String processName;
|
||||
final Type type;
|
||||
|
||||
final Map<Integer, RenderedPageImageFile> pagesToProcess;
|
||||
final BlockingQueue<RenderedPageImageFile> renderedPageImageFileOutput;
|
||||
|
||||
int currentPageNumber;
|
||||
|
||||
|
||||
public static GhostScriptOutputHandler errorHandler(InputStream is) {
|
||||
|
||||
return new GhostScriptOutputHandler(is, "GS", Type.ERROR, null, null);
|
||||
}
|
||||
|
||||
|
||||
public static GhostScriptOutputHandler stdOut(InputStream is,
|
||||
Map<Integer, RenderedPageImageFile> pagesToProcess,
|
||||
BlockingQueue<RenderedPageImageFile> renderedPageImageFileOutput) {
|
||||
|
||||
return new GhostScriptOutputHandler(is, "GS", Type.STD_OUT, pagesToProcess, renderedPageImageFileOutput);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void run() {
|
||||
|
||||
try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) {
|
||||
|
||||
String line;
|
||||
while (true) {
|
||||
line = br.readLine();
|
||||
|
||||
if (line == null) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (type.equals(Type.ERROR)) {
|
||||
log.error(processName + "_" + type.name() + ">" + line);
|
||||
} else {
|
||||
log.debug(processName + "_" + type.name() + ">" + line);
|
||||
addProcessedImageToQueue(line);
|
||||
}
|
||||
}
|
||||
}
|
||||
is.close();
|
||||
if (type.equals(Type.STD_OUT)) {
|
||||
queueFinishedPage(currentPageNumber);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void addProcessedImageToQueue(String line) {
|
||||
|
||||
/*
|
||||
Ghostscript prints the pageNumber it is currently working on, so we remember the current page and queue it as soon as the next comes in.
|
||||
*/
|
||||
Matcher pageNumberMatcher = pageFinishedPattern.matcher(line);
|
||||
if (pageNumberMatcher.find()) {
|
||||
int pageNumber = Integer.parseInt(pageNumberMatcher.group(1));
|
||||
|
||||
if (currentPageNumber == 0) {
|
||||
currentPageNumber = pageNumber;
|
||||
return;
|
||||
}
|
||||
|
||||
queueFinishedPage(currentPageNumber);
|
||||
currentPageNumber = pageNumber;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void queueFinishedPage(int pageNumber) {
|
||||
|
||||
var imageFile = this.pagesToProcess.get(pageNumber);
|
||||
if (imageFile == null) {
|
||||
throw new IllegalArgumentException(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet()));
|
||||
}
|
||||
assert new File(imageFile.absoluteFilePath()).isFile();
|
||||
renderedPageImageFileOutput.add(imageFile);
|
||||
}
|
||||
|
||||
|
||||
public enum Type {
|
||||
ERROR,
|
||||
STD_OUT
|
||||
}
|
||||
|
||||
}
|
||||
@ -5,12 +5,11 @@ import java.util.List;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
|
||||
@ -26,6 +25,7 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ImageExtractionThread extends Thread {
|
||||
|
||||
static double FULL_PAGE_IMAGE_THRESHOLD = 0.99;
|
||||
static double IMAGE_ALIGNMENT_THRESHOLD = 1;
|
||||
|
||||
int id;
|
||||
@ -37,9 +37,10 @@ public class ImageExtractionThread extends Thread {
|
||||
OcrServiceSettings settings;
|
||||
|
||||
// output is written to these lists
|
||||
BlockingQueue<OcrImage> imageOutputQueue;
|
||||
BlockingQueue<UnprocessedImage> imageProcessingQueue;
|
||||
List<Integer> stitchedPageNumbers;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void run() {
|
||||
@ -48,28 +49,28 @@ public class ImageExtractionThread extends Thread {
|
||||
for (Integer pageIndex : pageIndices) {
|
||||
try (PDDocument document = Loader.loadPDF(documentFile)) { // load new PDDocument for thread safety, also keeps RAM usage low.
|
||||
timestamp = System.currentTimeMillis();
|
||||
List<ExtractedOcrImage> extractedOcrImages = getExtractedOcrImages(pageIndex, document);
|
||||
List<ExtractedImage> extractedImages = getExtractedImages(pageIndex, document);
|
||||
stats.increaseImageExtraction(id, System.currentTimeMillis() - timestamp);
|
||||
if (extractedOcrImages.isEmpty()) {
|
||||
if (extractedImages.isEmpty()) {
|
||||
logger.logPageSkipped(pageIndex);
|
||||
}
|
||||
|
||||
if (checkForStitchedImages(extractedOcrImages)) {
|
||||
if (checkForFullPageOrStitchedImages(extractedImages, document.getPage(pageIndex - 1))) {
|
||||
stitchedPageNumbers.add(pageIndex);
|
||||
logger.addImagesToProcess(pageIndex, 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (ExtractedOcrImage image : extractedOcrImages) {
|
||||
imageOutputQueue.put(image);
|
||||
logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage());
|
||||
for (ExtractedImage image : extractedImages) {
|
||||
imageProcessingQueue.put((UnprocessedImage) image);
|
||||
logger.addImagesToProcess(image.pageNumber(), image.numberOnPage());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private List<ExtractedOcrImage> getExtractedOcrImages(Integer pageIndex, PDDocument document) {
|
||||
private List<ExtractedImage> getExtractedImages(Integer pageIndex, PDDocument document) {
|
||||
|
||||
PDPage page = document.getPage(pageIndex - 1);
|
||||
ImageStreamEngine imageStreamEngine = new ImageStreamEngine(settings);
|
||||
@ -79,22 +80,22 @@ public class ImageExtractionThread extends Thread {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private boolean checkForStitchedImages(List<ExtractedOcrImage> imagesOnCurrentPage) {
|
||||
private boolean checkForFullPageOrStitchedImages(List<ExtractedImage> imagesOnCurrentPage, PDPage page) {
|
||||
|
||||
if (imagesOnCurrentPage.size() <= 1) {
|
||||
if (imagesOnCurrentPage.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
//checking for intersections or direct alignment of images
|
||||
ExtractedOcrImage[] imageOnPagesArray = new ExtractedOcrImage[imagesOnCurrentPage.size()];
|
||||
int index = 0;
|
||||
for (ExtractedOcrImage imageOnPage : imagesOnCurrentPage) {
|
||||
imageOnPagesArray[index] = imageOnPage;
|
||||
index++;
|
||||
for (ExtractedImage imageOnPage : imagesOnCurrentPage) {
|
||||
if (imageOnPage.width() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getWidth() && imageOnPage.height() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < imageOnPagesArray.length; j++) {
|
||||
for (int i = j + 1; i < imageOnPagesArray.length; i++) {
|
||||
if (imageOnPagesArray[j].getImageCoordinatesInInitialUserSpace().aligns(imageOnPagesArray[i].getImageCoordinatesInInitialUserSpace(), IMAGE_ALIGNMENT_THRESHOLD)) {
|
||||
|
||||
//checking for intersections or direct alignment of images
|
||||
for (int j = 0; j < imagesOnCurrentPage.size(); j++) {
|
||||
for (int i = j + 1; i < imagesOnCurrentPage.size(); i++) {
|
||||
if (imagesOnCurrentPage.get(j).getImageCoordinatesInInitialUserSpace().aligns(imagesOnCurrentPage.get(i).getImageCoordinatesInInitialUserSpace(), IMAGE_ALIGNMENT_THRESHOLD)) {
|
||||
// TODO: see if we can stitch aligning images using BufferedImage and skip the gs conversion entirely
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -0,0 +1,205 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import static net.sourceforge.tess4j.ITessAPI.TRUE;
|
||||
|
||||
import java.nio.FloatBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||
import com.sun.jna.ptr.PointerByReference;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
import net.sourceforge.tess4j.ITessAPI;
|
||||
import net.sourceforge.tess4j.TessAPI1;
|
||||
|
||||
/*
|
||||
* This thread does all the image processing. There should only be one, since Leptonica is not thread safe.
|
||||
*/
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ImageProcessingThread extends Thread {
|
||||
|
||||
BlockingQueue<UnprocessedImage> imageInputQueue;
|
||||
BlockingQueue<OcrImage> imageOutputQueue;
|
||||
ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
|
||||
Statistics stats;
|
||||
OcrServiceSettings settings;
|
||||
PDDocument document;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void run() {
|
||||
|
||||
// Interrupting signals that the image extraction has finished
|
||||
while (true) {
|
||||
try {
|
||||
final UnprocessedImage image = imageInputQueue.take();
|
||||
OcrImage extractedOcrImage = this.process(image);
|
||||
try {
|
||||
imageOutputQueue.put(extractedOcrImage);
|
||||
} catch (InterruptedException e) {
|
||||
imageOutputQueue.put(extractedOcrImage);
|
||||
break;
|
||||
}
|
||||
|
||||
} catch (InterruptedException e) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// empty the queue
|
||||
List<UnprocessedImage> remainingImages = new ArrayList<>(imageInputQueue.size());
|
||||
imageInputQueue.drainTo(remainingImages);
|
||||
remainingImages.forEach(image -> {
|
||||
OcrImage ocrImage = this.process(image);
|
||||
try {
|
||||
imageOutputQueue.put(ocrImage);
|
||||
} catch (InterruptedException e) {
|
||||
log.error(e.getMessage());
|
||||
}
|
||||
});
|
||||
|
||||
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
|
||||
}
|
||||
|
||||
|
||||
private OcrImage process(UnprocessedImage unprocessedImage) {
|
||||
|
||||
long timestamp = System.currentTimeMillis();
|
||||
|
||||
OcrImage ocrImage;
|
||||
if (unprocessedImage instanceof ExtractedImage extractedImage) {
|
||||
ocrImage = processExtractedImage(extractedImage);
|
||||
} else if (unprocessedImage instanceof RenderedPageImageFile renderedPageImageFile) {
|
||||
ocrImage = processRenderedPageImageFile(renderedPageImageFile);
|
||||
} else {
|
||||
throw new UnsupportedOperationException(String.format("Class %s is not supported!", unprocessedImage.getClass()));
|
||||
}
|
||||
|
||||
stats.increaseImageProcessing(System.currentTimeMillis() - timestamp);
|
||||
|
||||
return ocrImage;
|
||||
}
|
||||
|
||||
|
||||
private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) {
|
||||
|
||||
Pix pix = binarize(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi());
|
||||
|
||||
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
|
||||
Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix);
|
||||
|
||||
OcrImage ocrImage = new RenderedPageOcrImage(pix.h,
|
||||
pix.w,
|
||||
PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1)),
|
||||
rotatedPix,
|
||||
orientDegree);
|
||||
|
||||
if (pix != rotatedPix) {
|
||||
LeptUtils.disposePix(pix);
|
||||
}
|
||||
|
||||
return ocrImage;
|
||||
}
|
||||
|
||||
|
||||
private OcrImage processExtractedImage(ExtractedImage extractedImage) {
|
||||
|
||||
float imageDPI = Math.abs(extractedImage.image().getWidth() / (extractedImage.ctm().getScalingFactorX() / 72));
|
||||
|
||||
Pix pix = binarize(extractedImage.asPix(), imageDPI, settings.getDpi());
|
||||
|
||||
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
|
||||
Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix);
|
||||
|
||||
OcrImage ocrImage = new ExtractedOcrImage(extractedImage.pageNumber(),
|
||||
extractedImage.numberOnPage(),
|
||||
extractedImage.height(),
|
||||
extractedImage.width(),
|
||||
extractedImage.ctm(),
|
||||
rotatedPix,
|
||||
pix.h,
|
||||
pix.w,
|
||||
orientDegree);
|
||||
|
||||
if (pix != rotatedPix) {
|
||||
LeptUtils.disposePix(pix);
|
||||
}
|
||||
return ocrImage;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
static public int detectOrientation(Pix pix, int dpi, ITessAPI.TessBaseAPI detectionScriptHandle) {
|
||||
|
||||
TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, pix);
|
||||
TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, dpi);
|
||||
|
||||
IntBuffer orientationDegreeResultBuffer;
|
||||
FloatBuffer orientationDegreeConfidenceBuffer;
|
||||
PointerByReference scriptureNameBuffer;
|
||||
FloatBuffer scriptureConfidenceBuffer;
|
||||
|
||||
orientationDegreeResultBuffer = IntBuffer.allocate(1);
|
||||
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
|
||||
scriptureNameBuffer = new PointerByReference();
|
||||
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
|
||||
|
||||
int orientationDegree = 0;
|
||||
int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle,
|
||||
orientationDegreeResultBuffer,
|
||||
orientationDegreeConfidenceBuffer,
|
||||
scriptureNameBuffer,
|
||||
scriptureConfidenceBuffer);
|
||||
if (result == TRUE && orientationDegreeConfidenceBuffer.get() > 10) {
|
||||
orientationDegree = orientationDegreeResultBuffer.get();
|
||||
}
|
||||
|
||||
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
|
||||
|
||||
return orientationDegree;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Pix binarize(Pix pix, float imageDpi, int targetDpi) {
|
||||
|
||||
Pix grayScale = ImageProcessingUtils.convertToGrayScale(pix);
|
||||
Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale);
|
||||
return ImageProcessingUtils.despecklePix(scaledUp);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
|
||||
|
||||
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
|
||||
String datapath = System.getenv("TESSDATA_PREFIX");
|
||||
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
|
||||
|
||||
return handle;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,6 +1,10 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import static net.sourceforge.tess4j.ITessAPI.TRUE;
|
||||
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPICreate;
|
||||
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIInit1;
|
||||
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetPageSegMode;
|
||||
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetVariable;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.FloatBuffer;
|
||||
@ -16,6 +20,7 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.Tesseract2;
|
||||
import com.sun.jna.StringArray;
|
||||
import com.sun.jna.ptr.PointerByReference;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
@ -43,7 +48,6 @@ public class OCRThread extends Thread {
|
||||
Statistics stats;
|
||||
OcrServiceSettings settings;
|
||||
Tesseract2 instance;
|
||||
ITessAPI.TessBaseAPI detectionScriptHandle;
|
||||
|
||||
|
||||
public OCRThread(int id,
|
||||
@ -62,7 +66,6 @@ public class OCRThread extends Thread {
|
||||
this.stats = stats;
|
||||
this.settings = settings;
|
||||
this.instance = createInstance(settings);
|
||||
this.detectionScriptHandle = initDetectionScriptHandle();
|
||||
}
|
||||
|
||||
|
||||
@ -87,10 +90,9 @@ public class OCRThread extends Thread {
|
||||
this.process(image);
|
||||
}
|
||||
} catch (NoSuchElementException e) {
|
||||
log.debug("Processed all Images, finishing.");
|
||||
log.debug("Executed tesseract on all Images, finishing.");
|
||||
}
|
||||
|
||||
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
|
||||
}
|
||||
|
||||
|
||||
@ -102,15 +104,8 @@ public class OCRThread extends Thread {
|
||||
|
||||
int psm = settings.getPsmOverride() < 0 ? image.getOptimalPageSegmentationMode() : settings.getPsmOverride();
|
||||
|
||||
int orientDegree = detectOrientation(image);
|
||||
image.setRotationDegrees(orientDegree);
|
||||
Pix rotatedPix = image.getRotatedPix();
|
||||
executeTesseract(psm, image.getDpi(), rotatedPix, tesseractOutputFileName);
|
||||
|
||||
synchronized (OCRThread.class) {
|
||||
image.destroyPix();
|
||||
LeptUtils.disposePix(rotatedPix);
|
||||
}
|
||||
executeTesseract(psm, image.getDpi(), image.getPix(), tesseractOutputFileName);
|
||||
image.destroyPix();
|
||||
|
||||
results.add(OcrResult.create(image, tesseractOutputFileName));
|
||||
logger.logImageFinished(image, psm);
|
||||
@ -118,51 +113,6 @@ public class OCRThread extends Thread {
|
||||
}
|
||||
|
||||
|
||||
public int detectOrientation(OcrImage image) {
|
||||
|
||||
IntBuffer orientationDegreeResultBuffer;
|
||||
FloatBuffer orientationDegreeConfidenceBuffer;
|
||||
PointerByReference scriptureNameBuffer;
|
||||
FloatBuffer scriptureConfidenceBuffer;
|
||||
|
||||
TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, image.getPix());
|
||||
TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, image.getDpi());
|
||||
|
||||
synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs in leptonica binarization.
|
||||
orientationDegreeResultBuffer = IntBuffer.allocate(1);
|
||||
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
|
||||
scriptureNameBuffer = new PointerByReference();
|
||||
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
|
||||
}
|
||||
|
||||
int orient_deg = 0;
|
||||
int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle,
|
||||
orientationDegreeResultBuffer,
|
||||
orientationDegreeConfidenceBuffer,
|
||||
scriptureNameBuffer,
|
||||
scriptureConfidenceBuffer);
|
||||
if (result == TRUE) {
|
||||
orient_deg = orientationDegreeResultBuffer.get();
|
||||
}
|
||||
|
||||
synchronized (OCRThread.class) {
|
||||
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
|
||||
}
|
||||
|
||||
return orient_deg;
|
||||
}
|
||||
|
||||
|
||||
synchronized private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
|
||||
|
||||
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
|
||||
String datapath = System.getenv("TESSDATA_PREFIX");
|
||||
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
|
||||
|
||||
return handle;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void executeTesseract(int psm, int dpi, Pix pix, String tesseractOutputFileName) {
|
||||
|
||||
|
||||
@ -1,55 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ProcessIOLogger extends Thread {
|
||||
|
||||
// If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock.
|
||||
// Since both need to read simultaneously we need to implement the readers as separate threads.
|
||||
|
||||
InputStream is;
|
||||
String processName;
|
||||
Type type;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void run() {
|
||||
|
||||
try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) {
|
||||
|
||||
String line;
|
||||
while (true) {
|
||||
line = br.readLine();
|
||||
|
||||
if (line == null) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (type.equals(Type.ERROR)) {
|
||||
log.error(processName + "_" + type.name() + ">" + line);
|
||||
} else {
|
||||
log.debug(processName + "_" + type.name() + ">" + line);
|
||||
}
|
||||
}
|
||||
}
|
||||
is.close();
|
||||
}
|
||||
|
||||
|
||||
public enum Type {
|
||||
ERROR,
|
||||
STD_OUT
|
||||
}
|
||||
|
||||
}
|
||||
@ -14,7 +14,7 @@ public class OcrServiceSettings {
|
||||
|
||||
int ocrThreadCount = 4; // Number of OCR threads
|
||||
int imageExtractThreadCount = 2; // Number of image extraction threads
|
||||
int gsProcessCount = 2; // Number of Ghostscript processes
|
||||
int gsProcessCount = 1; // Number of Ghostscript processes
|
||||
int dpi = 300; // Target DPI for binarized images
|
||||
int psmOverride = -1; // Overrides the page segmentation mode if > 0
|
||||
int minImageHeight = 20; // Minimum height for images to be processed
|
||||
|
||||
@ -2,10 +2,15 @@ package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import java.awt.AlphaComposite;
|
||||
import java.awt.Color;
|
||||
import java.awt.Graphics;
|
||||
import java.awt.Graphics2D;
|
||||
import java.awt.Transparency;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
@ -16,6 +21,22 @@ import net.sourceforge.lept4j.util.LeptUtils;
|
||||
@UtilityClass
|
||||
public class ImageProcessingUtils {
|
||||
|
||||
public BufferedImage convertToDeviceColorSpace(ExtractedImage extractedImage) {
|
||||
|
||||
BufferedImage image;
|
||||
if (extractedImage.colorSpace() instanceof PDDeviceRGB || extractedImage.colorSpace() instanceof PDDeviceGray) {
|
||||
image = extractedImage.image();
|
||||
} else {
|
||||
BufferedImage pdfImage = extractedImage.image();
|
||||
image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
|
||||
Graphics g = image.getGraphics();
|
||||
g.drawImage(pdfImage, 0, 0, null);
|
||||
g.dispose();
|
||||
}
|
||||
return image;
|
||||
}
|
||||
|
||||
|
||||
public static Pix despecklePix(Pix pix) {
|
||||
|
||||
assert pix.d == 8;
|
||||
@ -24,7 +45,9 @@ public class ImageProcessingUtils {
|
||||
// too small to properly despeckle, just binarize instead.
|
||||
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
|
||||
} else {
|
||||
despeckled = LeptUtils.despeckle(pix, LeptUtils.SEL_STR3, 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though...
|
||||
despeckled = LeptUtils.despeckle(pix,
|
||||
LeptUtils.SEL_STR3,
|
||||
3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though...
|
||||
if (despeckled == null) {
|
||||
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
|
||||
}
|
||||
@ -57,23 +80,35 @@ public class ImageProcessingUtils {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static Pix convertToGrayScale(BufferedImage image) {
|
||||
public static Pix convertToGrayScale(Pix pix) {
|
||||
|
||||
Pix pix = LeptUtils.convertImageToPix(image);
|
||||
if (pix.d == 8) {
|
||||
return pix;
|
||||
} else if (pix.d == 32) {
|
||||
Pix grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
|
||||
LeptUtils.disposePix(pix);
|
||||
return grayScale;
|
||||
} else {
|
||||
} else if (pix.d == 1) {
|
||||
Pix grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255);
|
||||
LeptUtils.disposePix(pix);
|
||||
return grayScale;
|
||||
} else {
|
||||
throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Pix deRotatePix(int orientDegree, Pix pix) {
|
||||
|
||||
return switch (360 - orientDegree) {
|
||||
case 90 -> Leptonica1.pixRotateOrth(pix, 1);
|
||||
case 180 -> Leptonica1.pixRotateOrth(pix, 2);
|
||||
case 270 -> Leptonica1.pixRotateOrth(pix, 3);
|
||||
default -> pix;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public static void setAlphaChannelToWhite(BufferedImage image) {
|
||||
|
||||
if (image.getTransparency() == Transparency.TRANSLUCENT) {
|
||||
|
||||
@ -0,0 +1,36 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import static net.sourceforge.lept4j.ILeptonica.IFF_PNG;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
|
||||
@Disabled
|
||||
class ImageProcessingUtilsTest {
|
||||
|
||||
@BeforeEach
|
||||
public void loadLeptonica() {
|
||||
|
||||
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testRotation() {
|
||||
|
||||
Pix pix = Leptonica1.pixRead("/home/kschuettler/Downloads/painHarold.webp");
|
||||
Pix pix2 = ImageProcessingUtils.deRotatePix(0, pix);
|
||||
Leptonica1.pixWrite("/tmp/0.png", pix2, IFF_PNG);
|
||||
Pix pix3 = ImageProcessingUtils.deRotatePix(90, pix);
|
||||
Leptonica1.pixWrite("/tmp/90.png", pix3, IFF_PNG);
|
||||
Pix pix4 = ImageProcessingUtils.deRotatePix(180, pix);
|
||||
Leptonica1.pixWrite("/tmp/180.png", pix4, IFF_PNG);
|
||||
Pix pix5 = ImageProcessingUtils.deRotatePix(270, pix);
|
||||
Leptonica1.pixWrite("/tmp/270.png", pix5, IFF_PNG);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,10 +1,7 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.IntStream;
|
||||
@ -19,7 +16,7 @@ import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.util.FileSystemUtils;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OsUtils;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.ProcessIOLogger;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -50,29 +47,6 @@ public class Pdf2ImgTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testGhostScript() {
|
||||
|
||||
String outputDir = "/tmp/ghostscript_out/";
|
||||
new File(outputDir).mkdirs();
|
||||
ClassPathResource resource = new ClassPathResource("files/Cyberport__SD-Faktura-Kopie_(ZRG2)_-_31.08.2020.pdf");
|
||||
|
||||
String[] cmdArgs = new String[]{"gs", "-dNOPAUSE", "-sDEVICE=tiff24nc", "-r" + DPI, "-sOutputFile=" + outputDir + "page%04d", resource.getFile().toString(), "-c", "quit"};
|
||||
Process p = Runtime.getRuntime().exec(cmdArgs);
|
||||
ProcessIOLogger logger = new ProcessIOLogger(p.getInputStream(), "GS", ProcessIOLogger.Type.STD_OUT);
|
||||
logger.start();
|
||||
ProcessIOLogger errorLogger = new ProcessIOLogger(p.getErrorStream(), "GS", ProcessIOLogger.Type.STD_OUT);
|
||||
errorLogger.start();
|
||||
int exitcode = p.waitFor();
|
||||
logger.join();
|
||||
errorLogger.join();
|
||||
System.out.println("Ghostscript finished with exit code " + exitcode);
|
||||
FileSystemUtils.deleteRecursively(new File(outputDir));
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testGhostScriptParallel() {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user