RED-7669: optimize OCR-module performance
* move all non thread safe stuff to separate thread in the middle
This commit is contained in:
parent
bb5b4a2fd8
commit
efd3a1d952
@ -1,14 +1,20 @@
|
|||||||
package com.knecon.fforesight.service.ocr.processor.model;
|
package com.knecon.fforesight.service.ocr.processor.model;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.awt.image.BufferedImage;
|
import java.awt.image.BufferedImage;
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace;
|
import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace;
|
||||||
import org.apache.pdfbox.util.Matrix;
|
import org.apache.pdfbox.util.Matrix;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
import lombok.experimental.FieldDefaults;
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import net.sourceforge.lept4j.Pix;
|
||||||
|
import net.sourceforge.lept4j.util.LeptUtils;
|
||||||
|
|
||||||
@Getter
|
@Getter
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
@ -24,4 +30,19 @@ public class ExtractedImage {
|
|||||||
int numberOnPage;
|
int numberOnPage;
|
||||||
PDColorSpace colorSpace;
|
PDColorSpace colorSpace;
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public Pix asPix() {
|
||||||
|
|
||||||
|
BufferedImage image = ImageProcessingUtils.convertToDeviceColorSpace(this);
|
||||||
|
ImageProcessingUtils.setAlphaChannelToWhite(image);
|
||||||
|
return LeptUtils.convertImageToPix(image);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public QuadPoint getImageCoordinatesInInitialUserSpace() {
|
||||||
|
|
||||||
|
return QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, 1, 1)).getTransformed(ctm.createAffineTransform());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,18 +1,15 @@
|
|||||||
package com.knecon.fforesight.service.ocr.processor.model;
|
package com.knecon.fforesight.service.ocr.processor.model;
|
||||||
|
|
||||||
|
import java.awt.Graphics;
|
||||||
import java.awt.geom.AffineTransform;
|
import java.awt.geom.AffineTransform;
|
||||||
import java.awt.image.BufferedImage;
|
import java.awt.image.BufferedImage;
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.IntBuffer;
|
|
||||||
import java.util.concurrent.Semaphore;
|
|
||||||
|
|
||||||
|
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
|
||||||
|
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
|
||||||
import org.apache.pdfbox.util.Matrix;
|
import org.apache.pdfbox.util.Matrix;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||||
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||||
import com.pdftron.sdf.Obj;
|
|
||||||
import com.sun.jna.StringArray;
|
|
||||||
import com.sun.jna.ptr.PointerByReference;
|
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
@ -27,63 +24,20 @@ import net.sourceforge.tess4j.ITessAPI;
|
|||||||
@Slf4j
|
@Slf4j
|
||||||
@Getter
|
@Getter
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
public class ExtractedOcrImage implements OcrImage {
|
public class ExtractedOcrImage implements OcrImage {
|
||||||
|
|
||||||
final int pageNumber;
|
int pageNumber;
|
||||||
final Pix pix;
|
int numberOnPage;
|
||||||
final int originalHeight;
|
int originalHeight;
|
||||||
final int originalWidth;
|
int originalWidth;
|
||||||
final int height;
|
Matrix ctm;
|
||||||
final int width;
|
Pix pix;
|
||||||
final Matrix ctm;
|
int height;
|
||||||
final int numberOnPage;
|
int width;
|
||||||
|
|
||||||
@Setter
|
|
||||||
int rotationDegrees;
|
int rotationDegrees;
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public ExtractedOcrImage(int pageNumber, int numberOnPage, BufferedImage bufferedImage, Matrix ctm, int targetDpi) {
|
|
||||||
|
|
||||||
this.pageNumber = pageNumber;
|
|
||||||
this.numberOnPage = numberOnPage;
|
|
||||||
this.ctm = ctm;
|
|
||||||
this.originalHeight = bufferedImage.getHeight();
|
|
||||||
this.originalWidth = bufferedImage.getWidth();
|
|
||||||
float imageDPI = Math.abs(bufferedImage.getWidth() / (ctm.getScalingFactorX() / 72));
|
|
||||||
this.pix = binarize(bufferedImage, imageDPI, targetDpi);
|
|
||||||
this.height = pix.h;
|
|
||||||
this.width = pix.w;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public ExtractedOcrImage(ExtractedImage image, int targetDpi) {
|
|
||||||
this.pageNumber = image.getPageNumber();
|
|
||||||
this.numberOnPage = image.getNumberOnPage();
|
|
||||||
this.ctm = image.getCtm();
|
|
||||||
this.originalHeight = image.getImage().getHeight();
|
|
||||||
this.originalWidth = image.getImage().getWidth();
|
|
||||||
float imageDPI = Math.abs(image.getImage().getWidth() / (ctm.getScalingFactorX() / 72));
|
|
||||||
this.pix = binarize(image.getImage(), imageDPI, targetDpi);
|
|
||||||
this.height = pix.h;
|
|
||||||
this.width = pix.w;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) {
|
|
||||||
|
|
||||||
ImageProcessingUtils.setAlphaChannelToWhite(image);
|
|
||||||
|
|
||||||
synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs tesseract detection script.
|
|
||||||
Pix grayScale = ImageProcessingUtils.convertToGrayScale(image);
|
|
||||||
Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale);
|
|
||||||
return ImageProcessingUtils.despecklePix(scaledUp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public AffineTransform getImageCTM() {
|
public AffineTransform getImageCTM() {
|
||||||
|
|
||||||
|
|||||||
@ -71,13 +71,6 @@ public interface OcrImage {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
default BufferedImage getRotatedBufferedImage() {
|
|
||||||
|
|
||||||
return LeptUtils.convertPixToImage(getRotatedPix());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Retrieves the rotation degree of the OCR image.
|
* Retrieves the rotation degree of the OCR image.
|
||||||
*
|
*
|
||||||
@ -94,16 +87,6 @@ public interface OcrImage {
|
|||||||
int getOptimalPageSegmentationMode(); // TODO: evaluate if PSM can be dynamically chosen to increase performance
|
int getOptimalPageSegmentationMode(); // TODO: evaluate if PSM can be dynamically chosen to increase performance
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sets the rotation degree of the OCR image. The rotation degree specifies the amount of rotation applied to the image.
|
|
||||||
* Currently only quadrant rotations are supported.
|
|
||||||
* Rotated partial images work, due to the CTM present in the pdf working with any rotation.
|
|
||||||
*
|
|
||||||
* @param rotationDegree The rotation degree of the OCR image.
|
|
||||||
*/
|
|
||||||
void setRotationDegrees(int rotationDegree);
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Retrieves the buffered image associated with the OCR image.
|
* Retrieves the buffered image associated with the OCR image.
|
||||||
*
|
*
|
||||||
@ -112,24 +95,6 @@ public interface OcrImage {
|
|||||||
Pix getPix();
|
Pix getPix();
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Retrieves the rotated image of the OCR image.
|
|
||||||
*
|
|
||||||
* @return The rotated BufferedImage object of the OCR image.
|
|
||||||
*/
|
|
||||||
default Pix getRotatedPix() {
|
|
||||||
|
|
||||||
synchronized (OCRThread.class) {
|
|
||||||
return switch (360 - getRotationDegrees()) {
|
|
||||||
case 90 -> Leptonica1.pixRotateOrth(getPix(), 1);
|
|
||||||
case 180 -> Leptonica1.pixRotateOrth(getPix(), 2);
|
|
||||||
case 270 -> Leptonica1.pixRotateOrth(getPix(), 3);
|
|
||||||
default -> getPix();
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
default int getDpi() {
|
default int getDpi() {
|
||||||
|
|
||||||
return PdfDpiCalculator.calculateDpi(getImageBounds(), getImageCTM(), getWidth());
|
return PdfDpiCalculator.calculateDpi(getImageBounds(), getImageCTM(), getWidth());
|
||||||
|
|||||||
@ -49,7 +49,7 @@ public class GhostScriptService {
|
|||||||
|
|
||||||
List<List<ProcessInfo>> processInfoBatches = buildSubListForEachProcess(stitchedPageNumbers,
|
List<List<ProcessInfo>> processInfoBatches = buildSubListForEachProcess(stitchedPageNumbers,
|
||||||
numOfProcesses,
|
numOfProcesses,
|
||||||
2 * settings.getOcrThreadCount()); // use 2 times the thread count as batch size, such that GS generates the rendered pages as needed by the OCR Threads
|
settings.getOcrThreadCount()); // use 2 times the thread count as batch size, such that GS generates the rendered pages as needed by the OCR Threads
|
||||||
for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) {
|
for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) {
|
||||||
long timestamp = System.currentTimeMillis();
|
long timestamp = System.currentTimeMillis();
|
||||||
List<RenderedPageImageFile> renderedPageImageFiles = Collections.synchronizedList(new LinkedList<>());
|
List<RenderedPageImageFile> renderedPageImageFiles = Collections.synchronizedList(new LinkedList<>());
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
package com.knecon.fforesight.service.ocr.processor.service;
|
package com.knecon.fforesight.service.ocr.processor.service;
|
||||||
|
|
||||||
import java.awt.Graphics;
|
import java.awt.Graphics;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.awt.image.BufferedImage;
|
import java.awt.image.BufferedImage;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
@ -26,6 +27,7 @@ import org.apache.pdfbox.util.Matrix;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
||||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||||
|
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
@ -34,7 +36,6 @@ import lombok.SneakyThrows;
|
|||||||
@Getter
|
@Getter
|
||||||
public class ImageStreamEngine extends PDFStreamEngine {
|
public class ImageStreamEngine extends PDFStreamEngine {
|
||||||
|
|
||||||
private ExtractedOcrImage currentImageOnPage;
|
|
||||||
private List<ExtractedImage> imagesOnCurrentPage;
|
private List<ExtractedImage> imagesOnCurrentPage;
|
||||||
private OcrServiceSettings settings;
|
private OcrServiceSettings settings;
|
||||||
private int pageNum;
|
private int pageNum;
|
||||||
@ -71,6 +72,7 @@ public class ImageStreamEngine extends PDFStreamEngine {
|
|||||||
|
|
||||||
Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix();
|
Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix();
|
||||||
this.imagesOnCurrentPage.add(new ExtractedImage(pageNum,
|
this.imagesOnCurrentPage.add(new ExtractedImage(pageNum,
|
||||||
|
QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, imageXObject.getWidth(), imageXObject.getHeight())),
|
||||||
imageXObject.getHeight(),
|
imageXObject.getHeight(),
|
||||||
imageXObject.getWidth(),
|
imageXObject.getWidth(),
|
||||||
imageXObject.getImage(),
|
imageXObject.getImage(),
|
||||||
@ -78,7 +80,6 @@ public class ImageStreamEngine extends PDFStreamEngine {
|
|||||||
imagesOnCurrentPage.size(),
|
imagesOnCurrentPage.size(),
|
||||||
imageXObject.getColorSpace()));
|
imageXObject.getColorSpace()));
|
||||||
|
|
||||||
//imagesOnPages.add(this.currentImageOnPage);
|
|
||||||
} else if (xobject instanceof PDFormXObject) {
|
} else if (xobject instanceof PDFormXObject) {
|
||||||
PDFormXObject form = (PDFormXObject) xobject;
|
PDFormXObject form = (PDFormXObject) xobject;
|
||||||
showForm(form);
|
showForm(form);
|
||||||
|
|||||||
@ -107,7 +107,7 @@ public class OCRService {
|
|||||||
int numberOfOcrThreads = Math.min(settings.getOcrThreadCount(), document.getNumberOfPages());
|
int numberOfOcrThreads = Math.min(settings.getOcrThreadCount(), document.getNumberOfPages());
|
||||||
stats = new Statistics(numberOfExtractThreads, numberOfOcrThreads);
|
stats = new Statistics(numberOfExtractThreads, numberOfOcrThreads);
|
||||||
|
|
||||||
BlockingQueue<OcrImage> ocrImageQueue = new ArrayBlockingQueue<>(2 * numberOfOcrThreads);
|
BlockingQueue<OcrImage> ocrImageQueue = new ArrayBlockingQueue<>((int) (1.5 * numberOfOcrThreads));
|
||||||
|
|
||||||
OcrImageFactory ocrImageFactory = new OcrImageFactory(document,
|
OcrImageFactory ocrImageFactory = new OcrImageFactory(document,
|
||||||
documentFile,
|
documentFile,
|
||||||
@ -128,7 +128,7 @@ public class OCRService {
|
|||||||
.toList();
|
.toList();
|
||||||
log.info("Started {} OCR consumer threads, listening for images on the queue", ocrThreads.size());
|
log.info("Started {} OCR consumer threads, listening for images on the queue", ocrThreads.size());
|
||||||
ocrImageFactory.join();
|
ocrImageFactory.join();
|
||||||
log.info("Extracted all images, interrupting ocr threads");
|
log.info("Processed all images, interrupting ocr threads");
|
||||||
|
|
||||||
ocrThreads.forEach(Thread::interrupt);
|
ocrThreads.forEach(Thread::interrupt);
|
||||||
for (OCRThread ocrThread : ocrThreads) {
|
for (OCRThread ocrThread : ocrThreads) {
|
||||||
|
|||||||
@ -6,13 +6,16 @@ import java.util.ArrayList;
|
|||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.concurrent.ArrayBlockingQueue;
|
||||||
import java.util.concurrent.BlockingQueue;
|
import java.util.concurrent.BlockingQueue;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageExtractionThread;
|
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageExtractionThread;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageProcessingThread;
|
||||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||||
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
|
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
|
||||||
|
|
||||||
@ -29,6 +32,8 @@ public class OcrImageFactory {
|
|||||||
File documentFile;
|
File documentFile;
|
||||||
Path tmpImageDir;
|
Path tmpImageDir;
|
||||||
GhostScriptService ghostScriptService;
|
GhostScriptService ghostScriptService;
|
||||||
|
BlockingQueue<ExtractedImage> imageProcessingQueue;
|
||||||
|
ImageProcessingThread imageProcessingThread;
|
||||||
BlockingQueue<OcrImage> imageOutputQueue;
|
BlockingQueue<OcrImage> imageOutputQueue;
|
||||||
List<ImageExtractionThread> imageExtractionThreads;
|
List<ImageExtractionThread> imageExtractionThreads;
|
||||||
List<Integer> stitchedPageNumbers;
|
List<Integer> stitchedPageNumbers;
|
||||||
@ -50,6 +55,7 @@ public class OcrImageFactory {
|
|||||||
this.tmpImageDir = tmpImageDir;
|
this.tmpImageDir = tmpImageDir;
|
||||||
this.ghostScriptService = ghostScriptService;
|
this.ghostScriptService = ghostScriptService;
|
||||||
this.imageOutputQueue = imageOutputQueue;
|
this.imageOutputQueue = imageOutputQueue;
|
||||||
|
this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOutputQueue.remainingCapacity());
|
||||||
this.stitchedPageNumbers = Collections.synchronizedList(new LinkedList<>());
|
this.stitchedPageNumbers = Collections.synchronizedList(new LinkedList<>());
|
||||||
this.stats = stats;
|
this.stats = stats;
|
||||||
|
|
||||||
@ -57,8 +63,10 @@ public class OcrImageFactory {
|
|||||||
|
|
||||||
List<List<Integer>> balancedPageNumbers = ListSplittingUtils.buildBalancedContinuousSublist(document.getNumberOfPages(), numberOfThreads);
|
List<List<Integer>> balancedPageNumbers = ListSplittingUtils.buildBalancedContinuousSublist(document.getNumberOfPages(), numberOfThreads);
|
||||||
for (int i = 0; i < balancedPageNumbers.size(); i++) {
|
for (int i = 0; i < balancedPageNumbers.size(); i++) {
|
||||||
imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageOutputQueue, stitchedPageNumbers));
|
imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageProcessingQueue, stitchedPageNumbers));
|
||||||
}
|
}
|
||||||
|
this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOutputQueue, stats, settings);
|
||||||
|
|
||||||
log.info("Started {} image extraction threads, with ({}) pages each",
|
log.info("Started {} image extraction threads, with ({}) pages each",
|
||||||
imageExtractionThreads.size(),
|
imageExtractionThreads.size(),
|
||||||
imageExtractionThreads.stream().map(ImageExtractionThread::getPageIndices).map(List::size).map(String::valueOf).collect(Collectors.joining(", ")));
|
imageExtractionThreads.stream().map(ImageExtractionThread::getPageIndices).map(List::size).map(String::valueOf).collect(Collectors.joining(", ")));
|
||||||
@ -70,6 +78,8 @@ public class OcrImageFactory {
|
|||||||
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
|
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
|
||||||
imageExtractionThread.start();
|
imageExtractionThread.start();
|
||||||
}
|
}
|
||||||
|
imageProcessingThread.start();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -79,11 +89,15 @@ public class OcrImageFactory {
|
|||||||
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
|
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
|
||||||
imageExtractionThread.join();
|
imageExtractionThread.join();
|
||||||
}
|
}
|
||||||
if (stitchedPageNumbers.isEmpty()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageOutputQueue, stats);
|
if (!stitchedPageNumbers.isEmpty()) {
|
||||||
|
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageOutputQueue, stats);
|
||||||
|
}
|
||||||
|
imageProcessingThread.interrupt();
|
||||||
|
log.info("All images extracted, interrupting processing thread.");
|
||||||
|
|
||||||
|
imageProcessingThread.join();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -15,6 +15,7 @@ public class Statistics {
|
|||||||
List<Long> tesseractDuration;
|
List<Long> tesseractDuration;
|
||||||
AtomicLong pdf2ImgDuration;
|
AtomicLong pdf2ImgDuration;
|
||||||
AtomicLong writingTextDuration;
|
AtomicLong writingTextDuration;
|
||||||
|
AtomicLong imageProcessingDuration;
|
||||||
|
|
||||||
|
|
||||||
public Statistics(int numberOfExtractThreads, int numberOfOcrThreads) {
|
public Statistics(int numberOfExtractThreads, int numberOfOcrThreads) {
|
||||||
@ -23,6 +24,7 @@ public class Statistics {
|
|||||||
this.tesseractDuration = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfOcrThreads, 0L)));
|
this.tesseractDuration = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfOcrThreads, 0L)));
|
||||||
this.pdf2ImgDuration = new AtomicLong(0);
|
this.pdf2ImgDuration = new AtomicLong(0);
|
||||||
this.writingTextDuration = new AtomicLong(0);
|
this.writingTextDuration = new AtomicLong(0);
|
||||||
|
this.imageProcessingDuration = new AtomicLong(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -32,6 +34,12 @@ public class Statistics {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void increaseImageProcessing(long duration) {
|
||||||
|
|
||||||
|
imageProcessingDuration.addAndGet(duration);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public void increaseTesseractDuration(int threadId, long duration) {
|
public void increaseTesseractDuration(int threadId, long duration) {
|
||||||
|
|
||||||
tesseractDuration.set(threadId, tesseractDuration.get(threadId) + duration);
|
tesseractDuration.set(threadId, tesseractDuration.get(threadId) + duration);
|
||||||
@ -53,13 +61,15 @@ public class Statistics {
|
|||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
||||||
return String.format("imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, PDF2Img=%.2f s, writingText=%.2f s",
|
return String.format(
|
||||||
|
"imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s",
|
||||||
((float) imageExtraction.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
|
((float) imageExtraction.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
|
||||||
((float) imageExtraction.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
|
((float) imageExtraction.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
|
||||||
((float) imageExtraction.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
|
((float) imageExtraction.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
|
||||||
((float) tesseractDuration.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
|
((float) tesseractDuration.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
|
||||||
((float) tesseractDuration.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
|
((float) tesseractDuration.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
|
||||||
((float) tesseractDuration.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
|
((float) tesseractDuration.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
|
||||||
|
(float) imageProcessingDuration.get() / 1000,
|
||||||
(float) pdf2ImgDuration.get() / 1000,
|
(float) pdf2ImgDuration.get() / 1000,
|
||||||
(float) writingTextDuration.get() / 1000);
|
(float) writingTextDuration.get() / 1000);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -9,8 +9,6 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
|||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine;
|
import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine;
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
|
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
|
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
|
||||||
@ -26,7 +24,7 @@ import lombok.experimental.FieldDefaults;
|
|||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
public class ImageExtractionThread extends Thread {
|
public class ImageExtractionThread extends Thread {
|
||||||
|
|
||||||
static double FULL_PAGE_IMAGE_THRESHOLD = 0.98;
|
static double FULL_PAGE_IMAGE_THRESHOLD = 0.99;
|
||||||
static double IMAGE_ALIGNMENT_THRESHOLD = 1;
|
static double IMAGE_ALIGNMENT_THRESHOLD = 1;
|
||||||
|
|
||||||
int id;
|
int id;
|
||||||
@ -38,7 +36,7 @@ public class ImageExtractionThread extends Thread {
|
|||||||
OcrServiceSettings settings;
|
OcrServiceSettings settings;
|
||||||
|
|
||||||
// output is written to these lists
|
// output is written to these lists
|
||||||
BlockingQueue<OcrImage> imageOutputQueue;
|
BlockingQueue<ExtractedImage> imageProcessingQueue;
|
||||||
List<Integer> stitchedPageNumbers;
|
List<Integer> stitchedPageNumbers;
|
||||||
|
|
||||||
|
|
||||||
@ -50,21 +48,20 @@ public class ImageExtractionThread extends Thread {
|
|||||||
for (Integer pageIndex : pageIndices) {
|
for (Integer pageIndex : pageIndices) {
|
||||||
try (PDDocument document = Loader.loadPDF(documentFile)) { // load new PDDocument for thread safety, also keeps RAM usage low.
|
try (PDDocument document = Loader.loadPDF(documentFile)) { // load new PDDocument for thread safety, also keeps RAM usage low.
|
||||||
timestamp = System.currentTimeMillis();
|
timestamp = System.currentTimeMillis();
|
||||||
List<ExtractedImage> extractedImages = getExtractedOcrImages(pageIndex, document);
|
List<ExtractedImage> extractedImages = getExtractedImages(pageIndex, document);
|
||||||
stats.increaseImageExtraction(id, System.currentTimeMillis() - timestamp);
|
stats.increaseImageExtraction(id, System.currentTimeMillis() - timestamp);
|
||||||
if (extractedImages.isEmpty()) {
|
if (extractedImages.isEmpty()) {
|
||||||
logger.logPageSkipped(pageIndex);
|
logger.logPageSkipped(pageIndex);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (checkForStitchedImages(extractedImages, document.getPage(pageIndex - 1))) {
|
if (checkForFullPageOrStitchedImages(extractedImages, document.getPage(pageIndex - 1))) {
|
||||||
stitchedPageNumbers.add(pageIndex);
|
stitchedPageNumbers.add(pageIndex);
|
||||||
logger.addImagesToProcess(pageIndex, 0);
|
logger.addImagesToProcess(pageIndex, 0);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (ExtractedImage image : extractedImages) {
|
for (ExtractedImage image : extractedImages) {
|
||||||
ExtractedOcrImage ocrImage = new ExtractedOcrImage(image, settings.getDpi());
|
imageProcessingQueue.put(image);
|
||||||
imageOutputQueue.put(ocrImage);
|
|
||||||
logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage());
|
logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -72,7 +69,7 @@ public class ImageExtractionThread extends Thread {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<ExtractedImage> getExtractedOcrImages(Integer pageIndex, PDDocument document) {
|
private List<ExtractedImage> getExtractedImages(Integer pageIndex, PDDocument document) {
|
||||||
|
|
||||||
PDPage page = document.getPage(pageIndex - 1);
|
PDPage page = document.getPage(pageIndex - 1);
|
||||||
ImageStreamEngine imageStreamEngine = new ImageStreamEngine(settings);
|
ImageStreamEngine imageStreamEngine = new ImageStreamEngine(settings);
|
||||||
@ -82,14 +79,14 @@ public class ImageExtractionThread extends Thread {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private boolean checkForStitchedImages(List<ExtractedImage> imagesOnCurrentPage, PDPage page) {
|
private boolean checkForFullPageOrStitchedImages(List<ExtractedImage> imagesOnCurrentPage, PDPage page) {
|
||||||
|
|
||||||
if (imagesOnCurrentPage.isEmpty()) {
|
if (imagesOnCurrentPage.isEmpty()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (ExtractedImage imageOnPage : imagesOnCurrentPage) {
|
for (ExtractedImage imageOnPage : imagesOnCurrentPage) {
|
||||||
if (imageOnPage.getImageCoordinatesInInitialUserSpace().size() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight() * page.getCropBox().getWidth()) {
|
if (imageOnPage.getWidth() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getWidth() && imageOnPage.getHeight() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,166 @@
|
|||||||
|
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||||
|
|
||||||
|
import static net.sourceforge.tess4j.ITessAPI.TRUE;
|
||||||
|
|
||||||
|
import java.nio.FloatBuffer;
|
||||||
|
import java.nio.IntBuffer;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.BlockingQueue;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||||
|
import com.sun.jna.ptr.PointerByReference;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import net.sourceforge.lept4j.Leptonica1;
|
||||||
|
import net.sourceforge.lept4j.Pix;
|
||||||
|
import net.sourceforge.lept4j.util.LeptUtils;
|
||||||
|
import net.sourceforge.tess4j.ITessAPI;
|
||||||
|
import net.sourceforge.tess4j.TessAPI1;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This thread does all the image processing. There should only be one, since Leptonica is not thread safe.
|
||||||
|
*/
|
||||||
|
@Slf4j
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public class ImageProcessingThread extends Thread {
|
||||||
|
|
||||||
|
BlockingQueue<ExtractedImage> imageInputQueue;
|
||||||
|
BlockingQueue<OcrImage> imageOutputQueue;
|
||||||
|
ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
|
||||||
|
Statistics stats;
|
||||||
|
OcrServiceSettings settings;
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
|
||||||
|
// Interrupting signals that the image extraction has finished
|
||||||
|
while (true) {
|
||||||
|
try {
|
||||||
|
final ExtractedImage image = imageInputQueue.take();
|
||||||
|
OcrImage extractedOcrImage = this.process(image);
|
||||||
|
try {
|
||||||
|
imageOutputQueue.put(extractedOcrImage);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
imageOutputQueue.put(extractedOcrImage);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
log.info("Leaving initial uninterrupted loop!");
|
||||||
|
// empty the queue
|
||||||
|
List<ExtractedImage> remainingImages = new ArrayList<>(imageInputQueue.size());
|
||||||
|
imageInputQueue.drainTo(remainingImages);
|
||||||
|
remainingImages.forEach(image -> {
|
||||||
|
OcrImage ocrImage = this.process(image);
|
||||||
|
try {
|
||||||
|
imageOutputQueue.put(ocrImage);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
log.error(e.getMessage());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private OcrImage process(ExtractedImage extractedImage) {
|
||||||
|
|
||||||
|
long timestamp = System.currentTimeMillis();
|
||||||
|
float imageDPI = Math.abs(extractedImage.getImage().getWidth() / (extractedImage.getCtm().getScalingFactorX() / 72));
|
||||||
|
|
||||||
|
Pix pix = binarize(extractedImage.asPix(), imageDPI, settings.getDpi());
|
||||||
|
|
||||||
|
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
|
||||||
|
Pix rotatedPix = switch (360 - orientDegree) {
|
||||||
|
case 90 -> Leptonica1.pixRotateOrth(pix, 1);
|
||||||
|
case 180 -> Leptonica1.pixRotateOrth(pix, 2);
|
||||||
|
case 270 -> Leptonica1.pixRotateOrth(pix, 3);
|
||||||
|
default -> pix;
|
||||||
|
};
|
||||||
|
OcrImage extractedOcrImage = new ExtractedOcrImage(extractedImage.getPageNumber(),
|
||||||
|
extractedImage.getNumberOnPage(),
|
||||||
|
extractedImage.getHeight(),
|
||||||
|
extractedImage.getWidth(),
|
||||||
|
extractedImage.getCtm(),
|
||||||
|
rotatedPix,
|
||||||
|
pix.h,
|
||||||
|
pix.w,
|
||||||
|
orientDegree);
|
||||||
|
|
||||||
|
if (pix != rotatedPix) {
|
||||||
|
LeptUtils.disposePix(pix);
|
||||||
|
}
|
||||||
|
|
||||||
|
stats.increaseImageProcessing(System.currentTimeMillis() - timestamp);
|
||||||
|
|
||||||
|
return extractedOcrImage;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static public int detectOrientation(Pix pix, int dpi, ITessAPI.TessBaseAPI detectionScriptHandle) {
|
||||||
|
|
||||||
|
TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, pix);
|
||||||
|
TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, dpi);
|
||||||
|
|
||||||
|
IntBuffer orientationDegreeResultBuffer;
|
||||||
|
FloatBuffer orientationDegreeConfidenceBuffer;
|
||||||
|
PointerByReference scriptureNameBuffer;
|
||||||
|
FloatBuffer scriptureConfidenceBuffer;
|
||||||
|
|
||||||
|
orientationDegreeResultBuffer = IntBuffer.allocate(1);
|
||||||
|
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
|
||||||
|
scriptureNameBuffer = new PointerByReference();
|
||||||
|
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
|
||||||
|
|
||||||
|
int orientationDegree = 0;
|
||||||
|
int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle,
|
||||||
|
orientationDegreeResultBuffer,
|
||||||
|
orientationDegreeConfidenceBuffer,
|
||||||
|
scriptureNameBuffer,
|
||||||
|
scriptureConfidenceBuffer);
|
||||||
|
if (result == TRUE && orientationDegreeConfidenceBuffer.get() > 10) {
|
||||||
|
orientationDegree = orientationDegreeResultBuffer.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
|
||||||
|
|
||||||
|
return orientationDegree;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private Pix binarize(Pix pix, float imageDpi, int targetDpi) {
|
||||||
|
|
||||||
|
Pix grayScale = ImageProcessingUtils.convertToGrayScale(pix);
|
||||||
|
Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale);
|
||||||
|
return ImageProcessingUtils.despecklePix(scaledUp);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
|
||||||
|
|
||||||
|
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
|
||||||
|
String datapath = System.getenv("TESSDATA_PREFIX");
|
||||||
|
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
|
||||||
|
|
||||||
|
return handle;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -47,8 +47,7 @@ public class OCRThread extends Thread {
|
|||||||
OcrProgressLogger logger;
|
OcrProgressLogger logger;
|
||||||
Statistics stats;
|
Statistics stats;
|
||||||
OcrServiceSettings settings;
|
OcrServiceSettings settings;
|
||||||
ITessAPI.TessBaseAPI detectionScriptHandle;
|
Tesseract2 instance;
|
||||||
ITessAPI.TessBaseAPI tesseractHandle;
|
|
||||||
|
|
||||||
|
|
||||||
public OCRThread(int id,
|
public OCRThread(int id,
|
||||||
@ -66,8 +65,7 @@ public class OCRThread extends Thread {
|
|||||||
this.logger = logger;
|
this.logger = logger;
|
||||||
this.stats = stats;
|
this.stats = stats;
|
||||||
this.settings = settings;
|
this.settings = settings;
|
||||||
this.detectionScriptHandle = initDetectionScriptHandle();
|
this.instance = createInstance(settings);
|
||||||
this.tesseractHandle = initTesseractHandle(settings);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -92,10 +90,9 @@ public class OCRThread extends Thread {
|
|||||||
this.process(image);
|
this.process(image);
|
||||||
}
|
}
|
||||||
} catch (NoSuchElementException e) {
|
} catch (NoSuchElementException e) {
|
||||||
log.debug("Processed all Images, finishing.");
|
log.debug("Executed tesseract on all Images, finishing.");
|
||||||
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
|
|
||||||
TessAPI1.TessBaseAPIDelete(this.tesseractHandle);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -107,13 +104,8 @@ public class OCRThread extends Thread {
|
|||||||
|
|
||||||
int psm = settings.getPsmOverride() < 0 ? image.getOptimalPageSegmentationMode() : settings.getPsmOverride();
|
int psm = settings.getPsmOverride() < 0 ? image.getOptimalPageSegmentationMode() : settings.getPsmOverride();
|
||||||
|
|
||||||
int orientDegree = detectOrientation(image);
|
executeTesseract(psm, image.getDpi(), image.getPix(), tesseractOutputFileName);
|
||||||
image.setRotationDegrees(orientDegree);
|
|
||||||
Pix rotatedPix = image.getRotatedPix();
|
|
||||||
executeTesseract(psm, image.getDpi(), rotatedPix, tesseractOutputFileName);
|
|
||||||
|
|
||||||
image.destroyPix();
|
image.destroyPix();
|
||||||
LeptUtils.disposePix(rotatedPix);
|
|
||||||
|
|
||||||
results.add(OcrResult.create(image, tesseractOutputFileName));
|
results.add(OcrResult.create(image, tesseractOutputFileName));
|
||||||
logger.logImageFinished(image, psm);
|
logger.logImageFinished(image, psm);
|
||||||
@ -121,67 +113,6 @@ public class OCRThread extends Thread {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public int detectOrientation(OcrImage image) {
|
|
||||||
|
|
||||||
IntBuffer orientationDegreeResultBuffer;
|
|
||||||
FloatBuffer orientationDegreeConfidenceBuffer;
|
|
||||||
PointerByReference scriptureNameBuffer;
|
|
||||||
FloatBuffer scriptureConfidenceBuffer;
|
|
||||||
|
|
||||||
TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, image.getPix());
|
|
||||||
TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, image.getDpi());
|
|
||||||
|
|
||||||
synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs in leptonica binarization.
|
|
||||||
orientationDegreeResultBuffer = IntBuffer.allocate(1);
|
|
||||||
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
|
|
||||||
scriptureNameBuffer = new PointerByReference();
|
|
||||||
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
int orient_deg = 0;
|
|
||||||
int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle,
|
|
||||||
orientationDegreeResultBuffer,
|
|
||||||
orientationDegreeConfidenceBuffer,
|
|
||||||
scriptureNameBuffer,
|
|
||||||
scriptureConfidenceBuffer);
|
|
||||||
if (result == TRUE) {
|
|
||||||
orient_deg = orientationDegreeResultBuffer.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
|
|
||||||
|
|
||||||
return orient_deg;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
|
|
||||||
|
|
||||||
synchronized (OCRThread.class) {
|
|
||||||
|
|
||||||
ITessAPI.TessBaseAPI handle = TessBaseAPICreate();
|
|
||||||
String datapath = System.getenv("TESSDATA_PREFIX");
|
|
||||||
// TessBaseAPISetVariable(handle, "debug_file", "/dev/null");
|
|
||||||
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
|
|
||||||
|
|
||||||
return handle;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
synchronized private static ITessAPI.TessBaseAPI initTesseractHandle(OcrServiceSettings settings) {
|
|
||||||
|
|
||||||
synchronized (OCRThread.class) {
|
|
||||||
|
|
||||||
ITessAPI.TessBaseAPI handle = TessBaseAPICreate();
|
|
||||||
String datapath = System.getenv("TESSDATA_PREFIX");
|
|
||||||
// TessBaseAPISetVariable(handle, "debug_file", "/dev/null");
|
|
||||||
TessBaseAPIInit1(handle, datapath, settings.getLanguages(), 1, new PointerByReference(), 0);
|
|
||||||
|
|
||||||
return handle;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void executeTesseract(int psm, int dpi, Pix pix, String tesseractOutputFileName) {
|
public void executeTesseract(int psm, int dpi, Pix pix, String tesseractOutputFileName) {
|
||||||
|
|
||||||
@ -192,14 +123,19 @@ public class OCRThread extends Thread {
|
|||||||
Leptonica1.pixWrite(folder + "/pix_" + a[a.length - 1] + ".png", pix, 3);
|
Leptonica1.pixWrite(folder + "/pix_" + a[a.length - 1] + ".png", pix, 3);
|
||||||
}
|
}
|
||||||
|
|
||||||
TessBaseAPISetPageSegMode(tesseractHandle, psm);
|
instance.setVariable("user_defined_dpi", String.valueOf(dpi));
|
||||||
|
instance.setPageSegMode(psm);
|
||||||
|
instance.createDocumentsWithResults(pix, null, tesseractOutputFileName, List.of(ITesseract.RenderedFormat.HOCR), ITessAPI.TessPageIteratorLevel.RIL_BLOCK);
|
||||||
|
}
|
||||||
|
|
||||||
Tesseract2.createDocumentsWithResults(pix,
|
|
||||||
null,
|
private static Tesseract2 createInstance(OcrServiceSettings settings) {
|
||||||
tesseractOutputFileName,
|
|
||||||
List.of(ITesseract.RenderedFormat.HOCR),
|
Tesseract2 instance = new Tesseract2();
|
||||||
ITessAPI.TessPageIteratorLevel.RIL_BLOCK,
|
instance.setVariable("debug_file", "/dev/null"); // remove warnings from std out
|
||||||
tesseractHandle);
|
instance.setOcrEngineMode(1); // set to LSTM based Engine
|
||||||
|
instance.setLanguage(settings.getLanguages());
|
||||||
|
return instance;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -13,13 +13,13 @@ import lombok.experimental.FieldDefaults;
|
|||||||
public class OcrServiceSettings {
|
public class OcrServiceSettings {
|
||||||
|
|
||||||
int ocrThreadCount = 16; // Number of OCR threads
|
int ocrThreadCount = 16; // Number of OCR threads
|
||||||
int imageExtractThreadCount = 5; // Number of image extraction threads
|
int imageExtractThreadCount = 2; // Number of image extraction threads
|
||||||
int gsProcessCount = 5; // Number of Ghostscript processes
|
int gsProcessCount = 2; // Number of Ghostscript processes
|
||||||
int dpi = 300; // Target DPI for binarized images
|
int dpi = 300; // Target DPI for binarized images
|
||||||
int psmOverride = -1; // Overrides the page segmentation mode if > 0
|
int psmOverride = -1; // Overrides the page segmentation mode if > 0
|
||||||
int minImageHeight = 20; // Minimum height for images to be processed
|
int minImageHeight = 20; // Minimum height for images to be processed
|
||||||
int minImageWidth = 20; // Minimum width for images to be processed
|
int minImageWidth = 20; // Minimum width for images to be processed
|
||||||
boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes
|
boolean debug = true; // If true, overlays OCR images with a grid and draws word bounding boxes
|
||||||
boolean removeWatermark; // If true, watermarks will be removed
|
boolean removeWatermark; // If true, watermarks will be removed
|
||||||
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
|
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
|
||||||
COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR");
|
COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR");
|
||||||
|
|||||||
@ -2,10 +2,16 @@ package com.knecon.fforesight.service.ocr.processor.utils;
|
|||||||
|
|
||||||
import java.awt.AlphaComposite;
|
import java.awt.AlphaComposite;
|
||||||
import java.awt.Color;
|
import java.awt.Color;
|
||||||
|
import java.awt.Graphics;
|
||||||
import java.awt.Graphics2D;
|
import java.awt.Graphics2D;
|
||||||
import java.awt.Transparency;
|
import java.awt.Transparency;
|
||||||
import java.awt.image.BufferedImage;
|
import java.awt.image.BufferedImage;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
|
||||||
|
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
import net.sourceforge.lept4j.Leptonica1;
|
import net.sourceforge.lept4j.Leptonica1;
|
||||||
@ -15,6 +21,22 @@ import net.sourceforge.lept4j.util.LeptUtils;
|
|||||||
@UtilityClass
|
@UtilityClass
|
||||||
public class ImageProcessingUtils {
|
public class ImageProcessingUtils {
|
||||||
|
|
||||||
|
public BufferedImage convertToDeviceColorSpace(ExtractedImage extractedImage) {
|
||||||
|
|
||||||
|
BufferedImage image;
|
||||||
|
if (extractedImage.getColorSpace() instanceof PDDeviceRGB || extractedImage.getColorSpace() instanceof PDDeviceGray) {
|
||||||
|
image = extractedImage.getImage();
|
||||||
|
} else {
|
||||||
|
BufferedImage pdfImage = extractedImage.getImage();
|
||||||
|
image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
|
||||||
|
Graphics g = image.getGraphics();
|
||||||
|
g.drawImage(pdfImage, 0, 0, null);
|
||||||
|
g.dispose();
|
||||||
|
}
|
||||||
|
return image;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Pix despecklePix(Pix pix) {
|
public static Pix despecklePix(Pix pix) {
|
||||||
|
|
||||||
assert pix.d == 8;
|
assert pix.d == 8;
|
||||||
@ -23,7 +45,9 @@ public class ImageProcessingUtils {
|
|||||||
// too small to properly despeckle, just binarize instead.
|
// too small to properly despeckle, just binarize instead.
|
||||||
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
|
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
|
||||||
} else {
|
} else {
|
||||||
despeckled = LeptUtils.despeckle(pix, LeptUtils.SEL_STR3, 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though...
|
despeckled = LeptUtils.despeckle(pix,
|
||||||
|
LeptUtils.SEL_STR3,
|
||||||
|
3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though...
|
||||||
if (despeckled == null) {
|
if (despeckled == null) {
|
||||||
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
|
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
|
||||||
}
|
}
|
||||||
@ -56,9 +80,8 @@ public class ImageProcessingUtils {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public static Pix convertToGrayScale(BufferedImage image) {
|
public static Pix convertToGrayScale(Pix pix) {
|
||||||
|
|
||||||
Pix pix = LeptUtils.convertImageToPix(image);
|
|
||||||
if (pix.d == 8) {
|
if (pix.d == 8) {
|
||||||
return pix;
|
return pix;
|
||||||
} else if (pix.d == 32) {
|
} else if (pix.d == 32) {
|
||||||
|
|||||||
@ -1,54 +1,45 @@
|
|||||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||||
|
|
||||||
import static net.sourceforge.tess4j.ITesseract.DOCUMENT_TITLE;
|
|
||||||
|
|
||||||
import java.awt.Rectangle;
|
import java.awt.Rectangle;
|
||||||
import java.nio.IntBuffer;
|
import java.nio.IntBuffer;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
|
||||||
import com.sun.jna.Pointer;
|
import com.sun.jna.Pointer;
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import net.sourceforge.lept4j.Pix;
|
import net.sourceforge.lept4j.Pix;
|
||||||
import net.sourceforge.tess4j.ITessAPI;
|
|
||||||
import net.sourceforge.tess4j.ITesseract;
|
|
||||||
import net.sourceforge.tess4j.OCRResult;
|
import net.sourceforge.tess4j.OCRResult;
|
||||||
import net.sourceforge.tess4j.TessAPI1;
|
import net.sourceforge.tess4j.TessAPI1;
|
||||||
|
import net.sourceforge.tess4j.Tesseract1;
|
||||||
|
import net.sourceforge.tess4j.TesseractException;
|
||||||
import net.sourceforge.tess4j.Word;
|
import net.sourceforge.tess4j.Word;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
/**
|
/**
|
||||||
* Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted.
|
* Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted.
|
||||||
*/
|
*/
|
||||||
@UtilityClass
|
public class Tesseract2 extends Tesseract1 {
|
||||||
public class Tesseract2 extends TessAPI1 {
|
|
||||||
|
|
||||||
private int createDocuments(Pix pix, String filename, ITessAPI.TessBaseAPI handle, ITessAPI.TessResultRenderer renderer) {
|
|
||||||
|
|
||||||
String title = TessBaseAPIGetStringVariable(handle, DOCUMENT_TITLE);
|
private int createDocuments(Pix pix, String filename, TessResultRenderer renderer) {
|
||||||
|
|
||||||
|
String title = TessBaseAPIGetStringVariable(getHandle(), DOCUMENT_TITLE);
|
||||||
TessResultRendererBeginDocument(renderer, title);
|
TessResultRendererBeginDocument(renderer, title);
|
||||||
int result = TessBaseAPIProcessPage(handle, pix, 0, filename, null, 0, renderer);
|
int result = TessBaseAPIProcessPage(getHandle(), pix, 0, filename, null, 0, renderer);
|
||||||
TessResultRendererEndDocument(renderer);
|
TessResultRendererEndDocument(renderer);
|
||||||
|
|
||||||
// if (result == ITessAPI.FALSE) {
|
// if (result == ITessAPI.FALSE) {
|
||||||
// throw new TesseractException("Error during processing page.");
|
// throw new TesseractException("Error during processing page.");
|
||||||
// }
|
// }
|
||||||
|
|
||||||
return TessBaseAPIMeanTextConf(handle);
|
return TessBaseAPIMeanTextConf(getHandle());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public OCRResult createDocumentsWithResults(Pix bi,
|
public OCRResult createDocumentsWithResults(Pix bi, String filename, String outputbase, List<RenderedFormat> formats, int pageIteratorLevel) throws TesseractException {
|
||||||
String filename,
|
|
||||||
String outputbase,
|
|
||||||
List<ITesseract.RenderedFormat> formats,
|
|
||||||
int pageIteratorLevel,
|
|
||||||
ITessAPI.TessBaseAPI handle) {
|
|
||||||
|
|
||||||
List<OCRResult> results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel, handle);
|
List<OCRResult> results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel);
|
||||||
if (!results.isEmpty()) {
|
if (!results.isEmpty()) {
|
||||||
return results.get(0);
|
return results.get(0);
|
||||||
} else {
|
} else {
|
||||||
@ -57,26 +48,24 @@ public class Tesseract2 extends TessAPI1 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<OCRResult> createDocumentsWithResults(Pix[] pixs,
|
public List<OCRResult> createDocumentsWithResults(Pix[] pixs, String[] filenames, String[] outputbases, List<RenderedFormat> formats, int pageIteratorLevel) {
|
||||||
String[] filenames,
|
|
||||||
String[] outputbases,
|
|
||||||
List<ITesseract.RenderedFormat> formats,
|
|
||||||
int pageIteratorLevel,
|
|
||||||
ITessAPI.TessBaseAPI handle) {
|
|
||||||
|
|
||||||
if (pixs.length != filenames.length || pixs.length != outputbases.length) {
|
if (pixs.length != filenames.length || pixs.length != outputbases.length) {
|
||||||
throw new RuntimeException("The three arrays must match in length.");
|
throw new RuntimeException("The three arrays must match in length.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
init();
|
||||||
|
setVariables();
|
||||||
|
|
||||||
List<OCRResult> results = new ArrayList<OCRResult>();
|
List<OCRResult> results = new ArrayList<OCRResult>();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for (int i = 0; i < pixs.length; i++) {
|
for (int i = 0; i < pixs.length; i++) {
|
||||||
try {
|
try {
|
||||||
ITessAPI.TessResultRenderer renderer = createRenderers(outputbases[i], formats);
|
TessResultRenderer renderer = createRenderers(outputbases[i], formats);
|
||||||
int meanTextConfidence = createDocuments(pixs[i], filenames[i], handle, renderer);
|
int meanTextConfidence = createDocuments(pixs[i], filenames[i], renderer);
|
||||||
TessDeleteResultRenderer(renderer);
|
TessDeleteResultRenderer(renderer);
|
||||||
List<Word> words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel, handle) : new ArrayList<Word>();
|
List<Word> words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel) : new ArrayList<Word>();
|
||||||
results.add(new OCRResult(meanTextConfidence, words));
|
results.add(new OCRResult(meanTextConfidence, words));
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
// skip the problematic image file
|
// skip the problematic image file
|
||||||
@ -84,22 +73,20 @@ public class Tesseract2 extends TessAPI1 {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
synchronized (OCRThread.class) {
|
dispose();
|
||||||
TessAPI1.TessBaseAPIClear(handle);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<Word> getRecognizedWords(int pageIteratorLevel, ITessAPI.TessBaseAPI handle) {
|
private List<Word> getRecognizedWords(int pageIteratorLevel) {
|
||||||
|
|
||||||
List<Word> words = new ArrayList<>();
|
List<Word> words = new ArrayList<>();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
ITessAPI.TessResultIterator ri = TessBaseAPIGetIterator(handle);
|
TessResultIterator ri = TessBaseAPIGetIterator(getHandle());
|
||||||
ITessAPI.TessPageIterator pi = TessResultIteratorGetPageIterator(ri);
|
TessPageIterator pi = TessResultIteratorGetPageIterator(ri);
|
||||||
TessPageIteratorBegin(pi);
|
TessPageIteratorBegin(pi);
|
||||||
|
|
||||||
do {
|
do {
|
||||||
@ -132,11 +119,11 @@ public class Tesseract2 extends TessAPI1 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private ITessAPI.TessResultRenderer createRenderers(String outputbase, List<ITesseract.RenderedFormat> formats) {
|
private TessResultRenderer createRenderers(String outputbase, List<RenderedFormat> formats) {
|
||||||
|
|
||||||
ITessAPI.TessResultRenderer renderer = null;
|
TessResultRenderer renderer = null;
|
||||||
|
|
||||||
for (ITesseract.RenderedFormat format : formats) {
|
for (RenderedFormat format : formats) {
|
||||||
switch (format) {
|
switch (format) {
|
||||||
|
|
||||||
case HOCR:
|
case HOCR:
|
||||||
|
|||||||
@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testOcr() {
|
public void testOcr() {
|
||||||
|
|
||||||
String text = testOCR("files/VV-352892.pdf");
|
String text = testOCR("files/2009-1048395_50pages_tables.pdf");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -139,7 +139,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
|||||||
|
|
||||||
String dir = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/";
|
String dir = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/";
|
||||||
List<File> foundFiles = Files.walk(Path.of(dir))
|
List<File> foundFiles = Files.walk(Path.of(dir))
|
||||||
// .sorted(Comparator.comparingLong(this::getFileSize))
|
.sorted(Comparator.comparingLong(this::getFileSize))
|
||||||
.map(Path::toFile)
|
.map(Path::toFile)
|
||||||
.filter(file -> file.getName().endsWith(".pdf"))
|
.filter(file -> file.getName().endsWith(".pdf"))
|
||||||
.peek(System.out::println)
|
.peek(System.out::println)
|
||||||
@ -162,7 +162,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testOcrForSpecificFile() {
|
public void testOcrForSpecificFile() {
|
||||||
|
|
||||||
testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/Item 17_Toxicidade Inalatoria.pdf"));
|
testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/F.2. A16003E - Acute Inhalation Study.pdf"));
|
||||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/A23220A - 404 - Skin Irritation in vivo.pdf"));
|
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/A23220A - 404 - Skin Irritation in vivo.pdf"));
|
||||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/G.1.2 - 1768300_MMNA_A13617AV_report.pdf"));
|
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/G.1.2 - 1768300_MMNA_A13617AV_report.pdf"));
|
||||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 17_Toxicidade Inalatoria Aguda.pdf"));
|
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 17_Toxicidade Inalatoria Aguda.pdf"));
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user