RED-7669: optimize OCR-module performance
* move all non thread safe stuff to separate thread in the middle
This commit is contained in:
parent
bb5b4a2fd8
commit
efd3a1d952
@ -1,14 +1,20 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
@Getter
|
||||
@RequiredArgsConstructor
|
||||
@ -24,4 +30,19 @@ public class ExtractedImage {
|
||||
int numberOnPage;
|
||||
PDColorSpace colorSpace;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Pix asPix() {
|
||||
|
||||
BufferedImage image = ImageProcessingUtils.convertToDeviceColorSpace(this);
|
||||
ImageProcessingUtils.setAlphaChannelToWhite(image);
|
||||
return LeptUtils.convertImageToPix(image);
|
||||
}
|
||||
|
||||
|
||||
public QuadPoint getImageCoordinatesInInitialUserSpace() {
|
||||
|
||||
return QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, 1, 1)).getTransformed(ctm.createAffineTransform());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,18 +1,15 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.Graphics;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.concurrent.Semaphore;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||
import com.pdftron.sdf.Obj;
|
||||
import com.sun.jna.StringArray;
|
||||
import com.sun.jna.ptr.PointerByReference;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
@ -27,63 +24,20 @@ import net.sourceforge.tess4j.ITessAPI;
|
||||
@Slf4j
|
||||
@Getter
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ExtractedOcrImage implements OcrImage {
|
||||
|
||||
final int pageNumber;
|
||||
final Pix pix;
|
||||
final int originalHeight;
|
||||
final int originalWidth;
|
||||
final int height;
|
||||
final int width;
|
||||
final Matrix ctm;
|
||||
final int numberOnPage;
|
||||
|
||||
@Setter
|
||||
int pageNumber;
|
||||
int numberOnPage;
|
||||
int originalHeight;
|
||||
int originalWidth;
|
||||
Matrix ctm;
|
||||
Pix pix;
|
||||
int height;
|
||||
int width;
|
||||
int rotationDegrees;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public ExtractedOcrImage(int pageNumber, int numberOnPage, BufferedImage bufferedImage, Matrix ctm, int targetDpi) {
|
||||
|
||||
this.pageNumber = pageNumber;
|
||||
this.numberOnPage = numberOnPage;
|
||||
this.ctm = ctm;
|
||||
this.originalHeight = bufferedImage.getHeight();
|
||||
this.originalWidth = bufferedImage.getWidth();
|
||||
float imageDPI = Math.abs(bufferedImage.getWidth() / (ctm.getScalingFactorX() / 72));
|
||||
this.pix = binarize(bufferedImage, imageDPI, targetDpi);
|
||||
this.height = pix.h;
|
||||
this.width = pix.w;
|
||||
}
|
||||
|
||||
|
||||
public ExtractedOcrImage(ExtractedImage image, int targetDpi) {
|
||||
this.pageNumber = image.getPageNumber();
|
||||
this.numberOnPage = image.getNumberOnPage();
|
||||
this.ctm = image.getCtm();
|
||||
this.originalHeight = image.getImage().getHeight();
|
||||
this.originalWidth = image.getImage().getWidth();
|
||||
float imageDPI = Math.abs(image.getImage().getWidth() / (ctm.getScalingFactorX() / 72));
|
||||
this.pix = binarize(image.getImage(), imageDPI, targetDpi);
|
||||
this.height = pix.h;
|
||||
this.width = pix.w;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) {
|
||||
|
||||
ImageProcessingUtils.setAlphaChannelToWhite(image);
|
||||
|
||||
synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs tesseract detection script.
|
||||
Pix grayScale = ImageProcessingUtils.convertToGrayScale(image);
|
||||
Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale);
|
||||
return ImageProcessingUtils.despecklePix(scaledUp);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public AffineTransform getImageCTM() {
|
||||
|
||||
|
||||
@ -71,13 +71,6 @@ public interface OcrImage {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
default BufferedImage getRotatedBufferedImage() {
|
||||
|
||||
return LeptUtils.convertPixToImage(getRotatedPix());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the rotation degree of the OCR image.
|
||||
*
|
||||
@ -94,16 +87,6 @@ public interface OcrImage {
|
||||
int getOptimalPageSegmentationMode(); // TODO: evaluate if PSM can be dynamically chosen to increase performance
|
||||
|
||||
|
||||
/**
|
||||
* Sets the rotation degree of the OCR image. The rotation degree specifies the amount of rotation applied to the image.
|
||||
* Currently only quadrant rotations are supported.
|
||||
* Rotated partial images work, due to the CTM present in the pdf working with any rotation.
|
||||
*
|
||||
* @param rotationDegree The rotation degree of the OCR image.
|
||||
*/
|
||||
void setRotationDegrees(int rotationDegree);
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the buffered image associated with the OCR image.
|
||||
*
|
||||
@ -112,24 +95,6 @@ public interface OcrImage {
|
||||
Pix getPix();
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the rotated image of the OCR image.
|
||||
*
|
||||
* @return The rotated BufferedImage object of the OCR image.
|
||||
*/
|
||||
default Pix getRotatedPix() {
|
||||
|
||||
synchronized (OCRThread.class) {
|
||||
return switch (360 - getRotationDegrees()) {
|
||||
case 90 -> Leptonica1.pixRotateOrth(getPix(), 1);
|
||||
case 180 -> Leptonica1.pixRotateOrth(getPix(), 2);
|
||||
case 270 -> Leptonica1.pixRotateOrth(getPix(), 3);
|
||||
default -> getPix();
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
default int getDpi() {
|
||||
|
||||
return PdfDpiCalculator.calculateDpi(getImageBounds(), getImageCTM(), getWidth());
|
||||
|
||||
@ -49,7 +49,7 @@ public class GhostScriptService {
|
||||
|
||||
List<List<ProcessInfo>> processInfoBatches = buildSubListForEachProcess(stitchedPageNumbers,
|
||||
numOfProcesses,
|
||||
2 * settings.getOcrThreadCount()); // use 2 times the thread count as batch size, such that GS generates the rendered pages as needed by the OCR Threads
|
||||
settings.getOcrThreadCount()); // use 2 times the thread count as batch size, such that GS generates the rendered pages as needed by the OCR Threads
|
||||
for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) {
|
||||
long timestamp = System.currentTimeMillis();
|
||||
List<RenderedPageImageFile> renderedPageImageFiles = Collections.synchronizedList(new LinkedList<>());
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.awt.Graphics;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
@ -26,6 +27,7 @@ import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
import lombok.Getter;
|
||||
@ -34,7 +36,6 @@ import lombok.SneakyThrows;
|
||||
@Getter
|
||||
public class ImageStreamEngine extends PDFStreamEngine {
|
||||
|
||||
private ExtractedOcrImage currentImageOnPage;
|
||||
private List<ExtractedImage> imagesOnCurrentPage;
|
||||
private OcrServiceSettings settings;
|
||||
private int pageNum;
|
||||
@ -71,6 +72,7 @@ public class ImageStreamEngine extends PDFStreamEngine {
|
||||
|
||||
Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix();
|
||||
this.imagesOnCurrentPage.add(new ExtractedImage(pageNum,
|
||||
QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, imageXObject.getWidth(), imageXObject.getHeight())),
|
||||
imageXObject.getHeight(),
|
||||
imageXObject.getWidth(),
|
||||
imageXObject.getImage(),
|
||||
@ -78,7 +80,6 @@ public class ImageStreamEngine extends PDFStreamEngine {
|
||||
imagesOnCurrentPage.size(),
|
||||
imageXObject.getColorSpace()));
|
||||
|
||||
//imagesOnPages.add(this.currentImageOnPage);
|
||||
} else if (xobject instanceof PDFormXObject) {
|
||||
PDFormXObject form = (PDFormXObject) xobject;
|
||||
showForm(form);
|
||||
|
||||
@ -107,7 +107,7 @@ public class OCRService {
|
||||
int numberOfOcrThreads = Math.min(settings.getOcrThreadCount(), document.getNumberOfPages());
|
||||
stats = new Statistics(numberOfExtractThreads, numberOfOcrThreads);
|
||||
|
||||
BlockingQueue<OcrImage> ocrImageQueue = new ArrayBlockingQueue<>(2 * numberOfOcrThreads);
|
||||
BlockingQueue<OcrImage> ocrImageQueue = new ArrayBlockingQueue<>((int) (1.5 * numberOfOcrThreads));
|
||||
|
||||
OcrImageFactory ocrImageFactory = new OcrImageFactory(document,
|
||||
documentFile,
|
||||
@ -128,7 +128,7 @@ public class OCRService {
|
||||
.toList();
|
||||
log.info("Started {} OCR consumer threads, listening for images on the queue", ocrThreads.size());
|
||||
ocrImageFactory.join();
|
||||
log.info("Extracted all images, interrupting ocr threads");
|
||||
log.info("Processed all images, interrupting ocr threads");
|
||||
|
||||
ocrThreads.forEach(Thread::interrupt);
|
||||
for (OCRThread ocrThread : ocrThreads) {
|
||||
|
||||
@ -6,13 +6,16 @@ import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageExtractionThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageProcessingThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
|
||||
|
||||
@ -29,6 +32,8 @@ public class OcrImageFactory {
|
||||
File documentFile;
|
||||
Path tmpImageDir;
|
||||
GhostScriptService ghostScriptService;
|
||||
BlockingQueue<ExtractedImage> imageProcessingQueue;
|
||||
ImageProcessingThread imageProcessingThread;
|
||||
BlockingQueue<OcrImage> imageOutputQueue;
|
||||
List<ImageExtractionThread> imageExtractionThreads;
|
||||
List<Integer> stitchedPageNumbers;
|
||||
@ -50,6 +55,7 @@ public class OcrImageFactory {
|
||||
this.tmpImageDir = tmpImageDir;
|
||||
this.ghostScriptService = ghostScriptService;
|
||||
this.imageOutputQueue = imageOutputQueue;
|
||||
this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOutputQueue.remainingCapacity());
|
||||
this.stitchedPageNumbers = Collections.synchronizedList(new LinkedList<>());
|
||||
this.stats = stats;
|
||||
|
||||
@ -57,8 +63,10 @@ public class OcrImageFactory {
|
||||
|
||||
List<List<Integer>> balancedPageNumbers = ListSplittingUtils.buildBalancedContinuousSublist(document.getNumberOfPages(), numberOfThreads);
|
||||
for (int i = 0; i < balancedPageNumbers.size(); i++) {
|
||||
imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageOutputQueue, stitchedPageNumbers));
|
||||
imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageProcessingQueue, stitchedPageNumbers));
|
||||
}
|
||||
this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOutputQueue, stats, settings);
|
||||
|
||||
log.info("Started {} image extraction threads, with ({}) pages each",
|
||||
imageExtractionThreads.size(),
|
||||
imageExtractionThreads.stream().map(ImageExtractionThread::getPageIndices).map(List::size).map(String::valueOf).collect(Collectors.joining(", ")));
|
||||
@ -70,6 +78,8 @@ public class OcrImageFactory {
|
||||
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
|
||||
imageExtractionThread.start();
|
||||
}
|
||||
imageProcessingThread.start();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -79,11 +89,15 @@ public class OcrImageFactory {
|
||||
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
|
||||
imageExtractionThread.join();
|
||||
}
|
||||
if (stitchedPageNumbers.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageOutputQueue, stats);
|
||||
if (!stitchedPageNumbers.isEmpty()) {
|
||||
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageOutputQueue, stats);
|
||||
}
|
||||
imageProcessingThread.interrupt();
|
||||
log.info("All images extracted, interrupting processing thread.");
|
||||
|
||||
imageProcessingThread.join();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -15,6 +15,7 @@ public class Statistics {
|
||||
List<Long> tesseractDuration;
|
||||
AtomicLong pdf2ImgDuration;
|
||||
AtomicLong writingTextDuration;
|
||||
AtomicLong imageProcessingDuration;
|
||||
|
||||
|
||||
public Statistics(int numberOfExtractThreads, int numberOfOcrThreads) {
|
||||
@ -23,6 +24,7 @@ public class Statistics {
|
||||
this.tesseractDuration = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfOcrThreads, 0L)));
|
||||
this.pdf2ImgDuration = new AtomicLong(0);
|
||||
this.writingTextDuration = new AtomicLong(0);
|
||||
this.imageProcessingDuration = new AtomicLong(0);
|
||||
}
|
||||
|
||||
|
||||
@ -32,6 +34,12 @@ public class Statistics {
|
||||
}
|
||||
|
||||
|
||||
public void increaseImageProcessing(long duration) {
|
||||
|
||||
imageProcessingDuration.addAndGet(duration);
|
||||
}
|
||||
|
||||
|
||||
public void increaseTesseractDuration(int threadId, long duration) {
|
||||
|
||||
tesseractDuration.set(threadId, tesseractDuration.get(threadId) + duration);
|
||||
@ -53,13 +61,15 @@ public class Statistics {
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.format("imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, PDF2Img=%.2f s, writingText=%.2f s",
|
||||
return String.format(
|
||||
"imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s",
|
||||
((float) imageExtraction.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
|
||||
((float) imageExtraction.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
|
||||
((float) imageExtraction.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
|
||||
((float) tesseractDuration.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
|
||||
((float) tesseractDuration.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
|
||||
((float) tesseractDuration.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
|
||||
(float) imageProcessingDuration.get() / 1000,
|
||||
(float) pdf2ImgDuration.get() / 1000,
|
||||
(float) writingTextDuration.get() / 1000);
|
||||
}
|
||||
|
||||
@ -9,8 +9,6 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
|
||||
@ -26,7 +24,7 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ImageExtractionThread extends Thread {
|
||||
|
||||
static double FULL_PAGE_IMAGE_THRESHOLD = 0.98;
|
||||
static double FULL_PAGE_IMAGE_THRESHOLD = 0.99;
|
||||
static double IMAGE_ALIGNMENT_THRESHOLD = 1;
|
||||
|
||||
int id;
|
||||
@ -38,7 +36,7 @@ public class ImageExtractionThread extends Thread {
|
||||
OcrServiceSettings settings;
|
||||
|
||||
// output is written to these lists
|
||||
BlockingQueue<OcrImage> imageOutputQueue;
|
||||
BlockingQueue<ExtractedImage> imageProcessingQueue;
|
||||
List<Integer> stitchedPageNumbers;
|
||||
|
||||
|
||||
@ -50,21 +48,20 @@ public class ImageExtractionThread extends Thread {
|
||||
for (Integer pageIndex : pageIndices) {
|
||||
try (PDDocument document = Loader.loadPDF(documentFile)) { // load new PDDocument for thread safety, also keeps RAM usage low.
|
||||
timestamp = System.currentTimeMillis();
|
||||
List<ExtractedImage> extractedImages = getExtractedOcrImages(pageIndex, document);
|
||||
List<ExtractedImage> extractedImages = getExtractedImages(pageIndex, document);
|
||||
stats.increaseImageExtraction(id, System.currentTimeMillis() - timestamp);
|
||||
if (extractedImages.isEmpty()) {
|
||||
logger.logPageSkipped(pageIndex);
|
||||
}
|
||||
|
||||
if (checkForStitchedImages(extractedImages, document.getPage(pageIndex - 1))) {
|
||||
if (checkForFullPageOrStitchedImages(extractedImages, document.getPage(pageIndex - 1))) {
|
||||
stitchedPageNumbers.add(pageIndex);
|
||||
logger.addImagesToProcess(pageIndex, 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (ExtractedImage image : extractedImages) {
|
||||
ExtractedOcrImage ocrImage = new ExtractedOcrImage(image, settings.getDpi());
|
||||
imageOutputQueue.put(ocrImage);
|
||||
imageProcessingQueue.put(image);
|
||||
logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage());
|
||||
}
|
||||
}
|
||||
@ -72,7 +69,7 @@ public class ImageExtractionThread extends Thread {
|
||||
}
|
||||
|
||||
|
||||
private List<ExtractedImage> getExtractedOcrImages(Integer pageIndex, PDDocument document) {
|
||||
private List<ExtractedImage> getExtractedImages(Integer pageIndex, PDDocument document) {
|
||||
|
||||
PDPage page = document.getPage(pageIndex - 1);
|
||||
ImageStreamEngine imageStreamEngine = new ImageStreamEngine(settings);
|
||||
@ -82,14 +79,14 @@ public class ImageExtractionThread extends Thread {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private boolean checkForStitchedImages(List<ExtractedImage> imagesOnCurrentPage, PDPage page) {
|
||||
private boolean checkForFullPageOrStitchedImages(List<ExtractedImage> imagesOnCurrentPage, PDPage page) {
|
||||
|
||||
if (imagesOnCurrentPage.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (ExtractedImage imageOnPage : imagesOnCurrentPage) {
|
||||
if (imageOnPage.getImageCoordinatesInInitialUserSpace().size() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight() * page.getCropBox().getWidth()) {
|
||||
if (imageOnPage.getWidth() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getWidth() && imageOnPage.getHeight() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,166 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import static net.sourceforge.tess4j.ITessAPI.TRUE;
|
||||
|
||||
import java.nio.FloatBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||
import com.sun.jna.ptr.PointerByReference;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
import net.sourceforge.tess4j.ITessAPI;
|
||||
import net.sourceforge.tess4j.TessAPI1;
|
||||
|
||||
/*
|
||||
* This thread does all the image processing. There should only be one, since Leptonica is not thread safe.
|
||||
*/
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ImageProcessingThread extends Thread {
|
||||
|
||||
BlockingQueue<ExtractedImage> imageInputQueue;
|
||||
BlockingQueue<OcrImage> imageOutputQueue;
|
||||
ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
|
||||
Statistics stats;
|
||||
OcrServiceSettings settings;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void run() {
|
||||
|
||||
// Interrupting signals that the image extraction has finished
|
||||
while (true) {
|
||||
try {
|
||||
final ExtractedImage image = imageInputQueue.take();
|
||||
OcrImage extractedOcrImage = this.process(image);
|
||||
try {
|
||||
imageOutputQueue.put(extractedOcrImage);
|
||||
} catch (InterruptedException e) {
|
||||
imageOutputQueue.put(extractedOcrImage);
|
||||
break;
|
||||
}
|
||||
|
||||
} catch (InterruptedException e) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
log.info("Leaving initial uninterrupted loop!");
|
||||
// empty the queue
|
||||
List<ExtractedImage> remainingImages = new ArrayList<>(imageInputQueue.size());
|
||||
imageInputQueue.drainTo(remainingImages);
|
||||
remainingImages.forEach(image -> {
|
||||
OcrImage ocrImage = this.process(image);
|
||||
try {
|
||||
imageOutputQueue.put(ocrImage);
|
||||
} catch (InterruptedException e) {
|
||||
log.error(e.getMessage());
|
||||
}
|
||||
});
|
||||
|
||||
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
|
||||
}
|
||||
|
||||
|
||||
private OcrImage process(ExtractedImage extractedImage) {
|
||||
|
||||
long timestamp = System.currentTimeMillis();
|
||||
float imageDPI = Math.abs(extractedImage.getImage().getWidth() / (extractedImage.getCtm().getScalingFactorX() / 72));
|
||||
|
||||
Pix pix = binarize(extractedImage.asPix(), imageDPI, settings.getDpi());
|
||||
|
||||
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
|
||||
Pix rotatedPix = switch (360 - orientDegree) {
|
||||
case 90 -> Leptonica1.pixRotateOrth(pix, 1);
|
||||
case 180 -> Leptonica1.pixRotateOrth(pix, 2);
|
||||
case 270 -> Leptonica1.pixRotateOrth(pix, 3);
|
||||
default -> pix;
|
||||
};
|
||||
OcrImage extractedOcrImage = new ExtractedOcrImage(extractedImage.getPageNumber(),
|
||||
extractedImage.getNumberOnPage(),
|
||||
extractedImage.getHeight(),
|
||||
extractedImage.getWidth(),
|
||||
extractedImage.getCtm(),
|
||||
rotatedPix,
|
||||
pix.h,
|
||||
pix.w,
|
||||
orientDegree);
|
||||
|
||||
if (pix != rotatedPix) {
|
||||
LeptUtils.disposePix(pix);
|
||||
}
|
||||
|
||||
stats.increaseImageProcessing(System.currentTimeMillis() - timestamp);
|
||||
|
||||
return extractedOcrImage;
|
||||
}
|
||||
|
||||
|
||||
static public int detectOrientation(Pix pix, int dpi, ITessAPI.TessBaseAPI detectionScriptHandle) {
|
||||
|
||||
TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, pix);
|
||||
TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, dpi);
|
||||
|
||||
IntBuffer orientationDegreeResultBuffer;
|
||||
FloatBuffer orientationDegreeConfidenceBuffer;
|
||||
PointerByReference scriptureNameBuffer;
|
||||
FloatBuffer scriptureConfidenceBuffer;
|
||||
|
||||
orientationDegreeResultBuffer = IntBuffer.allocate(1);
|
||||
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
|
||||
scriptureNameBuffer = new PointerByReference();
|
||||
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
|
||||
|
||||
int orientationDegree = 0;
|
||||
int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle,
|
||||
orientationDegreeResultBuffer,
|
||||
orientationDegreeConfidenceBuffer,
|
||||
scriptureNameBuffer,
|
||||
scriptureConfidenceBuffer);
|
||||
if (result == TRUE && orientationDegreeConfidenceBuffer.get() > 10) {
|
||||
orientationDegree = orientationDegreeResultBuffer.get();
|
||||
}
|
||||
|
||||
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
|
||||
|
||||
return orientationDegree;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Pix binarize(Pix pix, float imageDpi, int targetDpi) {
|
||||
|
||||
Pix grayScale = ImageProcessingUtils.convertToGrayScale(pix);
|
||||
Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale);
|
||||
return ImageProcessingUtils.despecklePix(scaledUp);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
|
||||
|
||||
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
|
||||
String datapath = System.getenv("TESSDATA_PREFIX");
|
||||
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
|
||||
|
||||
return handle;
|
||||
}
|
||||
|
||||
}
|
||||
@ -47,8 +47,7 @@ public class OCRThread extends Thread {
|
||||
OcrProgressLogger logger;
|
||||
Statistics stats;
|
||||
OcrServiceSettings settings;
|
||||
ITessAPI.TessBaseAPI detectionScriptHandle;
|
||||
ITessAPI.TessBaseAPI tesseractHandle;
|
||||
Tesseract2 instance;
|
||||
|
||||
|
||||
public OCRThread(int id,
|
||||
@ -66,8 +65,7 @@ public class OCRThread extends Thread {
|
||||
this.logger = logger;
|
||||
this.stats = stats;
|
||||
this.settings = settings;
|
||||
this.detectionScriptHandle = initDetectionScriptHandle();
|
||||
this.tesseractHandle = initTesseractHandle(settings);
|
||||
this.instance = createInstance(settings);
|
||||
}
|
||||
|
||||
|
||||
@ -92,10 +90,9 @@ public class OCRThread extends Thread {
|
||||
this.process(image);
|
||||
}
|
||||
} catch (NoSuchElementException e) {
|
||||
log.debug("Processed all Images, finishing.");
|
||||
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
|
||||
TessAPI1.TessBaseAPIDelete(this.tesseractHandle);
|
||||
log.debug("Executed tesseract on all Images, finishing.");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -107,13 +104,8 @@ public class OCRThread extends Thread {
|
||||
|
||||
int psm = settings.getPsmOverride() < 0 ? image.getOptimalPageSegmentationMode() : settings.getPsmOverride();
|
||||
|
||||
int orientDegree = detectOrientation(image);
|
||||
image.setRotationDegrees(orientDegree);
|
||||
Pix rotatedPix = image.getRotatedPix();
|
||||
executeTesseract(psm, image.getDpi(), rotatedPix, tesseractOutputFileName);
|
||||
|
||||
executeTesseract(psm, image.getDpi(), image.getPix(), tesseractOutputFileName);
|
||||
image.destroyPix();
|
||||
LeptUtils.disposePix(rotatedPix);
|
||||
|
||||
results.add(OcrResult.create(image, tesseractOutputFileName));
|
||||
logger.logImageFinished(image, psm);
|
||||
@ -121,67 +113,6 @@ public class OCRThread extends Thread {
|
||||
}
|
||||
|
||||
|
||||
public int detectOrientation(OcrImage image) {
|
||||
|
||||
IntBuffer orientationDegreeResultBuffer;
|
||||
FloatBuffer orientationDegreeConfidenceBuffer;
|
||||
PointerByReference scriptureNameBuffer;
|
||||
FloatBuffer scriptureConfidenceBuffer;
|
||||
|
||||
TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, image.getPix());
|
||||
TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, image.getDpi());
|
||||
|
||||
synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs in leptonica binarization.
|
||||
orientationDegreeResultBuffer = IntBuffer.allocate(1);
|
||||
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
|
||||
scriptureNameBuffer = new PointerByReference();
|
||||
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
|
||||
}
|
||||
|
||||
int orient_deg = 0;
|
||||
int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle,
|
||||
orientationDegreeResultBuffer,
|
||||
orientationDegreeConfidenceBuffer,
|
||||
scriptureNameBuffer,
|
||||
scriptureConfidenceBuffer);
|
||||
if (result == TRUE) {
|
||||
orient_deg = orientationDegreeResultBuffer.get();
|
||||
}
|
||||
|
||||
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
|
||||
|
||||
return orient_deg;
|
||||
}
|
||||
|
||||
|
||||
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
|
||||
|
||||
synchronized (OCRThread.class) {
|
||||
|
||||
ITessAPI.TessBaseAPI handle = TessBaseAPICreate();
|
||||
String datapath = System.getenv("TESSDATA_PREFIX");
|
||||
// TessBaseAPISetVariable(handle, "debug_file", "/dev/null");
|
||||
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
|
||||
|
||||
return handle;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
synchronized private static ITessAPI.TessBaseAPI initTesseractHandle(OcrServiceSettings settings) {
|
||||
|
||||
synchronized (OCRThread.class) {
|
||||
|
||||
ITessAPI.TessBaseAPI handle = TessBaseAPICreate();
|
||||
String datapath = System.getenv("TESSDATA_PREFIX");
|
||||
// TessBaseAPISetVariable(handle, "debug_file", "/dev/null");
|
||||
TessBaseAPIInit1(handle, datapath, settings.getLanguages(), 1, new PointerByReference(), 0);
|
||||
|
||||
return handle;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void executeTesseract(int psm, int dpi, Pix pix, String tesseractOutputFileName) {
|
||||
|
||||
@ -192,14 +123,19 @@ public class OCRThread extends Thread {
|
||||
Leptonica1.pixWrite(folder + "/pix_" + a[a.length - 1] + ".png", pix, 3);
|
||||
}
|
||||
|
||||
TessBaseAPISetPageSegMode(tesseractHandle, psm);
|
||||
instance.setVariable("user_defined_dpi", String.valueOf(dpi));
|
||||
instance.setPageSegMode(psm);
|
||||
instance.createDocumentsWithResults(pix, null, tesseractOutputFileName, List.of(ITesseract.RenderedFormat.HOCR), ITessAPI.TessPageIteratorLevel.RIL_BLOCK);
|
||||
}
|
||||
|
||||
Tesseract2.createDocumentsWithResults(pix,
|
||||
null,
|
||||
tesseractOutputFileName,
|
||||
List.of(ITesseract.RenderedFormat.HOCR),
|
||||
ITessAPI.TessPageIteratorLevel.RIL_BLOCK,
|
||||
tesseractHandle);
|
||||
|
||||
private static Tesseract2 createInstance(OcrServiceSettings settings) {
|
||||
|
||||
Tesseract2 instance = new Tesseract2();
|
||||
instance.setVariable("debug_file", "/dev/null"); // remove warnings from std out
|
||||
instance.setOcrEngineMode(1); // set to LSTM based Engine
|
||||
instance.setLanguage(settings.getLanguages());
|
||||
return instance;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -13,13 +13,13 @@ import lombok.experimental.FieldDefaults;
|
||||
public class OcrServiceSettings {
|
||||
|
||||
int ocrThreadCount = 16; // Number of OCR threads
|
||||
int imageExtractThreadCount = 5; // Number of image extraction threads
|
||||
int gsProcessCount = 5; // Number of Ghostscript processes
|
||||
int imageExtractThreadCount = 2; // Number of image extraction threads
|
||||
int gsProcessCount = 2; // Number of Ghostscript processes
|
||||
int dpi = 300; // Target DPI for binarized images
|
||||
int psmOverride = -1; // Overrides the page segmentation mode if > 0
|
||||
int minImageHeight = 20; // Minimum height for images to be processed
|
||||
int minImageWidth = 20; // Minimum width for images to be processed
|
||||
boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes
|
||||
boolean debug = true; // If true, overlays OCR images with a grid and draws word bounding boxes
|
||||
boolean removeWatermark; // If true, watermarks will be removed
|
||||
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
|
||||
COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR");
|
||||
|
||||
@ -2,10 +2,16 @@ package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import java.awt.AlphaComposite;
|
||||
import java.awt.Color;
|
||||
import java.awt.Graphics;
|
||||
import java.awt.Graphics2D;
|
||||
import java.awt.Transparency;
|
||||
import java.awt.image.BufferedImage;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
@ -15,6 +21,22 @@ import net.sourceforge.lept4j.util.LeptUtils;
|
||||
@UtilityClass
|
||||
public class ImageProcessingUtils {
|
||||
|
||||
public BufferedImage convertToDeviceColorSpace(ExtractedImage extractedImage) {
|
||||
|
||||
BufferedImage image;
|
||||
if (extractedImage.getColorSpace() instanceof PDDeviceRGB || extractedImage.getColorSpace() instanceof PDDeviceGray) {
|
||||
image = extractedImage.getImage();
|
||||
} else {
|
||||
BufferedImage pdfImage = extractedImage.getImage();
|
||||
image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
|
||||
Graphics g = image.getGraphics();
|
||||
g.drawImage(pdfImage, 0, 0, null);
|
||||
g.dispose();
|
||||
}
|
||||
return image;
|
||||
}
|
||||
|
||||
|
||||
public static Pix despecklePix(Pix pix) {
|
||||
|
||||
assert pix.d == 8;
|
||||
@ -23,7 +45,9 @@ public class ImageProcessingUtils {
|
||||
// too small to properly despeckle, just binarize instead.
|
||||
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
|
||||
} else {
|
||||
despeckled = LeptUtils.despeckle(pix, LeptUtils.SEL_STR3, 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though...
|
||||
despeckled = LeptUtils.despeckle(pix,
|
||||
LeptUtils.SEL_STR3,
|
||||
3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though...
|
||||
if (despeckled == null) {
|
||||
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
|
||||
}
|
||||
@ -56,9 +80,8 @@ public class ImageProcessingUtils {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static Pix convertToGrayScale(BufferedImage image) {
|
||||
public static Pix convertToGrayScale(Pix pix) {
|
||||
|
||||
Pix pix = LeptUtils.convertImageToPix(image);
|
||||
if (pix.d == 8) {
|
||||
return pix;
|
||||
} else if (pix.d == 32) {
|
||||
|
||||
@ -1,54 +1,45 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import static net.sourceforge.tess4j.ITesseract.DOCUMENT_TITLE;
|
||||
|
||||
import java.awt.Rectangle;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||
import com.sun.jna.Pointer;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.tess4j.ITessAPI;
|
||||
import net.sourceforge.tess4j.ITesseract;
|
||||
import net.sourceforge.tess4j.OCRResult;
|
||||
import net.sourceforge.tess4j.TessAPI1;
|
||||
import net.sourceforge.tess4j.Tesseract1;
|
||||
import net.sourceforge.tess4j.TesseractException;
|
||||
import net.sourceforge.tess4j.Word;
|
||||
|
||||
@Slf4j
|
||||
/**
|
||||
* Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted.
|
||||
*/
|
||||
@UtilityClass
|
||||
public class Tesseract2 extends TessAPI1 {
|
||||
public class Tesseract2 extends Tesseract1 {
|
||||
|
||||
private int createDocuments(Pix pix, String filename, ITessAPI.TessBaseAPI handle, ITessAPI.TessResultRenderer renderer) {
|
||||
|
||||
String title = TessBaseAPIGetStringVariable(handle, DOCUMENT_TITLE);
|
||||
private int createDocuments(Pix pix, String filename, TessResultRenderer renderer) {
|
||||
|
||||
String title = TessBaseAPIGetStringVariable(getHandle(), DOCUMENT_TITLE);
|
||||
TessResultRendererBeginDocument(renderer, title);
|
||||
int result = TessBaseAPIProcessPage(handle, pix, 0, filename, null, 0, renderer);
|
||||
int result = TessBaseAPIProcessPage(getHandle(), pix, 0, filename, null, 0, renderer);
|
||||
TessResultRendererEndDocument(renderer);
|
||||
|
||||
// if (result == ITessAPI.FALSE) {
|
||||
// throw new TesseractException("Error during processing page.");
|
||||
// }
|
||||
|
||||
return TessBaseAPIMeanTextConf(handle);
|
||||
return TessBaseAPIMeanTextConf(getHandle());
|
||||
}
|
||||
|
||||
|
||||
public OCRResult createDocumentsWithResults(Pix bi,
|
||||
String filename,
|
||||
String outputbase,
|
||||
List<ITesseract.RenderedFormat> formats,
|
||||
int pageIteratorLevel,
|
||||
ITessAPI.TessBaseAPI handle) {
|
||||
public OCRResult createDocumentsWithResults(Pix bi, String filename, String outputbase, List<RenderedFormat> formats, int pageIteratorLevel) throws TesseractException {
|
||||
|
||||
List<OCRResult> results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel, handle);
|
||||
List<OCRResult> results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel);
|
||||
if (!results.isEmpty()) {
|
||||
return results.get(0);
|
||||
} else {
|
||||
@ -57,26 +48,24 @@ public class Tesseract2 extends TessAPI1 {
|
||||
}
|
||||
|
||||
|
||||
public List<OCRResult> createDocumentsWithResults(Pix[] pixs,
|
||||
String[] filenames,
|
||||
String[] outputbases,
|
||||
List<ITesseract.RenderedFormat> formats,
|
||||
int pageIteratorLevel,
|
||||
ITessAPI.TessBaseAPI handle) {
|
||||
public List<OCRResult> createDocumentsWithResults(Pix[] pixs, String[] filenames, String[] outputbases, List<RenderedFormat> formats, int pageIteratorLevel) {
|
||||
|
||||
if (pixs.length != filenames.length || pixs.length != outputbases.length) {
|
||||
throw new RuntimeException("The three arrays must match in length.");
|
||||
}
|
||||
|
||||
init();
|
||||
setVariables();
|
||||
|
||||
List<OCRResult> results = new ArrayList<OCRResult>();
|
||||
|
||||
try {
|
||||
for (int i = 0; i < pixs.length; i++) {
|
||||
try {
|
||||
ITessAPI.TessResultRenderer renderer = createRenderers(outputbases[i], formats);
|
||||
int meanTextConfidence = createDocuments(pixs[i], filenames[i], handle, renderer);
|
||||
TessResultRenderer renderer = createRenderers(outputbases[i], formats);
|
||||
int meanTextConfidence = createDocuments(pixs[i], filenames[i], renderer);
|
||||
TessDeleteResultRenderer(renderer);
|
||||
List<Word> words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel, handle) : new ArrayList<Word>();
|
||||
List<Word> words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel) : new ArrayList<Word>();
|
||||
results.add(new OCRResult(meanTextConfidence, words));
|
||||
} catch (Exception e) {
|
||||
// skip the problematic image file
|
||||
@ -84,22 +73,20 @@ public class Tesseract2 extends TessAPI1 {
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
synchronized (OCRThread.class) {
|
||||
TessAPI1.TessBaseAPIClear(handle);
|
||||
}
|
||||
dispose();
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
|
||||
private List<Word> getRecognizedWords(int pageIteratorLevel, ITessAPI.TessBaseAPI handle) {
|
||||
private List<Word> getRecognizedWords(int pageIteratorLevel) {
|
||||
|
||||
List<Word> words = new ArrayList<>();
|
||||
|
||||
try {
|
||||
ITessAPI.TessResultIterator ri = TessBaseAPIGetIterator(handle);
|
||||
ITessAPI.TessPageIterator pi = TessResultIteratorGetPageIterator(ri);
|
||||
TessResultIterator ri = TessBaseAPIGetIterator(getHandle());
|
||||
TessPageIterator pi = TessResultIteratorGetPageIterator(ri);
|
||||
TessPageIteratorBegin(pi);
|
||||
|
||||
do {
|
||||
@ -132,11 +119,11 @@ public class Tesseract2 extends TessAPI1 {
|
||||
}
|
||||
|
||||
|
||||
private ITessAPI.TessResultRenderer createRenderers(String outputbase, List<ITesseract.RenderedFormat> formats) {
|
||||
private TessResultRenderer createRenderers(String outputbase, List<RenderedFormat> formats) {
|
||||
|
||||
ITessAPI.TessResultRenderer renderer = null;
|
||||
TessResultRenderer renderer = null;
|
||||
|
||||
for (ITesseract.RenderedFormat format : formats) {
|
||||
for (RenderedFormat format : formats) {
|
||||
switch (format) {
|
||||
|
||||
case HOCR:
|
||||
|
||||
@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testOcr() {
|
||||
|
||||
String text = testOCR("files/VV-352892.pdf");
|
||||
String text = testOCR("files/2009-1048395_50pages_tables.pdf");
|
||||
}
|
||||
|
||||
|
||||
@ -139,7 +139,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
|
||||
String dir = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/";
|
||||
List<File> foundFiles = Files.walk(Path.of(dir))
|
||||
// .sorted(Comparator.comparingLong(this::getFileSize))
|
||||
.sorted(Comparator.comparingLong(this::getFileSize))
|
||||
.map(Path::toFile)
|
||||
.filter(file -> file.getName().endsWith(".pdf"))
|
||||
.peek(System.out::println)
|
||||
@ -162,7 +162,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testOcrForSpecificFile() {
|
||||
|
||||
testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/Item 17_Toxicidade Inalatoria.pdf"));
|
||||
testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/F.2. A16003E - Acute Inhalation Study.pdf"));
|
||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/A23220A - 404 - Skin Irritation in vivo.pdf"));
|
||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/G.1.2 - 1768300_MMNA_A13617AV_report.pdf"));
|
||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 17_Toxicidade Inalatoria Aguda.pdf"));
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user