RED-7669: optimize OCR-module performance
* binarize images after reading
This commit is contained in:
parent
6f99664906
commit
bb5b4a2fd8
@ -0,0 +1,27 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ExtractedImage {
|
||||
|
||||
int pageNumber;
|
||||
QuadPoint position;
|
||||
int height;
|
||||
int width;
|
||||
BufferedImage image;
|
||||
Matrix ctm;
|
||||
int numberOnPage;
|
||||
PDColorSpace colorSpace;
|
||||
|
||||
}
|
||||
@ -11,6 +11,8 @@ import org.apache.pdfbox.util.Matrix;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||
import com.pdftron.sdf.Obj;
|
||||
import com.sun.jna.StringArray;
|
||||
import com.sun.jna.ptr.PointerByReference;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
@ -56,6 +58,19 @@ public class ExtractedOcrImage implements OcrImage {
|
||||
}
|
||||
|
||||
|
||||
public ExtractedOcrImage(ExtractedImage image, int targetDpi) {
|
||||
this.pageNumber = image.getPageNumber();
|
||||
this.numberOnPage = image.getNumberOnPage();
|
||||
this.ctm = image.getCtm();
|
||||
this.originalHeight = image.getImage().getHeight();
|
||||
this.originalWidth = image.getImage().getWidth();
|
||||
float imageDPI = Math.abs(image.getImage().getWidth() / (ctm.getScalingFactorX() / 72));
|
||||
this.pix = binarize(image.getImage(), imageDPI, targetDpi);
|
||||
this.height = pix.h;
|
||||
this.width = pix.w;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) {
|
||||
|
||||
|
||||
@ -2,10 +2,12 @@ package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.PdfDpiCalculator;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
@ -62,6 +64,20 @@ public interface OcrImage {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
default BufferedImage getBufferedImage() {
|
||||
|
||||
return LeptUtils.convertPixToImage(getPix());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
default BufferedImage getRotatedBufferedImage() {
|
||||
|
||||
return LeptUtils.convertPixToImage(getRotatedPix());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the rotation degree of the OCR image.
|
||||
*
|
||||
|
||||
@ -97,4 +97,10 @@ public record QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) {
|
||||
d().getY());
|
||||
}
|
||||
|
||||
|
||||
public double size() {
|
||||
|
||||
return a().distance(b()) * a().distance(d());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -24,6 +24,7 @@ import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
@ -34,7 +35,7 @@ import lombok.SneakyThrows;
|
||||
public class ImageStreamEngine extends PDFStreamEngine {
|
||||
|
||||
private ExtractedOcrImage currentImageOnPage;
|
||||
private List<ExtractedOcrImage> imagesOnCurrentPage;
|
||||
private List<ExtractedImage> imagesOnCurrentPage;
|
||||
private OcrServiceSettings settings;
|
||||
private int pageNum;
|
||||
|
||||
@ -69,21 +70,14 @@ public class ImageStreamEngine extends PDFStreamEngine {
|
||||
}
|
||||
|
||||
Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix();
|
||||
if (imageXObject.getColorSpace() instanceof PDDeviceRGB) {
|
||||
BufferedImage image = imageXObject.getImage();
|
||||
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi());
|
||||
} else if (imageXObject.getColorSpace() instanceof PDDeviceGray) {
|
||||
BufferedImage image = imageXObject.getImage();
|
||||
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi());
|
||||
} else {
|
||||
BufferedImage pdfImage = imageXObject.getImage();
|
||||
BufferedImage image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
|
||||
Graphics g = image.getGraphics();
|
||||
g.drawImage(pdfImage, 0, 0, null);
|
||||
g.dispose();
|
||||
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi());
|
||||
}
|
||||
this.imagesOnCurrentPage.add(this.currentImageOnPage);
|
||||
this.imagesOnCurrentPage.add(new ExtractedImage(pageNum,
|
||||
imageXObject.getHeight(),
|
||||
imageXObject.getWidth(),
|
||||
imageXObject.getImage(),
|
||||
imageCTM,
|
||||
imagesOnCurrentPage.size(),
|
||||
imageXObject.getColorSpace()));
|
||||
|
||||
//imagesOnPages.add(this.currentImageOnPage);
|
||||
} else if (xobject instanceof PDFormXObject) {
|
||||
PDFormXObject form = (PDFormXObject) xobject;
|
||||
|
||||
@ -107,7 +107,7 @@ public class OCRService {
|
||||
int numberOfOcrThreads = Math.min(settings.getOcrThreadCount(), document.getNumberOfPages());
|
||||
stats = new Statistics(numberOfExtractThreads, numberOfOcrThreads);
|
||||
|
||||
BlockingQueue<OcrImage> ocrImageQueue = new ArrayBlockingQueue<>(numberOfOcrThreads);
|
||||
BlockingQueue<OcrImage> ocrImageQueue = new ArrayBlockingQueue<>(2 * numberOfOcrThreads);
|
||||
|
||||
OcrImageFactory ocrImageFactory = new OcrImageFactory(document,
|
||||
documentFile,
|
||||
|
||||
@ -5,10 +5,10 @@ import java.util.List;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine;
|
||||
@ -26,6 +26,7 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ImageExtractionThread extends Thread {
|
||||
|
||||
static double FULL_PAGE_IMAGE_THRESHOLD = 0.98;
|
||||
static double IMAGE_ALIGNMENT_THRESHOLD = 1;
|
||||
|
||||
int id;
|
||||
@ -40,6 +41,7 @@ public class ImageExtractionThread extends Thread {
|
||||
BlockingQueue<OcrImage> imageOutputQueue;
|
||||
List<Integer> stitchedPageNumbers;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void run() {
|
||||
@ -48,20 +50,21 @@ public class ImageExtractionThread extends Thread {
|
||||
for (Integer pageIndex : pageIndices) {
|
||||
try (PDDocument document = Loader.loadPDF(documentFile)) { // load new PDDocument for thread safety, also keeps RAM usage low.
|
||||
timestamp = System.currentTimeMillis();
|
||||
List<ExtractedOcrImage> extractedOcrImages = getExtractedOcrImages(pageIndex, document);
|
||||
List<ExtractedImage> extractedImages = getExtractedOcrImages(pageIndex, document);
|
||||
stats.increaseImageExtraction(id, System.currentTimeMillis() - timestamp);
|
||||
if (extractedOcrImages.isEmpty()) {
|
||||
if (extractedImages.isEmpty()) {
|
||||
logger.logPageSkipped(pageIndex);
|
||||
}
|
||||
|
||||
if (checkForStitchedImages(extractedOcrImages)) {
|
||||
if (checkForStitchedImages(extractedImages, document.getPage(pageIndex - 1))) {
|
||||
stitchedPageNumbers.add(pageIndex);
|
||||
logger.addImagesToProcess(pageIndex, 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (ExtractedOcrImage image : extractedOcrImages) {
|
||||
imageOutputQueue.put(image);
|
||||
for (ExtractedImage image : extractedImages) {
|
||||
ExtractedOcrImage ocrImage = new ExtractedOcrImage(image, settings.getDpi());
|
||||
imageOutputQueue.put(ocrImage);
|
||||
logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage());
|
||||
}
|
||||
}
|
||||
@ -69,7 +72,7 @@ public class ImageExtractionThread extends Thread {
|
||||
}
|
||||
|
||||
|
||||
private List<ExtractedOcrImage> getExtractedOcrImages(Integer pageIndex, PDDocument document) {
|
||||
private List<ExtractedImage> getExtractedOcrImages(Integer pageIndex, PDDocument document) {
|
||||
|
||||
PDPage page = document.getPage(pageIndex - 1);
|
||||
ImageStreamEngine imageStreamEngine = new ImageStreamEngine(settings);
|
||||
@ -79,22 +82,22 @@ public class ImageExtractionThread extends Thread {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private boolean checkForStitchedImages(List<ExtractedOcrImage> imagesOnCurrentPage) {
|
||||
private boolean checkForStitchedImages(List<ExtractedImage> imagesOnCurrentPage, PDPage page) {
|
||||
|
||||
if (imagesOnCurrentPage.size() <= 1) {
|
||||
if (imagesOnCurrentPage.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
//checking for intersections or direct alignment of images
|
||||
ExtractedOcrImage[] imageOnPagesArray = new ExtractedOcrImage[imagesOnCurrentPage.size()];
|
||||
int index = 0;
|
||||
for (ExtractedOcrImage imageOnPage : imagesOnCurrentPage) {
|
||||
imageOnPagesArray[index] = imageOnPage;
|
||||
index++;
|
||||
for (ExtractedImage imageOnPage : imagesOnCurrentPage) {
|
||||
if (imageOnPage.getImageCoordinatesInInitialUserSpace().size() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight() * page.getCropBox().getWidth()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < imageOnPagesArray.length; j++) {
|
||||
for (int i = j + 1; i < imageOnPagesArray.length; i++) {
|
||||
if (imageOnPagesArray[j].getImageCoordinatesInInitialUserSpace().aligns(imageOnPagesArray[i].getImageCoordinatesInInitialUserSpace(), IMAGE_ALIGNMENT_THRESHOLD)) {
|
||||
|
||||
//checking for intersections or direct alignment of images
|
||||
for (int j = 0; j < imagesOnCurrentPage.size(); j++) {
|
||||
for (int i = j + 1; i < imagesOnCurrentPage.size(); i++) {
|
||||
if (imagesOnCurrentPage.get(j).getImageCoordinatesInInitialUserSpace().aligns(imagesOnCurrentPage.get(i).getImageCoordinatesInInitialUserSpace(), IMAGE_ALIGNMENT_THRESHOLD)) {
|
||||
// TODO: see if we can stitch aligning images using BufferedImage and skip the gs conversion entirely
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1,6 +1,10 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import static net.sourceforge.tess4j.ITessAPI.TRUE;
|
||||
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPICreate;
|
||||
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIInit1;
|
||||
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetPageSegMode;
|
||||
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetVariable;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.FloatBuffer;
|
||||
@ -16,6 +20,7 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.Tesseract2;
|
||||
import com.sun.jna.StringArray;
|
||||
import com.sun.jna.ptr.PointerByReference;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
@ -42,8 +47,8 @@ public class OCRThread extends Thread {
|
||||
OcrProgressLogger logger;
|
||||
Statistics stats;
|
||||
OcrServiceSettings settings;
|
||||
Tesseract2 instance;
|
||||
ITessAPI.TessBaseAPI detectionScriptHandle;
|
||||
ITessAPI.TessBaseAPI tesseractHandle;
|
||||
|
||||
|
||||
public OCRThread(int id,
|
||||
@ -61,8 +66,8 @@ public class OCRThread extends Thread {
|
||||
this.logger = logger;
|
||||
this.stats = stats;
|
||||
this.settings = settings;
|
||||
this.instance = createInstance(settings);
|
||||
this.detectionScriptHandle = initDetectionScriptHandle();
|
||||
this.tesseractHandle = initTesseractHandle(settings);
|
||||
}
|
||||
|
||||
|
||||
@ -88,9 +93,9 @@ public class OCRThread extends Thread {
|
||||
}
|
||||
} catch (NoSuchElementException e) {
|
||||
log.debug("Processed all Images, finishing.");
|
||||
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
|
||||
TessAPI1.TessBaseAPIDelete(this.tesseractHandle);
|
||||
}
|
||||
|
||||
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
|
||||
}
|
||||
|
||||
|
||||
@ -107,10 +112,8 @@ public class OCRThread extends Thread {
|
||||
Pix rotatedPix = image.getRotatedPix();
|
||||
executeTesseract(psm, image.getDpi(), rotatedPix, tesseractOutputFileName);
|
||||
|
||||
synchronized (OCRThread.class) {
|
||||
image.destroyPix();
|
||||
LeptUtils.disposePix(rotatedPix);
|
||||
}
|
||||
image.destroyPix();
|
||||
LeptUtils.disposePix(rotatedPix);
|
||||
|
||||
results.add(OcrResult.create(image, tesseractOutputFileName));
|
||||
logger.logImageFinished(image, psm);
|
||||
@ -145,21 +148,37 @@ public class OCRThread extends Thread {
|
||||
orient_deg = orientationDegreeResultBuffer.get();
|
||||
}
|
||||
|
||||
synchronized (OCRThread.class) {
|
||||
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
|
||||
}
|
||||
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
|
||||
|
||||
return orient_deg;
|
||||
}
|
||||
|
||||
|
||||
synchronized private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
|
||||
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
|
||||
|
||||
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
|
||||
String datapath = System.getenv("TESSDATA_PREFIX");
|
||||
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
|
||||
synchronized (OCRThread.class) {
|
||||
|
||||
return handle;
|
||||
ITessAPI.TessBaseAPI handle = TessBaseAPICreate();
|
||||
String datapath = System.getenv("TESSDATA_PREFIX");
|
||||
// TessBaseAPISetVariable(handle, "debug_file", "/dev/null");
|
||||
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
|
||||
|
||||
return handle;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
synchronized private static ITessAPI.TessBaseAPI initTesseractHandle(OcrServiceSettings settings) {
|
||||
|
||||
synchronized (OCRThread.class) {
|
||||
|
||||
ITessAPI.TessBaseAPI handle = TessBaseAPICreate();
|
||||
String datapath = System.getenv("TESSDATA_PREFIX");
|
||||
// TessBaseAPISetVariable(handle, "debug_file", "/dev/null");
|
||||
TessBaseAPIInit1(handle, datapath, settings.getLanguages(), 1, new PointerByReference(), 0);
|
||||
|
||||
return handle;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -173,19 +192,14 @@ public class OCRThread extends Thread {
|
||||
Leptonica1.pixWrite(folder + "/pix_" + a[a.length - 1] + ".png", pix, 3);
|
||||
}
|
||||
|
||||
instance.setVariable("user_defined_dpi", String.valueOf(dpi));
|
||||
instance.setPageSegMode(psm);
|
||||
instance.createDocumentsWithResults(pix, null, tesseractOutputFileName, List.of(ITesseract.RenderedFormat.HOCR), ITessAPI.TessPageIteratorLevel.RIL_BLOCK);
|
||||
}
|
||||
TessBaseAPISetPageSegMode(tesseractHandle, psm);
|
||||
|
||||
|
||||
private static Tesseract2 createInstance(OcrServiceSettings settings) {
|
||||
|
||||
Tesseract2 instance = new Tesseract2();
|
||||
instance.setVariable("debug_file", "/dev/null"); // remove warnings from std out
|
||||
instance.setOcrEngineMode(1); // set to LSTM based Engine
|
||||
instance.setLanguage(settings.getLanguages());
|
||||
return instance;
|
||||
Tesseract2.createDocumentsWithResults(pix,
|
||||
null,
|
||||
tesseractOutputFileName,
|
||||
List.of(ITesseract.RenderedFormat.HOCR),
|
||||
ITessAPI.TessPageIteratorLevel.RIL_BLOCK,
|
||||
tesseractHandle);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -12,9 +12,9 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class OcrServiceSettings {
|
||||
|
||||
int ocrThreadCount = 4; // Number of OCR threads
|
||||
int imageExtractThreadCount = 2; // Number of image extraction threads
|
||||
int gsProcessCount = 2; // Number of Ghostscript processes
|
||||
int ocrThreadCount = 16; // Number of OCR threads
|
||||
int imageExtractThreadCount = 5; // Number of image extraction threads
|
||||
int gsProcessCount = 5; // Number of Ghostscript processes
|
||||
int dpi = 300; // Target DPI for binarized images
|
||||
int psmOverride = -1; // Overrides the page segmentation mode if > 0
|
||||
int minImageHeight = 20; // Minimum height for images to be processed
|
||||
|
||||
@ -1,45 +1,54 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import static net.sourceforge.tess4j.ITesseract.DOCUMENT_TITLE;
|
||||
|
||||
import java.awt.Rectangle;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||
import com.sun.jna.Pointer;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.tess4j.ITessAPI;
|
||||
import net.sourceforge.tess4j.ITesseract;
|
||||
import net.sourceforge.tess4j.OCRResult;
|
||||
import net.sourceforge.tess4j.TessAPI1;
|
||||
import net.sourceforge.tess4j.Tesseract1;
|
||||
import net.sourceforge.tess4j.TesseractException;
|
||||
import net.sourceforge.tess4j.Word;
|
||||
|
||||
@Slf4j
|
||||
/**
|
||||
* Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted.
|
||||
*/
|
||||
public class Tesseract2 extends Tesseract1 {
|
||||
@UtilityClass
|
||||
public class Tesseract2 extends TessAPI1 {
|
||||
|
||||
private int createDocuments(Pix pix, String filename, ITessAPI.TessBaseAPI handle, ITessAPI.TessResultRenderer renderer) {
|
||||
|
||||
private int createDocuments(Pix pix, String filename, TessResultRenderer renderer) {
|
||||
|
||||
String title = TessBaseAPIGetStringVariable(getHandle(), DOCUMENT_TITLE);
|
||||
String title = TessBaseAPIGetStringVariable(handle, DOCUMENT_TITLE);
|
||||
TessResultRendererBeginDocument(renderer, title);
|
||||
int result = TessBaseAPIProcessPage(getHandle(), pix, 0, filename, null, 0, renderer);
|
||||
int result = TessBaseAPIProcessPage(handle, pix, 0, filename, null, 0, renderer);
|
||||
TessResultRendererEndDocument(renderer);
|
||||
|
||||
// if (result == ITessAPI.FALSE) {
|
||||
// throw new TesseractException("Error during processing page.");
|
||||
// }
|
||||
|
||||
return TessBaseAPIMeanTextConf(getHandle());
|
||||
return TessBaseAPIMeanTextConf(handle);
|
||||
}
|
||||
|
||||
|
||||
public OCRResult createDocumentsWithResults(Pix bi, String filename, String outputbase, List<RenderedFormat> formats, int pageIteratorLevel) throws TesseractException {
|
||||
public OCRResult createDocumentsWithResults(Pix bi,
|
||||
String filename,
|
||||
String outputbase,
|
||||
List<ITesseract.RenderedFormat> formats,
|
||||
int pageIteratorLevel,
|
||||
ITessAPI.TessBaseAPI handle) {
|
||||
|
||||
List<OCRResult> results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel);
|
||||
List<OCRResult> results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel, handle);
|
||||
if (!results.isEmpty()) {
|
||||
return results.get(0);
|
||||
} else {
|
||||
@ -48,24 +57,26 @@ public class Tesseract2 extends Tesseract1 {
|
||||
}
|
||||
|
||||
|
||||
public List<OCRResult> createDocumentsWithResults(Pix[] pixs, String[] filenames, String[] outputbases, List<RenderedFormat> formats, int pageIteratorLevel) {
|
||||
public List<OCRResult> createDocumentsWithResults(Pix[] pixs,
|
||||
String[] filenames,
|
||||
String[] outputbases,
|
||||
List<ITesseract.RenderedFormat> formats,
|
||||
int pageIteratorLevel,
|
||||
ITessAPI.TessBaseAPI handle) {
|
||||
|
||||
if (pixs.length != filenames.length || pixs.length != outputbases.length) {
|
||||
throw new RuntimeException("The three arrays must match in length.");
|
||||
}
|
||||
|
||||
init();
|
||||
setVariables();
|
||||
|
||||
List<OCRResult> results = new ArrayList<OCRResult>();
|
||||
|
||||
try {
|
||||
for (int i = 0; i < pixs.length; i++) {
|
||||
try {
|
||||
TessResultRenderer renderer = createRenderers(outputbases[i], formats);
|
||||
int meanTextConfidence = createDocuments(pixs[i], filenames[i], renderer);
|
||||
ITessAPI.TessResultRenderer renderer = createRenderers(outputbases[i], formats);
|
||||
int meanTextConfidence = createDocuments(pixs[i], filenames[i], handle, renderer);
|
||||
TessDeleteResultRenderer(renderer);
|
||||
List<Word> words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel) : new ArrayList<Word>();
|
||||
List<Word> words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel, handle) : new ArrayList<Word>();
|
||||
results.add(new OCRResult(meanTextConfidence, words));
|
||||
} catch (Exception e) {
|
||||
// skip the problematic image file
|
||||
@ -73,20 +84,22 @@ public class Tesseract2 extends Tesseract1 {
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
dispose();
|
||||
synchronized (OCRThread.class) {
|
||||
TessAPI1.TessBaseAPIClear(handle);
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
|
||||
private List<Word> getRecognizedWords(int pageIteratorLevel) {
|
||||
private List<Word> getRecognizedWords(int pageIteratorLevel, ITessAPI.TessBaseAPI handle) {
|
||||
|
||||
List<Word> words = new ArrayList<>();
|
||||
|
||||
try {
|
||||
TessResultIterator ri = TessBaseAPIGetIterator(getHandle());
|
||||
TessPageIterator pi = TessResultIteratorGetPageIterator(ri);
|
||||
ITessAPI.TessResultIterator ri = TessBaseAPIGetIterator(handle);
|
||||
ITessAPI.TessPageIterator pi = TessResultIteratorGetPageIterator(ri);
|
||||
TessPageIteratorBegin(pi);
|
||||
|
||||
do {
|
||||
@ -119,11 +132,11 @@ public class Tesseract2 extends Tesseract1 {
|
||||
}
|
||||
|
||||
|
||||
private TessResultRenderer createRenderers(String outputbase, List<RenderedFormat> formats) {
|
||||
private ITessAPI.TessResultRenderer createRenderers(String outputbase, List<ITesseract.RenderedFormat> formats) {
|
||||
|
||||
TessResultRenderer renderer = null;
|
||||
ITessAPI.TessResultRenderer renderer = null;
|
||||
|
||||
for (RenderedFormat format : formats) {
|
||||
for (ITesseract.RenderedFormat format : formats) {
|
||||
switch (format) {
|
||||
|
||||
case HOCR:
|
||||
|
||||
@ -31,7 +31,7 @@ import io.micrometer.prometheus.PrometheusMeterRegistry;
|
||||
import io.micrometer.prometheus.PrometheusTimer;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help.
|
||||
//@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help.
|
||||
@SpringBootTest()
|
||||
public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
|
||||
@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testOcr() {
|
||||
|
||||
String text = testOCR("files/2009-1048395_50pages_tables.pdf");
|
||||
String text = testOCR("files/VV-352892.pdf");
|
||||
}
|
||||
|
||||
|
||||
@ -139,7 +139,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
|
||||
String dir = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/";
|
||||
List<File> foundFiles = Files.walk(Path.of(dir))
|
||||
.sorted(Comparator.comparingLong(this::getFileSize))
|
||||
// .sorted(Comparator.comparingLong(this::getFileSize))
|
||||
.map(Path::toFile)
|
||||
.filter(file -> file.getName().endsWith(".pdf"))
|
||||
.peek(System.out::println)
|
||||
@ -162,7 +162,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testOcrForSpecificFile() {
|
||||
|
||||
testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/F.2. A16003E - Acute Inhalation Study.pdf"));
|
||||
testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/Item 17_Toxicidade Inalatoria.pdf"));
|
||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/A23220A - 404 - Skin Irritation in vivo.pdf"));
|
||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/G.1.2 - 1768300_MMNA_A13617AV_report.pdf"));
|
||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 17_Toxicidade Inalatoria Aguda.pdf"));
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user