RED-7669: optimize OCR-module performance

* binarize images after reading
This commit is contained in:
Kilian Schuettler 2023-11-21 14:37:18 +01:00
parent 6f99664906
commit bb5b4a2fd8
11 changed files with 182 additions and 94 deletions

View File

@ -0,0 +1,27 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.image.BufferedImage;
import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace;
import org.apache.pdfbox.util.Matrix;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
@Getter
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ExtractedImage {
int pageNumber;
QuadPoint position;
int height;
int width;
BufferedImage image;
Matrix ctm;
int numberOnPage;
PDColorSpace colorSpace;
}

View File

@ -11,6 +11,8 @@ import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import com.pdftron.sdf.Obj;
import com.sun.jna.StringArray;
import com.sun.jna.ptr.PointerByReference;
import lombok.AccessLevel;
import lombok.Getter;
@ -56,6 +58,19 @@ public class ExtractedOcrImage implements OcrImage {
}
public ExtractedOcrImage(ExtractedImage image, int targetDpi) {
this.pageNumber = image.getPageNumber();
this.numberOnPage = image.getNumberOnPage();
this.ctm = image.getCtm();
this.originalHeight = image.getImage().getHeight();
this.originalWidth = image.getImage().getWidth();
float imageDPI = Math.abs(image.getImage().getWidth() / (ctm.getScalingFactorX() / 72));
this.pix = binarize(image.getImage(), imageDPI, targetDpi);
this.height = pix.h;
this.width = pix.w;
}
@SneakyThrows
private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) {

View File

@ -2,10 +2,12 @@ package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.image.BufferedImage;
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
import com.knecon.fforesight.service.ocr.processor.utils.PdfDpiCalculator;
import lombok.SneakyThrows;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
@ -62,6 +64,20 @@ public interface OcrImage {
}
@SneakyThrows
default BufferedImage getBufferedImage() {
return LeptUtils.convertPixToImage(getPix());
}
@SneakyThrows
default BufferedImage getRotatedBufferedImage() {
return LeptUtils.convertPixToImage(getRotatedPix());
}
/**
* Retrieves the rotation degree of the OCR image.
*

View File

@ -97,4 +97,10 @@ public record QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) {
d().getY());
}
public double size() {
return a().distance(b()) * a().distance(d());
}
}

View File

@ -24,6 +24,7 @@ import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
@ -34,7 +35,7 @@ import lombok.SneakyThrows;
public class ImageStreamEngine extends PDFStreamEngine {
private ExtractedOcrImage currentImageOnPage;
private List<ExtractedOcrImage> imagesOnCurrentPage;
private List<ExtractedImage> imagesOnCurrentPage;
private OcrServiceSettings settings;
private int pageNum;
@ -69,21 +70,14 @@ public class ImageStreamEngine extends PDFStreamEngine {
}
Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix();
if (imageXObject.getColorSpace() instanceof PDDeviceRGB) {
BufferedImage image = imageXObject.getImage();
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi());
} else if (imageXObject.getColorSpace() instanceof PDDeviceGray) {
BufferedImage image = imageXObject.getImage();
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi());
} else {
BufferedImage pdfImage = imageXObject.getImage();
BufferedImage image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
Graphics g = image.getGraphics();
g.drawImage(pdfImage, 0, 0, null);
g.dispose();
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi());
}
this.imagesOnCurrentPage.add(this.currentImageOnPage);
this.imagesOnCurrentPage.add(new ExtractedImage(pageNum,
imageXObject.getHeight(),
imageXObject.getWidth(),
imageXObject.getImage(),
imageCTM,
imagesOnCurrentPage.size(),
imageXObject.getColorSpace()));
//imagesOnPages.add(this.currentImageOnPage);
} else if (xobject instanceof PDFormXObject) {
PDFormXObject form = (PDFormXObject) xobject;

View File

@ -107,7 +107,7 @@ public class OCRService {
int numberOfOcrThreads = Math.min(settings.getOcrThreadCount(), document.getNumberOfPages());
stats = new Statistics(numberOfExtractThreads, numberOfOcrThreads);
BlockingQueue<OcrImage> ocrImageQueue = new ArrayBlockingQueue<>(numberOfOcrThreads);
BlockingQueue<OcrImage> ocrImageQueue = new ArrayBlockingQueue<>(2 * numberOfOcrThreads);
OcrImageFactory ocrImageFactory = new OcrImageFactory(document,
documentFile,

View File

@ -5,10 +5,10 @@ import java.util.List;
import java.util.concurrent.BlockingQueue;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine;
@ -26,6 +26,7 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ImageExtractionThread extends Thread {
static double FULL_PAGE_IMAGE_THRESHOLD = 0.98;
static double IMAGE_ALIGNMENT_THRESHOLD = 1;
int id;
@ -40,6 +41,7 @@ public class ImageExtractionThread extends Thread {
BlockingQueue<OcrImage> imageOutputQueue;
List<Integer> stitchedPageNumbers;
@SneakyThrows
@Override
public void run() {
@ -48,20 +50,21 @@ public class ImageExtractionThread extends Thread {
for (Integer pageIndex : pageIndices) {
try (PDDocument document = Loader.loadPDF(documentFile)) { // load new PDDocument for thread safety, also keeps RAM usage low.
timestamp = System.currentTimeMillis();
List<ExtractedOcrImage> extractedOcrImages = getExtractedOcrImages(pageIndex, document);
List<ExtractedImage> extractedImages = getExtractedOcrImages(pageIndex, document);
stats.increaseImageExtraction(id, System.currentTimeMillis() - timestamp);
if (extractedOcrImages.isEmpty()) {
if (extractedImages.isEmpty()) {
logger.logPageSkipped(pageIndex);
}
if (checkForStitchedImages(extractedOcrImages)) {
if (checkForStitchedImages(extractedImages, document.getPage(pageIndex - 1))) {
stitchedPageNumbers.add(pageIndex);
logger.addImagesToProcess(pageIndex, 0);
continue;
}
for (ExtractedOcrImage image : extractedOcrImages) {
imageOutputQueue.put(image);
for (ExtractedImage image : extractedImages) {
ExtractedOcrImage ocrImage = new ExtractedOcrImage(image, settings.getDpi());
imageOutputQueue.put(ocrImage);
logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage());
}
}
@ -69,7 +72,7 @@ public class ImageExtractionThread extends Thread {
}
private List<ExtractedOcrImage> getExtractedOcrImages(Integer pageIndex, PDDocument document) {
private List<ExtractedImage> getExtractedOcrImages(Integer pageIndex, PDDocument document) {
PDPage page = document.getPage(pageIndex - 1);
ImageStreamEngine imageStreamEngine = new ImageStreamEngine(settings);
@ -79,22 +82,22 @@ public class ImageExtractionThread extends Thread {
@SneakyThrows
private boolean checkForStitchedImages(List<ExtractedOcrImage> imagesOnCurrentPage) {
private boolean checkForStitchedImages(List<ExtractedImage> imagesOnCurrentPage, PDPage page) {
if (imagesOnCurrentPage.size() <= 1) {
if (imagesOnCurrentPage.isEmpty()) {
return false;
}
//checking for intersections or direct alignment of images
ExtractedOcrImage[] imageOnPagesArray = new ExtractedOcrImage[imagesOnCurrentPage.size()];
int index = 0;
for (ExtractedOcrImage imageOnPage : imagesOnCurrentPage) {
imageOnPagesArray[index] = imageOnPage;
index++;
for (ExtractedImage imageOnPage : imagesOnCurrentPage) {
if (imageOnPage.getImageCoordinatesInInitialUserSpace().size() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getHeight() * page.getCropBox().getWidth()) {
return true;
}
}
for (int j = 0; j < imageOnPagesArray.length; j++) {
for (int i = j + 1; i < imageOnPagesArray.length; i++) {
if (imageOnPagesArray[j].getImageCoordinatesInInitialUserSpace().aligns(imageOnPagesArray[i].getImageCoordinatesInInitialUserSpace(), IMAGE_ALIGNMENT_THRESHOLD)) {
//checking for intersections or direct alignment of images
for (int j = 0; j < imagesOnCurrentPage.size(); j++) {
for (int i = j + 1; i < imagesOnCurrentPage.size(); i++) {
if (imagesOnCurrentPage.get(j).getImageCoordinatesInInitialUserSpace().aligns(imagesOnCurrentPage.get(i).getImageCoordinatesInInitialUserSpace(), IMAGE_ALIGNMENT_THRESHOLD)) {
// TODO: see if we can stitch aligning images using BufferedImage and skip the gs conversion entirely
return true;
}

View File

@ -1,6 +1,10 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import static net.sourceforge.tess4j.ITessAPI.TRUE;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPICreate;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIInit1;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetPageSegMode;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetVariable;
import java.io.File;
import java.nio.FloatBuffer;
@ -16,6 +20,7 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
import com.knecon.fforesight.service.ocr.processor.utils.Tesseract2;
import com.sun.jna.StringArray;
import com.sun.jna.ptr.PointerByReference;
import lombok.AccessLevel;
@ -42,8 +47,8 @@ public class OCRThread extends Thread {
OcrProgressLogger logger;
Statistics stats;
OcrServiceSettings settings;
Tesseract2 instance;
ITessAPI.TessBaseAPI detectionScriptHandle;
ITessAPI.TessBaseAPI tesseractHandle;
public OCRThread(int id,
@ -61,8 +66,8 @@ public class OCRThread extends Thread {
this.logger = logger;
this.stats = stats;
this.settings = settings;
this.instance = createInstance(settings);
this.detectionScriptHandle = initDetectionScriptHandle();
this.tesseractHandle = initTesseractHandle(settings);
}
@ -88,9 +93,9 @@ public class OCRThread extends Thread {
}
} catch (NoSuchElementException e) {
log.debug("Processed all Images, finishing.");
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
TessAPI1.TessBaseAPIDelete(this.tesseractHandle);
}
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
}
@ -107,10 +112,8 @@ public class OCRThread extends Thread {
Pix rotatedPix = image.getRotatedPix();
executeTesseract(psm, image.getDpi(), rotatedPix, tesseractOutputFileName);
synchronized (OCRThread.class) {
image.destroyPix();
LeptUtils.disposePix(rotatedPix);
}
image.destroyPix();
LeptUtils.disposePix(rotatedPix);
results.add(OcrResult.create(image, tesseractOutputFileName));
logger.logImageFinished(image, psm);
@ -145,21 +148,37 @@ public class OCRThread extends Thread {
orient_deg = orientationDegreeResultBuffer.get();
}
synchronized (OCRThread.class) {
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
}
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
return orient_deg;
}
synchronized private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
String datapath = System.getenv("TESSDATA_PREFIX");
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
synchronized (OCRThread.class) {
return handle;
ITessAPI.TessBaseAPI handle = TessBaseAPICreate();
String datapath = System.getenv("TESSDATA_PREFIX");
// TessBaseAPISetVariable(handle, "debug_file", "/dev/null");
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
return handle;
}
}
synchronized private static ITessAPI.TessBaseAPI initTesseractHandle(OcrServiceSettings settings) {
synchronized (OCRThread.class) {
ITessAPI.TessBaseAPI handle = TessBaseAPICreate();
String datapath = System.getenv("TESSDATA_PREFIX");
// TessBaseAPISetVariable(handle, "debug_file", "/dev/null");
TessBaseAPIInit1(handle, datapath, settings.getLanguages(), 1, new PointerByReference(), 0);
return handle;
}
}
@ -173,19 +192,14 @@ public class OCRThread extends Thread {
Leptonica1.pixWrite(folder + "/pix_" + a[a.length - 1] + ".png", pix, 3);
}
instance.setVariable("user_defined_dpi", String.valueOf(dpi));
instance.setPageSegMode(psm);
instance.createDocumentsWithResults(pix, null, tesseractOutputFileName, List.of(ITesseract.RenderedFormat.HOCR), ITessAPI.TessPageIteratorLevel.RIL_BLOCK);
}
TessBaseAPISetPageSegMode(tesseractHandle, psm);
private static Tesseract2 createInstance(OcrServiceSettings settings) {
Tesseract2 instance = new Tesseract2();
instance.setVariable("debug_file", "/dev/null"); // remove warnings from std out
instance.setOcrEngineMode(1); // set to LSTM based Engine
instance.setLanguage(settings.getLanguages());
return instance;
Tesseract2.createDocumentsWithResults(pix,
null,
tesseractOutputFileName,
List.of(ITesseract.RenderedFormat.HOCR),
ITessAPI.TessPageIteratorLevel.RIL_BLOCK,
tesseractHandle);
}
}

View File

@ -12,9 +12,9 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(level = AccessLevel.PRIVATE)
public class OcrServiceSettings {
int ocrThreadCount = 4; // Number of OCR threads
int imageExtractThreadCount = 2; // Number of image extraction threads
int gsProcessCount = 2; // Number of Ghostscript processes
int ocrThreadCount = 16; // Number of OCR threads
int imageExtractThreadCount = 5; // Number of image extraction threads
int gsProcessCount = 5; // Number of Ghostscript processes
int dpi = 300; // Target DPI for binarized images
int psmOverride = -1; // Overrides the page segmentation mode if > 0
int minImageHeight = 20; // Minimum height for images to be processed

View File

@ -1,45 +1,54 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import static net.sourceforge.tess4j.ITesseract.DOCUMENT_TITLE;
import java.awt.Rectangle;
import java.nio.IntBuffer;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
import com.sun.jna.Pointer;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.tess4j.ITessAPI;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.OCRResult;
import net.sourceforge.tess4j.TessAPI1;
import net.sourceforge.tess4j.Tesseract1;
import net.sourceforge.tess4j.TesseractException;
import net.sourceforge.tess4j.Word;
@Slf4j
/**
* Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted.
*/
public class Tesseract2 extends Tesseract1 {
@UtilityClass
public class Tesseract2 extends TessAPI1 {
private int createDocuments(Pix pix, String filename, ITessAPI.TessBaseAPI handle, ITessAPI.TessResultRenderer renderer) {
private int createDocuments(Pix pix, String filename, TessResultRenderer renderer) {
String title = TessBaseAPIGetStringVariable(getHandle(), DOCUMENT_TITLE);
String title = TessBaseAPIGetStringVariable(handle, DOCUMENT_TITLE);
TessResultRendererBeginDocument(renderer, title);
int result = TessBaseAPIProcessPage(getHandle(), pix, 0, filename, null, 0, renderer);
int result = TessBaseAPIProcessPage(handle, pix, 0, filename, null, 0, renderer);
TessResultRendererEndDocument(renderer);
// if (result == ITessAPI.FALSE) {
// throw new TesseractException("Error during processing page.");
// }
return TessBaseAPIMeanTextConf(getHandle());
return TessBaseAPIMeanTextConf(handle);
}
public OCRResult createDocumentsWithResults(Pix bi, String filename, String outputbase, List<RenderedFormat> formats, int pageIteratorLevel) throws TesseractException {
public OCRResult createDocumentsWithResults(Pix bi,
String filename,
String outputbase,
List<ITesseract.RenderedFormat> formats,
int pageIteratorLevel,
ITessAPI.TessBaseAPI handle) {
List<OCRResult> results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel);
List<OCRResult> results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel, handle);
if (!results.isEmpty()) {
return results.get(0);
} else {
@ -48,24 +57,26 @@ public class Tesseract2 extends Tesseract1 {
}
public List<OCRResult> createDocumentsWithResults(Pix[] pixs, String[] filenames, String[] outputbases, List<RenderedFormat> formats, int pageIteratorLevel) {
public List<OCRResult> createDocumentsWithResults(Pix[] pixs,
String[] filenames,
String[] outputbases,
List<ITesseract.RenderedFormat> formats,
int pageIteratorLevel,
ITessAPI.TessBaseAPI handle) {
if (pixs.length != filenames.length || pixs.length != outputbases.length) {
throw new RuntimeException("The three arrays must match in length.");
}
init();
setVariables();
List<OCRResult> results = new ArrayList<OCRResult>();
try {
for (int i = 0; i < pixs.length; i++) {
try {
TessResultRenderer renderer = createRenderers(outputbases[i], formats);
int meanTextConfidence = createDocuments(pixs[i], filenames[i], renderer);
ITessAPI.TessResultRenderer renderer = createRenderers(outputbases[i], formats);
int meanTextConfidence = createDocuments(pixs[i], filenames[i], handle, renderer);
TessDeleteResultRenderer(renderer);
List<Word> words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel) : new ArrayList<Word>();
List<Word> words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel, handle) : new ArrayList<Word>();
results.add(new OCRResult(meanTextConfidence, words));
} catch (Exception e) {
// skip the problematic image file
@ -73,20 +84,22 @@ public class Tesseract2 extends Tesseract1 {
}
}
} finally {
dispose();
synchronized (OCRThread.class) {
TessAPI1.TessBaseAPIClear(handle);
}
}
return results;
}
private List<Word> getRecognizedWords(int pageIteratorLevel) {
private List<Word> getRecognizedWords(int pageIteratorLevel, ITessAPI.TessBaseAPI handle) {
List<Word> words = new ArrayList<>();
try {
TessResultIterator ri = TessBaseAPIGetIterator(getHandle());
TessPageIterator pi = TessResultIteratorGetPageIterator(ri);
ITessAPI.TessResultIterator ri = TessBaseAPIGetIterator(handle);
ITessAPI.TessPageIterator pi = TessResultIteratorGetPageIterator(ri);
TessPageIteratorBegin(pi);
do {
@ -119,11 +132,11 @@ public class Tesseract2 extends Tesseract1 {
}
private TessResultRenderer createRenderers(String outputbase, List<RenderedFormat> formats) {
private ITessAPI.TessResultRenderer createRenderers(String outputbase, List<ITesseract.RenderedFormat> formats) {
TessResultRenderer renderer = null;
ITessAPI.TessResultRenderer renderer = null;
for (RenderedFormat format : formats) {
for (ITesseract.RenderedFormat format : formats) {
switch (format) {
case HOCR:

View File

@ -31,7 +31,7 @@ import io.micrometer.prometheus.PrometheusMeterRegistry;
import io.micrometer.prometheus.PrometheusTimer;
import lombok.SneakyThrows;
@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help.
//@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help.
@SpringBootTest()
public class OcrServiceIntegrationTest extends AbstractTest {
@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@SneakyThrows
public void testOcr() {
String text = testOCR("files/2009-1048395_50pages_tables.pdf");
String text = testOCR("files/VV-352892.pdf");
}
@ -139,7 +139,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
String dir = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/";
List<File> foundFiles = Files.walk(Path.of(dir))
.sorted(Comparator.comparingLong(this::getFileSize))
// .sorted(Comparator.comparingLong(this::getFileSize))
.map(Path::toFile)
.filter(file -> file.getName().endsWith(".pdf"))
.peek(System.out::println)
@ -162,7 +162,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@SneakyThrows
public void testOcrForSpecificFile() {
testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/F.2. A16003E - Acute Inhalation Study.pdf"));
testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/Item 17_Toxicidade Inalatoria.pdf"));
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/A23220A - 404 - Skin Irritation in vivo.pdf"));
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/G.1.2 - 1768300_MMNA_A13617AV_report.pdf"));
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 17_Toxicidade Inalatoria Aguda.pdf"));