RED-7669: optimize OCR-module performance

This commit is contained in:
Kilian Schüttler 2023-11-20 09:55:48 +01:00
parent aa45fa84bb
commit 759bae6499
7 changed files with 217 additions and 80 deletions

View File

@ -8,9 +8,14 @@ import java.awt.geom.AffineTransform;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.nio.IntBuffer;
import java.util.concurrent.Semaphore;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import com.pdftron.sdf.Obj;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
@ -41,8 +46,9 @@ public class ExtractedOcrImage implements OcrImage {
@Setter
int rotationDegrees;
@SneakyThrows
public ExtractedOcrImage(int pageNumber, int numberOnPage, BufferedImage bufferedImage, Matrix ctm, int targetDpi, boolean isGray) {
public ExtractedOcrImage(int pageNumber, int numberOnPage, BufferedImage bufferedImage, Matrix ctm, int targetDpi) {
this.pageNumber = pageNumber;
this.numberOnPage = numberOnPage;
@ -50,72 +56,21 @@ public class ExtractedOcrImage implements OcrImage {
this.originalHeight = bufferedImage.getHeight();
this.originalWidth = bufferedImage.getWidth();
float imageDPI = Math.abs(bufferedImage.getWidth() / (ctm.getScalingFactorX() / 72));
this.pix = binarize(bufferedImage, imageDPI, targetDpi, isGray);
this.pix = binarize(bufferedImage, imageDPI, targetDpi);
this.height = pix.h;
this.width = pix.w;
}
@SneakyThrows
private Pix binarize(BufferedImage image, float imageDpi, int targetDpi, boolean isGray) {
private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) {
setAlphaChannelToWhite(image);
Pix grayScale = convertToGrayScale(image, isGray);
Pix scaledUp = scaleToTargetDpi(imageDpi, targetDpi, grayScale);
Pix despeckled = LeptUtils.despeckle(scaledUp, LeptUtils.SEL_STR3, 3);
LeptUtils.disposePix(scaledUp);
return despeckled;
}
ImageProcessingUtils.setAlphaChannelToWhite(image);
private static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) {
Pix scaledUp;
float targetFactor = targetDpi / imageDpi;
if (targetFactor > 3) {
scaledUp = Leptonica1.pixScaleGray4xLI(grayScale);
LeptUtils.disposePix(grayScale);
} else if (targetFactor > 1.9) {
scaledUp = Leptonica1.pixScaleGray2xLI(grayScale);
LeptUtils.disposePix(grayScale);
} else {
scaledUp = grayScale;
}
return scaledUp;
}
private static Pix convertToGrayScale(BufferedImage image, boolean isGray) throws IOException {
Pix pix = LeptUtils.convertImageToPix(image);
Pix grayScale;
if (isGray) {
grayScale = pix;
} else {
grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
LeptUtils.disposePix(pix);
}
return grayScale;
}
private static void setAlphaChannelToWhite(BufferedImage image) {
if (image.getTransparency() == Transparency.TRANSLUCENT) {
// NOTE: For BITMASK images, the color model is likely IndexColorModel,
// and this model will contain the "real" color of the transparent parts
// which is likely a better fit than unconditionally setting it to white.
// Fill background with white
Graphics2D graphics = image.createGraphics();
try {
graphics.setComposite(AlphaComposite.DstOver); // Set composite rules to paint "behind"
graphics.setPaint(Color.WHITE);
graphics.fillRect(0, 0, image.getWidth(), image.getHeight());
} finally {
graphics.dispose();
}
synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs tesseract detection script.
Pix grayScale = ImageProcessingUtils.convertToGrayScale(image);
Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale);
return ImageProcessingUtils.despecklePix(scaledUp);
}
}

View File

@ -3,6 +3,7 @@ package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
import com.knecon.fforesight.service.ocr.processor.utils.PdfDpiCalculator;
import net.sourceforge.lept4j.Leptonica1;
@ -102,12 +103,14 @@ public interface OcrImage {
*/
default Pix getRotatedPix() {
return switch (360 - getRotationDegrees()) {
case 90 -> Leptonica1.pixRotateOrth(getPix(), 1);
case 180 -> Leptonica1.pixRotateOrth(getPix(), 2);
case 270 -> Leptonica1.pixRotateOrth(getPix(), 3);
default -> getPix();
};
synchronized (OCRThread.class) {
return switch (360 - getRotationDegrees()) {
case 90 -> Leptonica1.pixRotateOrth(getPix(), 1);
case 180 -> Leptonica1.pixRotateOrth(getPix(), 2);
case 270 -> Leptonica1.pixRotateOrth(getPix(), 3);
default -> getPix();
};
}
}

View File

@ -71,18 +71,17 @@ public class ImageStreamEngine extends PDFStreamEngine {
Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix();
if (imageXObject.getColorSpace() instanceof PDDeviceRGB) {
BufferedImage image = imageXObject.getImage();
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi(), false);
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi());
} else if (imageXObject.getColorSpace() instanceof PDDeviceGray) {
BufferedImage image = imageXObject.getImage();
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi(), true);
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi());
} else {
BufferedImage pdfImage = imageXObject.getImage();
BufferedImage image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(),
BufferedImage.TYPE_BYTE_GRAY);
BufferedImage image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
Graphics g = image.getGraphics();
g.drawImage(pdfImage, 0, 0, null);
g.dispose();
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi(), true);
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi());
}
this.imagesOnCurrentPage.add(this.currentImageOnPage);
//imagesOnPages.add(this.currentImageOnPage);

View File

@ -89,6 +89,8 @@ public class OCRThread extends Thread {
} catch (NoSuchElementException e) {
log.debug("Processed all Images, finishing.");
}
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
}
@ -104,8 +106,11 @@ public class OCRThread extends Thread {
image.setRotationDegrees(orientDegree);
Pix rotatedPix = image.getRotatedPix();
executeTesseract(psm, image.getDpi(), rotatedPix, tesseractOutputFileName);
image.destroyPix();
LeptUtils.disposePix(rotatedPix);
synchronized (OCRThread.class) {
image.destroyPix();
LeptUtils.disposePix(rotatedPix);
}
results.add(OcrResult.create(image, tesseractOutputFileName));
logger.logImageFinished(image, psm);
@ -115,26 +120,40 @@ public class OCRThread extends Thread {
public int detectOrientation(OcrImage image) {
IntBuffer orientationDegreeResultBuffer;
FloatBuffer orientationDegreeConfidenceBuffer;
PointerByReference scriptureNameBuffer;
FloatBuffer scriptureConfidenceBuffer;
TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, image.getPix());
TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, image.getDpi());
IntBuffer orient_degB = IntBuffer.allocate(1);
FloatBuffer orient_confB = FloatBuffer.allocate(1);
PointerByReference script_nameB = new PointerByReference();
FloatBuffer script_confB = FloatBuffer.allocate(1);
synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs in leptonica binarization.
orientationDegreeResultBuffer = IntBuffer.allocate(1);
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
scriptureNameBuffer = new PointerByReference();
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
}
int orient_deg = 0;
int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle, orient_degB, orient_confB, script_nameB, script_confB);
int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle,
orientationDegreeResultBuffer,
orientationDegreeConfidenceBuffer,
scriptureNameBuffer,
scriptureConfidenceBuffer);
if (result == TRUE) {
orient_deg = orient_degB.get();
orient_deg = orientationDegreeResultBuffer.get();
}
synchronized (OCRThread.class) {
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
}
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
return orient_deg;
}
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
synchronized private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
String datapath = System.getenv("TESSDATA_PREFIX");

View File

@ -0,0 +1,96 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import java.awt.AlphaComposite;
import java.awt.Color;
import java.awt.Graphics2D;
import java.awt.Transparency;
import java.awt.image.BufferedImage;
import java.io.IOException;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
@UtilityClass
public class ImageProcessingUtils {
public static Pix despecklePix(Pix pix) {
assert pix.d == 8;
Pix despeckled;
if (pix.w < 100 || pix.h < 100) {
// too small to properly despeckle, just binarize instead.
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
} else {
despeckled = LeptUtils.despeckle(pix, LeptUtils.SEL_STR3, 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though...
if (despeckled == null) {
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
}
}
if (pix != despeckled) {
LeptUtils.disposePix(pix);
}
return despeckled;
}
public static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) {
float targetFactor = targetDpi / imageDpi;
if (targetFactor > 3) {
Pix scaledUp;
scaledUp = Leptonica1.pixScaleGray4xLI(grayScale);
LeptUtils.disposePix(grayScale);
return scaledUp;
} else if (targetFactor > 1.9) {
Pix scaledUp;
scaledUp = Leptonica1.pixScaleGray2xLI(grayScale);
LeptUtils.disposePix(grayScale);
return scaledUp;
} else {
return grayScale;
}
}
@SneakyThrows
public static Pix convertToGrayScale(BufferedImage image) {
Pix pix = LeptUtils.convertImageToPix(image);
if (pix.d == 8) {
return pix;
} else if (pix.d == 32) {
Pix grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
LeptUtils.disposePix(pix);
return grayScale;
} else {
Pix grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255);
LeptUtils.disposePix(pix);
return grayScale;
}
}
public static void setAlphaChannelToWhite(BufferedImage image) {
if (image.getTransparency() == Transparency.TRANSLUCENT) {
// NOTE: For BITMASK images, the color model is likely IndexColorModel,
// and this model will contain the "real" color of the transparent parts
// which is likely a better fit than unconditionally setting it to white.
// Fill background with white
Graphics2D graphics = image.createGraphics();
try {
graphics.setComposite(AlphaComposite.DstOver); // Set composite rules to paint "behind"
graphics.setPaint(Color.WHITE);
graphics.fillRect(0, 0, image.getWidth(), image.getHeight());
} finally {
graphics.dispose();
}
}
}
}

View File

@ -16,6 +16,9 @@ import net.sourceforge.tess4j.TesseractException;
import net.sourceforge.tess4j.Word;
@Slf4j
/**
* Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted.
*/
public class Tesseract2 extends Tesseract1 {

View File

@ -4,10 +4,15 @@ import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTex
import static com.knecon.fforesight.service.ocr.processor.service.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
@ -127,4 +132,61 @@ public class OcrServiceIntegrationTest extends AbstractTest {
}
}
@Test
@SneakyThrows
public void testOcrForAllDMFiles() {
String dir = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/";
List<File> foundFiles = Files.walk(Path.of(dir))
.sorted(Comparator.comparingLong(this::getFileSize))
.map(Path::toFile)
.filter(file -> file.getName().endsWith(".pdf"))
.peek(System.out::println)
.toList();
int fileCount = foundFiles.size();
AtomicInteger processedCount = new AtomicInteger();
System.out.printf("Found %s files, starting OCR for each.%n%n", fileCount);
foundFiles.stream().peek(file -> System.out.printf("%s/%s: %s%n", processedCount.getAndIncrement(), fileCount, file)).forEach(this::testOCRForFile);
}
@SneakyThrows
public long getFileSize(Path path) {
return Files.size(path);
}
@Test
@SneakyThrows
public void testOcrForSpecificFile() {
testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/F.2. A16003E - Acute Inhalation Study.pdf"));
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/A23220A - 404 - Skin Irritation in vivo.pdf"));
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/G.1.2 - 1768300_MMNA_A13617AV_report.pdf"));
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 17_Toxicidade Inalatoria Aguda.pdf"));
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 20_Sensibilizacao_02.pdf"));
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/ITEM 23_A15149W - Dermal absorption of formulated product.pdf"));
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 16_Toxicidade Cutanea Aguda.pdf"));
}
@SneakyThrows
private void testOCRForFile(File file) {
var originId = FileStorageService.getStorageId(TEST_DOSSIER_ID, "file", FileType.ORIGIN);
try (var fileStream = new FileInputStream(file)) {
storageService.storeObject(TenantContext.getTenantId(), originId, fileStream);
}
Path tmpFileName = Path.of(getTemporaryDirectory()).resolve(Path.of(file.getAbsolutePath()).getFileName());
try (var out = new FileOutputStream(tmpFileName.toFile())) {
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", out);
System.out.println("File:" + tmpFileName);
}
System.out.println("\n\n");
}
}