RED-7669: optimize OCR-module performance
This commit is contained in:
parent
aa45fa84bb
commit
759bae6499
@ -8,9 +8,14 @@ import java.awt.geom.AffineTransform;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.concurrent.Semaphore;
|
||||
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||
import com.pdftron.sdf.Obj;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -41,8 +46,9 @@ public class ExtractedOcrImage implements OcrImage {
|
||||
@Setter
|
||||
int rotationDegrees;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public ExtractedOcrImage(int pageNumber, int numberOnPage, BufferedImage bufferedImage, Matrix ctm, int targetDpi, boolean isGray) {
|
||||
public ExtractedOcrImage(int pageNumber, int numberOnPage, BufferedImage bufferedImage, Matrix ctm, int targetDpi) {
|
||||
|
||||
this.pageNumber = pageNumber;
|
||||
this.numberOnPage = numberOnPage;
|
||||
@ -50,72 +56,21 @@ public class ExtractedOcrImage implements OcrImage {
|
||||
this.originalHeight = bufferedImage.getHeight();
|
||||
this.originalWidth = bufferedImage.getWidth();
|
||||
float imageDPI = Math.abs(bufferedImage.getWidth() / (ctm.getScalingFactorX() / 72));
|
||||
this.pix = binarize(bufferedImage, imageDPI, targetDpi, isGray);
|
||||
this.pix = binarize(bufferedImage, imageDPI, targetDpi);
|
||||
this.height = pix.h;
|
||||
this.width = pix.w;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Pix binarize(BufferedImage image, float imageDpi, int targetDpi, boolean isGray) {
|
||||
private Pix binarize(BufferedImage image, float imageDpi, int targetDpi) {
|
||||
|
||||
setAlphaChannelToWhite(image);
|
||||
Pix grayScale = convertToGrayScale(image, isGray);
|
||||
Pix scaledUp = scaleToTargetDpi(imageDpi, targetDpi, grayScale);
|
||||
Pix despeckled = LeptUtils.despeckle(scaledUp, LeptUtils.SEL_STR3, 3);
|
||||
LeptUtils.disposePix(scaledUp);
|
||||
return despeckled;
|
||||
}
|
||||
ImageProcessingUtils.setAlphaChannelToWhite(image);
|
||||
|
||||
|
||||
private static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) {
|
||||
|
||||
Pix scaledUp;
|
||||
float targetFactor = targetDpi / imageDpi;
|
||||
|
||||
if (targetFactor > 3) {
|
||||
scaledUp = Leptonica1.pixScaleGray4xLI(grayScale);
|
||||
LeptUtils.disposePix(grayScale);
|
||||
} else if (targetFactor > 1.9) {
|
||||
scaledUp = Leptonica1.pixScaleGray2xLI(grayScale);
|
||||
LeptUtils.disposePix(grayScale);
|
||||
} else {
|
||||
scaledUp = grayScale;
|
||||
}
|
||||
return scaledUp;
|
||||
}
|
||||
|
||||
|
||||
private static Pix convertToGrayScale(BufferedImage image, boolean isGray) throws IOException {
|
||||
|
||||
Pix pix = LeptUtils.convertImageToPix(image);
|
||||
Pix grayScale;
|
||||
if (isGray) {
|
||||
grayScale = pix;
|
||||
} else {
|
||||
grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
|
||||
LeptUtils.disposePix(pix);
|
||||
}
|
||||
return grayScale;
|
||||
}
|
||||
|
||||
|
||||
private static void setAlphaChannelToWhite(BufferedImage image) {
|
||||
|
||||
if (image.getTransparency() == Transparency.TRANSLUCENT) {
|
||||
// NOTE: For BITMASK images, the color model is likely IndexColorModel,
|
||||
// and this model will contain the "real" color of the transparent parts
|
||||
// which is likely a better fit than unconditionally setting it to white.
|
||||
|
||||
// Fill background with white
|
||||
Graphics2D graphics = image.createGraphics();
|
||||
try {
|
||||
graphics.setComposite(AlphaComposite.DstOver); // Set composite rules to paint "behind"
|
||||
graphics.setPaint(Color.WHITE);
|
||||
graphics.fillRect(0, 0, image.getWidth(), image.getHeight());
|
||||
} finally {
|
||||
graphics.dispose();
|
||||
}
|
||||
synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs tesseract detection script.
|
||||
Pix grayScale = ImageProcessingUtils.convertToGrayScale(image);
|
||||
Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale);
|
||||
return ImageProcessingUtils.despecklePix(scaledUp);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.ocr.processor.model;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.PdfDpiCalculator;
|
||||
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
@ -102,12 +103,14 @@ public interface OcrImage {
|
||||
*/
|
||||
default Pix getRotatedPix() {
|
||||
|
||||
return switch (360 - getRotationDegrees()) {
|
||||
case 90 -> Leptonica1.pixRotateOrth(getPix(), 1);
|
||||
case 180 -> Leptonica1.pixRotateOrth(getPix(), 2);
|
||||
case 270 -> Leptonica1.pixRotateOrth(getPix(), 3);
|
||||
default -> getPix();
|
||||
};
|
||||
synchronized (OCRThread.class) {
|
||||
return switch (360 - getRotationDegrees()) {
|
||||
case 90 -> Leptonica1.pixRotateOrth(getPix(), 1);
|
||||
case 180 -> Leptonica1.pixRotateOrth(getPix(), 2);
|
||||
case 270 -> Leptonica1.pixRotateOrth(getPix(), 3);
|
||||
default -> getPix();
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -71,18 +71,17 @@ public class ImageStreamEngine extends PDFStreamEngine {
|
||||
Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix();
|
||||
if (imageXObject.getColorSpace() instanceof PDDeviceRGB) {
|
||||
BufferedImage image = imageXObject.getImage();
|
||||
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi(), false);
|
||||
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi());
|
||||
} else if (imageXObject.getColorSpace() instanceof PDDeviceGray) {
|
||||
BufferedImage image = imageXObject.getImage();
|
||||
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi(), true);
|
||||
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi());
|
||||
} else {
|
||||
BufferedImage pdfImage = imageXObject.getImage();
|
||||
BufferedImage image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(),
|
||||
BufferedImage.TYPE_BYTE_GRAY);
|
||||
BufferedImage image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
|
||||
Graphics g = image.getGraphics();
|
||||
g.drawImage(pdfImage, 0, 0, null);
|
||||
g.dispose();
|
||||
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi(), true);
|
||||
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi());
|
||||
}
|
||||
this.imagesOnCurrentPage.add(this.currentImageOnPage);
|
||||
//imagesOnPages.add(this.currentImageOnPage);
|
||||
|
||||
@ -89,6 +89,8 @@ public class OCRThread extends Thread {
|
||||
} catch (NoSuchElementException e) {
|
||||
log.debug("Processed all Images, finishing.");
|
||||
}
|
||||
|
||||
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
|
||||
}
|
||||
|
||||
|
||||
@ -104,8 +106,11 @@ public class OCRThread extends Thread {
|
||||
image.setRotationDegrees(orientDegree);
|
||||
Pix rotatedPix = image.getRotatedPix();
|
||||
executeTesseract(psm, image.getDpi(), rotatedPix, tesseractOutputFileName);
|
||||
image.destroyPix();
|
||||
LeptUtils.disposePix(rotatedPix);
|
||||
|
||||
synchronized (OCRThread.class) {
|
||||
image.destroyPix();
|
||||
LeptUtils.disposePix(rotatedPix);
|
||||
}
|
||||
|
||||
results.add(OcrResult.create(image, tesseractOutputFileName));
|
||||
logger.logImageFinished(image, psm);
|
||||
@ -115,26 +120,40 @@ public class OCRThread extends Thread {
|
||||
|
||||
public int detectOrientation(OcrImage image) {
|
||||
|
||||
IntBuffer orientationDegreeResultBuffer;
|
||||
FloatBuffer orientationDegreeConfidenceBuffer;
|
||||
PointerByReference scriptureNameBuffer;
|
||||
FloatBuffer scriptureConfidenceBuffer;
|
||||
|
||||
TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, image.getPix());
|
||||
TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, image.getDpi());
|
||||
|
||||
IntBuffer orient_degB = IntBuffer.allocate(1);
|
||||
FloatBuffer orient_confB = FloatBuffer.allocate(1);
|
||||
PointerByReference script_nameB = new PointerByReference();
|
||||
FloatBuffer script_confB = FloatBuffer.allocate(1);
|
||||
synchronized (OCRThread.class) { // must synchronize the mallocs here with the mallocs in leptonica binarization.
|
||||
orientationDegreeResultBuffer = IntBuffer.allocate(1);
|
||||
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
|
||||
scriptureNameBuffer = new PointerByReference();
|
||||
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
|
||||
}
|
||||
|
||||
int orient_deg = 0;
|
||||
int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle, orient_degB, orient_confB, script_nameB, script_confB);
|
||||
int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle,
|
||||
orientationDegreeResultBuffer,
|
||||
orientationDegreeConfidenceBuffer,
|
||||
scriptureNameBuffer,
|
||||
scriptureConfidenceBuffer);
|
||||
if (result == TRUE) {
|
||||
orient_deg = orient_degB.get();
|
||||
orient_deg = orientationDegreeResultBuffer.get();
|
||||
}
|
||||
|
||||
synchronized (OCRThread.class) {
|
||||
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
|
||||
}
|
||||
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
|
||||
|
||||
return orient_deg;
|
||||
}
|
||||
|
||||
|
||||
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
|
||||
synchronized private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
|
||||
|
||||
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
|
||||
String datapath = System.getenv("TESSDATA_PREFIX");
|
||||
|
||||
@ -0,0 +1,96 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import java.awt.AlphaComposite;
|
||||
import java.awt.Color;
|
||||
import java.awt.Graphics2D;
|
||||
import java.awt.Transparency;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
@UtilityClass
|
||||
public class ImageProcessingUtils {
|
||||
|
||||
public static Pix despecklePix(Pix pix) {
|
||||
|
||||
assert pix.d == 8;
|
||||
Pix despeckled;
|
||||
if (pix.w < 100 || pix.h < 100) {
|
||||
// too small to properly despeckle, just binarize instead.
|
||||
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
|
||||
} else {
|
||||
despeckled = LeptUtils.despeckle(pix, LeptUtils.SEL_STR3, 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though...
|
||||
if (despeckled == null) {
|
||||
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
|
||||
}
|
||||
}
|
||||
if (pix != despeckled) {
|
||||
LeptUtils.disposePix(pix);
|
||||
}
|
||||
return despeckled;
|
||||
}
|
||||
|
||||
|
||||
public static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) {
|
||||
|
||||
float targetFactor = targetDpi / imageDpi;
|
||||
|
||||
if (targetFactor > 3) {
|
||||
Pix scaledUp;
|
||||
scaledUp = Leptonica1.pixScaleGray4xLI(grayScale);
|
||||
LeptUtils.disposePix(grayScale);
|
||||
return scaledUp;
|
||||
} else if (targetFactor > 1.9) {
|
||||
Pix scaledUp;
|
||||
scaledUp = Leptonica1.pixScaleGray2xLI(grayScale);
|
||||
LeptUtils.disposePix(grayScale);
|
||||
return scaledUp;
|
||||
} else {
|
||||
return grayScale;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static Pix convertToGrayScale(BufferedImage image) {
|
||||
|
||||
Pix pix = LeptUtils.convertImageToPix(image);
|
||||
if (pix.d == 8) {
|
||||
return pix;
|
||||
} else if (pix.d == 32) {
|
||||
Pix grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
|
||||
LeptUtils.disposePix(pix);
|
||||
return grayScale;
|
||||
} else {
|
||||
Pix grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255);
|
||||
LeptUtils.disposePix(pix);
|
||||
return grayScale;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static void setAlphaChannelToWhite(BufferedImage image) {
|
||||
|
||||
if (image.getTransparency() == Transparency.TRANSLUCENT) {
|
||||
// NOTE: For BITMASK images, the color model is likely IndexColorModel,
|
||||
// and this model will contain the "real" color of the transparent parts
|
||||
// which is likely a better fit than unconditionally setting it to white.
|
||||
|
||||
// Fill background with white
|
||||
Graphics2D graphics = image.createGraphics();
|
||||
try {
|
||||
graphics.setComposite(AlphaComposite.DstOver); // Set composite rules to paint "behind"
|
||||
graphics.setPaint(Color.WHITE);
|
||||
graphics.fillRect(0, 0, image.getWidth(), image.getHeight());
|
||||
} finally {
|
||||
graphics.dispose();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -16,6 +16,9 @@ import net.sourceforge.tess4j.TesseractException;
|
||||
import net.sourceforge.tess4j.Word;
|
||||
|
||||
@Slf4j
|
||||
/**
|
||||
* Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted.
|
||||
*/
|
||||
public class Tesseract2 extends Tesseract1 {
|
||||
|
||||
|
||||
|
||||
@ -4,10 +4,15 @@ import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTex
|
||||
import static com.knecon.fforesight.service.ocr.processor.service.OsUtils.getTemporaryDirectory;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
@ -127,4 +132,61 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testOcrForAllDMFiles() {
|
||||
|
||||
String dir = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/";
|
||||
List<File> foundFiles = Files.walk(Path.of(dir))
|
||||
.sorted(Comparator.comparingLong(this::getFileSize))
|
||||
.map(Path::toFile)
|
||||
.filter(file -> file.getName().endsWith(".pdf"))
|
||||
.peek(System.out::println)
|
||||
.toList();
|
||||
int fileCount = foundFiles.size();
|
||||
AtomicInteger processedCount = new AtomicInteger();
|
||||
System.out.printf("Found %s files, starting OCR for each.%n%n", fileCount);
|
||||
foundFiles.stream().peek(file -> System.out.printf("%s/%s: %s%n", processedCount.getAndIncrement(), fileCount, file)).forEach(this::testOCRForFile);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public long getFileSize(Path path) {
|
||||
|
||||
return Files.size(path);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testOcrForSpecificFile() {
|
||||
|
||||
testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/F.2. A16003E - Acute Inhalation Study.pdf"));
|
||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/A23220A - 404 - Skin Irritation in vivo.pdf"));
|
||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/G.1.2 - 1768300_MMNA_A13617AV_report.pdf"));
|
||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 17_Toxicidade Inalatoria Aguda.pdf"));
|
||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 20_Sensibilizacao_02.pdf"));
|
||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/ITEM 23_A15149W - Dermal absorption of formulated product.pdf"));
|
||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 16_Toxicidade Cutanea Aguda.pdf"));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void testOCRForFile(File file) {
|
||||
|
||||
var originId = FileStorageService.getStorageId(TEST_DOSSIER_ID, "file", FileType.ORIGIN);
|
||||
try (var fileStream = new FileInputStream(file)) {
|
||||
storageService.storeObject(TenantContext.getTenantId(), originId, fileStream);
|
||||
}
|
||||
|
||||
Path tmpFileName = Path.of(getTemporaryDirectory()).resolve(Path.of(file.getAbsolutePath()).getFileName());
|
||||
try (var out = new FileOutputStream(tmpFileName.toFile())) {
|
||||
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", out);
|
||||
System.out.println("File:" + tmpFileName);
|
||||
}
|
||||
System.out.println("\n\n");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user