diff --git a/README.md b/README.md index 46b1310..c774701 100644 --- a/README.md +++ b/README.md @@ -15,11 +15,17 @@ The service uses PDFTron to attempt the removal of invisible elements and waterm Extracts all images from the PDF using PDFBox 3. Striped Image Detection and Stitching Detects if images are striped and stitches them together using Ghostscript. -4. Binarization - Binarizes the resulting images using Leptonica and the Otsu thresholding algorithm. +4. Image Processing + - Convert to grayscale + - Upscale to target DPI + - Filter using Gauss kernel + - Binarizes the resulting images using Leptonica and the Otsu thresholding algorithm. + - Despeckle using various morphological operations 5. OCR Processing Runs Tesseract on the images to extract text. -6. Text Integration +6. Font style detection +Detection of bold text using stroke width estimation +7. Text Integration Draws the resulting text onto the original PDF using PDFBox. Steps 2.-5. happen in parallel and communicate via a blocking queue to limit RAM usage. diff --git a/buildSrc/src/main/kotlin/com.iqser.red.service.java-conventions.gradle.kts b/buildSrc/src/main/kotlin/com.iqser.red.service.java-conventions.gradle.kts index 7257124..3f116e4 100644 --- a/buildSrc/src/main/kotlin/com.iqser.red.service.java-conventions.gradle.kts +++ b/buildSrc/src/main/kotlin/com.iqser.red.service.java-conventions.gradle.kts @@ -25,6 +25,8 @@ tasks.named("test") { reports { junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit")) } + minHeapSize = "512m" + maxHeapSize = "8192m" } tasks.test { diff --git a/ocr-service-v1/ocr-service-processor/build.gradle.kts b/ocr-service-v1/ocr-service-processor/build.gradle.kts index becb36e..8a974dc 100644 --- a/ocr-service-v1/ocr-service-processor/build.gradle.kts +++ b/ocr-service-v1/ocr-service-processor/build.gradle.kts @@ -20,6 +20,7 @@ dependencies { api("org.apache.pdfbox:jbig2-imageio:3.0.4") api("com.github.jai-imageio:jai-imageio-core:1.4.0") api("com.github.jai-imageio:jai-imageio-jpeg2000:1.4.0") + api("org.apache.commons:commons-math3:3.6.1") api("io.github.karols:hocr4j:0.2.0") api("com.amazonaws:aws-java-sdk-kms:1.12.440") api("com.google.guava:guava:31.1-jre") diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java index 3c3d804..7873b36 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrImage.java @@ -31,9 +31,19 @@ public interface OcrImage { int getNumberOnPage(); + /** + * Retrieves the height of the original image (not necessarily in pdf coordinates). + * + * @return the height of the image + */ int getHeight(); + /** + * Retrieves the width of the original image (not necessarily in pdf coordinates). + * + * @return the width of the image + */ int getWidth(); @@ -44,7 +54,7 @@ public interface OcrImage { */ default QuadPoint getImageBounds() { - // cannot be solved with a nice rotation matrix, since the after rotating the text coordinates in the image will always start at (0,0) and will therefore always start at (0,0) in the PDF. + // cannot be solved with a nice rotation matrix. After rotating the text coordinates in the image will always start at (0,0) and will therefore always start at (0,0) in the PDF. // So in order to mimic this behavior we need to start with (0,0) coordinates always. if (getRotationDegrees() == 90 || getRotationDegrees() == 270) { return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, getWidth()), new Point2D.Double(getHeight(), getWidth()), new Point2D.Double(getHeight(), 0)); @@ -65,13 +75,6 @@ public interface OcrImage { } - @SneakyThrows - default BufferedImage getBufferedImage() { - - return LeptUtils.convertPixToImage(getPix()); - } - - /** * Retrieves the rotation degree of the OCR image. * @@ -86,6 +89,10 @@ public interface OcrImage { * @return The optimal page segmentation mode. */ default int getOptimalPageSegmentationMode() { + + if (getWidth() < 200 || getHeight() < 200) { + return ITessAPI.TessPageSegMode.PSM_SINGLE_BLOCK; + } return ITessAPI.TessPageSegMode.PSM_AUTO; } // TODO: evaluate if PSM can be dynamically chosen to increase performance @@ -112,17 +119,6 @@ public interface OcrImage { AffineTransform getImageCTM(); - /** - * Retrieves the size (width * height) of the image. - * - * @return The size of the image. - */ - default int getImageSize() { - - return getHeight() * getWidth(); - } - - default void destroyPix() { LeptUtils.disposePix(getPix()); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrResult.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrResult.java index cdab022..908a241 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrResult.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrResult.java @@ -7,27 +7,17 @@ import com.knecon.fforesight.service.ocr.processor.service.HOcrPageParser; import io.github.karols.hocr4j.Word; -public record OcrResult(Image image, String hOcrPageAbsolutePath) { +public record OcrResult(OcrImage image, String tesseractOutputFilePath) { public static OcrResult create(OcrImage image, String tesseractResult) { - return new OcrResult(Image.fromOcrImage(image), tesseractResult); + return new OcrResult(image, tesseractResult); } public List getAllWords() { - return HOcrPageParser.extractHocrPage(hOcrPageAbsolutePath).getAllWords(); - } - - - public record Image(Integer pageNumber, AffineTransform ctm, QuadPoint position) { - - public static Image fromOcrImage(OcrImage image) { - - return new Image(image.getPageNumber(), image.getImageCTM(), image.getImageCoordinatesInInitialUserSpace()); - } - + return HOcrPageParser.extractHocrPage(tesseractOutputFilePath).getAllWords(); } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrResultToWrite.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrResultToWrite.java new file mode 100644 index 0000000..ccbd45a --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/OcrResultToWrite.java @@ -0,0 +1,35 @@ +package com.knecon.fforesight.service.ocr.processor.model; + +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.FontStyleDetectionModel; +import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory; +import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle; + +public record OcrResultToWrite(List textPositionInImage, QuadPoint imageBoundingBox) { + + public static OcrResultToWrite fromFontStyleDetectionModel(FontStyleDetectionModel fontStyleDetectionModel) { + + return new OcrResultToWrite(fontStyleDetectionModel.getTextPositionInImages(), fontStyleDetectionModel.getImageBounds()); + } + + + public static Map> buildOcrResultsToWrite(List ocrResults, FontMetricsFactory fontMetricsFactory) { + + return ocrResults.stream() + .collect(Collectors.groupingBy(ocrResult -> ocrResult.image().getPageNumber())) + .entrySet() + .stream() + .collect(Collectors.toMap(Map.Entry::getKey, + entry -> entry.getValue() + .stream() + .map(ocrResult -> new OcrResultToWrite(ocrResult.getAllWords() + .stream() + .filter(word -> !word.isBlank()) + .map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR)) + .toList(), ocrResult.image().getImageCoordinatesInInitialUserSpace())) + .toList())); + } +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java index 4935eda..771b53a 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java @@ -6,7 +6,7 @@ public record PageInformation(int height, int width, int number, int rotationDeg public static PageInformation fromPDPage(int pageNum, PDPage page) { - return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation()); + return new PageInformation((int) page.getMediaBox().getHeight(), (int) page.getMediaBox().getWidth(), pageNum, page.getRotation()); } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageOcrImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageOcrImage.java index 2e2d3ab..efdadc7 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageOcrImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/RenderedPageOcrImage.java @@ -60,17 +60,6 @@ public class RenderedPageOcrImage implements OcrImage { } - @Override - public QuadPoint getImageBounds() { - - if (rotationDegrees == 90 || rotationDegrees == 270) { - return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, width), new Point2D.Double(height, width), new Point2D.Double(height, 0)); - } else { - return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, height), new Point2D.Double(width, height), new Point2D.Double(width, 0)); - } - } - - @Override public int getPageNumber() { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/TextPositionInImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/TextPositionInImage.java index 1a96101..b0591e2 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/TextPositionInImage.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/TextPositionInImage.java @@ -7,29 +7,35 @@ import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.util.Matrix; import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory; +import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle; -import io.github.karols.hocr4j.Bounds; import io.github.karols.hocr4j.Word; import lombok.AccessLevel; import lombok.Getter; +import lombok.Setter; import lombok.experimental.FieldDefaults; @Getter -@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true) +@FieldDefaults(level = AccessLevel.PRIVATE) public class TextPositionInImage { - QuadPoint position; - String text; - AffineTransform imageCTM; + final QuadPoint position; + final String text; + final AffineTransform imageCTM; + + @Setter FontMetricsFactory fontMetricsFactory; + @Setter + FontStyle fontStyle; - public TextPositionInImage(Word word, AffineTransform imageCTM, FontMetricsFactory fontMetricsFactory) { + public TextPositionInImage(Word word, AffineTransform imageCTM, FontMetricsFactory fontMetricsFactory, FontStyle fontStyle) { this.position = QuadPoint.fromBounds(word.getBounds()); this.text = word.getText(); this.imageCTM = imageCTM; this.fontMetricsFactory = fontMetricsFactory; + this.fontStyle = fontStyle; } @@ -90,6 +96,13 @@ public class TextPositionInImage { } + public double getTextHeight() { + + var metrics = fontMetricsFactory.calculateMetrics(text, getTransformedWidth(), getTransformedHeight()); + return fontMetricsFactory.calculateFontSize(text, getTransformedWidth()) * metrics.getHeightScaling(); + } + + public double getHeight() { return position.a().distance(position.b()); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/scriptdetection/FontStyleDetectionModel.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/scriptdetection/FontStyleDetectionModel.java new file mode 100644 index 0000000..5c60dc3 --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/scriptdetection/FontStyleDetectionModel.java @@ -0,0 +1,58 @@ +package com.knecon.fforesight.service.ocr.processor.model.scriptdetection; + +import java.util.List; + +import com.knecon.fforesight.service.ocr.processor.model.OcrImage; +import com.knecon.fforesight.service.ocr.processor.model.OcrResult; +import com.knecon.fforesight.service.ocr.processor.model.QuadPoint; +import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage; +import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory; +import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; + +import lombok.AccessLevel; +import lombok.Getter; +import lombok.RequiredArgsConstructor; +import lombok.experimental.FieldDefaults; +import net.sourceforge.lept4j.Leptonica1; +import net.sourceforge.lept4j.Pix; +import net.sourceforge.lept4j.util.LeptUtils; + +@Getter +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public final class FontStyleDetectionModel { + + QuadPoint imageBounds; + Pix image; + List textPositionsAndWordImages; + + + public static FontStyleDetectionModel fromOcrResult(OcrResult ocrResult, FontMetricsFactory fontMetricsFactory, OcrServiceSettings settings) { + + var image = Leptonica1.pixRead(ocrResult.tesseractOutputFilePath() + ".tiff"); + var wordPixes = ocrResult.getAllWords().stream().filter(word -> !word.isBlank()).map(word -> TextPositionAndWordImage.create(ocrResult.image().getImageCTM(), word, image, settings, fontMetricsFactory)).toList(); + + return new FontStyleDetectionModel(ocrResult.image().getImageCoordinatesInInitialUserSpace(), image, wordPixes); + } + + + public List getTextPositionInImages() { + + return textPositionsAndWordImages.stream().map(TextPositionAndWordImage::getTextPositionInImage).toList(); + } + + + public List getWordImages() { + + return textPositionsAndWordImages.stream().map(TextPositionAndWordImage::getWordImage).toList(); + } + + + public void dispose() { + + LeptUtils.disposePix(image); + getWordImages().forEach(WordImage::dispose); + } + + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/scriptdetection/TextPositionAndWordImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/scriptdetection/TextPositionAndWordImage.java new file mode 100644 index 0000000..274a997 --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/scriptdetection/TextPositionAndWordImage.java @@ -0,0 +1,52 @@ +package com.knecon.fforesight.service.ocr.processor.model.scriptdetection; + +import java.awt.geom.AffineTransform; +import java.util.Objects; + +import org.apache.commons.math3.ml.clustering.Clusterable; + +import com.knecon.fforesight.service.ocr.processor.model.OcrResult; +import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage; +import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory; +import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle; +import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; + +import io.github.karols.hocr4j.Word; +import lombok.Getter; +import net.sourceforge.lept4j.Pix; + +@Getter +public final class TextPositionAndWordImage implements Clusterable { + + private final TextPositionInImage textPositionInImage; + private final WordImage wordImage; + + + public TextPositionAndWordImage(TextPositionInImage textPositionInImage, WordImage wordImage) { + + this.textPositionInImage = textPositionInImage; + this.wordImage = wordImage; + } + + + public static TextPositionAndWordImage create(AffineTransform imageCTM, Word word, Pix image, OcrServiceSettings settings, FontMetricsFactory fontMetricsFactory) { + + TextPositionInImage textPositionInImage = new TextPositionInImage(word, imageCTM, fontMetricsFactory, FontStyle.REGULAR); + WordImage wordImage = new WordImage(textPositionInImage.getTextHeight(), word, image, settings); + return new TextPositionAndWordImage(textPositionInImage, wordImage); + } + + + @Override + public double[] getPoint() { + + return wordImage.getPoint(); + } + + + public double getTextHeight() { + + return wordImage.getTextHeight(); + } + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/scriptdetection/WordImage.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/scriptdetection/WordImage.java new file mode 100644 index 0000000..dd169af --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/scriptdetection/WordImage.java @@ -0,0 +1,71 @@ +package com.knecon.fforesight.service.ocr.processor.model.scriptdetection; + +import org.apache.commons.math3.ml.clustering.Clusterable; + +import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.StrokeWidthCalculator; +import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; +import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; + +import io.github.karols.hocr4j.Word; +import lombok.AccessLevel; +import lombok.Getter; +import lombok.RequiredArgsConstructor; +import lombok.experimental.FieldDefaults; +import net.sourceforge.lept4j.Box; +import net.sourceforge.lept4j.Leptonica1; +import net.sourceforge.lept4j.Pix; +import net.sourceforge.lept4j.util.LeptUtils; + +@Getter +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class WordImage implements Clusterable { + + Pix image; + String text; + double textHeight; + OcrServiceSettings settings; + + + public WordImage(double textHeight, Word word, Pix originalImage, OcrServiceSettings settings) { + + Box box = new Box(word.getBounds().getLeft(), word.getBounds().getTop(), word.getBounds().getWidth(), word.getBounds().getHeight(), 1); + this.image = Leptonica1.pixClipRectangle(originalImage, box, null); + box.clear(); + this.text = word.getText(); + this.textHeight = textHeight; + this.settings = settings; + } + + + public boolean hasLargerStrokeWidth(double strokeWidth) { + + int roundedStrokeWidth = (int) Math.round(strokeWidth); + double roundingError = (roundedStrokeWidth - strokeWidth) / strokeWidth; + + // add 1 to open a bit bigger than the estimated regular stroke width + Pix openedPix = Leptonica1.pixOpenBrick(null, image, roundedStrokeWidth + 1, roundedStrokeWidth + 1); + + double openedPixelDensity = ImageProcessingUtils.calculatePixelDensity(openedPix); + + double pixelDensity = ImageProcessingUtils.calculatePixelDensity(image); + + LeptUtils.disposePix(openedPix); + + return (openedPixelDensity * (1 + roundingError)) / pixelDensity > (settings.getBoldThreshold()); + } + + + @Override + public double[] getPoint() { + + return new double[]{textHeight}; + } + + + public void dispose() { + + LeptUtils.disposePix(image); + } + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java index cc2c84e..ac80d15 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/GhostScriptService.java @@ -3,24 +3,18 @@ package com.knecon.fforesight.service.ocr.processor.service; import java.io.InputStream; import java.nio.file.Path; import java.util.ArrayList; -import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingDeque; -import java.util.concurrent.LinkedTransferQueue; import java.util.stream.Collectors; import org.apache.pdfbox.pdmodel.PDDocument; import org.springframework.stereotype.Service; -import com.azure.core.implementation.GeoObjectHelper; -import com.knecon.fforesight.service.ocr.processor.model.OcrImage; -import com.knecon.fforesight.service.ocr.processor.model.PageInformation; import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile; -import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage; import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage; import com.knecon.fforesight.service.ocr.processor.service.threads.BlockingQueueFiller; import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler; @@ -32,7 +26,6 @@ import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; import lombok.extern.slf4j.Slf4j; -import net.sourceforge.lept4j.Pix; @Slf4j @Service diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java index 54b8306..ddef5ec 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OCRService.java @@ -9,6 +9,7 @@ import java.io.OutputStream; import java.nio.file.Path; import java.util.LinkedList; import java.util.List; +import java.util.Map; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.stream.IntStream; @@ -20,8 +21,10 @@ import org.springframework.util.FileSystemUtils; import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService; import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService; +import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite; import com.knecon.fforesight.service.ocr.processor.model.OcrImage; import com.knecon.fforesight.service.ocr.processor.model.OcrResult; +import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.FontStyleDetector; import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; @@ -44,6 +47,7 @@ public class OCRService { InvisibleElementRemovalService invisibleElementRemovalService; OcrResultWriter ocrResultWriter; GhostScriptService ghostScriptService; + FontStyleDetector boldDetector; /** @@ -135,9 +139,14 @@ public class OCRService { ocrThread.join(); } - log.info("OCR processing has finished, writing results"); + log.info("Tesseract OCR has finished for file {} and dossier {}", fileId, dossierId); + timestamp = System.currentTimeMillis(); - var dictionariesToUpdate = ocrResultWriter.drawOcrResultsToPdf(document, ocrResults); + Map> imageWithTextPositionsPerPage = boldDetector.detectBold(ocrResults, document); + stats.increaseFontStyleDetectionDuration(System.currentTimeMillis() - timestamp); + + timestamp = System.currentTimeMillis(); + var dictionariesToUpdate = ocrResultWriter.drawOcrResultsToPdf(document, imageWithTextPositionsPerPage); log.info("Saving document"); document.saveIncremental(out, dictionariesToUpdate); stats.increaseWritingTextDuration(System.currentTimeMillis() - timestamp); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java index 6cd904b..30e2ee7 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java @@ -2,11 +2,11 @@ package com.knecon.fforesight.service.ocr.processor.service; import java.awt.Color; import java.awt.geom.Point2D; +import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.stream.Collectors; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSName; @@ -20,11 +20,9 @@ import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentPrope import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.ocr.processor.model.OcrResult; +import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite; import com.knecon.fforesight.service.ocr.processor.model.QuadPoint; import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage; -import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory; -import com.knecon.fforesight.service.ocr.processor.service.fonts.Type0FontMetricsFactory; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; import lombok.AccessLevel; @@ -44,19 +42,17 @@ public class OcrResultWriter { @SneakyThrows - public Set drawOcrResultsToPdf(PDDocument document, List ocrResults) { + public Set drawOcrResultsToPdf(PDDocument document, Map> imagesWithResultsPerPage) { - FontMetricsFactory fontMetricsFactory = new Type0FontMetricsFactory(document); Set dictionariesToUpdate = new HashSet<>(); - Map> resultsPerPage = ocrResults.stream().collect(Collectors.groupingBy(result -> result.image().pageNumber())); - resultsPerPage.keySet().forEach(pageNumber -> drawResultsPerPage(document, pageNumber, resultsPerPage, dictionariesToUpdate, fontMetricsFactory)); + imagesWithResultsPerPage.keySet().forEach(pageNumber -> drawResultsPerPage(document, pageNumber, imagesWithResultsPerPage.get(pageNumber), dictionariesToUpdate)); dictionariesToUpdate.add(document.getDocumentInformation().getCOSObject()); return dictionariesToUpdate; } @SneakyThrows - private void drawResultsPerPage(PDDocument document, Integer pageNumber, Map> resultsPerPage, Set dictionariesToUpdate, FontMetricsFactory fontMetricsFactory) { + private void drawResultsPerPage(PDDocument document, Integer pageNumber, List ocrResultToWrite, Set dictionariesToUpdate) { var pdPage = document.getPage(pageNumber - 1); @@ -69,7 +65,7 @@ public class OcrResultWriter { escapeContentStreams(document, pdPage); - List words = buildTextPositionsOnPage(pageNumber, resultsPerPage, fontMetricsFactory); + List words = ocrResultToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList(); try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true)) { // write invisible ocr text inside tagged content @@ -86,7 +82,6 @@ public class OcrResultWriter { // write visible ocr text inside optional group contentStream.beginMarkedContent(COSName.OC, textDebugLayer); contentStream.saveGraphicsState(); - contentStream.setNonStrokingColor(Color.BLUE); words.forEach(word -> drawVisibleWord(word, contentStream)); contentStream.restoreGraphicsState(); contentStream.endMarkedContent(); @@ -94,7 +89,9 @@ public class OcrResultWriter { // write word bounding boxes (tesseract output) inside optional group contentStream.beginMarkedContent(COSName.OC, bBoxDebugLayer); contentStream.saveGraphicsState(); - resultsPerPage.get(pageNumber).stream().map(OcrResult::image).forEach(image -> drawGrid(contentStream, image.position())); + ocrResultToWrite.stream() + .map(OcrResultToWrite::imageBoundingBox) + .forEach(imagePosition -> drawGrid(contentStream, imagePosition)); words.stream().map(TextPositionInImage::getTransformedTextBBox).forEach(word -> drawRectangle(contentStream, word)); contentStream.restoreGraphicsState(); contentStream.endMarkedContent(); @@ -105,15 +102,6 @@ public class OcrResultWriter { } - private static List buildTextPositionsOnPage(Integer pageNumber, Map> resultsPerPage, FontMetricsFactory fontMetricsFactory) { - - return resultsPerPage.get(pageNumber) - .stream() - .flatMap(result -> result.getAllWords().stream().filter(word -> !word.isBlank()).map(word -> new TextPositionInImage(word, result.image().ctm(), fontMetricsFactory))) - .toList(); - } - - @SneakyThrows private static void escapeContentStreams(PDDocument document, PDPage pdPage) { // We need to append to the contentstream, otherwise the content could be overlapped by images @@ -196,6 +184,11 @@ public class OcrResultWriter { private void drawWord(TextPositionInImage position, PDPageContentStream contentStream, RenderingMode renderingMode) { try { + contentStream.setNonStrokingColor(switch (position.getFontStyle()) { + case BOLD -> Color.RED; + case ITALIC -> Color.GREEN; + default -> Color.BLUE; + }); contentStream.beginText(); contentStream.setRenderingMode(renderingMode); contentStream.setFont(position.getFont(), (float) position.getFontSize()); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/Statistics.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/Statistics.java index 97d44e3..63b5439 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/Statistics.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/Statistics.java @@ -16,12 +16,14 @@ public class Statistics { AtomicLong pdf2ImgDuration; AtomicLong writingTextDuration; AtomicLong imageProcessingDuration; + AtomicLong fontStyleDetectionDuration; public Statistics(int numberOfExtractThreads, int numberOfOcrThreads) { this.imageExtraction = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfExtractThreads, 0L))); this.tesseractDuration = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfOcrThreads, 0L))); + this.fontStyleDetectionDuration = new AtomicLong(0); this.pdf2ImgDuration = new AtomicLong(0); this.writingTextDuration = new AtomicLong(0); this.imageProcessingDuration = new AtomicLong(0); @@ -57,12 +59,17 @@ public class Statistics { writingTextDuration.addAndGet(duration); } + public void increaseFontStyleDetectionDuration(long duration) { + + fontStyleDetectionDuration.addAndGet(duration); + } + @Override public String toString() { return String.format( - "imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s", + "imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s, FontstyleDetection=%.2f s", ((float) imageExtraction.stream().mapToLong(Long::longValue).average().orElse(0) / 1000), ((float) imageExtraction.stream().mapToLong(Long::longValue).max().orElse(0) / 1000), ((float) imageExtraction.stream().mapToLong(Long::longValue).min().orElse(0) / 1000), @@ -71,7 +78,8 @@ public class Statistics { ((float) tesseractDuration.stream().mapToLong(Long::longValue).min().orElse(0) / 1000), (float) imageProcessingDuration.get() / 1000, (float) pdf2ImgDuration.get() / 1000, - (float) writingTextDuration.get() / 1000); + (float) writingTextDuration.get() / 1000, + (float) fontStyleDetectionDuration.get() / 1000); } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/FontMetricsFactory.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/FontMetricsFactory.java index 9173533..039b217 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/FontMetricsFactory.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/FontMetricsFactory.java @@ -36,6 +36,7 @@ public interface FontMetricsFactory { PDFont getFont(); + HeightAndDescent calculateHeightAndDescent(String text); } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/FontStyle.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/FontStyle.java new file mode 100644 index 0000000..23be225 --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/FontStyle.java @@ -0,0 +1,5 @@ +package com.knecon.fforesight.service.ocr.processor.service.fonts; + +public enum FontStyle { + REGULAR, BOLD, ITALIC +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/Type0FontMetricsFactory.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/Type0FontMetricsFactory.java index b7e10ef..9e787ad 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/Type0FontMetricsFactory.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/fonts/Type0FontMetricsFactory.java @@ -1,6 +1,9 @@ package com.knecon.fforesight.service.ocr.processor.service.fonts; import java.io.ByteArrayInputStream; +import java.util.Collections; +import java.util.List; +import java.util.Set; import org.apache.fontbox.ttf.GlyphData; import org.apache.fontbox.ttf.TTFParser; @@ -12,22 +15,41 @@ import org.apache.pdfbox.pdmodel.font.PDType0Font; import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent; +import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; +import software.amazon.awssdk.services.s3.endpoints.internal.Value; @Slf4j +@RequiredArgsConstructor public class Type0FontMetricsFactory implements FontMetricsFactory { private final PDType0Font type0Font; private final TrueTypeFont trueTypeFont; + // for this specific font back-/forward-slashes have a lot of descent screwing up the font size and therefore bold detection. So if we find such a character we ignore its descent. + private static final Set slashGlyphIds = Set.of(18, 63); + + + public static Type0FontMetricsFactory regular(PDDocument document) { + + return createFromResource("fonts/cmu-regular.ttf", document); + } + + + public static Type0FontMetricsFactory bold(PDDocument document) { + + return createFromResource("fonts/cmu-bold.ttf", document); + } + @SneakyThrows - public Type0FontMetricsFactory(PDDocument document) { + private static Type0FontMetricsFactory createFromResource(String resourcePath, PDDocument document) { - try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream("fonts/cmu-regular.ttf"); var buffer = new RandomAccessReadBuffer(in)) { - this.trueTypeFont = new TTFParser().parse(buffer); // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information - this.type0Font = PDType0Font.load(document, this.trueTypeFont, false); // use Type0Font for unicode support + try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(resourcePath); var buffer = new RandomAccessReadBuffer(in)) { + TrueTypeFont trueTypeFont = new TTFParser().parse(buffer); // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information + PDType0Font type0Font = PDType0Font.load(document, trueTypeFont, true); // use Type0Font for unicode support + return new Type0FontMetricsFactory(type0Font, trueTypeFont); } } @@ -55,8 +77,9 @@ public class Type0FontMetricsFactory implements FontMetricsFactory { if (glyph == null || glyph.getBoundingBox() == null) { continue; } - - descent = Math.min(descent, glyph.getYMinimum()); + if (!slashGlyphIds.contains(glyphId)) { + descent = Math.min(descent, glyph.getYMinimum()); + } height = Math.max(height, glyph.getYMaximum()); } catch (Exception e) { log.warn("descent and height of string {} could not be parsed, using average fallback value!", text); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/FontStyleDetector.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/FontStyleDetector.java new file mode 100644 index 0000000..d12af43 --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/FontStyleDetector.java @@ -0,0 +1,158 @@ +package com.knecon.fforesight.service.ocr.processor.service.scriptdetection; + +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Stream; + +import org.apache.commons.math3.ml.clustering.Cluster; +import org.apache.commons.math3.ml.clustering.DBSCANClusterer; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.ocr.processor.model.OcrResult; +import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite; +import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.FontStyleDetectionModel; +import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.TextPositionAndWordImage; +import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.WordImage; +import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory; +import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle; +import com.knecon.fforesight.service.ocr.processor.service.fonts.Type0FontMetricsFactory; +import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; + +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class FontStyleDetector { + + OcrServiceSettings settings; + StrokeWidthCalculator strokeWidthCalculator; + + + /** + * Implementation of the MOBDoB algorithm, refer to the paper here: + * Script Independent Detection of Bold Words in Multi Font-size Documents + *

+ * As a high level overview: We cluster all text based on its font size. We determine the cluster with the most words. This is assumed to be regular text. + * We then estimate the average stroke width of that cluster by thinning all text to a single pixel and calculating the ratio of remaining pixels. + * (Leptonica Documentation on thinning) + * For each word we scale this average strokewidth based on its fontsize compared to the most common fontsize. + * Using the scaled strokewidth we do an opening operation. + * (Opening (Morphology)). + * We then threshold the ratio of remaining pixels to determine whether a word is bold or not. + *

+ * I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size. + * But this is based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height. + * The paper does not describe its clustering algorithm, so I've decided on DBSCAN due to its good runtime and readily available implementation by apache commons math. + * Moreover, the paper states that stroke width scales linearly with text height. I've come to the conclusion this is not the case. + * It seems it scales with the square root of the text height. Or at least this seemed to give the best results. + */ + public Map> detectBold(List ocrResults, PDDocument document) { + + FontMetricsFactory fontMetricsFactory = Type0FontMetricsFactory.regular(document); + if (!settings.isBoldDetection()) { + return OcrResultToWrite.buildOcrResultsToWrite(ocrResults, fontMetricsFactory); + } + + Map> ocrResultToWritePerPage = new HashMap<>(); + + DBSCANClusterer clusterer = new DBSCANClusterer<>(0.5, 1); + + FontMetricsFactory boldFontMetricsFactory = Type0FontMetricsFactory.bold(document); + + for (OcrResult result : ocrResults) { + FontStyleDetectionModel fontStyleDetectionModel = FontStyleDetectionModel.fromOcrResult(result, fontMetricsFactory, settings); + + List> clusters = clusterer.cluster(fontStyleDetectionModel.getTextPositionsAndWordImages()); + Optional> largestCluster = clusters.stream().max(Comparator.comparingInt(cluster -> cluster.getPoints().size())); + + if (largestCluster.isEmpty()) { + insertResultIntoMap(result.image().getPageNumber(), ocrResultToWritePerPage, fontStyleDetectionModel); + continue; + } + + List wordsWithMostCommonTextHeight = largestCluster.get().getPoints(); + + double standardTextHeight = calculateStandardTextheight(wordsWithMostCommonTextHeight); + double regularStrokeWidth = calculateRegularStrokeWidth(wordsWithMostCommonTextHeight); + + for (TextPositionAndWordImage textPositionsAndWordImage : fontStyleDetectionModel.getTextPositionsAndWordImages()) { + decideOnFontStyle(textPositionsAndWordImage, regularStrokeWidth, standardTextHeight, boldFontMetricsFactory); + } + + insertResultIntoMap(result.image().getPageNumber(), ocrResultToWritePerPage, fontStyleDetectionModel); + fontStyleDetectionModel.dispose(); + } + + log.info("Finished bold detection"); + return ocrResultToWritePerPage; + } + + + private static double calculateStandardTextheight(List wordsWithMostCommonTextHeight) { + + return wordsWithMostCommonTextHeight.stream() + .map(TextPositionAndWordImage::getWordImage) + .mapToDouble(WordImage::getTextHeight) + .filter(Double::isFinite) + .average() + .orElseThrow(); + } + + + private double calculateRegularStrokeWidth(List wordsWithMostCommonTextHeight) { + + return wordsWithMostCommonTextHeight.stream() + .mapToDouble(textPositionAndWordImage -> strokeWidthCalculator.calculate(textPositionAndWordImage.getWordImage().getImage())) + .filter(Double::isFinite) + .average() + .orElseThrow(); + } + + + private static void insertResultIntoMap(int pageNumber, Map> ocrResultToWritePerPage, FontStyleDetectionModel fontStyleDetectionModel) { + + OcrResultToWrite ocrResult = OcrResultToWrite.fromFontStyleDetectionModel(fontStyleDetectionModel); + + ocrResultToWritePerPage.compute(pageNumber, (key, existingList) -> { + if (existingList == null) { + return List.of(ocrResult); + } else { + return Stream.concat(existingList.stream(), Stream.of(ocrResult)).toList(); + } + }); + } + + + private void decideOnFontStyle(TextPositionAndWordImage textPositionsAndWordImage, + double standardStrokeWidth, + double standardTextHeight, + FontMetricsFactory boldFontMetricsFactory) { + + double scaledStrokeWidth = scaleStrokeWidthByFontSize(textPositionsAndWordImage, standardStrokeWidth, standardTextHeight); + + if (textPositionsAndWordImage.getWordImage().hasLargerStrokeWidth(scaledStrokeWidth)) { + textPositionsAndWordImage.getTextPositionInImage().setFontMetricsFactory(boldFontMetricsFactory); + textPositionsAndWordImage.getTextPositionInImage().setFontStyle(FontStyle.BOLD); + } else { + textPositionsAndWordImage.getTextPositionInImage().setFontStyle(FontStyle.REGULAR); + } + } + + + private static double scaleStrokeWidthByFontSize(TextPositionAndWordImage textPositionsAndWordImage, double standardStrokeWidth, double standardFontSize) { + + double influenceOfFontSize = 1.0; // the paper states that stroke width scales exactly linearly with font size. This did not seem to be true for me. Maybe some of the preprocessing steps are affecting this. + double fontsizeScalingFactor = Math.sqrt(textPositionsAndWordImage.getWordImage().getTextHeight() / standardFontSize); + return standardStrokeWidth + (influenceOfFontSize * (fontsizeScalingFactor - 1) * standardStrokeWidth); + } + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/ItalicDetector.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/ItalicDetector.java new file mode 100644 index 0000000..851a649 --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/ItalicDetector.java @@ -0,0 +1,57 @@ +package com.knecon.fforesight.service.ocr.processor.service.scriptdetection; + +import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils; + +import lombok.AccessLevel; +import lombok.NoArgsConstructor; +import lombok.experimental.FieldDefaults; +import net.sourceforge.lept4j.Leptonica1; +import net.sourceforge.lept4j.Pix; +import net.sourceforge.lept4j.Sel; +import net.sourceforge.lept4j.util.LeptUtils; + +/** + * This code is a good start for detecting italic text, although it has a few issues especially with glyphs which are naturally slanted. E.g. z, 2, 7, / + * If we want this maybe we should exclude these glyphs and then it might have less false positives. But in its current state i don't recommend using it. + */ +@NoArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class ItalicDetector { + + + static String italicKernel = "ooxxooxxooxxoxxooXxooxxoxxooxxooxxoo"; + Sel italicSel = Leptonica1.selCreateFromString(italicKernel, 9, 4, "italicKernel"); + Sel brickSel = Leptonica1.selCreateBrick(3, 4, 1, 2, 1); + + + public boolean isItalic(Pix pix) { + + Pix preprocessed = preprocess(pix); + Pix flipped = Leptonica1.pixFlipLR(null, pix); + Pix flippedPreprocessed = preprocess(flipped); + Leptonica1.pixFlipLR(flippedPreprocessed, flippedPreprocessed); + double pixelDensity = ImageProcessingUtils.calculatePixelDensity(preprocessed); + double flippedPixelDensity = ImageProcessingUtils.calculatePixelDensity(flippedPreprocessed); + LeptUtils.disposePix(preprocessed); + LeptUtils.disposePix(flipped); + LeptUtils.disposePix(flippedPreprocessed); + return flippedPixelDensity / pixelDensity < 0.85; + } + + + private Pix preprocess(Pix pix) { + + Pix eroded = Leptonica1.pixErode(null, pix, italicSel.getPointer()); + Pix dilated = Leptonica1.pixDilate(null, eroded, brickSel.getPointer()); + LeptUtils.disposePix(eroded); + return dilated; + } + + + public void dispose() { + + LeptUtils.dispose(italicSel); + LeptUtils.dispose(brickSel); + } + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/StrokeWidthCalculator.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/StrokeWidthCalculator.java new file mode 100644 index 0000000..bbd8747 --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/scriptdetection/StrokeWidthCalculator.java @@ -0,0 +1,58 @@ +package com.knecon.fforesight.service.ocr.processor.service.scriptdetection; + +import static net.sourceforge.lept4j.ILeptonica.L_THIN_FG; + +import java.nio.IntBuffer; + +import org.springframework.stereotype.Service; + +import lombok.AccessLevel; +import lombok.NoArgsConstructor; +import lombok.experimental.FieldDefaults; +import net.sourceforge.lept4j.Leptonica1; +import net.sourceforge.lept4j.Pix; +import net.sourceforge.lept4j.Sela; +import net.sourceforge.lept4j.util.LeptUtils; + +@Service +@NoArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class StrokeWidthCalculator { + + Sela thinningSel; + + + /** + * Uses a series of sels to thin all connected lines to a single pixel. Then the pixel ratio is a good estimation of the stroke width in pixels. + * Leptonica Documentation on thinning + * Since the baseline is a strokewidth of exactly one, we need to add 1 to the result. + * + * @param input binarized pix with text on it + * @return estimated stroke width in pixels + */ + public double calculate(Pix input) { + + init(); + + Pix thinned = Leptonica1.pixThinConnectedBySet(input, L_THIN_FG, thinningSel, 0); + + IntBuffer thinnedPixelCount = IntBuffer.allocate(1); + Leptonica1.pixCountPixels(thinned, thinnedPixelCount, null); + + IntBuffer pixelCount = IntBuffer.allocate(1); + Leptonica1.pixCountPixels(input, pixelCount, null); + + LeptUtils.disposePix(thinned); + + return (double) pixelCount.get() / thinnedPixelCount.get() + 1; + } + + + private void init() { + + if (thinningSel == null) { + thinningSel = Leptonica1.selaMakeThinSets(1, 0); + } + } + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/BlockingQueueFiller.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/BlockingQueueFiller.java index b46a1fe..e5d4f52 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/BlockingQueueFiller.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/BlockingQueueFiller.java @@ -17,7 +17,6 @@ import lombok.experimental.FieldDefaults; import lombok.extern.slf4j.Slf4j; import net.sourceforge.tess4j.TessAPI1; - /* This just moves the Elements from the GhostScriptOutputListener into the ImageProcessing queue asynchronously */ @@ -38,9 +37,17 @@ public class BlockingQueueFiller extends Thread { public void run() { // Interrupting signals that the image extraction has finished - while (!allImagesQueued) { + try { + while (!allImagesQueued) { final UnprocessedImage image = imageInputQueue.take(); - imageOutputQueue.put(image); + try { + imageOutputQueue.put(image); + } catch (InterruptedException e) { + imageOutputQueue.put(image); + } + } + } catch (InterruptedException e) { + log.info("All images extracted, emptying processing queue and stopping"); } // empty the queue @@ -54,4 +61,5 @@ public class BlockingQueueFiller extends Thread { } } + } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java index c7a81e9..35dc148 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java @@ -4,8 +4,6 @@ import static net.sourceforge.tess4j.ITessAPI.TRUE; import java.nio.FloatBuffer; import java.nio.IntBuffer; -import java.util.ArrayList; -import java.util.List; import java.util.NoSuchElementException; import java.util.concurrent.BlockingQueue; @@ -29,6 +27,8 @@ import lombok.Setter; import lombok.SneakyThrows; import lombok.experimental.FieldDefaults; import lombok.extern.slf4j.Slf4j; +import net.sourceforge.lept4j.L_Kernel; +import net.sourceforge.lept4j.Leptonica1; import net.sourceforge.lept4j.Pix; import net.sourceforge.lept4j.util.LeptUtils; import net.sourceforge.tess4j.ITessAPI; @@ -45,6 +45,7 @@ public class ImageProcessingThread extends Thread { final BlockingQueue imageInputQueue; final BlockingQueue imageOutputQueue; final ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle(); + final L_Kernel gaussianKernel = Leptonica1.makeGaussianKernel(2, 2, 1.2f, 1); final Statistics stats; final OcrServiceSettings settings; final PDDocument document; @@ -81,7 +82,9 @@ public class ImageProcessingThread extends Thread { log.debug("No images left in processing queue, stopping."); } + TessAPI1.TessBaseAPIEnd(this.detectionScriptHandle); TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle); + LeptUtils.dispose(gaussianKernel); } @@ -106,7 +109,7 @@ public class ImageProcessingThread extends Thread { private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) { - Pix pix = binarize(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi()); + Pix pix = processPix(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi()); int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle); Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix); @@ -129,7 +132,7 @@ public class ImageProcessingThread extends Thread { float imageDPI = Math.abs(extractedImage.image().getWidth() / (extractedImage.ctm().getScalingFactorX() / 72)); - Pix pix = binarize(extractedImage.asPix(), imageDPI, settings.getDpi()); + Pix pix = processPix(extractedImage.asPix(), imageDPI, settings.getDpi()); int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle); Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix); @@ -163,7 +166,7 @@ public class ImageProcessingThread extends Thread { orientationDegreeResultBuffer = IntBuffer.allocate(1); orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1); - scriptureNameBuffer = new PointerByReference(); + scriptureNameBuffer = new PointerByReference(); // Is this memory being freed? scriptureConfidenceBuffer = FloatBuffer.allocate(1); int orientationDegree = 0; @@ -183,15 +186,58 @@ public class ImageProcessingThread extends Thread { @SneakyThrows - private Pix binarize(Pix pix, float imageDpi, int targetDpi) { + private Pix processPix(Pix pix, float imageDpi, int targetDpi) { - Pix grayScale = ImageProcessingUtils.convertToGrayScale(pix); - Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale); - return ImageProcessingUtils.despecklePix(scaledUp); + Pix grayScale; + Pix scaledUp; + Pix gaussian; + Pix binarized; + //convert to grayscale + if (pix.d == 8) { + grayScale = pix; + } else if (pix.d == 32) { + grayScale = Leptonica1.pixConvertRGBToGrayFast(pix); + } else if (pix.d == 1) { + grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255); + } else { + throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d)); + } + + // scale up + float targetFactor = targetDpi / imageDpi; + if (targetFactor > 2.1) { + scaledUp = Leptonica1.pixScaleGray4xLI(grayScale); + } else if (targetFactor > 1.1) { + scaledUp = Leptonica1.pixScaleGray2xLI(grayScale); + } else { + scaledUp = grayScale; + } + + // remove noise and prep for Otsu + gaussian = Leptonica1.pixConvolve(scaledUp, gaussianKernel, 8, 1); + + // Threshold to binary + if (pix.w < 100 || pix.h < 100) { + binarized = Leptonica1.pixThresholdToBinary(gaussian, 170); + } else { + binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.2f, null); + + if (binarized == null) { // Sometimes Otsu just fails, then we binarize directly + binarized = Leptonica1.pixThresholdToBinary(gaussian, 170); + } + } + + LeptUtils.disposePix(pix); + LeptUtils.disposePix(grayScale); + LeptUtils.disposePix(scaledUp); + LeptUtils.disposePix(gaussian); + + return binarized; } + private static ITessAPI.TessBaseAPI initDetectionScriptHandle() { ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate(); diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java index 9c1a0a7..e851204 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/OCRThread.java @@ -116,16 +116,11 @@ public class OCRThread extends Thread { @SneakyThrows public void executeTesseract(int psm, int dpi, Pix pix, String tesseractOutputFileName) { - if (settings.isDebug()) { - String[] a = tesseractOutputFileName.split("/"); - String folder = "/tmp/pixs/" + a[a.length - 3]; - new File(folder).mkdirs(); - Leptonica1.pixWrite(folder + "/pix_" + a[a.length - 1] + ".png", pix, 3); - } - + Leptonica1.pixWrite(tesseractOutputFileName + ".tiff", pix, 5); // write the used image for later bold detection instance.setVariable("user_defined_dpi", String.valueOf(dpi)); instance.setPageSegMode(psm); instance.createDocumentsWithResults(pix, null, tesseractOutputFileName, List.of(ITesseract.RenderedFormat.HOCR), ITessAPI.TessPageIteratorLevel.RIL_BLOCK); + } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java index 227c9c3..312d081 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/settings/OcrServiceSettings.java @@ -19,10 +19,12 @@ public class OcrServiceSettings { int psmOverride = -1; // Overrides the page segmentation mode if > 0 int minImageHeight = 20; // Minimum height for images to be processed int minImageWidth = 20; // Minimum width for images to be processed - float minRotationConfidence = 2; // + float minRotationConfidence = 2; // Sets a lower bound for the confidence rating for rotated pages. boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes boolean removeWatermark; // If true, watermarks will be removed String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR"); + boolean boldDetection = true; // if true, bold detection will be attempted + double boldThreshold = 0.5; // Words are opened with a brick of average stroke width, if the ratio of remaining pixels is higher the word is determined bold. } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java index cd8d7f8..3f3fc62 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/ImageProcessingUtils.java @@ -6,14 +6,17 @@ import java.awt.Graphics; import java.awt.Graphics2D; import java.awt.Transparency; import java.awt.image.BufferedImage; +import java.nio.IntBuffer; import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage; +import com.sun.jna.ptr.PointerByReference; import lombok.SneakyThrows; import lombok.experimental.UtilityClass; +import net.sourceforge.lept4j.L_Kernel; import net.sourceforge.lept4j.Leptonica1; import net.sourceforge.lept4j.Pix; import net.sourceforge.lept4j.util.LeptUtils; @@ -37,67 +40,6 @@ public class ImageProcessingUtils { } - public static Pix despecklePix(Pix pix) { - - assert pix.d == 8; - Pix despeckled; - if (pix.w < 100 || pix.h < 100) { - // too small to properly despeckle, just binarize instead. - despeckled = Leptonica1.pixThresholdToBinary(pix, 180); - } else { - despeckled = LeptUtils.despeckle(pix, - LeptUtils.SEL_STR3, - 3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though... - if (despeckled == null) { - despeckled = Leptonica1.pixThresholdToBinary(pix, 180); - } - } - if (pix != despeckled) { - LeptUtils.disposePix(pix); - } - return despeckled; - } - - - public static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) { - - float targetFactor = targetDpi / imageDpi; - - if (targetFactor > 3) { - Pix scaledUp; - scaledUp = Leptonica1.pixScaleGray4xLI(grayScale); - LeptUtils.disposePix(grayScale); - return scaledUp; - } else if (targetFactor > 1.9) { - Pix scaledUp; - scaledUp = Leptonica1.pixScaleGray2xLI(grayScale); - LeptUtils.disposePix(grayScale); - return scaledUp; - } else { - return grayScale; - } - } - - - @SneakyThrows - public static Pix convertToGrayScale(Pix pix) { - - if (pix.d == 8) { - return pix; - } else if (pix.d == 32) { - Pix grayScale = Leptonica1.pixConvertRGBToGrayFast(pix); - LeptUtils.disposePix(pix); - return grayScale; - } else if (pix.d == 1) { - Pix grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255); - LeptUtils.disposePix(pix); - return grayScale; - } else { - throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d)); - } - } - - public Pix deRotatePix(int orientDegree, Pix pix) { return switch (360 - orientDegree) { @@ -128,4 +70,16 @@ public class ImageProcessingUtils { } } + + public static double calculatePixelDensity(Pix pix) { + + IntBuffer pixelCount = IntBuffer.allocate(1); + int result = Leptonica1.pixCountPixels(pix, pixelCount, null); + if (result == 0) { + return (double) pixelCount.get() / (pix.h * pix.w); + } else { + return -1; + } + } + } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/KernelUtils.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/KernelUtils.java new file mode 100644 index 0000000..bd7982e --- /dev/null +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/KernelUtils.java @@ -0,0 +1,73 @@ +package com.knecon.fforesight.service.ocr.processor.utils; + +import lombok.experimental.UtilityClass; +import net.sourceforge.lept4j.L_Kernel; +import net.sourceforge.lept4j.Leptonica1; + +@UtilityClass +public class KernelUtils { + + /* + -1, -1, -1 + -1, 8, -1 + -1, -1, -1 + */ + public L_Kernel createFullLaplacianKernel() { + + L_Kernel laplacianKernel = Leptonica1.kernelCreate(3, 3); + Leptonica1.kernelSetElement(laplacianKernel, 0, 0, -1); + Leptonica1.kernelSetElement(laplacianKernel, 0, 1, -1); + Leptonica1.kernelSetElement(laplacianKernel, 0, 2, -1); + Leptonica1.kernelSetElement(laplacianKernel, 1, 0, -1); + Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1); + Leptonica1.kernelSetElement(laplacianKernel, 2, 0, -1); + Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1); + Leptonica1.kernelSetElement(laplacianKernel, 2, 2, -1); + Leptonica1.kernelSetElement(laplacianKernel, 1, 1, 8); + return laplacianKernel; + } + + /* + 0, 0, -1, 0, 0 + 0, -1, -1, -1, 0 + -1, -1, 12, -1, -1 + 0, -1, -1, -1, 0 + 0, 0, -1, 0, 0 + */ + public L_Kernel createLaplacianKernel5x5() { + + L_Kernel laplacianKernel = Leptonica1.kernelCreate(5, 5); + Leptonica1.kernelSetElement(laplacianKernel, 0, 2, -1); + Leptonica1.kernelSetElement(laplacianKernel, 1, 1, -1); + Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1); + Leptonica1.kernelSetElement(laplacianKernel, 1, 3, -1); + Leptonica1.kernelSetElement(laplacianKernel, 2, 0, -1); + Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1); + Leptonica1.kernelSetElement(laplacianKernel, 2, 3, -1); + Leptonica1.kernelSetElement(laplacianKernel, 2, 4, -1); + Leptonica1.kernelSetElement(laplacianKernel, 3, 1, -1); + Leptonica1.kernelSetElement(laplacianKernel, 3, 2, -1); + Leptonica1.kernelSetElement(laplacianKernel, 3, 3, -1); + Leptonica1.kernelSetElement(laplacianKernel, 4, 2, -1); + Leptonica1.kernelSetElement(laplacianKernel, 2, 2, 12); + return laplacianKernel; + } + + /* + 0, -1, 0 + -1, 4, -1 + 0, -1, 0 + */ + public L_Kernel createLaplacianKernel() { + + L_Kernel laplacianKernel = Leptonica1.kernelCreate(3, 3); + Leptonica1.kernelSetElement(laplacianKernel, 0, 1, -1); + Leptonica1.kernelSetElement(laplacianKernel, 1, 0, -1); + Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1); + Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1); + Leptonica1.kernelSetElement(laplacianKernel, 1, 1, 4); + return laplacianKernel; + } + + +} diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java index d85dc46..61870fa 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/utils/Tesseract2.java @@ -138,4 +138,11 @@ public class Tesseract2 extends Tesseract1 { return renderer; } + @Override + protected void dispose() { + + TessBaseAPIEnd(getHandle()); + TessBaseAPIDelete(getHandle()); + } + } diff --git a/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/service/Type0FontMetricsFactoryTest.java b/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/service/Type0FontMetricsFactoryTest.java index 8ecd43a..6881615 100644 --- a/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/service/Type0FontMetricsFactoryTest.java +++ b/ocr-service-v1/ocr-service-processor/src/test/java/com/knecon/fforesight/service/ocr/processor/service/Type0FontMetricsFactoryTest.java @@ -20,7 +20,7 @@ class Type0FontMetricsFactoryTest { public void testStringWidth() { try (PDDocument document = Loader.loadPDF(new File(Type0FontMetricsFactoryTest.class.getClassLoader().getResource("InvisibleText.pdf").getPath()))) { - Type0FontMetricsFactory metricsFactory = new Type0FontMetricsFactory(document); + Type0FontMetricsFactory metricsFactory = Type0FontMetricsFactory.regular(document); FontMetrics fontMetrics = metricsFactory.calculateMetrics("deine mutter", 100, 50); } diff --git a/ocr-service-v1/ocr-service-server/src/main/resources/application.yml b/ocr-service-v1/ocr-service-server/src/main/resources/application.yml index 6d421a2..1b10f2c 100644 --- a/ocr-service-v1/ocr-service-server/src/main/resources/application.yml +++ b/ocr-service-v1/ocr-service-server/src/main/resources/application.yml @@ -41,8 +41,15 @@ fforesight: ignored-endpoints: [ '/actuator/health', '/actuator/health/**' ] enabled: true +logging.pattern.level: "%5p [${spring.application.name},%X{traceId:-},%X{spanId:-}]" management: + tracing: + sampling: + probability: 1.0 + otlp: + tracing: + endpoint: http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces endpoint: metrics.enabled: ${monitoring.enabled:false} prometheus.enabled: ${monitoring.enabled:false} diff --git a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java index 06465ad..6c4d124 100644 --- a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcr() { - String text = testOCR("files/2009-1048395_50pages_tables.pdf"); + String text = testOCR("files/402Study.pdf"); } @@ -162,13 +162,7 @@ public class OcrServiceIntegrationTest extends AbstractTest { @SneakyThrows public void testOcrForSpecificFile() { - testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/F.2. A16003E - Acute Inhalation Study.pdf")); -// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/A23220A - 404 - Skin Irritation in vivo.pdf")); -// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/G.1.2 - 1768300_MMNA_A13617AV_report.pdf")); -// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 17_Toxicidade Inalatoria Aguda.pdf")); -// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 20_Sensibilizacao_02.pdf")); -// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/ITEM 23_A15149W - Dermal absorption of formulated product.pdf")); -// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 16_Toxicidade Cutanea Aguda.pdf")); + testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles2/A16361B - Acute Dermal Toxicity Study in Rats.pdf")); }