Merge branch 'RED-7669-fontstyle' into 'master'
RED-8155: integrate bold-detection into ocr-service Closes RED-7669 See merge request redactmanager/ocr-service!31
This commit is contained in:
commit
bab16ad9b2
12
README.md
12
README.md
@ -15,11 +15,17 @@ The service uses PDFTron to attempt the removal of invisible elements and waterm
|
||||
Extracts all images from the PDF using PDFBox
|
||||
3. Striped Image Detection and Stitching
|
||||
Detects if images are striped and stitches them together using Ghostscript.
|
||||
4. Binarization
|
||||
Binarizes the resulting images using Leptonica and the Otsu thresholding algorithm.
|
||||
4. Image Processing
|
||||
- Convert to grayscale
|
||||
- Upscale to target DPI
|
||||
- Filter using Gauss kernel
|
||||
- Binarizes the resulting images using Leptonica and the Otsu thresholding algorithm.
|
||||
- Despeckle using various morphological operations
|
||||
5. OCR Processing
|
||||
Runs Tesseract on the images to extract text.
|
||||
6. Text Integration
|
||||
6. Font style detection
|
||||
Detection of bold text using stroke width estimation
|
||||
7. Text Integration
|
||||
Draws the resulting text onto the original PDF using PDFBox.
|
||||
|
||||
Steps 2.-5. happen in parallel and communicate via a blocking queue to limit RAM usage.
|
||||
|
||||
@ -25,6 +25,8 @@ tasks.named<Test>("test") {
|
||||
reports {
|
||||
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
|
||||
}
|
||||
minHeapSize = "512m"
|
||||
maxHeapSize = "8192m"
|
||||
}
|
||||
|
||||
tasks.test {
|
||||
|
||||
@ -20,6 +20,7 @@ dependencies {
|
||||
api("org.apache.pdfbox:jbig2-imageio:3.0.4")
|
||||
api("com.github.jai-imageio:jai-imageio-core:1.4.0")
|
||||
api("com.github.jai-imageio:jai-imageio-jpeg2000:1.4.0")
|
||||
api("org.apache.commons:commons-math3:3.6.1")
|
||||
api("io.github.karols:hocr4j:0.2.0")
|
||||
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
|
||||
api("com.google.guava:guava:31.1-jre")
|
||||
|
||||
@ -31,9 +31,19 @@ public interface OcrImage {
|
||||
int getNumberOnPage();
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the height of the original image (not necessarily in pdf coordinates).
|
||||
*
|
||||
* @return the height of the image
|
||||
*/
|
||||
int getHeight();
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the width of the original image (not necessarily in pdf coordinates).
|
||||
*
|
||||
* @return the width of the image
|
||||
*/
|
||||
int getWidth();
|
||||
|
||||
|
||||
@ -44,7 +54,7 @@ public interface OcrImage {
|
||||
*/
|
||||
default QuadPoint getImageBounds() {
|
||||
|
||||
// cannot be solved with a nice rotation matrix, since the after rotating the text coordinates in the image will always start at (0,0) and will therefore always start at (0,0) in the PDF.
|
||||
// cannot be solved with a nice rotation matrix. After rotating the text coordinates in the image will always start at (0,0) and will therefore always start at (0,0) in the PDF.
|
||||
// So in order to mimic this behavior we need to start with (0,0) coordinates always.
|
||||
if (getRotationDegrees() == 90 || getRotationDegrees() == 270) {
|
||||
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, getWidth()), new Point2D.Double(getHeight(), getWidth()), new Point2D.Double(getHeight(), 0));
|
||||
@ -65,13 +75,6 @@ public interface OcrImage {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
default BufferedImage getBufferedImage() {
|
||||
|
||||
return LeptUtils.convertPixToImage(getPix());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the rotation degree of the OCR image.
|
||||
*
|
||||
@ -86,6 +89,10 @@ public interface OcrImage {
|
||||
* @return The optimal page segmentation mode.
|
||||
*/
|
||||
default int getOptimalPageSegmentationMode() {
|
||||
|
||||
if (getWidth() < 200 || getHeight() < 200) {
|
||||
return ITessAPI.TessPageSegMode.PSM_SINGLE_BLOCK;
|
||||
}
|
||||
return ITessAPI.TessPageSegMode.PSM_AUTO;
|
||||
} // TODO: evaluate if PSM can be dynamically chosen to increase performance
|
||||
|
||||
@ -112,17 +119,6 @@ public interface OcrImage {
|
||||
AffineTransform getImageCTM();
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the size (width * height) of the image.
|
||||
*
|
||||
* @return The size of the image.
|
||||
*/
|
||||
default int getImageSize() {
|
||||
|
||||
return getHeight() * getWidth();
|
||||
}
|
||||
|
||||
|
||||
default void destroyPix() {
|
||||
|
||||
LeptUtils.disposePix(getPix());
|
||||
|
||||
@ -7,27 +7,17 @@ import com.knecon.fforesight.service.ocr.processor.service.HOcrPageParser;
|
||||
|
||||
import io.github.karols.hocr4j.Word;
|
||||
|
||||
public record OcrResult(Image image, String hOcrPageAbsolutePath) {
|
||||
public record OcrResult(OcrImage image, String tesseractOutputFilePath) {
|
||||
|
||||
public static OcrResult create(OcrImage image, String tesseractResult) {
|
||||
|
||||
return new OcrResult(Image.fromOcrImage(image), tesseractResult);
|
||||
return new OcrResult(image, tesseractResult);
|
||||
}
|
||||
|
||||
|
||||
public List<Word> getAllWords() {
|
||||
|
||||
return HOcrPageParser.extractHocrPage(hOcrPageAbsolutePath).getAllWords();
|
||||
}
|
||||
|
||||
|
||||
public record Image(Integer pageNumber, AffineTransform ctm, QuadPoint position) {
|
||||
|
||||
public static Image fromOcrImage(OcrImage image) {
|
||||
|
||||
return new Image(image.getPageNumber(), image.getImageCTM(), image.getImageCoordinatesInInitialUserSpace());
|
||||
}
|
||||
|
||||
return HOcrPageParser.extractHocrPage(tesseractOutputFilePath).getAllWords();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,35 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.FontStyleDetectionModel;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
|
||||
|
||||
public record OcrResultToWrite(List<TextPositionInImage> textPositionInImage, QuadPoint imageBoundingBox) {
|
||||
|
||||
public static OcrResultToWrite fromFontStyleDetectionModel(FontStyleDetectionModel fontStyleDetectionModel) {
|
||||
|
||||
return new OcrResultToWrite(fontStyleDetectionModel.getTextPositionInImages(), fontStyleDetectionModel.getImageBounds());
|
||||
}
|
||||
|
||||
|
||||
public static Map<Integer, List<OcrResultToWrite>> buildOcrResultsToWrite(List<OcrResult> ocrResults, FontMetricsFactory fontMetricsFactory) {
|
||||
|
||||
return ocrResults.stream()
|
||||
.collect(Collectors.groupingBy(ocrResult -> ocrResult.image().getPageNumber()))
|
||||
.entrySet()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(Map.Entry::getKey,
|
||||
entry -> entry.getValue()
|
||||
.stream()
|
||||
.map(ocrResult -> new OcrResultToWrite(ocrResult.getAllWords()
|
||||
.stream()
|
||||
.filter(word -> !word.isBlank())
|
||||
.map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR))
|
||||
.toList(), ocrResult.image().getImageCoordinatesInInitialUserSpace()))
|
||||
.toList()));
|
||||
}
|
||||
}
|
||||
@ -6,7 +6,7 @@ public record PageInformation(int height, int width, int number, int rotationDeg
|
||||
|
||||
public static PageInformation fromPDPage(int pageNum, PDPage page) {
|
||||
|
||||
return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation());
|
||||
return new PageInformation((int) page.getMediaBox().getHeight(), (int) page.getMediaBox().getWidth(), pageNum, page.getRotation());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -60,17 +60,6 @@ public class RenderedPageOcrImage implements OcrImage {
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public QuadPoint getImageBounds() {
|
||||
|
||||
if (rotationDegrees == 90 || rotationDegrees == 270) {
|
||||
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, width), new Point2D.Double(height, width), new Point2D.Double(height, 0));
|
||||
} else {
|
||||
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, height), new Point2D.Double(width, height), new Point2D.Double(width, 0));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getPageNumber() {
|
||||
|
||||
|
||||
@ -7,29 +7,35 @@ import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
|
||||
|
||||
import io.github.karols.hocr4j.Bounds;
|
||||
import io.github.karols.hocr4j.Word;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class TextPositionInImage {
|
||||
|
||||
QuadPoint position;
|
||||
String text;
|
||||
AffineTransform imageCTM;
|
||||
final QuadPoint position;
|
||||
final String text;
|
||||
final AffineTransform imageCTM;
|
||||
|
||||
@Setter
|
||||
FontMetricsFactory fontMetricsFactory;
|
||||
@Setter
|
||||
FontStyle fontStyle;
|
||||
|
||||
|
||||
public TextPositionInImage(Word word, AffineTransform imageCTM, FontMetricsFactory fontMetricsFactory) {
|
||||
public TextPositionInImage(Word word, AffineTransform imageCTM, FontMetricsFactory fontMetricsFactory, FontStyle fontStyle) {
|
||||
|
||||
this.position = QuadPoint.fromBounds(word.getBounds());
|
||||
this.text = word.getText();
|
||||
this.imageCTM = imageCTM;
|
||||
this.fontMetricsFactory = fontMetricsFactory;
|
||||
this.fontStyle = fontStyle;
|
||||
}
|
||||
|
||||
|
||||
@ -90,6 +96,13 @@ public class TextPositionInImage {
|
||||
}
|
||||
|
||||
|
||||
public double getTextHeight() {
|
||||
|
||||
var metrics = fontMetricsFactory.calculateMetrics(text, getTransformedWidth(), getTransformedHeight());
|
||||
return fontMetricsFactory.calculateFontSize(text, getTransformedWidth()) * metrics.getHeightScaling();
|
||||
}
|
||||
|
||||
|
||||
public double getHeight() {
|
||||
|
||||
return position.a().distance(position.b());
|
||||
|
||||
@ -0,0 +1,58 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model.scriptdetection;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
@Getter
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public final class FontStyleDetectionModel {
|
||||
|
||||
QuadPoint imageBounds;
|
||||
Pix image;
|
||||
List<TextPositionAndWordImage> textPositionsAndWordImages;
|
||||
|
||||
|
||||
public static FontStyleDetectionModel fromOcrResult(OcrResult ocrResult, FontMetricsFactory fontMetricsFactory, OcrServiceSettings settings) {
|
||||
|
||||
var image = Leptonica1.pixRead(ocrResult.tesseractOutputFilePath() + ".tiff");
|
||||
var wordPixes = ocrResult.getAllWords().stream().filter(word -> !word.isBlank()).map(word -> TextPositionAndWordImage.create(ocrResult.image().getImageCTM(), word, image, settings, fontMetricsFactory)).toList();
|
||||
|
||||
return new FontStyleDetectionModel(ocrResult.image().getImageCoordinatesInInitialUserSpace(), image, wordPixes);
|
||||
}
|
||||
|
||||
|
||||
public List<TextPositionInImage> getTextPositionInImages() {
|
||||
|
||||
return textPositionsAndWordImages.stream().map(TextPositionAndWordImage::getTextPositionInImage).toList();
|
||||
}
|
||||
|
||||
|
||||
public List<WordImage> getWordImages() {
|
||||
|
||||
return textPositionsAndWordImages.stream().map(TextPositionAndWordImage::getWordImage).toList();
|
||||
}
|
||||
|
||||
|
||||
public void dispose() {
|
||||
|
||||
LeptUtils.disposePix(image);
|
||||
getWordImages().forEach(WordImage::dispose);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -0,0 +1,52 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model.scriptdetection;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.commons.math3.ml.clustering.Clusterable;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
import io.github.karols.hocr4j.Word;
|
||||
import lombok.Getter;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
|
||||
@Getter
|
||||
public final class TextPositionAndWordImage implements Clusterable {
|
||||
|
||||
private final TextPositionInImage textPositionInImage;
|
||||
private final WordImage wordImage;
|
||||
|
||||
|
||||
public TextPositionAndWordImage(TextPositionInImage textPositionInImage, WordImage wordImage) {
|
||||
|
||||
this.textPositionInImage = textPositionInImage;
|
||||
this.wordImage = wordImage;
|
||||
}
|
||||
|
||||
|
||||
public static TextPositionAndWordImage create(AffineTransform imageCTM, Word word, Pix image, OcrServiceSettings settings, FontMetricsFactory fontMetricsFactory) {
|
||||
|
||||
TextPositionInImage textPositionInImage = new TextPositionInImage(word, imageCTM, fontMetricsFactory, FontStyle.REGULAR);
|
||||
WordImage wordImage = new WordImage(textPositionInImage.getTextHeight(), word, image, settings);
|
||||
return new TextPositionAndWordImage(textPositionInImage, wordImage);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public double[] getPoint() {
|
||||
|
||||
return wordImage.getPoint();
|
||||
}
|
||||
|
||||
|
||||
public double getTextHeight() {
|
||||
|
||||
return wordImage.getTextHeight();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,71 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model.scriptdetection;
|
||||
|
||||
import org.apache.commons.math3.ml.clustering.Clusterable;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.StrokeWidthCalculator;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||
|
||||
import io.github.karols.hocr4j.Word;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import net.sourceforge.lept4j.Box;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
@Getter
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class WordImage implements Clusterable {
|
||||
|
||||
Pix image;
|
||||
String text;
|
||||
double textHeight;
|
||||
OcrServiceSettings settings;
|
||||
|
||||
|
||||
public WordImage(double textHeight, Word word, Pix originalImage, OcrServiceSettings settings) {
|
||||
|
||||
Box box = new Box(word.getBounds().getLeft(), word.getBounds().getTop(), word.getBounds().getWidth(), word.getBounds().getHeight(), 1);
|
||||
this.image = Leptonica1.pixClipRectangle(originalImage, box, null);
|
||||
box.clear();
|
||||
this.text = word.getText();
|
||||
this.textHeight = textHeight;
|
||||
this.settings = settings;
|
||||
}
|
||||
|
||||
|
||||
public boolean hasLargerStrokeWidth(double strokeWidth) {
|
||||
|
||||
int roundedStrokeWidth = (int) Math.round(strokeWidth);
|
||||
double roundingError = (roundedStrokeWidth - strokeWidth) / strokeWidth;
|
||||
|
||||
// add 1 to open a bit bigger than the estimated regular stroke width
|
||||
Pix openedPix = Leptonica1.pixOpenBrick(null, image, roundedStrokeWidth + 1, roundedStrokeWidth + 1);
|
||||
|
||||
double openedPixelDensity = ImageProcessingUtils.calculatePixelDensity(openedPix);
|
||||
|
||||
double pixelDensity = ImageProcessingUtils.calculatePixelDensity(image);
|
||||
|
||||
LeptUtils.disposePix(openedPix);
|
||||
|
||||
return (openedPixelDensity * (1 + roundingError)) / pixelDensity > (settings.getBoldThreshold());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public double[] getPoint() {
|
||||
|
||||
return new double[]{textHeight};
|
||||
}
|
||||
|
||||
|
||||
public void dispose() {
|
||||
|
||||
LeptUtils.disposePix(image);
|
||||
}
|
||||
|
||||
}
|
||||
@ -3,24 +3,18 @@ package com.knecon.fforesight.service.ocr.processor.service;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.LinkedBlockingDeque;
|
||||
import java.util.concurrent.LinkedTransferQueue;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.azure.core.implementation.GeoObjectHelper;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.BlockingQueueFiller;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler;
|
||||
@ -32,7 +26,6 @@ import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
|
||||
@ -9,6 +9,7 @@ import java.io.OutputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.stream.IntStream;
|
||||
@ -20,8 +21,10 @@ import org.springframework.util.FileSystemUtils;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.FontStyleDetector;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
@ -44,6 +47,7 @@ public class OCRService {
|
||||
InvisibleElementRemovalService invisibleElementRemovalService;
|
||||
OcrResultWriter ocrResultWriter;
|
||||
GhostScriptService ghostScriptService;
|
||||
FontStyleDetector boldDetector;
|
||||
|
||||
|
||||
/**
|
||||
@ -135,9 +139,14 @@ public class OCRService {
|
||||
ocrThread.join();
|
||||
}
|
||||
|
||||
log.info("OCR processing has finished, writing results");
|
||||
log.info("Tesseract OCR has finished for file {} and dossier {}", fileId, dossierId);
|
||||
|
||||
timestamp = System.currentTimeMillis();
|
||||
var dictionariesToUpdate = ocrResultWriter.drawOcrResultsToPdf(document, ocrResults);
|
||||
Map<Integer, List<OcrResultToWrite>> imageWithTextPositionsPerPage = boldDetector.detectBold(ocrResults, document);
|
||||
stats.increaseFontStyleDetectionDuration(System.currentTimeMillis() - timestamp);
|
||||
|
||||
timestamp = System.currentTimeMillis();
|
||||
var dictionariesToUpdate = ocrResultWriter.drawOcrResultsToPdf(document, imageWithTextPositionsPerPage);
|
||||
log.info("Saving document");
|
||||
document.saveIncremental(out, dictionariesToUpdate);
|
||||
stats.increaseWritingTextDuration(System.currentTimeMillis() - timestamp);
|
||||
|
||||
@ -2,11 +2,11 @@ package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
@ -20,11 +20,9 @@ import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentPrope
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.Type0FontMetricsFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
@ -44,19 +42,17 @@ public class OcrResultWriter {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Set<COSDictionary> drawOcrResultsToPdf(PDDocument document, List<OcrResult> ocrResults) {
|
||||
public Set<COSDictionary> drawOcrResultsToPdf(PDDocument document, Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||
|
||||
FontMetricsFactory fontMetricsFactory = new Type0FontMetricsFactory(document);
|
||||
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
|
||||
Map<Integer, List<OcrResult>> resultsPerPage = ocrResults.stream().collect(Collectors.groupingBy(result -> result.image().pageNumber()));
|
||||
resultsPerPage.keySet().forEach(pageNumber -> drawResultsPerPage(document, pageNumber, resultsPerPage, dictionariesToUpdate, fontMetricsFactory));
|
||||
imagesWithResultsPerPage.keySet().forEach(pageNumber -> drawResultsPerPage(document, pageNumber, imagesWithResultsPerPage.get(pageNumber), dictionariesToUpdate));
|
||||
dictionariesToUpdate.add(document.getDocumentInformation().getCOSObject());
|
||||
return dictionariesToUpdate;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void drawResultsPerPage(PDDocument document, Integer pageNumber, Map<Integer, List<OcrResult>> resultsPerPage, Set<COSDictionary> dictionariesToUpdate, FontMetricsFactory fontMetricsFactory) {
|
||||
private void drawResultsPerPage(PDDocument document, Integer pageNumber, List<OcrResultToWrite> ocrResultToWrite, Set<COSDictionary> dictionariesToUpdate) {
|
||||
|
||||
var pdPage = document.getPage(pageNumber - 1);
|
||||
|
||||
@ -69,7 +65,7 @@ public class OcrResultWriter {
|
||||
|
||||
escapeContentStreams(document, pdPage);
|
||||
|
||||
List<TextPositionInImage> words = buildTextPositionsOnPage(pageNumber, resultsPerPage, fontMetricsFactory);
|
||||
List<TextPositionInImage> words = ocrResultToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
|
||||
try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
|
||||
|
||||
// write invisible ocr text inside tagged content
|
||||
@ -86,7 +82,6 @@ public class OcrResultWriter {
|
||||
// write visible ocr text inside optional group
|
||||
contentStream.beginMarkedContent(COSName.OC, textDebugLayer);
|
||||
contentStream.saveGraphicsState();
|
||||
contentStream.setNonStrokingColor(Color.BLUE);
|
||||
words.forEach(word -> drawVisibleWord(word, contentStream));
|
||||
contentStream.restoreGraphicsState();
|
||||
contentStream.endMarkedContent();
|
||||
@ -94,7 +89,9 @@ public class OcrResultWriter {
|
||||
// write word bounding boxes (tesseract output) inside optional group
|
||||
contentStream.beginMarkedContent(COSName.OC, bBoxDebugLayer);
|
||||
contentStream.saveGraphicsState();
|
||||
resultsPerPage.get(pageNumber).stream().map(OcrResult::image).forEach(image -> drawGrid(contentStream, image.position()));
|
||||
ocrResultToWrite.stream()
|
||||
.map(OcrResultToWrite::imageBoundingBox)
|
||||
.forEach(imagePosition -> drawGrid(contentStream, imagePosition));
|
||||
words.stream().map(TextPositionInImage::getTransformedTextBBox).forEach(word -> drawRectangle(contentStream, word));
|
||||
contentStream.restoreGraphicsState();
|
||||
contentStream.endMarkedContent();
|
||||
@ -105,15 +102,6 @@ public class OcrResultWriter {
|
||||
}
|
||||
|
||||
|
||||
private static List<TextPositionInImage> buildTextPositionsOnPage(Integer pageNumber, Map<Integer, List<OcrResult>> resultsPerPage, FontMetricsFactory fontMetricsFactory) {
|
||||
|
||||
return resultsPerPage.get(pageNumber)
|
||||
.stream()
|
||||
.flatMap(result -> result.getAllWords().stream().filter(word -> !word.isBlank()).map(word -> new TextPositionInImage(word, result.image().ctm(), fontMetricsFactory)))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void escapeContentStreams(PDDocument document, PDPage pdPage) {
|
||||
// We need to append to the contentstream, otherwise the content could be overlapped by images
|
||||
@ -196,6 +184,11 @@ public class OcrResultWriter {
|
||||
private void drawWord(TextPositionInImage position, PDPageContentStream contentStream, RenderingMode renderingMode) {
|
||||
|
||||
try {
|
||||
contentStream.setNonStrokingColor(switch (position.getFontStyle()) {
|
||||
case BOLD -> Color.RED;
|
||||
case ITALIC -> Color.GREEN;
|
||||
default -> Color.BLUE;
|
||||
});
|
||||
contentStream.beginText();
|
||||
contentStream.setRenderingMode(renderingMode);
|
||||
contentStream.setFont(position.getFont(), (float) position.getFontSize());
|
||||
|
||||
@ -16,12 +16,14 @@ public class Statistics {
|
||||
AtomicLong pdf2ImgDuration;
|
||||
AtomicLong writingTextDuration;
|
||||
AtomicLong imageProcessingDuration;
|
||||
AtomicLong fontStyleDetectionDuration;
|
||||
|
||||
|
||||
public Statistics(int numberOfExtractThreads, int numberOfOcrThreads) {
|
||||
|
||||
this.imageExtraction = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfExtractThreads, 0L)));
|
||||
this.tesseractDuration = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfOcrThreads, 0L)));
|
||||
this.fontStyleDetectionDuration = new AtomicLong(0);
|
||||
this.pdf2ImgDuration = new AtomicLong(0);
|
||||
this.writingTextDuration = new AtomicLong(0);
|
||||
this.imageProcessingDuration = new AtomicLong(0);
|
||||
@ -57,12 +59,17 @@ public class Statistics {
|
||||
writingTextDuration.addAndGet(duration);
|
||||
}
|
||||
|
||||
public void increaseFontStyleDetectionDuration(long duration) {
|
||||
|
||||
fontStyleDetectionDuration.addAndGet(duration);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.format(
|
||||
"imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s",
|
||||
"imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s, FontstyleDetection=%.2f s",
|
||||
((float) imageExtraction.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
|
||||
((float) imageExtraction.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
|
||||
((float) imageExtraction.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
|
||||
@ -71,7 +78,8 @@ public class Statistics {
|
||||
((float) tesseractDuration.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
|
||||
(float) imageProcessingDuration.get() / 1000,
|
||||
(float) pdf2ImgDuration.get() / 1000,
|
||||
(float) writingTextDuration.get() / 1000);
|
||||
(float) writingTextDuration.get() / 1000,
|
||||
(float) fontStyleDetectionDuration.get() / 1000);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -36,6 +36,7 @@ public interface FontMetricsFactory {
|
||||
|
||||
PDFont getFont();
|
||||
|
||||
|
||||
HeightAndDescent calculateHeightAndDescent(String text);
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,5 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.fonts;
|
||||
|
||||
public enum FontStyle {
|
||||
REGULAR, BOLD, ITALIC
|
||||
}
|
||||
@ -1,6 +1,9 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.fonts;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.fontbox.ttf.GlyphData;
|
||||
import org.apache.fontbox.ttf.TTFParser;
|
||||
@ -12,22 +15,41 @@ import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import software.amazon.awssdk.services.s3.endpoints.internal.Value;
|
||||
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
public class Type0FontMetricsFactory implements FontMetricsFactory {
|
||||
|
||||
private final PDType0Font type0Font;
|
||||
private final TrueTypeFont trueTypeFont;
|
||||
|
||||
// for this specific font back-/forward-slashes have a lot of descent screwing up the font size and therefore bold detection. So if we find such a character we ignore its descent.
|
||||
private static final Set<Integer> slashGlyphIds = Set.of(18, 63);
|
||||
|
||||
|
||||
public static Type0FontMetricsFactory regular(PDDocument document) {
|
||||
|
||||
return createFromResource("fonts/cmu-regular.ttf", document);
|
||||
}
|
||||
|
||||
|
||||
public static Type0FontMetricsFactory bold(PDDocument document) {
|
||||
|
||||
return createFromResource("fonts/cmu-bold.ttf", document);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Type0FontMetricsFactory(PDDocument document) {
|
||||
private static Type0FontMetricsFactory createFromResource(String resourcePath, PDDocument document) {
|
||||
|
||||
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream("fonts/cmu-regular.ttf"); var buffer = new RandomAccessReadBuffer(in)) {
|
||||
this.trueTypeFont = new TTFParser().parse(buffer); // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
|
||||
this.type0Font = PDType0Font.load(document, this.trueTypeFont, false); // use Type0Font for unicode support
|
||||
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(resourcePath); var buffer = new RandomAccessReadBuffer(in)) {
|
||||
TrueTypeFont trueTypeFont = new TTFParser().parse(buffer); // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
|
||||
PDType0Font type0Font = PDType0Font.load(document, trueTypeFont, true); // use Type0Font for unicode support
|
||||
return new Type0FontMetricsFactory(type0Font, trueTypeFont);
|
||||
}
|
||||
}
|
||||
|
||||
@ -55,8 +77,9 @@ public class Type0FontMetricsFactory implements FontMetricsFactory {
|
||||
if (glyph == null || glyph.getBoundingBox() == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
descent = Math.min(descent, glyph.getYMinimum());
|
||||
if (!slashGlyphIds.contains(glyphId)) {
|
||||
descent = Math.min(descent, glyph.getYMinimum());
|
||||
}
|
||||
height = Math.max(height, glyph.getYMaximum());
|
||||
} catch (Exception e) {
|
||||
log.warn("descent and height of string {} could not be parsed, using average fallback value!", text);
|
||||
|
||||
@ -0,0 +1,158 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.scriptdetection;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.math3.ml.clustering.Cluster;
|
||||
import org.apache.commons.math3.ml.clustering.DBSCANClusterer;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.FontStyleDetectionModel;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.TextPositionAndWordImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.WordImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.Type0FontMetricsFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class FontStyleDetector {
|
||||
|
||||
OcrServiceSettings settings;
|
||||
StrokeWidthCalculator strokeWidthCalculator;
|
||||
|
||||
|
||||
/**
|
||||
* Implementation of the MOBDoB algorithm, refer to the paper here:
|
||||
* <a href="http://mile.ee.iisc.ac.in/publications/softCopy/DocumentAnalysis/Sai_NCVPRIPG2013.pdf">Script Independent Detection of Bold Words in Multi Font-size Documents</a>
|
||||
* <p>
|
||||
* As a high level overview: We cluster all text based on its font size. We determine the cluster with the most words. This is assumed to be regular text.
|
||||
* We then estimate the average stroke width of that cluster by thinning all text to a single pixel and calculating the ratio of remaining pixels.
|
||||
* (<a href="http://www.leptonica.org/papers/conn.pdf">Leptonica Documentation on thinning</a>)
|
||||
* For each word we scale this average strokewidth based on its fontsize compared to the most common fontsize.
|
||||
* Using the scaled strokewidth we do an opening operation.
|
||||
* (<a href="https://en.wikipedia.org/wiki/Opening_(morphology)">Opening (Morphology)</a>).
|
||||
* We then threshold the ratio of remaining pixels to determine whether a word is bold or not.
|
||||
* <p>
|
||||
* I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size.
|
||||
* But this is based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height.
|
||||
* The paper does not describe its clustering algorithm, so I've decided on DBSCAN due to its good runtime and readily available implementation by apache commons math.
|
||||
* Moreover, the paper states that stroke width scales linearly with text height. I've come to the conclusion this is not the case.
|
||||
* It seems it scales with the square root of the text height. Or at least this seemed to give the best results.
|
||||
*/
|
||||
public Map<Integer, List<OcrResultToWrite>> detectBold(List<OcrResult> ocrResults, PDDocument document) {
|
||||
|
||||
FontMetricsFactory fontMetricsFactory = Type0FontMetricsFactory.regular(document);
|
||||
if (!settings.isBoldDetection()) {
|
||||
return OcrResultToWrite.buildOcrResultsToWrite(ocrResults, fontMetricsFactory);
|
||||
}
|
||||
|
||||
Map<Integer, List<OcrResultToWrite>> ocrResultToWritePerPage = new HashMap<>();
|
||||
|
||||
DBSCANClusterer<TextPositionAndWordImage> clusterer = new DBSCANClusterer<>(0.5, 1);
|
||||
|
||||
FontMetricsFactory boldFontMetricsFactory = Type0FontMetricsFactory.bold(document);
|
||||
|
||||
for (OcrResult result : ocrResults) {
|
||||
FontStyleDetectionModel fontStyleDetectionModel = FontStyleDetectionModel.fromOcrResult(result, fontMetricsFactory, settings);
|
||||
|
||||
List<Cluster<TextPositionAndWordImage>> clusters = clusterer.cluster(fontStyleDetectionModel.getTextPositionsAndWordImages());
|
||||
Optional<Cluster<TextPositionAndWordImage>> largestCluster = clusters.stream().max(Comparator.comparingInt(cluster -> cluster.getPoints().size()));
|
||||
|
||||
if (largestCluster.isEmpty()) {
|
||||
insertResultIntoMap(result.image().getPageNumber(), ocrResultToWritePerPage, fontStyleDetectionModel);
|
||||
continue;
|
||||
}
|
||||
|
||||
List<TextPositionAndWordImage> wordsWithMostCommonTextHeight = largestCluster.get().getPoints();
|
||||
|
||||
double standardTextHeight = calculateStandardTextheight(wordsWithMostCommonTextHeight);
|
||||
double regularStrokeWidth = calculateRegularStrokeWidth(wordsWithMostCommonTextHeight);
|
||||
|
||||
for (TextPositionAndWordImage textPositionsAndWordImage : fontStyleDetectionModel.getTextPositionsAndWordImages()) {
|
||||
decideOnFontStyle(textPositionsAndWordImage, regularStrokeWidth, standardTextHeight, boldFontMetricsFactory);
|
||||
}
|
||||
|
||||
insertResultIntoMap(result.image().getPageNumber(), ocrResultToWritePerPage, fontStyleDetectionModel);
|
||||
fontStyleDetectionModel.dispose();
|
||||
}
|
||||
|
||||
log.info("Finished bold detection");
|
||||
return ocrResultToWritePerPage;
|
||||
}
|
||||
|
||||
|
||||
private static double calculateStandardTextheight(List<TextPositionAndWordImage> wordsWithMostCommonTextHeight) {
|
||||
|
||||
return wordsWithMostCommonTextHeight.stream()
|
||||
.map(TextPositionAndWordImage::getWordImage)
|
||||
.mapToDouble(WordImage::getTextHeight)
|
||||
.filter(Double::isFinite)
|
||||
.average()
|
||||
.orElseThrow();
|
||||
}
|
||||
|
||||
|
||||
private double calculateRegularStrokeWidth(List<TextPositionAndWordImage> wordsWithMostCommonTextHeight) {
|
||||
|
||||
return wordsWithMostCommonTextHeight.stream()
|
||||
.mapToDouble(textPositionAndWordImage -> strokeWidthCalculator.calculate(textPositionAndWordImage.getWordImage().getImage()))
|
||||
.filter(Double::isFinite)
|
||||
.average()
|
||||
.orElseThrow();
|
||||
}
|
||||
|
||||
|
||||
private static void insertResultIntoMap(int pageNumber, Map<Integer, List<OcrResultToWrite>> ocrResultToWritePerPage, FontStyleDetectionModel fontStyleDetectionModel) {
|
||||
|
||||
OcrResultToWrite ocrResult = OcrResultToWrite.fromFontStyleDetectionModel(fontStyleDetectionModel);
|
||||
|
||||
ocrResultToWritePerPage.compute(pageNumber, (key, existingList) -> {
|
||||
if (existingList == null) {
|
||||
return List.of(ocrResult);
|
||||
} else {
|
||||
return Stream.concat(existingList.stream(), Stream.of(ocrResult)).toList();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
private void decideOnFontStyle(TextPositionAndWordImage textPositionsAndWordImage,
|
||||
double standardStrokeWidth,
|
||||
double standardTextHeight,
|
||||
FontMetricsFactory boldFontMetricsFactory) {
|
||||
|
||||
double scaledStrokeWidth = scaleStrokeWidthByFontSize(textPositionsAndWordImage, standardStrokeWidth, standardTextHeight);
|
||||
|
||||
if (textPositionsAndWordImage.getWordImage().hasLargerStrokeWidth(scaledStrokeWidth)) {
|
||||
textPositionsAndWordImage.getTextPositionInImage().setFontMetricsFactory(boldFontMetricsFactory);
|
||||
textPositionsAndWordImage.getTextPositionInImage().setFontStyle(FontStyle.BOLD);
|
||||
} else {
|
||||
textPositionsAndWordImage.getTextPositionInImage().setFontStyle(FontStyle.REGULAR);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static double scaleStrokeWidthByFontSize(TextPositionAndWordImage textPositionsAndWordImage, double standardStrokeWidth, double standardFontSize) {
|
||||
|
||||
double influenceOfFontSize = 1.0; // the paper states that stroke width scales exactly linearly with font size. This did not seem to be true for me. Maybe some of the preprocessing steps are affecting this.
|
||||
double fontsizeScalingFactor = Math.sqrt(textPositionsAndWordImage.getWordImage().getTextHeight() / standardFontSize);
|
||||
return standardStrokeWidth + (influenceOfFontSize * (fontsizeScalingFactor - 1) * standardStrokeWidth);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,57 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.scriptdetection;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.Sel;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
/**
|
||||
* This code is a good start for detecting italic text, although it has a few issues especially with glyphs which are naturally slanted. E.g. z, 2, 7, /
|
||||
* If we want this maybe we should exclude these glyphs and then it might have less false positives. But in its current state i don't recommend using it.
|
||||
*/
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ItalicDetector {
|
||||
|
||||
|
||||
static String italicKernel = "ooxxooxxooxxoxxooXxooxxoxxooxxooxxoo";
|
||||
Sel italicSel = Leptonica1.selCreateFromString(italicKernel, 9, 4, "italicKernel");
|
||||
Sel brickSel = Leptonica1.selCreateBrick(3, 4, 1, 2, 1);
|
||||
|
||||
|
||||
public boolean isItalic(Pix pix) {
|
||||
|
||||
Pix preprocessed = preprocess(pix);
|
||||
Pix flipped = Leptonica1.pixFlipLR(null, pix);
|
||||
Pix flippedPreprocessed = preprocess(flipped);
|
||||
Leptonica1.pixFlipLR(flippedPreprocessed, flippedPreprocessed);
|
||||
double pixelDensity = ImageProcessingUtils.calculatePixelDensity(preprocessed);
|
||||
double flippedPixelDensity = ImageProcessingUtils.calculatePixelDensity(flippedPreprocessed);
|
||||
LeptUtils.disposePix(preprocessed);
|
||||
LeptUtils.disposePix(flipped);
|
||||
LeptUtils.disposePix(flippedPreprocessed);
|
||||
return flippedPixelDensity / pixelDensity < 0.85;
|
||||
}
|
||||
|
||||
|
||||
private Pix preprocess(Pix pix) {
|
||||
|
||||
Pix eroded = Leptonica1.pixErode(null, pix, italicSel.getPointer());
|
||||
Pix dilated = Leptonica1.pixDilate(null, eroded, brickSel.getPointer());
|
||||
LeptUtils.disposePix(eroded);
|
||||
return dilated;
|
||||
}
|
||||
|
||||
|
||||
public void dispose() {
|
||||
|
||||
LeptUtils.dispose(italicSel);
|
||||
LeptUtils.dispose(brickSel);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,58 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.scriptdetection;
|
||||
|
||||
import static net.sourceforge.lept4j.ILeptonica.L_THIN_FG;
|
||||
|
||||
import java.nio.IntBuffer;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.Sela;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
@Service
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class StrokeWidthCalculator {
|
||||
|
||||
Sela thinningSel;
|
||||
|
||||
|
||||
/**
|
||||
* Uses a series of sels to thin all connected lines to a single pixel. Then the pixel ratio is a good estimation of the stroke width in pixels.
|
||||
* <a href="http://www.leptonica.org/papers/conn.pdf">Leptonica Documentation on thinning</a>
|
||||
* Since the baseline is a strokewidth of exactly one, we need to add 1 to the result.
|
||||
*
|
||||
* @param input binarized pix with text on it
|
||||
* @return estimated stroke width in pixels
|
||||
*/
|
||||
public double calculate(Pix input) {
|
||||
|
||||
init();
|
||||
|
||||
Pix thinned = Leptonica1.pixThinConnectedBySet(input, L_THIN_FG, thinningSel, 0);
|
||||
|
||||
IntBuffer thinnedPixelCount = IntBuffer.allocate(1);
|
||||
Leptonica1.pixCountPixels(thinned, thinnedPixelCount, null);
|
||||
|
||||
IntBuffer pixelCount = IntBuffer.allocate(1);
|
||||
Leptonica1.pixCountPixels(input, pixelCount, null);
|
||||
|
||||
LeptUtils.disposePix(thinned);
|
||||
|
||||
return (double) pixelCount.get() / thinnedPixelCount.get() + 1;
|
||||
}
|
||||
|
||||
|
||||
private void init() {
|
||||
|
||||
if (thinningSel == null) {
|
||||
thinningSel = Leptonica1.selaMakeThinSets(1, 0);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -17,7 +17,6 @@ import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.tess4j.TessAPI1;
|
||||
|
||||
|
||||
/*
|
||||
This just moves the Elements from the GhostScriptOutputListener into the ImageProcessing queue asynchronously
|
||||
*/
|
||||
@ -38,9 +37,17 @@ public class BlockingQueueFiller extends Thread {
|
||||
public void run() {
|
||||
|
||||
// Interrupting signals that the image extraction has finished
|
||||
while (!allImagesQueued) {
|
||||
try {
|
||||
while (!allImagesQueued) {
|
||||
final UnprocessedImage image = imageInputQueue.take();
|
||||
imageOutputQueue.put(image);
|
||||
try {
|
||||
imageOutputQueue.put(image);
|
||||
} catch (InterruptedException e) {
|
||||
imageOutputQueue.put(image);
|
||||
}
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
log.info("All images extracted, emptying processing queue and stopping");
|
||||
}
|
||||
|
||||
// empty the queue
|
||||
@ -54,4 +61,5 @@ public class BlockingQueueFiller extends Thread {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -4,8 +4,6 @@ import static net.sourceforge.tess4j.ITessAPI.TRUE;
|
||||
|
||||
import java.nio.FloatBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
||||
@ -29,6 +27,8 @@ import lombok.Setter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.L_Kernel;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
import net.sourceforge.tess4j.ITessAPI;
|
||||
@ -45,6 +45,7 @@ public class ImageProcessingThread extends Thread {
|
||||
final BlockingQueue<UnprocessedImage> imageInputQueue;
|
||||
final BlockingQueue<OcrImage> imageOutputQueue;
|
||||
final ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
|
||||
final L_Kernel gaussianKernel = Leptonica1.makeGaussianKernel(2, 2, 1.2f, 1);
|
||||
final Statistics stats;
|
||||
final OcrServiceSettings settings;
|
||||
final PDDocument document;
|
||||
@ -81,7 +82,9 @@ public class ImageProcessingThread extends Thread {
|
||||
log.debug("No images left in processing queue, stopping.");
|
||||
}
|
||||
|
||||
TessAPI1.TessBaseAPIEnd(this.detectionScriptHandle);
|
||||
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
|
||||
LeptUtils.dispose(gaussianKernel);
|
||||
}
|
||||
|
||||
|
||||
@ -106,7 +109,7 @@ public class ImageProcessingThread extends Thread {
|
||||
|
||||
private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) {
|
||||
|
||||
Pix pix = binarize(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi());
|
||||
Pix pix = processPix(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi());
|
||||
|
||||
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
|
||||
Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix);
|
||||
@ -129,7 +132,7 @@ public class ImageProcessingThread extends Thread {
|
||||
|
||||
float imageDPI = Math.abs(extractedImage.image().getWidth() / (extractedImage.ctm().getScalingFactorX() / 72));
|
||||
|
||||
Pix pix = binarize(extractedImage.asPix(), imageDPI, settings.getDpi());
|
||||
Pix pix = processPix(extractedImage.asPix(), imageDPI, settings.getDpi());
|
||||
|
||||
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
|
||||
Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix);
|
||||
@ -163,7 +166,7 @@ public class ImageProcessingThread extends Thread {
|
||||
|
||||
orientationDegreeResultBuffer = IntBuffer.allocate(1);
|
||||
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
|
||||
scriptureNameBuffer = new PointerByReference();
|
||||
scriptureNameBuffer = new PointerByReference(); // Is this memory being freed?
|
||||
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
|
||||
|
||||
int orientationDegree = 0;
|
||||
@ -183,15 +186,58 @@ public class ImageProcessingThread extends Thread {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Pix binarize(Pix pix, float imageDpi, int targetDpi) {
|
||||
private Pix processPix(Pix pix, float imageDpi, int targetDpi) {
|
||||
|
||||
Pix grayScale = ImageProcessingUtils.convertToGrayScale(pix);
|
||||
Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale);
|
||||
return ImageProcessingUtils.despecklePix(scaledUp);
|
||||
Pix grayScale;
|
||||
Pix scaledUp;
|
||||
Pix gaussian;
|
||||
Pix binarized;
|
||||
|
||||
//convert to grayscale
|
||||
if (pix.d == 8) {
|
||||
grayScale = pix;
|
||||
} else if (pix.d == 32) {
|
||||
grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
|
||||
} else if (pix.d == 1) {
|
||||
grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255);
|
||||
} else {
|
||||
throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d));
|
||||
}
|
||||
|
||||
// scale up
|
||||
float targetFactor = targetDpi / imageDpi;
|
||||
if (targetFactor > 2.1) {
|
||||
scaledUp = Leptonica1.pixScaleGray4xLI(grayScale);
|
||||
} else if (targetFactor > 1.1) {
|
||||
scaledUp = Leptonica1.pixScaleGray2xLI(grayScale);
|
||||
} else {
|
||||
scaledUp = grayScale;
|
||||
}
|
||||
|
||||
// remove noise and prep for Otsu
|
||||
gaussian = Leptonica1.pixConvolve(scaledUp, gaussianKernel, 8, 1);
|
||||
|
||||
// Threshold to binary
|
||||
if (pix.w < 100 || pix.h < 100) {
|
||||
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
|
||||
} else {
|
||||
binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.2f, null);
|
||||
|
||||
if (binarized == null) { // Sometimes Otsu just fails, then we binarize directly
|
||||
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
|
||||
}
|
||||
}
|
||||
|
||||
LeptUtils.disposePix(pix);
|
||||
LeptUtils.disposePix(grayScale);
|
||||
LeptUtils.disposePix(scaledUp);
|
||||
LeptUtils.disposePix(gaussian);
|
||||
|
||||
return binarized;
|
||||
}
|
||||
|
||||
|
||||
|
||||
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
|
||||
|
||||
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
|
||||
|
||||
@ -116,16 +116,11 @@ public class OCRThread extends Thread {
|
||||
@SneakyThrows
|
||||
public void executeTesseract(int psm, int dpi, Pix pix, String tesseractOutputFileName) {
|
||||
|
||||
if (settings.isDebug()) {
|
||||
String[] a = tesseractOutputFileName.split("/");
|
||||
String folder = "/tmp/pixs/" + a[a.length - 3];
|
||||
new File(folder).mkdirs();
|
||||
Leptonica1.pixWrite(folder + "/pix_" + a[a.length - 1] + ".png", pix, 3);
|
||||
}
|
||||
|
||||
Leptonica1.pixWrite(tesseractOutputFileName + ".tiff", pix, 5); // write the used image for later bold detection
|
||||
instance.setVariable("user_defined_dpi", String.valueOf(dpi));
|
||||
instance.setPageSegMode(psm);
|
||||
instance.createDocumentsWithResults(pix, null, tesseractOutputFileName, List.of(ITesseract.RenderedFormat.HOCR), ITessAPI.TessPageIteratorLevel.RIL_BLOCK);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -19,10 +19,12 @@ public class OcrServiceSettings {
|
||||
int psmOverride = -1; // Overrides the page segmentation mode if > 0
|
||||
int minImageHeight = 20; // Minimum height for images to be processed
|
||||
int minImageWidth = 20; // Minimum width for images to be processed
|
||||
float minRotationConfidence = 2; //
|
||||
float minRotationConfidence = 2; // Sets a lower bound for the confidence rating for rotated pages.
|
||||
boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes
|
||||
boolean removeWatermark; // If true, watermarks will be removed
|
||||
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
|
||||
COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR");
|
||||
boolean boldDetection = true; // if true, bold detection will be attempted
|
||||
double boldThreshold = 0.5; // Words are opened with a brick of average stroke width, if the ratio of remaining pixels is higher the word is determined bold.
|
||||
|
||||
}
|
||||
|
||||
@ -6,14 +6,17 @@ import java.awt.Graphics;
|
||||
import java.awt.Graphics2D;
|
||||
import java.awt.Transparency;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.nio.IntBuffer;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.sun.jna.ptr.PointerByReference;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
import net.sourceforge.lept4j.L_Kernel;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
@ -37,67 +40,6 @@ public class ImageProcessingUtils {
|
||||
}
|
||||
|
||||
|
||||
public static Pix despecklePix(Pix pix) {
|
||||
|
||||
assert pix.d == 8;
|
||||
Pix despeckled;
|
||||
if (pix.w < 100 || pix.h < 100) {
|
||||
// too small to properly despeckle, just binarize instead.
|
||||
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
|
||||
} else {
|
||||
despeckled = LeptUtils.despeckle(pix,
|
||||
LeptUtils.SEL_STR3,
|
||||
3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though...
|
||||
if (despeckled == null) {
|
||||
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
|
||||
}
|
||||
}
|
||||
if (pix != despeckled) {
|
||||
LeptUtils.disposePix(pix);
|
||||
}
|
||||
return despeckled;
|
||||
}
|
||||
|
||||
|
||||
public static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) {
|
||||
|
||||
float targetFactor = targetDpi / imageDpi;
|
||||
|
||||
if (targetFactor > 3) {
|
||||
Pix scaledUp;
|
||||
scaledUp = Leptonica1.pixScaleGray4xLI(grayScale);
|
||||
LeptUtils.disposePix(grayScale);
|
||||
return scaledUp;
|
||||
} else if (targetFactor > 1.9) {
|
||||
Pix scaledUp;
|
||||
scaledUp = Leptonica1.pixScaleGray2xLI(grayScale);
|
||||
LeptUtils.disposePix(grayScale);
|
||||
return scaledUp;
|
||||
} else {
|
||||
return grayScale;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static Pix convertToGrayScale(Pix pix) {
|
||||
|
||||
if (pix.d == 8) {
|
||||
return pix;
|
||||
} else if (pix.d == 32) {
|
||||
Pix grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
|
||||
LeptUtils.disposePix(pix);
|
||||
return grayScale;
|
||||
} else if (pix.d == 1) {
|
||||
Pix grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255);
|
||||
LeptUtils.disposePix(pix);
|
||||
return grayScale;
|
||||
} else {
|
||||
throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Pix deRotatePix(int orientDegree, Pix pix) {
|
||||
|
||||
return switch (360 - orientDegree) {
|
||||
@ -128,4 +70,16 @@ public class ImageProcessingUtils {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static double calculatePixelDensity(Pix pix) {
|
||||
|
||||
IntBuffer pixelCount = IntBuffer.allocate(1);
|
||||
int result = Leptonica1.pixCountPixels(pix, pixelCount, null);
|
||||
if (result == 0) {
|
||||
return (double) pixelCount.get() / (pix.h * pix.w);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,73 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import net.sourceforge.lept4j.L_Kernel;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
|
||||
@UtilityClass
|
||||
public class KernelUtils {
|
||||
|
||||
/*
|
||||
-1, -1, -1
|
||||
-1, 8, -1
|
||||
-1, -1, -1
|
||||
*/
|
||||
public L_Kernel createFullLaplacianKernel() {
|
||||
|
||||
L_Kernel laplacianKernel = Leptonica1.kernelCreate(3, 3);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 0, 0, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 0, 1, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 0, 2, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 1, 0, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 2, 0, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 2, 2, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 1, 1, 8);
|
||||
return laplacianKernel;
|
||||
}
|
||||
|
||||
/*
|
||||
0, 0, -1, 0, 0
|
||||
0, -1, -1, -1, 0
|
||||
-1, -1, 12, -1, -1
|
||||
0, -1, -1, -1, 0
|
||||
0, 0, -1, 0, 0
|
||||
*/
|
||||
public L_Kernel createLaplacianKernel5x5() {
|
||||
|
||||
L_Kernel laplacianKernel = Leptonica1.kernelCreate(5, 5);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 0, 2, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 1, 1, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 1, 3, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 2, 0, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 2, 3, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 2, 4, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 3, 1, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 3, 2, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 3, 3, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 4, 2, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 2, 2, 12);
|
||||
return laplacianKernel;
|
||||
}
|
||||
|
||||
/*
|
||||
0, -1, 0
|
||||
-1, 4, -1
|
||||
0, -1, 0
|
||||
*/
|
||||
public L_Kernel createLaplacianKernel() {
|
||||
|
||||
L_Kernel laplacianKernel = Leptonica1.kernelCreate(3, 3);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 0, 1, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 1, 0, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 1, 1, 4);
|
||||
return laplacianKernel;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -138,4 +138,11 @@ public class Tesseract2 extends Tesseract1 {
|
||||
return renderer;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void dispose() {
|
||||
|
||||
TessBaseAPIEnd(getHandle());
|
||||
TessBaseAPIDelete(getHandle());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -20,7 +20,7 @@ class Type0FontMetricsFactoryTest {
|
||||
public void testStringWidth() {
|
||||
|
||||
try (PDDocument document = Loader.loadPDF(new File(Type0FontMetricsFactoryTest.class.getClassLoader().getResource("InvisibleText.pdf").getPath()))) {
|
||||
Type0FontMetricsFactory metricsFactory = new Type0FontMetricsFactory(document);
|
||||
Type0FontMetricsFactory metricsFactory = Type0FontMetricsFactory.regular(document);
|
||||
FontMetrics fontMetrics = metricsFactory.calculateMetrics("deine mutter", 100, 50);
|
||||
}
|
||||
|
||||
|
||||
@ -41,8 +41,15 @@ fforesight:
|
||||
ignored-endpoints: [ '/actuator/health', '/actuator/health/**' ]
|
||||
enabled: true
|
||||
|
||||
logging.pattern.level: "%5p [${spring.application.name},%X{traceId:-},%X{spanId:-}]"
|
||||
|
||||
management:
|
||||
tracing:
|
||||
sampling:
|
||||
probability: 1.0
|
||||
otlp:
|
||||
tracing:
|
||||
endpoint: http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces
|
||||
endpoint:
|
||||
metrics.enabled: ${monitoring.enabled:false}
|
||||
prometheus.enabled: ${monitoring.enabled:false}
|
||||
|
||||
@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testOcr() {
|
||||
|
||||
String text = testOCR("files/2009-1048395_50pages_tables.pdf");
|
||||
String text = testOCR("files/402Study.pdf");
|
||||
}
|
||||
|
||||
|
||||
@ -162,13 +162,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testOcrForSpecificFile() {
|
||||
|
||||
testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/F.2. A16003E - Acute Inhalation Study.pdf"));
|
||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/A23220A - 404 - Skin Irritation in vivo.pdf"));
|
||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/G.1.2 - 1768300_MMNA_A13617AV_report.pdf"));
|
||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 17_Toxicidade Inalatoria Aguda.pdf"));
|
||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 20_Sensibilizacao_02.pdf"));
|
||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/ITEM 23_A15149W - Dermal absorption of formulated product.pdf"));
|
||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 16_Toxicidade Cutanea Aguda.pdf"));
|
||||
testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles2/A16361B - Acute Dermal Toxicity Study in Rats.pdf"));
|
||||
}
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user