Merge branch 'RED-7669-fontstyle' into 'master'

RED-8155: integrate bold-detection into ocr-service

Closes RED-7669

See merge request redactmanager/ocr-service!31
This commit is contained in:
Kilian Schüttler 2024-01-05 16:05:53 +01:00
commit bab16ad9b2
32 changed files with 785 additions and 181 deletions

View File

@ -15,11 +15,17 @@ The service uses PDFTron to attempt the removal of invisible elements and waterm
Extracts all images from the PDF using PDFBox
3. Striped Image Detection and Stitching
Detects if images are striped and stitches them together using Ghostscript.
4. Binarization
Binarizes the resulting images using Leptonica and the Otsu thresholding algorithm.
4. Image Processing
- Convert to grayscale
- Upscale to target DPI
- Filter using Gauss kernel
- Binarizes the resulting images using Leptonica and the Otsu thresholding algorithm.
- Despeckle using various morphological operations
5. OCR Processing
Runs Tesseract on the images to extract text.
6. Text Integration
6. Font style detection
Detection of bold text using stroke width estimation
7. Text Integration
Draws the resulting text onto the original PDF using PDFBox.
Steps 2.-5. happen in parallel and communicate via a blocking queue to limit RAM usage.

View File

@ -25,6 +25,8 @@ tasks.named<Test>("test") {
reports {
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
}
minHeapSize = "512m"
maxHeapSize = "8192m"
}
tasks.test {

View File

@ -20,6 +20,7 @@ dependencies {
api("org.apache.pdfbox:jbig2-imageio:3.0.4")
api("com.github.jai-imageio:jai-imageio-core:1.4.0")
api("com.github.jai-imageio:jai-imageio-jpeg2000:1.4.0")
api("org.apache.commons:commons-math3:3.6.1")
api("io.github.karols:hocr4j:0.2.0")
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
api("com.google.guava:guava:31.1-jre")

View File

@ -31,9 +31,19 @@ public interface OcrImage {
int getNumberOnPage();
/**
* Retrieves the height of the original image (not necessarily in pdf coordinates).
*
* @return the height of the image
*/
int getHeight();
/**
* Retrieves the width of the original image (not necessarily in pdf coordinates).
*
* @return the width of the image
*/
int getWidth();
@ -44,7 +54,7 @@ public interface OcrImage {
*/
default QuadPoint getImageBounds() {
// cannot be solved with a nice rotation matrix, since the after rotating the text coordinates in the image will always start at (0,0) and will therefore always start at (0,0) in the PDF.
// cannot be solved with a nice rotation matrix. After rotating the text coordinates in the image will always start at (0,0) and will therefore always start at (0,0) in the PDF.
// So in order to mimic this behavior we need to start with (0,0) coordinates always.
if (getRotationDegrees() == 90 || getRotationDegrees() == 270) {
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, getWidth()), new Point2D.Double(getHeight(), getWidth()), new Point2D.Double(getHeight(), 0));
@ -65,13 +75,6 @@ public interface OcrImage {
}
@SneakyThrows
default BufferedImage getBufferedImage() {
return LeptUtils.convertPixToImage(getPix());
}
/**
* Retrieves the rotation degree of the OCR image.
*
@ -86,6 +89,10 @@ public interface OcrImage {
* @return The optimal page segmentation mode.
*/
default int getOptimalPageSegmentationMode() {
if (getWidth() < 200 || getHeight() < 200) {
return ITessAPI.TessPageSegMode.PSM_SINGLE_BLOCK;
}
return ITessAPI.TessPageSegMode.PSM_AUTO;
} // TODO: evaluate if PSM can be dynamically chosen to increase performance
@ -112,17 +119,6 @@ public interface OcrImage {
AffineTransform getImageCTM();
/**
* Retrieves the size (width * height) of the image.
*
* @return The size of the image.
*/
default int getImageSize() {
return getHeight() * getWidth();
}
default void destroyPix() {
LeptUtils.disposePix(getPix());

View File

@ -7,27 +7,17 @@ import com.knecon.fforesight.service.ocr.processor.service.HOcrPageParser;
import io.github.karols.hocr4j.Word;
public record OcrResult(Image image, String hOcrPageAbsolutePath) {
public record OcrResult(OcrImage image, String tesseractOutputFilePath) {
public static OcrResult create(OcrImage image, String tesseractResult) {
return new OcrResult(Image.fromOcrImage(image), tesseractResult);
return new OcrResult(image, tesseractResult);
}
public List<Word> getAllWords() {
return HOcrPageParser.extractHocrPage(hOcrPageAbsolutePath).getAllWords();
}
public record Image(Integer pageNumber, AffineTransform ctm, QuadPoint position) {
public static Image fromOcrImage(OcrImage image) {
return new Image(image.getPageNumber(), image.getImageCTM(), image.getImageCoordinatesInInitialUserSpace());
}
return HOcrPageParser.extractHocrPage(tesseractOutputFilePath).getAllWords();
}
}

View File

@ -0,0 +1,35 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.FontStyleDetectionModel;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
public record OcrResultToWrite(List<TextPositionInImage> textPositionInImage, QuadPoint imageBoundingBox) {
public static OcrResultToWrite fromFontStyleDetectionModel(FontStyleDetectionModel fontStyleDetectionModel) {
return new OcrResultToWrite(fontStyleDetectionModel.getTextPositionInImages(), fontStyleDetectionModel.getImageBounds());
}
public static Map<Integer, List<OcrResultToWrite>> buildOcrResultsToWrite(List<OcrResult> ocrResults, FontMetricsFactory fontMetricsFactory) {
return ocrResults.stream()
.collect(Collectors.groupingBy(ocrResult -> ocrResult.image().getPageNumber()))
.entrySet()
.stream()
.collect(Collectors.toMap(Map.Entry::getKey,
entry -> entry.getValue()
.stream()
.map(ocrResult -> new OcrResultToWrite(ocrResult.getAllWords()
.stream()
.filter(word -> !word.isBlank())
.map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR))
.toList(), ocrResult.image().getImageCoordinatesInInitialUserSpace()))
.toList()));
}
}

View File

@ -6,7 +6,7 @@ public record PageInformation(int height, int width, int number, int rotationDeg
public static PageInformation fromPDPage(int pageNum, PDPage page) {
return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation());
return new PageInformation((int) page.getMediaBox().getHeight(), (int) page.getMediaBox().getWidth(), pageNum, page.getRotation());
}
}

View File

@ -60,17 +60,6 @@ public class RenderedPageOcrImage implements OcrImage {
}
@Override
public QuadPoint getImageBounds() {
if (rotationDegrees == 90 || rotationDegrees == 270) {
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, width), new Point2D.Double(height, width), new Point2D.Double(height, 0));
} else {
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, height), new Point2D.Double(width, height), new Point2D.Double(width, 0));
}
}
@Override
public int getPageNumber() {

View File

@ -7,29 +7,35 @@ import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
import io.github.karols.hocr4j.Bounds;
import io.github.karols.hocr4j.Word;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.Setter;
import lombok.experimental.FieldDefaults;
@Getter
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TextPositionInImage {
QuadPoint position;
String text;
AffineTransform imageCTM;
final QuadPoint position;
final String text;
final AffineTransform imageCTM;
@Setter
FontMetricsFactory fontMetricsFactory;
@Setter
FontStyle fontStyle;
public TextPositionInImage(Word word, AffineTransform imageCTM, FontMetricsFactory fontMetricsFactory) {
public TextPositionInImage(Word word, AffineTransform imageCTM, FontMetricsFactory fontMetricsFactory, FontStyle fontStyle) {
this.position = QuadPoint.fromBounds(word.getBounds());
this.text = word.getText();
this.imageCTM = imageCTM;
this.fontMetricsFactory = fontMetricsFactory;
this.fontStyle = fontStyle;
}
@ -90,6 +96,13 @@ public class TextPositionInImage {
}
public double getTextHeight() {
var metrics = fontMetricsFactory.calculateMetrics(text, getTransformedWidth(), getTransformedHeight());
return fontMetricsFactory.calculateFontSize(text, getTransformedWidth()) * metrics.getHeightScaling();
}
public double getHeight() {
return position.a().distance(position.b());

View File

@ -0,0 +1,58 @@
package com.knecon.fforesight.service.ocr.processor.model.scriptdetection;
import java.util.List;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
@Getter
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public final class FontStyleDetectionModel {
QuadPoint imageBounds;
Pix image;
List<TextPositionAndWordImage> textPositionsAndWordImages;
public static FontStyleDetectionModel fromOcrResult(OcrResult ocrResult, FontMetricsFactory fontMetricsFactory, OcrServiceSettings settings) {
var image = Leptonica1.pixRead(ocrResult.tesseractOutputFilePath() + ".tiff");
var wordPixes = ocrResult.getAllWords().stream().filter(word -> !word.isBlank()).map(word -> TextPositionAndWordImage.create(ocrResult.image().getImageCTM(), word, image, settings, fontMetricsFactory)).toList();
return new FontStyleDetectionModel(ocrResult.image().getImageCoordinatesInInitialUserSpace(), image, wordPixes);
}
public List<TextPositionInImage> getTextPositionInImages() {
return textPositionsAndWordImages.stream().map(TextPositionAndWordImage::getTextPositionInImage).toList();
}
public List<WordImage> getWordImages() {
return textPositionsAndWordImages.stream().map(TextPositionAndWordImage::getWordImage).toList();
}
public void dispose() {
LeptUtils.disposePix(image);
getWordImages().forEach(WordImage::dispose);
}
}

View File

@ -0,0 +1,52 @@
package com.knecon.fforesight.service.ocr.processor.model.scriptdetection;
import java.awt.geom.AffineTransform;
import java.util.Objects;
import org.apache.commons.math3.ml.clustering.Clusterable;
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import io.github.karols.hocr4j.Word;
import lombok.Getter;
import net.sourceforge.lept4j.Pix;
@Getter
public final class TextPositionAndWordImage implements Clusterable {
private final TextPositionInImage textPositionInImage;
private final WordImage wordImage;
public TextPositionAndWordImage(TextPositionInImage textPositionInImage, WordImage wordImage) {
this.textPositionInImage = textPositionInImage;
this.wordImage = wordImage;
}
public static TextPositionAndWordImage create(AffineTransform imageCTM, Word word, Pix image, OcrServiceSettings settings, FontMetricsFactory fontMetricsFactory) {
TextPositionInImage textPositionInImage = new TextPositionInImage(word, imageCTM, fontMetricsFactory, FontStyle.REGULAR);
WordImage wordImage = new WordImage(textPositionInImage.getTextHeight(), word, image, settings);
return new TextPositionAndWordImage(textPositionInImage, wordImage);
}
@Override
public double[] getPoint() {
return wordImage.getPoint();
}
public double getTextHeight() {
return wordImage.getTextHeight();
}
}

View File

@ -0,0 +1,71 @@
package com.knecon.fforesight.service.ocr.processor.model.scriptdetection;
import org.apache.commons.math3.ml.clustering.Clusterable;
import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.StrokeWidthCalculator;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import io.github.karols.hocr4j.Word;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Box;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
@Getter
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class WordImage implements Clusterable {
Pix image;
String text;
double textHeight;
OcrServiceSettings settings;
public WordImage(double textHeight, Word word, Pix originalImage, OcrServiceSettings settings) {
Box box = new Box(word.getBounds().getLeft(), word.getBounds().getTop(), word.getBounds().getWidth(), word.getBounds().getHeight(), 1);
this.image = Leptonica1.pixClipRectangle(originalImage, box, null);
box.clear();
this.text = word.getText();
this.textHeight = textHeight;
this.settings = settings;
}
public boolean hasLargerStrokeWidth(double strokeWidth) {
int roundedStrokeWidth = (int) Math.round(strokeWidth);
double roundingError = (roundedStrokeWidth - strokeWidth) / strokeWidth;
// add 1 to open a bit bigger than the estimated regular stroke width
Pix openedPix = Leptonica1.pixOpenBrick(null, image, roundedStrokeWidth + 1, roundedStrokeWidth + 1);
double openedPixelDensity = ImageProcessingUtils.calculatePixelDensity(openedPix);
double pixelDensity = ImageProcessingUtils.calculatePixelDensity(image);
LeptUtils.disposePix(openedPix);
return (openedPixelDensity * (1 + roundingError)) / pixelDensity > (settings.getBoldThreshold());
}
@Override
public double[] getPoint() {
return new double[]{textHeight};
}
public void dispose() {
LeptUtils.disposePix(image);
}
}

View File

@ -3,24 +3,18 @@ package com.knecon.fforesight.service.ocr.processor.service;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.LinkedTransferQueue;
import java.util.stream.Collectors;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service;
import com.azure.core.implementation.GeoObjectHelper;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import com.knecon.fforesight.service.ocr.processor.service.threads.BlockingQueueFiller;
import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler;
@ -32,7 +26,6 @@ import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.Pix;
@Slf4j
@Service

View File

@ -9,6 +9,7 @@ import java.io.OutputStream;
import java.nio.file.Path;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.stream.IntStream;
@ -20,8 +21,10 @@ import org.springframework.util.FileSystemUtils;
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.FontStyleDetector;
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
@ -44,6 +47,7 @@ public class OCRService {
InvisibleElementRemovalService invisibleElementRemovalService;
OcrResultWriter ocrResultWriter;
GhostScriptService ghostScriptService;
FontStyleDetector boldDetector;
/**
@ -135,9 +139,14 @@ public class OCRService {
ocrThread.join();
}
log.info("OCR processing has finished, writing results");
log.info("Tesseract OCR has finished for file {} and dossier {}", fileId, dossierId);
timestamp = System.currentTimeMillis();
var dictionariesToUpdate = ocrResultWriter.drawOcrResultsToPdf(document, ocrResults);
Map<Integer, List<OcrResultToWrite>> imageWithTextPositionsPerPage = boldDetector.detectBold(ocrResults, document);
stats.increaseFontStyleDetectionDuration(System.currentTimeMillis() - timestamp);
timestamp = System.currentTimeMillis();
var dictionariesToUpdate = ocrResultWriter.drawOcrResultsToPdf(document, imageWithTextPositionsPerPage);
log.info("Saving document");
document.saveIncremental(out, dictionariesToUpdate);
stats.increaseWritingTextDuration(System.currentTimeMillis() - timestamp);

View File

@ -2,11 +2,11 @@ package com.knecon.fforesight.service.ocr.processor.service;
import java.awt.Color;
import java.awt.geom.Point2D;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
@ -20,11 +20,9 @@ import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentPrope
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.service.fonts.Type0FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import lombok.AccessLevel;
@ -44,19 +42,17 @@ public class OcrResultWriter {
@SneakyThrows
public Set<COSDictionary> drawOcrResultsToPdf(PDDocument document, List<OcrResult> ocrResults) {
public Set<COSDictionary> drawOcrResultsToPdf(PDDocument document, Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
FontMetricsFactory fontMetricsFactory = new Type0FontMetricsFactory(document);
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
Map<Integer, List<OcrResult>> resultsPerPage = ocrResults.stream().collect(Collectors.groupingBy(result -> result.image().pageNumber()));
resultsPerPage.keySet().forEach(pageNumber -> drawResultsPerPage(document, pageNumber, resultsPerPage, dictionariesToUpdate, fontMetricsFactory));
imagesWithResultsPerPage.keySet().forEach(pageNumber -> drawResultsPerPage(document, pageNumber, imagesWithResultsPerPage.get(pageNumber), dictionariesToUpdate));
dictionariesToUpdate.add(document.getDocumentInformation().getCOSObject());
return dictionariesToUpdate;
}
@SneakyThrows
private void drawResultsPerPage(PDDocument document, Integer pageNumber, Map<Integer, List<OcrResult>> resultsPerPage, Set<COSDictionary> dictionariesToUpdate, FontMetricsFactory fontMetricsFactory) {
private void drawResultsPerPage(PDDocument document, Integer pageNumber, List<OcrResultToWrite> ocrResultToWrite, Set<COSDictionary> dictionariesToUpdate) {
var pdPage = document.getPage(pageNumber - 1);
@ -69,7 +65,7 @@ public class OcrResultWriter {
escapeContentStreams(document, pdPage);
List<TextPositionInImage> words = buildTextPositionsOnPage(pageNumber, resultsPerPage, fontMetricsFactory);
List<TextPositionInImage> words = ocrResultToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
// write invisible ocr text inside tagged content
@ -86,7 +82,6 @@ public class OcrResultWriter {
// write visible ocr text inside optional group
contentStream.beginMarkedContent(COSName.OC, textDebugLayer);
contentStream.saveGraphicsState();
contentStream.setNonStrokingColor(Color.BLUE);
words.forEach(word -> drawVisibleWord(word, contentStream));
contentStream.restoreGraphicsState();
contentStream.endMarkedContent();
@ -94,7 +89,9 @@ public class OcrResultWriter {
// write word bounding boxes (tesseract output) inside optional group
contentStream.beginMarkedContent(COSName.OC, bBoxDebugLayer);
contentStream.saveGraphicsState();
resultsPerPage.get(pageNumber).stream().map(OcrResult::image).forEach(image -> drawGrid(contentStream, image.position()));
ocrResultToWrite.stream()
.map(OcrResultToWrite::imageBoundingBox)
.forEach(imagePosition -> drawGrid(contentStream, imagePosition));
words.stream().map(TextPositionInImage::getTransformedTextBBox).forEach(word -> drawRectangle(contentStream, word));
contentStream.restoreGraphicsState();
contentStream.endMarkedContent();
@ -105,15 +102,6 @@ public class OcrResultWriter {
}
private static List<TextPositionInImage> buildTextPositionsOnPage(Integer pageNumber, Map<Integer, List<OcrResult>> resultsPerPage, FontMetricsFactory fontMetricsFactory) {
return resultsPerPage.get(pageNumber)
.stream()
.flatMap(result -> result.getAllWords().stream().filter(word -> !word.isBlank()).map(word -> new TextPositionInImage(word, result.image().ctm(), fontMetricsFactory)))
.toList();
}
@SneakyThrows
private static void escapeContentStreams(PDDocument document, PDPage pdPage) {
// We need to append to the contentstream, otherwise the content could be overlapped by images
@ -196,6 +184,11 @@ public class OcrResultWriter {
private void drawWord(TextPositionInImage position, PDPageContentStream contentStream, RenderingMode renderingMode) {
try {
contentStream.setNonStrokingColor(switch (position.getFontStyle()) {
case BOLD -> Color.RED;
case ITALIC -> Color.GREEN;
default -> Color.BLUE;
});
contentStream.beginText();
contentStream.setRenderingMode(renderingMode);
contentStream.setFont(position.getFont(), (float) position.getFontSize());

View File

@ -16,12 +16,14 @@ public class Statistics {
AtomicLong pdf2ImgDuration;
AtomicLong writingTextDuration;
AtomicLong imageProcessingDuration;
AtomicLong fontStyleDetectionDuration;
public Statistics(int numberOfExtractThreads, int numberOfOcrThreads) {
this.imageExtraction = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfExtractThreads, 0L)));
this.tesseractDuration = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfOcrThreads, 0L)));
this.fontStyleDetectionDuration = new AtomicLong(0);
this.pdf2ImgDuration = new AtomicLong(0);
this.writingTextDuration = new AtomicLong(0);
this.imageProcessingDuration = new AtomicLong(0);
@ -57,12 +59,17 @@ public class Statistics {
writingTextDuration.addAndGet(duration);
}
public void increaseFontStyleDetectionDuration(long duration) {
fontStyleDetectionDuration.addAndGet(duration);
}
@Override
public String toString() {
return String.format(
"imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s",
"imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s, FontstyleDetection=%.2f s",
((float) imageExtraction.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
((float) imageExtraction.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
((float) imageExtraction.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
@ -71,7 +78,8 @@ public class Statistics {
((float) tesseractDuration.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
(float) imageProcessingDuration.get() / 1000,
(float) pdf2ImgDuration.get() / 1000,
(float) writingTextDuration.get() / 1000);
(float) writingTextDuration.get() / 1000,
(float) fontStyleDetectionDuration.get() / 1000);
}
}

View File

@ -36,6 +36,7 @@ public interface FontMetricsFactory {
PDFont getFont();
HeightAndDescent calculateHeightAndDescent(String text);
}

View File

@ -0,0 +1,5 @@
package com.knecon.fforesight.service.ocr.processor.service.fonts;
public enum FontStyle {
REGULAR, BOLD, ITALIC
}

View File

@ -1,6 +1,9 @@
package com.knecon.fforesight.service.ocr.processor.service.fonts;
import java.io.ByteArrayInputStream;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import org.apache.fontbox.ttf.GlyphData;
import org.apache.fontbox.ttf.TTFParser;
@ -12,22 +15,41 @@ import org.apache.pdfbox.pdmodel.font.PDType0Font;
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import software.amazon.awssdk.services.s3.endpoints.internal.Value;
@Slf4j
@RequiredArgsConstructor
public class Type0FontMetricsFactory implements FontMetricsFactory {
private final PDType0Font type0Font;
private final TrueTypeFont trueTypeFont;
// for this specific font back-/forward-slashes have a lot of descent screwing up the font size and therefore bold detection. So if we find such a character we ignore its descent.
private static final Set<Integer> slashGlyphIds = Set.of(18, 63);
public static Type0FontMetricsFactory regular(PDDocument document) {
return createFromResource("fonts/cmu-regular.ttf", document);
}
public static Type0FontMetricsFactory bold(PDDocument document) {
return createFromResource("fonts/cmu-bold.ttf", document);
}
@SneakyThrows
public Type0FontMetricsFactory(PDDocument document) {
private static Type0FontMetricsFactory createFromResource(String resourcePath, PDDocument document) {
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream("fonts/cmu-regular.ttf"); var buffer = new RandomAccessReadBuffer(in)) {
this.trueTypeFont = new TTFParser().parse(buffer); // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
this.type0Font = PDType0Font.load(document, this.trueTypeFont, false); // use Type0Font for unicode support
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(resourcePath); var buffer = new RandomAccessReadBuffer(in)) {
TrueTypeFont trueTypeFont = new TTFParser().parse(buffer); // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
PDType0Font type0Font = PDType0Font.load(document, trueTypeFont, true); // use Type0Font for unicode support
return new Type0FontMetricsFactory(type0Font, trueTypeFont);
}
}
@ -55,8 +77,9 @@ public class Type0FontMetricsFactory implements FontMetricsFactory {
if (glyph == null || glyph.getBoundingBox() == null) {
continue;
}
descent = Math.min(descent, glyph.getYMinimum());
if (!slashGlyphIds.contains(glyphId)) {
descent = Math.min(descent, glyph.getYMinimum());
}
height = Math.max(height, glyph.getYMaximum());
} catch (Exception e) {
log.warn("descent and height of string {} could not be parsed, using average fallback value!", text);

View File

@ -0,0 +1,158 @@
package com.knecon.fforesight.service.ocr.processor.service.scriptdetection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Stream;
import org.apache.commons.math3.ml.clustering.Cluster;
import org.apache.commons.math3.ml.clustering.DBSCANClusterer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.FontStyleDetectionModel;
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.TextPositionAndWordImage;
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.WordImage;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
import com.knecon.fforesight.service.ocr.processor.service.fonts.Type0FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class FontStyleDetector {
OcrServiceSettings settings;
StrokeWidthCalculator strokeWidthCalculator;
/**
* Implementation of the MOBDoB algorithm, refer to the paper here:
* <a href="http://mile.ee.iisc.ac.in/publications/softCopy/DocumentAnalysis/Sai_NCVPRIPG2013.pdf">Script Independent Detection of Bold Words in Multi Font-size Documents</a>
* <p>
* As a high level overview: We cluster all text based on its font size. We determine the cluster with the most words. This is assumed to be regular text.
* We then estimate the average stroke width of that cluster by thinning all text to a single pixel and calculating the ratio of remaining pixels.
* (<a href="http://www.leptonica.org/papers/conn.pdf">Leptonica Documentation on thinning</a>)
* For each word we scale this average strokewidth based on its fontsize compared to the most common fontsize.
* Using the scaled strokewidth we do an opening operation.
* (<a href="https://en.wikipedia.org/wiki/Opening_(morphology)">Opening (Morphology)</a>).
* We then threshold the ratio of remaining pixels to determine whether a word is bold or not.
* <p>
* I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size.
* But this is based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height.
* The paper does not describe its clustering algorithm, so I've decided on DBSCAN due to its good runtime and readily available implementation by apache commons math.
* Moreover, the paper states that stroke width scales linearly with text height. I've come to the conclusion this is not the case.
* It seems it scales with the square root of the text height. Or at least this seemed to give the best results.
*/
public Map<Integer, List<OcrResultToWrite>> detectBold(List<OcrResult> ocrResults, PDDocument document) {
FontMetricsFactory fontMetricsFactory = Type0FontMetricsFactory.regular(document);
if (!settings.isBoldDetection()) {
return OcrResultToWrite.buildOcrResultsToWrite(ocrResults, fontMetricsFactory);
}
Map<Integer, List<OcrResultToWrite>> ocrResultToWritePerPage = new HashMap<>();
DBSCANClusterer<TextPositionAndWordImage> clusterer = new DBSCANClusterer<>(0.5, 1);
FontMetricsFactory boldFontMetricsFactory = Type0FontMetricsFactory.bold(document);
for (OcrResult result : ocrResults) {
FontStyleDetectionModel fontStyleDetectionModel = FontStyleDetectionModel.fromOcrResult(result, fontMetricsFactory, settings);
List<Cluster<TextPositionAndWordImage>> clusters = clusterer.cluster(fontStyleDetectionModel.getTextPositionsAndWordImages());
Optional<Cluster<TextPositionAndWordImage>> largestCluster = clusters.stream().max(Comparator.comparingInt(cluster -> cluster.getPoints().size()));
if (largestCluster.isEmpty()) {
insertResultIntoMap(result.image().getPageNumber(), ocrResultToWritePerPage, fontStyleDetectionModel);
continue;
}
List<TextPositionAndWordImage> wordsWithMostCommonTextHeight = largestCluster.get().getPoints();
double standardTextHeight = calculateStandardTextheight(wordsWithMostCommonTextHeight);
double regularStrokeWidth = calculateRegularStrokeWidth(wordsWithMostCommonTextHeight);
for (TextPositionAndWordImage textPositionsAndWordImage : fontStyleDetectionModel.getTextPositionsAndWordImages()) {
decideOnFontStyle(textPositionsAndWordImage, regularStrokeWidth, standardTextHeight, boldFontMetricsFactory);
}
insertResultIntoMap(result.image().getPageNumber(), ocrResultToWritePerPage, fontStyleDetectionModel);
fontStyleDetectionModel.dispose();
}
log.info("Finished bold detection");
return ocrResultToWritePerPage;
}
private static double calculateStandardTextheight(List<TextPositionAndWordImage> wordsWithMostCommonTextHeight) {
return wordsWithMostCommonTextHeight.stream()
.map(TextPositionAndWordImage::getWordImage)
.mapToDouble(WordImage::getTextHeight)
.filter(Double::isFinite)
.average()
.orElseThrow();
}
private double calculateRegularStrokeWidth(List<TextPositionAndWordImage> wordsWithMostCommonTextHeight) {
return wordsWithMostCommonTextHeight.stream()
.mapToDouble(textPositionAndWordImage -> strokeWidthCalculator.calculate(textPositionAndWordImage.getWordImage().getImage()))
.filter(Double::isFinite)
.average()
.orElseThrow();
}
private static void insertResultIntoMap(int pageNumber, Map<Integer, List<OcrResultToWrite>> ocrResultToWritePerPage, FontStyleDetectionModel fontStyleDetectionModel) {
OcrResultToWrite ocrResult = OcrResultToWrite.fromFontStyleDetectionModel(fontStyleDetectionModel);
ocrResultToWritePerPage.compute(pageNumber, (key, existingList) -> {
if (existingList == null) {
return List.of(ocrResult);
} else {
return Stream.concat(existingList.stream(), Stream.of(ocrResult)).toList();
}
});
}
private void decideOnFontStyle(TextPositionAndWordImage textPositionsAndWordImage,
double standardStrokeWidth,
double standardTextHeight,
FontMetricsFactory boldFontMetricsFactory) {
double scaledStrokeWidth = scaleStrokeWidthByFontSize(textPositionsAndWordImage, standardStrokeWidth, standardTextHeight);
if (textPositionsAndWordImage.getWordImage().hasLargerStrokeWidth(scaledStrokeWidth)) {
textPositionsAndWordImage.getTextPositionInImage().setFontMetricsFactory(boldFontMetricsFactory);
textPositionsAndWordImage.getTextPositionInImage().setFontStyle(FontStyle.BOLD);
} else {
textPositionsAndWordImage.getTextPositionInImage().setFontStyle(FontStyle.REGULAR);
}
}
private static double scaleStrokeWidthByFontSize(TextPositionAndWordImage textPositionsAndWordImage, double standardStrokeWidth, double standardFontSize) {
double influenceOfFontSize = 1.0; // the paper states that stroke width scales exactly linearly with font size. This did not seem to be true for me. Maybe some of the preprocessing steps are affecting this.
double fontsizeScalingFactor = Math.sqrt(textPositionsAndWordImage.getWordImage().getTextHeight() / standardFontSize);
return standardStrokeWidth + (influenceOfFontSize * (fontsizeScalingFactor - 1) * standardStrokeWidth);
}
}

View File

@ -0,0 +1,57 @@
package com.knecon.fforesight.service.ocr.processor.service.scriptdetection;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import lombok.AccessLevel;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.Sel;
import net.sourceforge.lept4j.util.LeptUtils;
/**
* This code is a good start for detecting italic text, although it has a few issues especially with glyphs which are naturally slanted. E.g. z, 2, 7, /
* If we want this maybe we should exclude these glyphs and then it might have less false positives. But in its current state i don't recommend using it.
*/
@NoArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ItalicDetector {
static String italicKernel = "ooxxooxxooxxoxxooXxooxxoxxooxxooxxoo";
Sel italicSel = Leptonica1.selCreateFromString(italicKernel, 9, 4, "italicKernel");
Sel brickSel = Leptonica1.selCreateBrick(3, 4, 1, 2, 1);
public boolean isItalic(Pix pix) {
Pix preprocessed = preprocess(pix);
Pix flipped = Leptonica1.pixFlipLR(null, pix);
Pix flippedPreprocessed = preprocess(flipped);
Leptonica1.pixFlipLR(flippedPreprocessed, flippedPreprocessed);
double pixelDensity = ImageProcessingUtils.calculatePixelDensity(preprocessed);
double flippedPixelDensity = ImageProcessingUtils.calculatePixelDensity(flippedPreprocessed);
LeptUtils.disposePix(preprocessed);
LeptUtils.disposePix(flipped);
LeptUtils.disposePix(flippedPreprocessed);
return flippedPixelDensity / pixelDensity < 0.85;
}
private Pix preprocess(Pix pix) {
Pix eroded = Leptonica1.pixErode(null, pix, italicSel.getPointer());
Pix dilated = Leptonica1.pixDilate(null, eroded, brickSel.getPointer());
LeptUtils.disposePix(eroded);
return dilated;
}
public void dispose() {
LeptUtils.dispose(italicSel);
LeptUtils.dispose(brickSel);
}
}

View File

@ -0,0 +1,58 @@
package com.knecon.fforesight.service.ocr.processor.service.scriptdetection;
import static net.sourceforge.lept4j.ILeptonica.L_THIN_FG;
import java.nio.IntBuffer;
import org.springframework.stereotype.Service;
import lombok.AccessLevel;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.Sela;
import net.sourceforge.lept4j.util.LeptUtils;
@Service
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class StrokeWidthCalculator {
Sela thinningSel;
/**
* Uses a series of sels to thin all connected lines to a single pixel. Then the pixel ratio is a good estimation of the stroke width in pixels.
* <a href="http://www.leptonica.org/papers/conn.pdf">Leptonica Documentation on thinning</a>
* Since the baseline is a strokewidth of exactly one, we need to add 1 to the result.
*
* @param input binarized pix with text on it
* @return estimated stroke width in pixels
*/
public double calculate(Pix input) {
init();
Pix thinned = Leptonica1.pixThinConnectedBySet(input, L_THIN_FG, thinningSel, 0);
IntBuffer thinnedPixelCount = IntBuffer.allocate(1);
Leptonica1.pixCountPixels(thinned, thinnedPixelCount, null);
IntBuffer pixelCount = IntBuffer.allocate(1);
Leptonica1.pixCountPixels(input, pixelCount, null);
LeptUtils.disposePix(thinned);
return (double) pixelCount.get() / thinnedPixelCount.get() + 1;
}
private void init() {
if (thinningSel == null) {
thinningSel = Leptonica1.selaMakeThinSets(1, 0);
}
}
}

View File

@ -17,7 +17,6 @@ import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.tess4j.TessAPI1;
/*
This just moves the Elements from the GhostScriptOutputListener into the ImageProcessing queue asynchronously
*/
@ -38,9 +37,17 @@ public class BlockingQueueFiller extends Thread {
public void run() {
// Interrupting signals that the image extraction has finished
while (!allImagesQueued) {
try {
while (!allImagesQueued) {
final UnprocessedImage image = imageInputQueue.take();
imageOutputQueue.put(image);
try {
imageOutputQueue.put(image);
} catch (InterruptedException e) {
imageOutputQueue.put(image);
}
}
} catch (InterruptedException e) {
log.info("All images extracted, emptying processing queue and stopping");
}
// empty the queue
@ -54,4 +61,5 @@ public class BlockingQueueFiller extends Thread {
}
}
}

View File

@ -4,8 +4,6 @@ import static net.sourceforge.tess4j.ITessAPI.TRUE;
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.concurrent.BlockingQueue;
@ -29,6 +27,8 @@ import lombok.Setter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.L_Kernel;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
import net.sourceforge.tess4j.ITessAPI;
@ -45,6 +45,7 @@ public class ImageProcessingThread extends Thread {
final BlockingQueue<UnprocessedImage> imageInputQueue;
final BlockingQueue<OcrImage> imageOutputQueue;
final ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
final L_Kernel gaussianKernel = Leptonica1.makeGaussianKernel(2, 2, 1.2f, 1);
final Statistics stats;
final OcrServiceSettings settings;
final PDDocument document;
@ -81,7 +82,9 @@ public class ImageProcessingThread extends Thread {
log.debug("No images left in processing queue, stopping.");
}
TessAPI1.TessBaseAPIEnd(this.detectionScriptHandle);
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
LeptUtils.dispose(gaussianKernel);
}
@ -106,7 +109,7 @@ public class ImageProcessingThread extends Thread {
private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) {
Pix pix = binarize(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi());
Pix pix = processPix(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi());
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix);
@ -129,7 +132,7 @@ public class ImageProcessingThread extends Thread {
float imageDPI = Math.abs(extractedImage.image().getWidth() / (extractedImage.ctm().getScalingFactorX() / 72));
Pix pix = binarize(extractedImage.asPix(), imageDPI, settings.getDpi());
Pix pix = processPix(extractedImage.asPix(), imageDPI, settings.getDpi());
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix);
@ -163,7 +166,7 @@ public class ImageProcessingThread extends Thread {
orientationDegreeResultBuffer = IntBuffer.allocate(1);
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
scriptureNameBuffer = new PointerByReference();
scriptureNameBuffer = new PointerByReference(); // Is this memory being freed?
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
int orientationDegree = 0;
@ -183,15 +186,58 @@ public class ImageProcessingThread extends Thread {
@SneakyThrows
private Pix binarize(Pix pix, float imageDpi, int targetDpi) {
private Pix processPix(Pix pix, float imageDpi, int targetDpi) {
Pix grayScale = ImageProcessingUtils.convertToGrayScale(pix);
Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale);
return ImageProcessingUtils.despecklePix(scaledUp);
Pix grayScale;
Pix scaledUp;
Pix gaussian;
Pix binarized;
//convert to grayscale
if (pix.d == 8) {
grayScale = pix;
} else if (pix.d == 32) {
grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
} else if (pix.d == 1) {
grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255);
} else {
throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d));
}
// scale up
float targetFactor = targetDpi / imageDpi;
if (targetFactor > 2.1) {
scaledUp = Leptonica1.pixScaleGray4xLI(grayScale);
} else if (targetFactor > 1.1) {
scaledUp = Leptonica1.pixScaleGray2xLI(grayScale);
} else {
scaledUp = grayScale;
}
// remove noise and prep for Otsu
gaussian = Leptonica1.pixConvolve(scaledUp, gaussianKernel, 8, 1);
// Threshold to binary
if (pix.w < 100 || pix.h < 100) {
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
} else {
binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.2f, null);
if (binarized == null) { // Sometimes Otsu just fails, then we binarize directly
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
}
}
LeptUtils.disposePix(pix);
LeptUtils.disposePix(grayScale);
LeptUtils.disposePix(scaledUp);
LeptUtils.disposePix(gaussian);
return binarized;
}
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();

View File

@ -116,16 +116,11 @@ public class OCRThread extends Thread {
@SneakyThrows
public void executeTesseract(int psm, int dpi, Pix pix, String tesseractOutputFileName) {
if (settings.isDebug()) {
String[] a = tesseractOutputFileName.split("/");
String folder = "/tmp/pixs/" + a[a.length - 3];
new File(folder).mkdirs();
Leptonica1.pixWrite(folder + "/pix_" + a[a.length - 1] + ".png", pix, 3);
}
Leptonica1.pixWrite(tesseractOutputFileName + ".tiff", pix, 5); // write the used image for later bold detection
instance.setVariable("user_defined_dpi", String.valueOf(dpi));
instance.setPageSegMode(psm);
instance.createDocumentsWithResults(pix, null, tesseractOutputFileName, List.of(ITesseract.RenderedFormat.HOCR), ITessAPI.TessPageIteratorLevel.RIL_BLOCK);
}

View File

@ -19,10 +19,12 @@ public class OcrServiceSettings {
int psmOverride = -1; // Overrides the page segmentation mode if > 0
int minImageHeight = 20; // Minimum height for images to be processed
int minImageWidth = 20; // Minimum width for images to be processed
float minRotationConfidence = 2; //
float minRotationConfidence = 2; // Sets a lower bound for the confidence rating for rotated pages.
boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes
boolean removeWatermark; // If true, watermarks will be removed
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR");
boolean boldDetection = true; // if true, bold detection will be attempted
double boldThreshold = 0.5; // Words are opened with a brick of average stroke width, if the ratio of remaining pixels is higher the word is determined bold.
}

View File

@ -6,14 +6,17 @@ import java.awt.Graphics;
import java.awt.Graphics2D;
import java.awt.Transparency;
import java.awt.image.BufferedImage;
import java.nio.IntBuffer;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.sun.jna.ptr.PointerByReference;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
import net.sourceforge.lept4j.L_Kernel;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
@ -37,67 +40,6 @@ public class ImageProcessingUtils {
}
public static Pix despecklePix(Pix pix) {
assert pix.d == 8;
Pix despeckled;
if (pix.w < 100 || pix.h < 100) {
// too small to properly despeckle, just binarize instead.
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
} else {
despeckled = LeptUtils.despeckle(pix,
LeptUtils.SEL_STR3,
3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though...
if (despeckled == null) {
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
}
}
if (pix != despeckled) {
LeptUtils.disposePix(pix);
}
return despeckled;
}
public static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) {
float targetFactor = targetDpi / imageDpi;
if (targetFactor > 3) {
Pix scaledUp;
scaledUp = Leptonica1.pixScaleGray4xLI(grayScale);
LeptUtils.disposePix(grayScale);
return scaledUp;
} else if (targetFactor > 1.9) {
Pix scaledUp;
scaledUp = Leptonica1.pixScaleGray2xLI(grayScale);
LeptUtils.disposePix(grayScale);
return scaledUp;
} else {
return grayScale;
}
}
@SneakyThrows
public static Pix convertToGrayScale(Pix pix) {
if (pix.d == 8) {
return pix;
} else if (pix.d == 32) {
Pix grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
LeptUtils.disposePix(pix);
return grayScale;
} else if (pix.d == 1) {
Pix grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255);
LeptUtils.disposePix(pix);
return grayScale;
} else {
throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d));
}
}
public Pix deRotatePix(int orientDegree, Pix pix) {
return switch (360 - orientDegree) {
@ -128,4 +70,16 @@ public class ImageProcessingUtils {
}
}
public static double calculatePixelDensity(Pix pix) {
IntBuffer pixelCount = IntBuffer.allocate(1);
int result = Leptonica1.pixCountPixels(pix, pixelCount, null);
if (result == 0) {
return (double) pixelCount.get() / (pix.h * pix.w);
} else {
return -1;
}
}
}

View File

@ -0,0 +1,73 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import lombok.experimental.UtilityClass;
import net.sourceforge.lept4j.L_Kernel;
import net.sourceforge.lept4j.Leptonica1;
@UtilityClass
public class KernelUtils {
/*
-1, -1, -1
-1, 8, -1
-1, -1, -1
*/
public L_Kernel createFullLaplacianKernel() {
L_Kernel laplacianKernel = Leptonica1.kernelCreate(3, 3);
Leptonica1.kernelSetElement(laplacianKernel, 0, 0, -1);
Leptonica1.kernelSetElement(laplacianKernel, 0, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 0, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 0, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 0, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 1, 8);
return laplacianKernel;
}
/*
0, 0, -1, 0, 0
0, -1, -1, -1, 0
-1, -1, 12, -1, -1
0, -1, -1, -1, 0
0, 0, -1, 0, 0
*/
public L_Kernel createLaplacianKernel5x5() {
L_Kernel laplacianKernel = Leptonica1.kernelCreate(5, 5);
Leptonica1.kernelSetElement(laplacianKernel, 0, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 3, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 0, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 3, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 4, -1);
Leptonica1.kernelSetElement(laplacianKernel, 3, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 3, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 3, 3, -1);
Leptonica1.kernelSetElement(laplacianKernel, 4, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 2, 12);
return laplacianKernel;
}
/*
0, -1, 0
-1, 4, -1
0, -1, 0
*/
public L_Kernel createLaplacianKernel() {
L_Kernel laplacianKernel = Leptonica1.kernelCreate(3, 3);
Leptonica1.kernelSetElement(laplacianKernel, 0, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 0, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 1, 4);
return laplacianKernel;
}
}

View File

@ -138,4 +138,11 @@ public class Tesseract2 extends Tesseract1 {
return renderer;
}
@Override
protected void dispose() {
TessBaseAPIEnd(getHandle());
TessBaseAPIDelete(getHandle());
}
}

View File

@ -20,7 +20,7 @@ class Type0FontMetricsFactoryTest {
public void testStringWidth() {
try (PDDocument document = Loader.loadPDF(new File(Type0FontMetricsFactoryTest.class.getClassLoader().getResource("InvisibleText.pdf").getPath()))) {
Type0FontMetricsFactory metricsFactory = new Type0FontMetricsFactory(document);
Type0FontMetricsFactory metricsFactory = Type0FontMetricsFactory.regular(document);
FontMetrics fontMetrics = metricsFactory.calculateMetrics("deine mutter", 100, 50);
}

View File

@ -41,8 +41,15 @@ fforesight:
ignored-endpoints: [ '/actuator/health', '/actuator/health/**' ]
enabled: true
logging.pattern.level: "%5p [${spring.application.name},%X{traceId:-},%X{spanId:-}]"
management:
tracing:
sampling:
probability: 1.0
otlp:
tracing:
endpoint: http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces
endpoint:
metrics.enabled: ${monitoring.enabled:false}
prometheus.enabled: ${monitoring.enabled:false}

View File

@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@SneakyThrows
public void testOcr() {
String text = testOCR("files/2009-1048395_50pages_tables.pdf");
String text = testOCR("files/402Study.pdf");
}
@ -162,13 +162,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@SneakyThrows
public void testOcrForSpecificFile() {
testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/F.2. A16003E - Acute Inhalation Study.pdf"));
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/A23220A - 404 - Skin Irritation in vivo.pdf"));
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/G.1.2 - 1768300_MMNA_A13617AV_report.pdf"));
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 17_Toxicidade Inalatoria Aguda.pdf"));
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 20_Sensibilizacao_02.pdf"));
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/ITEM 23_A15149W - Dermal absorption of formulated product.pdf"));
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 16_Toxicidade Cutanea Aguda.pdf"));
testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles2/A16361B - Acute Dermal Toxicity Study in Rats.pdf"));
}