Merge branch 'RED-7669-fontstyle' into 'master'
RED-8155: integrate bold-detection into ocr-service Closes RED-7669 See merge request redactmanager/ocr-service!31
This commit is contained in:
commit
bab16ad9b2
12
README.md
12
README.md
@ -15,11 +15,17 @@ The service uses PDFTron to attempt the removal of invisible elements and waterm
|
|||||||
Extracts all images from the PDF using PDFBox
|
Extracts all images from the PDF using PDFBox
|
||||||
3. Striped Image Detection and Stitching
|
3. Striped Image Detection and Stitching
|
||||||
Detects if images are striped and stitches them together using Ghostscript.
|
Detects if images are striped and stitches them together using Ghostscript.
|
||||||
4. Binarization
|
4. Image Processing
|
||||||
Binarizes the resulting images using Leptonica and the Otsu thresholding algorithm.
|
- Convert to grayscale
|
||||||
|
- Upscale to target DPI
|
||||||
|
- Filter using Gauss kernel
|
||||||
|
- Binarizes the resulting images using Leptonica and the Otsu thresholding algorithm.
|
||||||
|
- Despeckle using various morphological operations
|
||||||
5. OCR Processing
|
5. OCR Processing
|
||||||
Runs Tesseract on the images to extract text.
|
Runs Tesseract on the images to extract text.
|
||||||
6. Text Integration
|
6. Font style detection
|
||||||
|
Detection of bold text using stroke width estimation
|
||||||
|
7. Text Integration
|
||||||
Draws the resulting text onto the original PDF using PDFBox.
|
Draws the resulting text onto the original PDF using PDFBox.
|
||||||
|
|
||||||
Steps 2.-5. happen in parallel and communicate via a blocking queue to limit RAM usage.
|
Steps 2.-5. happen in parallel and communicate via a blocking queue to limit RAM usage.
|
||||||
|
|||||||
@ -25,6 +25,8 @@ tasks.named<Test>("test") {
|
|||||||
reports {
|
reports {
|
||||||
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
|
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
|
||||||
}
|
}
|
||||||
|
minHeapSize = "512m"
|
||||||
|
maxHeapSize = "8192m"
|
||||||
}
|
}
|
||||||
|
|
||||||
tasks.test {
|
tasks.test {
|
||||||
|
|||||||
@ -20,6 +20,7 @@ dependencies {
|
|||||||
api("org.apache.pdfbox:jbig2-imageio:3.0.4")
|
api("org.apache.pdfbox:jbig2-imageio:3.0.4")
|
||||||
api("com.github.jai-imageio:jai-imageio-core:1.4.0")
|
api("com.github.jai-imageio:jai-imageio-core:1.4.0")
|
||||||
api("com.github.jai-imageio:jai-imageio-jpeg2000:1.4.0")
|
api("com.github.jai-imageio:jai-imageio-jpeg2000:1.4.0")
|
||||||
|
api("org.apache.commons:commons-math3:3.6.1")
|
||||||
api("io.github.karols:hocr4j:0.2.0")
|
api("io.github.karols:hocr4j:0.2.0")
|
||||||
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
|
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
|
||||||
api("com.google.guava:guava:31.1-jre")
|
api("com.google.guava:guava:31.1-jre")
|
||||||
|
|||||||
@ -31,9 +31,19 @@ public interface OcrImage {
|
|||||||
int getNumberOnPage();
|
int getNumberOnPage();
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieves the height of the original image (not necessarily in pdf coordinates).
|
||||||
|
*
|
||||||
|
* @return the height of the image
|
||||||
|
*/
|
||||||
int getHeight();
|
int getHeight();
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieves the width of the original image (not necessarily in pdf coordinates).
|
||||||
|
*
|
||||||
|
* @return the width of the image
|
||||||
|
*/
|
||||||
int getWidth();
|
int getWidth();
|
||||||
|
|
||||||
|
|
||||||
@ -44,7 +54,7 @@ public interface OcrImage {
|
|||||||
*/
|
*/
|
||||||
default QuadPoint getImageBounds() {
|
default QuadPoint getImageBounds() {
|
||||||
|
|
||||||
// cannot be solved with a nice rotation matrix, since the after rotating the text coordinates in the image will always start at (0,0) and will therefore always start at (0,0) in the PDF.
|
// cannot be solved with a nice rotation matrix. After rotating the text coordinates in the image will always start at (0,0) and will therefore always start at (0,0) in the PDF.
|
||||||
// So in order to mimic this behavior we need to start with (0,0) coordinates always.
|
// So in order to mimic this behavior we need to start with (0,0) coordinates always.
|
||||||
if (getRotationDegrees() == 90 || getRotationDegrees() == 270) {
|
if (getRotationDegrees() == 90 || getRotationDegrees() == 270) {
|
||||||
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, getWidth()), new Point2D.Double(getHeight(), getWidth()), new Point2D.Double(getHeight(), 0));
|
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, getWidth()), new Point2D.Double(getHeight(), getWidth()), new Point2D.Double(getHeight(), 0));
|
||||||
@ -65,13 +75,6 @@ public interface OcrImage {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
default BufferedImage getBufferedImage() {
|
|
||||||
|
|
||||||
return LeptUtils.convertPixToImage(getPix());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Retrieves the rotation degree of the OCR image.
|
* Retrieves the rotation degree of the OCR image.
|
||||||
*
|
*
|
||||||
@ -86,6 +89,10 @@ public interface OcrImage {
|
|||||||
* @return The optimal page segmentation mode.
|
* @return The optimal page segmentation mode.
|
||||||
*/
|
*/
|
||||||
default int getOptimalPageSegmentationMode() {
|
default int getOptimalPageSegmentationMode() {
|
||||||
|
|
||||||
|
if (getWidth() < 200 || getHeight() < 200) {
|
||||||
|
return ITessAPI.TessPageSegMode.PSM_SINGLE_BLOCK;
|
||||||
|
}
|
||||||
return ITessAPI.TessPageSegMode.PSM_AUTO;
|
return ITessAPI.TessPageSegMode.PSM_AUTO;
|
||||||
} // TODO: evaluate if PSM can be dynamically chosen to increase performance
|
} // TODO: evaluate if PSM can be dynamically chosen to increase performance
|
||||||
|
|
||||||
@ -112,17 +119,6 @@ public interface OcrImage {
|
|||||||
AffineTransform getImageCTM();
|
AffineTransform getImageCTM();
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Retrieves the size (width * height) of the image.
|
|
||||||
*
|
|
||||||
* @return The size of the image.
|
|
||||||
*/
|
|
||||||
default int getImageSize() {
|
|
||||||
|
|
||||||
return getHeight() * getWidth();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
default void destroyPix() {
|
default void destroyPix() {
|
||||||
|
|
||||||
LeptUtils.disposePix(getPix());
|
LeptUtils.disposePix(getPix());
|
||||||
|
|||||||
@ -7,27 +7,17 @@ import com.knecon.fforesight.service.ocr.processor.service.HOcrPageParser;
|
|||||||
|
|
||||||
import io.github.karols.hocr4j.Word;
|
import io.github.karols.hocr4j.Word;
|
||||||
|
|
||||||
public record OcrResult(Image image, String hOcrPageAbsolutePath) {
|
public record OcrResult(OcrImage image, String tesseractOutputFilePath) {
|
||||||
|
|
||||||
public static OcrResult create(OcrImage image, String tesseractResult) {
|
public static OcrResult create(OcrImage image, String tesseractResult) {
|
||||||
|
|
||||||
return new OcrResult(Image.fromOcrImage(image), tesseractResult);
|
return new OcrResult(image, tesseractResult);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<Word> getAllWords() {
|
public List<Word> getAllWords() {
|
||||||
|
|
||||||
return HOcrPageParser.extractHocrPage(hOcrPageAbsolutePath).getAllWords();
|
return HOcrPageParser.extractHocrPage(tesseractOutputFilePath).getAllWords();
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public record Image(Integer pageNumber, AffineTransform ctm, QuadPoint position) {
|
|
||||||
|
|
||||||
public static Image fromOcrImage(OcrImage image) {
|
|
||||||
|
|
||||||
return new Image(image.getPageNumber(), image.getImageCTM(), image.getImageCoordinatesInInitialUserSpace());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,35 @@
|
|||||||
|
package com.knecon.fforesight.service.ocr.processor.model;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.FontStyleDetectionModel;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
|
||||||
|
|
||||||
|
public record OcrResultToWrite(List<TextPositionInImage> textPositionInImage, QuadPoint imageBoundingBox) {
|
||||||
|
|
||||||
|
public static OcrResultToWrite fromFontStyleDetectionModel(FontStyleDetectionModel fontStyleDetectionModel) {
|
||||||
|
|
||||||
|
return new OcrResultToWrite(fontStyleDetectionModel.getTextPositionInImages(), fontStyleDetectionModel.getImageBounds());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static Map<Integer, List<OcrResultToWrite>> buildOcrResultsToWrite(List<OcrResult> ocrResults, FontMetricsFactory fontMetricsFactory) {
|
||||||
|
|
||||||
|
return ocrResults.stream()
|
||||||
|
.collect(Collectors.groupingBy(ocrResult -> ocrResult.image().getPageNumber()))
|
||||||
|
.entrySet()
|
||||||
|
.stream()
|
||||||
|
.collect(Collectors.toMap(Map.Entry::getKey,
|
||||||
|
entry -> entry.getValue()
|
||||||
|
.stream()
|
||||||
|
.map(ocrResult -> new OcrResultToWrite(ocrResult.getAllWords()
|
||||||
|
.stream()
|
||||||
|
.filter(word -> !word.isBlank())
|
||||||
|
.map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR))
|
||||||
|
.toList(), ocrResult.image().getImageCoordinatesInInitialUserSpace()))
|
||||||
|
.toList()));
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -6,7 +6,7 @@ public record PageInformation(int height, int width, int number, int rotationDeg
|
|||||||
|
|
||||||
public static PageInformation fromPDPage(int pageNum, PDPage page) {
|
public static PageInformation fromPDPage(int pageNum, PDPage page) {
|
||||||
|
|
||||||
return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation());
|
return new PageInformation((int) page.getMediaBox().getHeight(), (int) page.getMediaBox().getWidth(), pageNum, page.getRotation());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -60,17 +60,6 @@ public class RenderedPageOcrImage implements OcrImage {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public QuadPoint getImageBounds() {
|
|
||||||
|
|
||||||
if (rotationDegrees == 90 || rotationDegrees == 270) {
|
|
||||||
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, width), new Point2D.Double(height, width), new Point2D.Double(height, 0));
|
|
||||||
} else {
|
|
||||||
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, height), new Point2D.Double(width, height), new Point2D.Double(width, 0));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int getPageNumber() {
|
public int getPageNumber() {
|
||||||
|
|
||||||
|
|||||||
@ -7,29 +7,35 @@ import org.apache.pdfbox.pdmodel.font.PDFont;
|
|||||||
import org.apache.pdfbox.util.Matrix;
|
import org.apache.pdfbox.util.Matrix;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
|
||||||
|
|
||||||
import io.github.karols.hocr4j.Bounds;
|
|
||||||
import io.github.karols.hocr4j.Word;
|
import io.github.karols.hocr4j.Word;
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
|
import lombok.Setter;
|
||||||
import lombok.experimental.FieldDefaults;
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
@Getter
|
@Getter
|
||||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
public class TextPositionInImage {
|
public class TextPositionInImage {
|
||||||
|
|
||||||
QuadPoint position;
|
final QuadPoint position;
|
||||||
String text;
|
final String text;
|
||||||
AffineTransform imageCTM;
|
final AffineTransform imageCTM;
|
||||||
|
|
||||||
|
@Setter
|
||||||
FontMetricsFactory fontMetricsFactory;
|
FontMetricsFactory fontMetricsFactory;
|
||||||
|
@Setter
|
||||||
|
FontStyle fontStyle;
|
||||||
|
|
||||||
|
|
||||||
public TextPositionInImage(Word word, AffineTransform imageCTM, FontMetricsFactory fontMetricsFactory) {
|
public TextPositionInImage(Word word, AffineTransform imageCTM, FontMetricsFactory fontMetricsFactory, FontStyle fontStyle) {
|
||||||
|
|
||||||
this.position = QuadPoint.fromBounds(word.getBounds());
|
this.position = QuadPoint.fromBounds(word.getBounds());
|
||||||
this.text = word.getText();
|
this.text = word.getText();
|
||||||
this.imageCTM = imageCTM;
|
this.imageCTM = imageCTM;
|
||||||
this.fontMetricsFactory = fontMetricsFactory;
|
this.fontMetricsFactory = fontMetricsFactory;
|
||||||
|
this.fontStyle = fontStyle;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -90,6 +96,13 @@ public class TextPositionInImage {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getTextHeight() {
|
||||||
|
|
||||||
|
var metrics = fontMetricsFactory.calculateMetrics(text, getTransformedWidth(), getTransformedHeight());
|
||||||
|
return fontMetricsFactory.calculateFontSize(text, getTransformedWidth()) * metrics.getHeightScaling();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public double getHeight() {
|
public double getHeight() {
|
||||||
|
|
||||||
return position.a().distance(position.b());
|
return position.a().distance(position.b());
|
||||||
|
|||||||
@ -0,0 +1,58 @@
|
|||||||
|
package com.knecon.fforesight.service.ocr.processor.model.scriptdetection;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import net.sourceforge.lept4j.Leptonica1;
|
||||||
|
import net.sourceforge.lept4j.Pix;
|
||||||
|
import net.sourceforge.lept4j.util.LeptUtils;
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public final class FontStyleDetectionModel {
|
||||||
|
|
||||||
|
QuadPoint imageBounds;
|
||||||
|
Pix image;
|
||||||
|
List<TextPositionAndWordImage> textPositionsAndWordImages;
|
||||||
|
|
||||||
|
|
||||||
|
public static FontStyleDetectionModel fromOcrResult(OcrResult ocrResult, FontMetricsFactory fontMetricsFactory, OcrServiceSettings settings) {
|
||||||
|
|
||||||
|
var image = Leptonica1.pixRead(ocrResult.tesseractOutputFilePath() + ".tiff");
|
||||||
|
var wordPixes = ocrResult.getAllWords().stream().filter(word -> !word.isBlank()).map(word -> TextPositionAndWordImage.create(ocrResult.image().getImageCTM(), word, image, settings, fontMetricsFactory)).toList();
|
||||||
|
|
||||||
|
return new FontStyleDetectionModel(ocrResult.image().getImageCoordinatesInInitialUserSpace(), image, wordPixes);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<TextPositionInImage> getTextPositionInImages() {
|
||||||
|
|
||||||
|
return textPositionsAndWordImages.stream().map(TextPositionAndWordImage::getTextPositionInImage).toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<WordImage> getWordImages() {
|
||||||
|
|
||||||
|
return textPositionsAndWordImages.stream().map(TextPositionAndWordImage::getWordImage).toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void dispose() {
|
||||||
|
|
||||||
|
LeptUtils.disposePix(image);
|
||||||
|
getWordImages().forEach(WordImage::dispose);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,52 @@
|
|||||||
|
package com.knecon.fforesight.service.ocr.processor.model.scriptdetection;
|
||||||
|
|
||||||
|
import java.awt.geom.AffineTransform;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
import org.apache.commons.math3.ml.clustering.Clusterable;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||||
|
|
||||||
|
import io.github.karols.hocr4j.Word;
|
||||||
|
import lombok.Getter;
|
||||||
|
import net.sourceforge.lept4j.Pix;
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
public final class TextPositionAndWordImage implements Clusterable {
|
||||||
|
|
||||||
|
private final TextPositionInImage textPositionInImage;
|
||||||
|
private final WordImage wordImage;
|
||||||
|
|
||||||
|
|
||||||
|
public TextPositionAndWordImage(TextPositionInImage textPositionInImage, WordImage wordImage) {
|
||||||
|
|
||||||
|
this.textPositionInImage = textPositionInImage;
|
||||||
|
this.wordImage = wordImage;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static TextPositionAndWordImage create(AffineTransform imageCTM, Word word, Pix image, OcrServiceSettings settings, FontMetricsFactory fontMetricsFactory) {
|
||||||
|
|
||||||
|
TextPositionInImage textPositionInImage = new TextPositionInImage(word, imageCTM, fontMetricsFactory, FontStyle.REGULAR);
|
||||||
|
WordImage wordImage = new WordImage(textPositionInImage.getTextHeight(), word, image, settings);
|
||||||
|
return new TextPositionAndWordImage(textPositionInImage, wordImage);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double[] getPoint() {
|
||||||
|
|
||||||
|
return wordImage.getPoint();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getTextHeight() {
|
||||||
|
|
||||||
|
return wordImage.getTextHeight();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,71 @@
|
|||||||
|
package com.knecon.fforesight.service.ocr.processor.model.scriptdetection;
|
||||||
|
|
||||||
|
import org.apache.commons.math3.ml.clustering.Clusterable;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.StrokeWidthCalculator;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||||
|
|
||||||
|
import io.github.karols.hocr4j.Word;
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import net.sourceforge.lept4j.Box;
|
||||||
|
import net.sourceforge.lept4j.Leptonica1;
|
||||||
|
import net.sourceforge.lept4j.Pix;
|
||||||
|
import net.sourceforge.lept4j.util.LeptUtils;
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public class WordImage implements Clusterable {
|
||||||
|
|
||||||
|
Pix image;
|
||||||
|
String text;
|
||||||
|
double textHeight;
|
||||||
|
OcrServiceSettings settings;
|
||||||
|
|
||||||
|
|
||||||
|
public WordImage(double textHeight, Word word, Pix originalImage, OcrServiceSettings settings) {
|
||||||
|
|
||||||
|
Box box = new Box(word.getBounds().getLeft(), word.getBounds().getTop(), word.getBounds().getWidth(), word.getBounds().getHeight(), 1);
|
||||||
|
this.image = Leptonica1.pixClipRectangle(originalImage, box, null);
|
||||||
|
box.clear();
|
||||||
|
this.text = word.getText();
|
||||||
|
this.textHeight = textHeight;
|
||||||
|
this.settings = settings;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean hasLargerStrokeWidth(double strokeWidth) {
|
||||||
|
|
||||||
|
int roundedStrokeWidth = (int) Math.round(strokeWidth);
|
||||||
|
double roundingError = (roundedStrokeWidth - strokeWidth) / strokeWidth;
|
||||||
|
|
||||||
|
// add 1 to open a bit bigger than the estimated regular stroke width
|
||||||
|
Pix openedPix = Leptonica1.pixOpenBrick(null, image, roundedStrokeWidth + 1, roundedStrokeWidth + 1);
|
||||||
|
|
||||||
|
double openedPixelDensity = ImageProcessingUtils.calculatePixelDensity(openedPix);
|
||||||
|
|
||||||
|
double pixelDensity = ImageProcessingUtils.calculatePixelDensity(image);
|
||||||
|
|
||||||
|
LeptUtils.disposePix(openedPix);
|
||||||
|
|
||||||
|
return (openedPixelDensity * (1 + roundingError)) / pixelDensity > (settings.getBoldThreshold());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double[] getPoint() {
|
||||||
|
|
||||||
|
return new double[]{textHeight};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void dispose() {
|
||||||
|
|
||||||
|
LeptUtils.disposePix(image);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -3,24 +3,18 @@ package com.knecon.fforesight.service.ocr.processor.service;
|
|||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.concurrent.BlockingQueue;
|
import java.util.concurrent.BlockingQueue;
|
||||||
import java.util.concurrent.LinkedBlockingDeque;
|
import java.util.concurrent.LinkedBlockingDeque;
|
||||||
import java.util.concurrent.LinkedTransferQueue;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.azure.core.implementation.GeoObjectHelper;
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
|
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage;
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
|
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.threads.BlockingQueueFiller;
|
import com.knecon.fforesight.service.ocr.processor.service.threads.BlockingQueueFiller;
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler;
|
import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler;
|
||||||
@ -32,7 +26,6 @@ import lombok.RequiredArgsConstructor;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.experimental.FieldDefaults;
|
import lombok.experimental.FieldDefaults;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import net.sourceforge.lept4j.Pix;
|
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@Service
|
@Service
|
||||||
|
|||||||
@ -9,6 +9,7 @@ import java.io.OutputStream;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.concurrent.ArrayBlockingQueue;
|
import java.util.concurrent.ArrayBlockingQueue;
|
||||||
import java.util.concurrent.BlockingQueue;
|
import java.util.concurrent.BlockingQueue;
|
||||||
import java.util.stream.IntStream;
|
import java.util.stream.IntStream;
|
||||||
@ -20,8 +21,10 @@ import org.springframework.util.FileSystemUtils;
|
|||||||
|
|
||||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||||
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.FontStyleDetector;
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||||
|
|
||||||
@ -44,6 +47,7 @@ public class OCRService {
|
|||||||
InvisibleElementRemovalService invisibleElementRemovalService;
|
InvisibleElementRemovalService invisibleElementRemovalService;
|
||||||
OcrResultWriter ocrResultWriter;
|
OcrResultWriter ocrResultWriter;
|
||||||
GhostScriptService ghostScriptService;
|
GhostScriptService ghostScriptService;
|
||||||
|
FontStyleDetector boldDetector;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -135,9 +139,14 @@ public class OCRService {
|
|||||||
ocrThread.join();
|
ocrThread.join();
|
||||||
}
|
}
|
||||||
|
|
||||||
log.info("OCR processing has finished, writing results");
|
log.info("Tesseract OCR has finished for file {} and dossier {}", fileId, dossierId);
|
||||||
|
|
||||||
timestamp = System.currentTimeMillis();
|
timestamp = System.currentTimeMillis();
|
||||||
var dictionariesToUpdate = ocrResultWriter.drawOcrResultsToPdf(document, ocrResults);
|
Map<Integer, List<OcrResultToWrite>> imageWithTextPositionsPerPage = boldDetector.detectBold(ocrResults, document);
|
||||||
|
stats.increaseFontStyleDetectionDuration(System.currentTimeMillis() - timestamp);
|
||||||
|
|
||||||
|
timestamp = System.currentTimeMillis();
|
||||||
|
var dictionariesToUpdate = ocrResultWriter.drawOcrResultsToPdf(document, imageWithTextPositionsPerPage);
|
||||||
log.info("Saving document");
|
log.info("Saving document");
|
||||||
document.saveIncremental(out, dictionariesToUpdate);
|
document.saveIncremental(out, dictionariesToUpdate);
|
||||||
stats.increaseWritingTextDuration(System.currentTimeMillis() - timestamp);
|
stats.increaseWritingTextDuration(System.currentTimeMillis() - timestamp);
|
||||||
|
|||||||
@ -2,11 +2,11 @@ package com.knecon.fforesight.service.ocr.processor.service;
|
|||||||
|
|
||||||
import java.awt.Color;
|
import java.awt.Color;
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.pdfbox.cos.COSDictionary;
|
import org.apache.pdfbox.cos.COSDictionary;
|
||||||
import org.apache.pdfbox.cos.COSName;
|
import org.apache.pdfbox.cos.COSName;
|
||||||
@ -20,11 +20,9 @@ import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentPrope
|
|||||||
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
|
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.Type0FontMetricsFactory;
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
@ -44,19 +42,17 @@ public class OcrResultWriter {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public Set<COSDictionary> drawOcrResultsToPdf(PDDocument document, List<OcrResult> ocrResults) {
|
public Set<COSDictionary> drawOcrResultsToPdf(PDDocument document, Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||||
|
|
||||||
FontMetricsFactory fontMetricsFactory = new Type0FontMetricsFactory(document);
|
|
||||||
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
|
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
|
||||||
Map<Integer, List<OcrResult>> resultsPerPage = ocrResults.stream().collect(Collectors.groupingBy(result -> result.image().pageNumber()));
|
imagesWithResultsPerPage.keySet().forEach(pageNumber -> drawResultsPerPage(document, pageNumber, imagesWithResultsPerPage.get(pageNumber), dictionariesToUpdate));
|
||||||
resultsPerPage.keySet().forEach(pageNumber -> drawResultsPerPage(document, pageNumber, resultsPerPage, dictionariesToUpdate, fontMetricsFactory));
|
|
||||||
dictionariesToUpdate.add(document.getDocumentInformation().getCOSObject());
|
dictionariesToUpdate.add(document.getDocumentInformation().getCOSObject());
|
||||||
return dictionariesToUpdate;
|
return dictionariesToUpdate;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void drawResultsPerPage(PDDocument document, Integer pageNumber, Map<Integer, List<OcrResult>> resultsPerPage, Set<COSDictionary> dictionariesToUpdate, FontMetricsFactory fontMetricsFactory) {
|
private void drawResultsPerPage(PDDocument document, Integer pageNumber, List<OcrResultToWrite> ocrResultToWrite, Set<COSDictionary> dictionariesToUpdate) {
|
||||||
|
|
||||||
var pdPage = document.getPage(pageNumber - 1);
|
var pdPage = document.getPage(pageNumber - 1);
|
||||||
|
|
||||||
@ -69,7 +65,7 @@ public class OcrResultWriter {
|
|||||||
|
|
||||||
escapeContentStreams(document, pdPage);
|
escapeContentStreams(document, pdPage);
|
||||||
|
|
||||||
List<TextPositionInImage> words = buildTextPositionsOnPage(pageNumber, resultsPerPage, fontMetricsFactory);
|
List<TextPositionInImage> words = ocrResultToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
|
||||||
try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
|
try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
|
||||||
|
|
||||||
// write invisible ocr text inside tagged content
|
// write invisible ocr text inside tagged content
|
||||||
@ -86,7 +82,6 @@ public class OcrResultWriter {
|
|||||||
// write visible ocr text inside optional group
|
// write visible ocr text inside optional group
|
||||||
contentStream.beginMarkedContent(COSName.OC, textDebugLayer);
|
contentStream.beginMarkedContent(COSName.OC, textDebugLayer);
|
||||||
contentStream.saveGraphicsState();
|
contentStream.saveGraphicsState();
|
||||||
contentStream.setNonStrokingColor(Color.BLUE);
|
|
||||||
words.forEach(word -> drawVisibleWord(word, contentStream));
|
words.forEach(word -> drawVisibleWord(word, contentStream));
|
||||||
contentStream.restoreGraphicsState();
|
contentStream.restoreGraphicsState();
|
||||||
contentStream.endMarkedContent();
|
contentStream.endMarkedContent();
|
||||||
@ -94,7 +89,9 @@ public class OcrResultWriter {
|
|||||||
// write word bounding boxes (tesseract output) inside optional group
|
// write word bounding boxes (tesseract output) inside optional group
|
||||||
contentStream.beginMarkedContent(COSName.OC, bBoxDebugLayer);
|
contentStream.beginMarkedContent(COSName.OC, bBoxDebugLayer);
|
||||||
contentStream.saveGraphicsState();
|
contentStream.saveGraphicsState();
|
||||||
resultsPerPage.get(pageNumber).stream().map(OcrResult::image).forEach(image -> drawGrid(contentStream, image.position()));
|
ocrResultToWrite.stream()
|
||||||
|
.map(OcrResultToWrite::imageBoundingBox)
|
||||||
|
.forEach(imagePosition -> drawGrid(contentStream, imagePosition));
|
||||||
words.stream().map(TextPositionInImage::getTransformedTextBBox).forEach(word -> drawRectangle(contentStream, word));
|
words.stream().map(TextPositionInImage::getTransformedTextBBox).forEach(word -> drawRectangle(contentStream, word));
|
||||||
contentStream.restoreGraphicsState();
|
contentStream.restoreGraphicsState();
|
||||||
contentStream.endMarkedContent();
|
contentStream.endMarkedContent();
|
||||||
@ -105,15 +102,6 @@ public class OcrResultWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static List<TextPositionInImage> buildTextPositionsOnPage(Integer pageNumber, Map<Integer, List<OcrResult>> resultsPerPage, FontMetricsFactory fontMetricsFactory) {
|
|
||||||
|
|
||||||
return resultsPerPage.get(pageNumber)
|
|
||||||
.stream()
|
|
||||||
.flatMap(result -> result.getAllWords().stream().filter(word -> !word.isBlank()).map(word -> new TextPositionInImage(word, result.image().ctm(), fontMetricsFactory)))
|
|
||||||
.toList();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private static void escapeContentStreams(PDDocument document, PDPage pdPage) {
|
private static void escapeContentStreams(PDDocument document, PDPage pdPage) {
|
||||||
// We need to append to the contentstream, otherwise the content could be overlapped by images
|
// We need to append to the contentstream, otherwise the content could be overlapped by images
|
||||||
@ -196,6 +184,11 @@ public class OcrResultWriter {
|
|||||||
private void drawWord(TextPositionInImage position, PDPageContentStream contentStream, RenderingMode renderingMode) {
|
private void drawWord(TextPositionInImage position, PDPageContentStream contentStream, RenderingMode renderingMode) {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
contentStream.setNonStrokingColor(switch (position.getFontStyle()) {
|
||||||
|
case BOLD -> Color.RED;
|
||||||
|
case ITALIC -> Color.GREEN;
|
||||||
|
default -> Color.BLUE;
|
||||||
|
});
|
||||||
contentStream.beginText();
|
contentStream.beginText();
|
||||||
contentStream.setRenderingMode(renderingMode);
|
contentStream.setRenderingMode(renderingMode);
|
||||||
contentStream.setFont(position.getFont(), (float) position.getFontSize());
|
contentStream.setFont(position.getFont(), (float) position.getFontSize());
|
||||||
|
|||||||
@ -16,12 +16,14 @@ public class Statistics {
|
|||||||
AtomicLong pdf2ImgDuration;
|
AtomicLong pdf2ImgDuration;
|
||||||
AtomicLong writingTextDuration;
|
AtomicLong writingTextDuration;
|
||||||
AtomicLong imageProcessingDuration;
|
AtomicLong imageProcessingDuration;
|
||||||
|
AtomicLong fontStyleDetectionDuration;
|
||||||
|
|
||||||
|
|
||||||
public Statistics(int numberOfExtractThreads, int numberOfOcrThreads) {
|
public Statistics(int numberOfExtractThreads, int numberOfOcrThreads) {
|
||||||
|
|
||||||
this.imageExtraction = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfExtractThreads, 0L)));
|
this.imageExtraction = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfExtractThreads, 0L)));
|
||||||
this.tesseractDuration = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfOcrThreads, 0L)));
|
this.tesseractDuration = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfOcrThreads, 0L)));
|
||||||
|
this.fontStyleDetectionDuration = new AtomicLong(0);
|
||||||
this.pdf2ImgDuration = new AtomicLong(0);
|
this.pdf2ImgDuration = new AtomicLong(0);
|
||||||
this.writingTextDuration = new AtomicLong(0);
|
this.writingTextDuration = new AtomicLong(0);
|
||||||
this.imageProcessingDuration = new AtomicLong(0);
|
this.imageProcessingDuration = new AtomicLong(0);
|
||||||
@ -57,12 +59,17 @@ public class Statistics {
|
|||||||
writingTextDuration.addAndGet(duration);
|
writingTextDuration.addAndGet(duration);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void increaseFontStyleDetectionDuration(long duration) {
|
||||||
|
|
||||||
|
fontStyleDetectionDuration.addAndGet(duration);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
||||||
return String.format(
|
return String.format(
|
||||||
"imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s",
|
"imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s, FontstyleDetection=%.2f s",
|
||||||
((float) imageExtraction.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
|
((float) imageExtraction.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
|
||||||
((float) imageExtraction.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
|
((float) imageExtraction.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
|
||||||
((float) imageExtraction.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
|
((float) imageExtraction.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
|
||||||
@ -71,7 +78,8 @@ public class Statistics {
|
|||||||
((float) tesseractDuration.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
|
((float) tesseractDuration.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
|
||||||
(float) imageProcessingDuration.get() / 1000,
|
(float) imageProcessingDuration.get() / 1000,
|
||||||
(float) pdf2ImgDuration.get() / 1000,
|
(float) pdf2ImgDuration.get() / 1000,
|
||||||
(float) writingTextDuration.get() / 1000);
|
(float) writingTextDuration.get() / 1000,
|
||||||
|
(float) fontStyleDetectionDuration.get() / 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -36,6 +36,7 @@ public interface FontMetricsFactory {
|
|||||||
|
|
||||||
PDFont getFont();
|
PDFont getFont();
|
||||||
|
|
||||||
|
|
||||||
HeightAndDescent calculateHeightAndDescent(String text);
|
HeightAndDescent calculateHeightAndDescent(String text);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,5 @@
|
|||||||
|
package com.knecon.fforesight.service.ocr.processor.service.fonts;
|
||||||
|
|
||||||
|
public enum FontStyle {
|
||||||
|
REGULAR, BOLD, ITALIC
|
||||||
|
}
|
||||||
@ -1,6 +1,9 @@
|
|||||||
package com.knecon.fforesight.service.ocr.processor.service.fonts;
|
package com.knecon.fforesight.service.ocr.processor.service.fonts;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.fontbox.ttf.GlyphData;
|
import org.apache.fontbox.ttf.GlyphData;
|
||||||
import org.apache.fontbox.ttf.TTFParser;
|
import org.apache.fontbox.ttf.TTFParser;
|
||||||
@ -12,22 +15,41 @@ import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
|
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.services.s3.endpoints.internal.Value;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
|
@RequiredArgsConstructor
|
||||||
public class Type0FontMetricsFactory implements FontMetricsFactory {
|
public class Type0FontMetricsFactory implements FontMetricsFactory {
|
||||||
|
|
||||||
private final PDType0Font type0Font;
|
private final PDType0Font type0Font;
|
||||||
private final TrueTypeFont trueTypeFont;
|
private final TrueTypeFont trueTypeFont;
|
||||||
|
|
||||||
|
// for this specific font back-/forward-slashes have a lot of descent screwing up the font size and therefore bold detection. So if we find such a character we ignore its descent.
|
||||||
|
private static final Set<Integer> slashGlyphIds = Set.of(18, 63);
|
||||||
|
|
||||||
|
|
||||||
|
public static Type0FontMetricsFactory regular(PDDocument document) {
|
||||||
|
|
||||||
|
return createFromResource("fonts/cmu-regular.ttf", document);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static Type0FontMetricsFactory bold(PDDocument document) {
|
||||||
|
|
||||||
|
return createFromResource("fonts/cmu-bold.ttf", document);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public Type0FontMetricsFactory(PDDocument document) {
|
private static Type0FontMetricsFactory createFromResource(String resourcePath, PDDocument document) {
|
||||||
|
|
||||||
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream("fonts/cmu-regular.ttf"); var buffer = new RandomAccessReadBuffer(in)) {
|
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(resourcePath); var buffer = new RandomAccessReadBuffer(in)) {
|
||||||
this.trueTypeFont = new TTFParser().parse(buffer); // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
|
TrueTypeFont trueTypeFont = new TTFParser().parse(buffer); // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
|
||||||
this.type0Font = PDType0Font.load(document, this.trueTypeFont, false); // use Type0Font for unicode support
|
PDType0Font type0Font = PDType0Font.load(document, trueTypeFont, true); // use Type0Font for unicode support
|
||||||
|
return new Type0FontMetricsFactory(type0Font, trueTypeFont);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -55,8 +77,9 @@ public class Type0FontMetricsFactory implements FontMetricsFactory {
|
|||||||
if (glyph == null || glyph.getBoundingBox() == null) {
|
if (glyph == null || glyph.getBoundingBox() == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
if (!slashGlyphIds.contains(glyphId)) {
|
||||||
descent = Math.min(descent, glyph.getYMinimum());
|
descent = Math.min(descent, glyph.getYMinimum());
|
||||||
|
}
|
||||||
height = Math.max(height, glyph.getYMaximum());
|
height = Math.max(height, glyph.getYMaximum());
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.warn("descent and height of string {} could not be parsed, using average fallback value!", text);
|
log.warn("descent and height of string {} could not be parsed, using average fallback value!", text);
|
||||||
|
|||||||
@ -0,0 +1,158 @@
|
|||||||
|
package com.knecon.fforesight.service.ocr.processor.service.scriptdetection;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import org.apache.commons.math3.ml.clustering.Cluster;
|
||||||
|
import org.apache.commons.math3.ml.clustering.DBSCANClusterer;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.FontStyleDetectionModel;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.TextPositionAndWordImage;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.WordImage;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.service.fonts.Type0FontMetricsFactory;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public class FontStyleDetector {
|
||||||
|
|
||||||
|
OcrServiceSettings settings;
|
||||||
|
StrokeWidthCalculator strokeWidthCalculator;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implementation of the MOBDoB algorithm, refer to the paper here:
|
||||||
|
* <a href="http://mile.ee.iisc.ac.in/publications/softCopy/DocumentAnalysis/Sai_NCVPRIPG2013.pdf">Script Independent Detection of Bold Words in Multi Font-size Documents</a>
|
||||||
|
* <p>
|
||||||
|
* As a high level overview: We cluster all text based on its font size. We determine the cluster with the most words. This is assumed to be regular text.
|
||||||
|
* We then estimate the average stroke width of that cluster by thinning all text to a single pixel and calculating the ratio of remaining pixels.
|
||||||
|
* (<a href="http://www.leptonica.org/papers/conn.pdf">Leptonica Documentation on thinning</a>)
|
||||||
|
* For each word we scale this average strokewidth based on its fontsize compared to the most common fontsize.
|
||||||
|
* Using the scaled strokewidth we do an opening operation.
|
||||||
|
* (<a href="https://en.wikipedia.org/wiki/Opening_(morphology)">Opening (Morphology)</a>).
|
||||||
|
* We then threshold the ratio of remaining pixels to determine whether a word is bold or not.
|
||||||
|
* <p>
|
||||||
|
* I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size.
|
||||||
|
* But this is based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height.
|
||||||
|
* The paper does not describe its clustering algorithm, so I've decided on DBSCAN due to its good runtime and readily available implementation by apache commons math.
|
||||||
|
* Moreover, the paper states that stroke width scales linearly with text height. I've come to the conclusion this is not the case.
|
||||||
|
* It seems it scales with the square root of the text height. Or at least this seemed to give the best results.
|
||||||
|
*/
|
||||||
|
public Map<Integer, List<OcrResultToWrite>> detectBold(List<OcrResult> ocrResults, PDDocument document) {
|
||||||
|
|
||||||
|
FontMetricsFactory fontMetricsFactory = Type0FontMetricsFactory.regular(document);
|
||||||
|
if (!settings.isBoldDetection()) {
|
||||||
|
return OcrResultToWrite.buildOcrResultsToWrite(ocrResults, fontMetricsFactory);
|
||||||
|
}
|
||||||
|
|
||||||
|
Map<Integer, List<OcrResultToWrite>> ocrResultToWritePerPage = new HashMap<>();
|
||||||
|
|
||||||
|
DBSCANClusterer<TextPositionAndWordImage> clusterer = new DBSCANClusterer<>(0.5, 1);
|
||||||
|
|
||||||
|
FontMetricsFactory boldFontMetricsFactory = Type0FontMetricsFactory.bold(document);
|
||||||
|
|
||||||
|
for (OcrResult result : ocrResults) {
|
||||||
|
FontStyleDetectionModel fontStyleDetectionModel = FontStyleDetectionModel.fromOcrResult(result, fontMetricsFactory, settings);
|
||||||
|
|
||||||
|
List<Cluster<TextPositionAndWordImage>> clusters = clusterer.cluster(fontStyleDetectionModel.getTextPositionsAndWordImages());
|
||||||
|
Optional<Cluster<TextPositionAndWordImage>> largestCluster = clusters.stream().max(Comparator.comparingInt(cluster -> cluster.getPoints().size()));
|
||||||
|
|
||||||
|
if (largestCluster.isEmpty()) {
|
||||||
|
insertResultIntoMap(result.image().getPageNumber(), ocrResultToWritePerPage, fontStyleDetectionModel);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<TextPositionAndWordImage> wordsWithMostCommonTextHeight = largestCluster.get().getPoints();
|
||||||
|
|
||||||
|
double standardTextHeight = calculateStandardTextheight(wordsWithMostCommonTextHeight);
|
||||||
|
double regularStrokeWidth = calculateRegularStrokeWidth(wordsWithMostCommonTextHeight);
|
||||||
|
|
||||||
|
for (TextPositionAndWordImage textPositionsAndWordImage : fontStyleDetectionModel.getTextPositionsAndWordImages()) {
|
||||||
|
decideOnFontStyle(textPositionsAndWordImage, regularStrokeWidth, standardTextHeight, boldFontMetricsFactory);
|
||||||
|
}
|
||||||
|
|
||||||
|
insertResultIntoMap(result.image().getPageNumber(), ocrResultToWritePerPage, fontStyleDetectionModel);
|
||||||
|
fontStyleDetectionModel.dispose();
|
||||||
|
}
|
||||||
|
|
||||||
|
log.info("Finished bold detection");
|
||||||
|
return ocrResultToWritePerPage;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static double calculateStandardTextheight(List<TextPositionAndWordImage> wordsWithMostCommonTextHeight) {
|
||||||
|
|
||||||
|
return wordsWithMostCommonTextHeight.stream()
|
||||||
|
.map(TextPositionAndWordImage::getWordImage)
|
||||||
|
.mapToDouble(WordImage::getTextHeight)
|
||||||
|
.filter(Double::isFinite)
|
||||||
|
.average()
|
||||||
|
.orElseThrow();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double calculateRegularStrokeWidth(List<TextPositionAndWordImage> wordsWithMostCommonTextHeight) {
|
||||||
|
|
||||||
|
return wordsWithMostCommonTextHeight.stream()
|
||||||
|
.mapToDouble(textPositionAndWordImage -> strokeWidthCalculator.calculate(textPositionAndWordImage.getWordImage().getImage()))
|
||||||
|
.filter(Double::isFinite)
|
||||||
|
.average()
|
||||||
|
.orElseThrow();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void insertResultIntoMap(int pageNumber, Map<Integer, List<OcrResultToWrite>> ocrResultToWritePerPage, FontStyleDetectionModel fontStyleDetectionModel) {
|
||||||
|
|
||||||
|
OcrResultToWrite ocrResult = OcrResultToWrite.fromFontStyleDetectionModel(fontStyleDetectionModel);
|
||||||
|
|
||||||
|
ocrResultToWritePerPage.compute(pageNumber, (key, existingList) -> {
|
||||||
|
if (existingList == null) {
|
||||||
|
return List.of(ocrResult);
|
||||||
|
} else {
|
||||||
|
return Stream.concat(existingList.stream(), Stream.of(ocrResult)).toList();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void decideOnFontStyle(TextPositionAndWordImage textPositionsAndWordImage,
|
||||||
|
double standardStrokeWidth,
|
||||||
|
double standardTextHeight,
|
||||||
|
FontMetricsFactory boldFontMetricsFactory) {
|
||||||
|
|
||||||
|
double scaledStrokeWidth = scaleStrokeWidthByFontSize(textPositionsAndWordImage, standardStrokeWidth, standardTextHeight);
|
||||||
|
|
||||||
|
if (textPositionsAndWordImage.getWordImage().hasLargerStrokeWidth(scaledStrokeWidth)) {
|
||||||
|
textPositionsAndWordImage.getTextPositionInImage().setFontMetricsFactory(boldFontMetricsFactory);
|
||||||
|
textPositionsAndWordImage.getTextPositionInImage().setFontStyle(FontStyle.BOLD);
|
||||||
|
} else {
|
||||||
|
textPositionsAndWordImage.getTextPositionInImage().setFontStyle(FontStyle.REGULAR);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static double scaleStrokeWidthByFontSize(TextPositionAndWordImage textPositionsAndWordImage, double standardStrokeWidth, double standardFontSize) {
|
||||||
|
|
||||||
|
double influenceOfFontSize = 1.0; // the paper states that stroke width scales exactly linearly with font size. This did not seem to be true for me. Maybe some of the preprocessing steps are affecting this.
|
||||||
|
double fontsizeScalingFactor = Math.sqrt(textPositionsAndWordImage.getWordImage().getTextHeight() / standardFontSize);
|
||||||
|
return standardStrokeWidth + (influenceOfFontSize * (fontsizeScalingFactor - 1) * standardStrokeWidth);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,57 @@
|
|||||||
|
package com.knecon.fforesight.service.ocr.processor.service.scriptdetection;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import net.sourceforge.lept4j.Leptonica1;
|
||||||
|
import net.sourceforge.lept4j.Pix;
|
||||||
|
import net.sourceforge.lept4j.Sel;
|
||||||
|
import net.sourceforge.lept4j.util.LeptUtils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This code is a good start for detecting italic text, although it has a few issues especially with glyphs which are naturally slanted. E.g. z, 2, 7, /
|
||||||
|
* If we want this maybe we should exclude these glyphs and then it might have less false positives. But in its current state i don't recommend using it.
|
||||||
|
*/
|
||||||
|
@NoArgsConstructor
|
||||||
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
|
public class ItalicDetector {
|
||||||
|
|
||||||
|
|
||||||
|
static String italicKernel = "ooxxooxxooxxoxxooXxooxxoxxooxxooxxoo";
|
||||||
|
Sel italicSel = Leptonica1.selCreateFromString(italicKernel, 9, 4, "italicKernel");
|
||||||
|
Sel brickSel = Leptonica1.selCreateBrick(3, 4, 1, 2, 1);
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isItalic(Pix pix) {
|
||||||
|
|
||||||
|
Pix preprocessed = preprocess(pix);
|
||||||
|
Pix flipped = Leptonica1.pixFlipLR(null, pix);
|
||||||
|
Pix flippedPreprocessed = preprocess(flipped);
|
||||||
|
Leptonica1.pixFlipLR(flippedPreprocessed, flippedPreprocessed);
|
||||||
|
double pixelDensity = ImageProcessingUtils.calculatePixelDensity(preprocessed);
|
||||||
|
double flippedPixelDensity = ImageProcessingUtils.calculatePixelDensity(flippedPreprocessed);
|
||||||
|
LeptUtils.disposePix(preprocessed);
|
||||||
|
LeptUtils.disposePix(flipped);
|
||||||
|
LeptUtils.disposePix(flippedPreprocessed);
|
||||||
|
return flippedPixelDensity / pixelDensity < 0.85;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Pix preprocess(Pix pix) {
|
||||||
|
|
||||||
|
Pix eroded = Leptonica1.pixErode(null, pix, italicSel.getPointer());
|
||||||
|
Pix dilated = Leptonica1.pixDilate(null, eroded, brickSel.getPointer());
|
||||||
|
LeptUtils.disposePix(eroded);
|
||||||
|
return dilated;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void dispose() {
|
||||||
|
|
||||||
|
LeptUtils.dispose(italicSel);
|
||||||
|
LeptUtils.dispose(brickSel);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,58 @@
|
|||||||
|
package com.knecon.fforesight.service.ocr.processor.service.scriptdetection;
|
||||||
|
|
||||||
|
import static net.sourceforge.lept4j.ILeptonica.L_THIN_FG;
|
||||||
|
|
||||||
|
import java.nio.IntBuffer;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import net.sourceforge.lept4j.Leptonica1;
|
||||||
|
import net.sourceforge.lept4j.Pix;
|
||||||
|
import net.sourceforge.lept4j.Sela;
|
||||||
|
import net.sourceforge.lept4j.util.LeptUtils;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@NoArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
public class StrokeWidthCalculator {
|
||||||
|
|
||||||
|
Sela thinningSel;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Uses a series of sels to thin all connected lines to a single pixel. Then the pixel ratio is a good estimation of the stroke width in pixels.
|
||||||
|
* <a href="http://www.leptonica.org/papers/conn.pdf">Leptonica Documentation on thinning</a>
|
||||||
|
* Since the baseline is a strokewidth of exactly one, we need to add 1 to the result.
|
||||||
|
*
|
||||||
|
* @param input binarized pix with text on it
|
||||||
|
* @return estimated stroke width in pixels
|
||||||
|
*/
|
||||||
|
public double calculate(Pix input) {
|
||||||
|
|
||||||
|
init();
|
||||||
|
|
||||||
|
Pix thinned = Leptonica1.pixThinConnectedBySet(input, L_THIN_FG, thinningSel, 0);
|
||||||
|
|
||||||
|
IntBuffer thinnedPixelCount = IntBuffer.allocate(1);
|
||||||
|
Leptonica1.pixCountPixels(thinned, thinnedPixelCount, null);
|
||||||
|
|
||||||
|
IntBuffer pixelCount = IntBuffer.allocate(1);
|
||||||
|
Leptonica1.pixCountPixels(input, pixelCount, null);
|
||||||
|
|
||||||
|
LeptUtils.disposePix(thinned);
|
||||||
|
|
||||||
|
return (double) pixelCount.get() / thinnedPixelCount.get() + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void init() {
|
||||||
|
|
||||||
|
if (thinningSel == null) {
|
||||||
|
thinningSel = Leptonica1.selaMakeThinSets(1, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -17,7 +17,6 @@ import lombok.experimental.FieldDefaults;
|
|||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import net.sourceforge.tess4j.TessAPI1;
|
import net.sourceforge.tess4j.TessAPI1;
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
This just moves the Elements from the GhostScriptOutputListener into the ImageProcessing queue asynchronously
|
This just moves the Elements from the GhostScriptOutputListener into the ImageProcessing queue asynchronously
|
||||||
*/
|
*/
|
||||||
@ -38,9 +37,17 @@ public class BlockingQueueFiller extends Thread {
|
|||||||
public void run() {
|
public void run() {
|
||||||
|
|
||||||
// Interrupting signals that the image extraction has finished
|
// Interrupting signals that the image extraction has finished
|
||||||
while (!allImagesQueued) {
|
try {
|
||||||
|
while (!allImagesQueued) {
|
||||||
final UnprocessedImage image = imageInputQueue.take();
|
final UnprocessedImage image = imageInputQueue.take();
|
||||||
imageOutputQueue.put(image);
|
try {
|
||||||
|
imageOutputQueue.put(image);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
imageOutputQueue.put(image);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
log.info("All images extracted, emptying processing queue and stopping");
|
||||||
}
|
}
|
||||||
|
|
||||||
// empty the queue
|
// empty the queue
|
||||||
@ -54,4 +61,5 @@ public class BlockingQueueFiller extends Thread {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -4,8 +4,6 @@ import static net.sourceforge.tess4j.ITessAPI.TRUE;
|
|||||||
|
|
||||||
import java.nio.FloatBuffer;
|
import java.nio.FloatBuffer;
|
||||||
import java.nio.IntBuffer;
|
import java.nio.IntBuffer;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.NoSuchElementException;
|
import java.util.NoSuchElementException;
|
||||||
import java.util.concurrent.BlockingQueue;
|
import java.util.concurrent.BlockingQueue;
|
||||||
|
|
||||||
@ -29,6 +27,8 @@ import lombok.Setter;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.experimental.FieldDefaults;
|
import lombok.experimental.FieldDefaults;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import net.sourceforge.lept4j.L_Kernel;
|
||||||
|
import net.sourceforge.lept4j.Leptonica1;
|
||||||
import net.sourceforge.lept4j.Pix;
|
import net.sourceforge.lept4j.Pix;
|
||||||
import net.sourceforge.lept4j.util.LeptUtils;
|
import net.sourceforge.lept4j.util.LeptUtils;
|
||||||
import net.sourceforge.tess4j.ITessAPI;
|
import net.sourceforge.tess4j.ITessAPI;
|
||||||
@ -45,6 +45,7 @@ public class ImageProcessingThread extends Thread {
|
|||||||
final BlockingQueue<UnprocessedImage> imageInputQueue;
|
final BlockingQueue<UnprocessedImage> imageInputQueue;
|
||||||
final BlockingQueue<OcrImage> imageOutputQueue;
|
final BlockingQueue<OcrImage> imageOutputQueue;
|
||||||
final ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
|
final ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
|
||||||
|
final L_Kernel gaussianKernel = Leptonica1.makeGaussianKernel(2, 2, 1.2f, 1);
|
||||||
final Statistics stats;
|
final Statistics stats;
|
||||||
final OcrServiceSettings settings;
|
final OcrServiceSettings settings;
|
||||||
final PDDocument document;
|
final PDDocument document;
|
||||||
@ -81,7 +82,9 @@ public class ImageProcessingThread extends Thread {
|
|||||||
log.debug("No images left in processing queue, stopping.");
|
log.debug("No images left in processing queue, stopping.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TessAPI1.TessBaseAPIEnd(this.detectionScriptHandle);
|
||||||
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
|
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
|
||||||
|
LeptUtils.dispose(gaussianKernel);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -106,7 +109,7 @@ public class ImageProcessingThread extends Thread {
|
|||||||
|
|
||||||
private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) {
|
private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) {
|
||||||
|
|
||||||
Pix pix = binarize(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi());
|
Pix pix = processPix(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi());
|
||||||
|
|
||||||
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
|
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
|
||||||
Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix);
|
Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix);
|
||||||
@ -129,7 +132,7 @@ public class ImageProcessingThread extends Thread {
|
|||||||
|
|
||||||
float imageDPI = Math.abs(extractedImage.image().getWidth() / (extractedImage.ctm().getScalingFactorX() / 72));
|
float imageDPI = Math.abs(extractedImage.image().getWidth() / (extractedImage.ctm().getScalingFactorX() / 72));
|
||||||
|
|
||||||
Pix pix = binarize(extractedImage.asPix(), imageDPI, settings.getDpi());
|
Pix pix = processPix(extractedImage.asPix(), imageDPI, settings.getDpi());
|
||||||
|
|
||||||
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
|
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
|
||||||
Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix);
|
Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix);
|
||||||
@ -163,7 +166,7 @@ public class ImageProcessingThread extends Thread {
|
|||||||
|
|
||||||
orientationDegreeResultBuffer = IntBuffer.allocate(1);
|
orientationDegreeResultBuffer = IntBuffer.allocate(1);
|
||||||
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
|
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
|
||||||
scriptureNameBuffer = new PointerByReference();
|
scriptureNameBuffer = new PointerByReference(); // Is this memory being freed?
|
||||||
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
|
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
|
||||||
|
|
||||||
int orientationDegree = 0;
|
int orientationDegree = 0;
|
||||||
@ -183,15 +186,58 @@ public class ImageProcessingThread extends Thread {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private Pix binarize(Pix pix, float imageDpi, int targetDpi) {
|
private Pix processPix(Pix pix, float imageDpi, int targetDpi) {
|
||||||
|
|
||||||
Pix grayScale = ImageProcessingUtils.convertToGrayScale(pix);
|
Pix grayScale;
|
||||||
Pix scaledUp = ImageProcessingUtils.scaleToTargetDpi(imageDpi, targetDpi, grayScale);
|
Pix scaledUp;
|
||||||
return ImageProcessingUtils.despecklePix(scaledUp);
|
Pix gaussian;
|
||||||
|
Pix binarized;
|
||||||
|
|
||||||
|
//convert to grayscale
|
||||||
|
if (pix.d == 8) {
|
||||||
|
grayScale = pix;
|
||||||
|
} else if (pix.d == 32) {
|
||||||
|
grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
|
||||||
|
} else if (pix.d == 1) {
|
||||||
|
grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255);
|
||||||
|
} else {
|
||||||
|
throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d));
|
||||||
|
}
|
||||||
|
|
||||||
|
// scale up
|
||||||
|
float targetFactor = targetDpi / imageDpi;
|
||||||
|
if (targetFactor > 2.1) {
|
||||||
|
scaledUp = Leptonica1.pixScaleGray4xLI(grayScale);
|
||||||
|
} else if (targetFactor > 1.1) {
|
||||||
|
scaledUp = Leptonica1.pixScaleGray2xLI(grayScale);
|
||||||
|
} else {
|
||||||
|
scaledUp = grayScale;
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove noise and prep for Otsu
|
||||||
|
gaussian = Leptonica1.pixConvolve(scaledUp, gaussianKernel, 8, 1);
|
||||||
|
|
||||||
|
// Threshold to binary
|
||||||
|
if (pix.w < 100 || pix.h < 100) {
|
||||||
|
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
|
||||||
|
} else {
|
||||||
|
binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.2f, null);
|
||||||
|
|
||||||
|
if (binarized == null) { // Sometimes Otsu just fails, then we binarize directly
|
||||||
|
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LeptUtils.disposePix(pix);
|
||||||
|
LeptUtils.disposePix(grayScale);
|
||||||
|
LeptUtils.disposePix(scaledUp);
|
||||||
|
LeptUtils.disposePix(gaussian);
|
||||||
|
|
||||||
|
return binarized;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
|
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
|
||||||
|
|
||||||
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
|
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
|
||||||
|
|||||||
@ -116,16 +116,11 @@ public class OCRThread extends Thread {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void executeTesseract(int psm, int dpi, Pix pix, String tesseractOutputFileName) {
|
public void executeTesseract(int psm, int dpi, Pix pix, String tesseractOutputFileName) {
|
||||||
|
|
||||||
if (settings.isDebug()) {
|
Leptonica1.pixWrite(tesseractOutputFileName + ".tiff", pix, 5); // write the used image for later bold detection
|
||||||
String[] a = tesseractOutputFileName.split("/");
|
|
||||||
String folder = "/tmp/pixs/" + a[a.length - 3];
|
|
||||||
new File(folder).mkdirs();
|
|
||||||
Leptonica1.pixWrite(folder + "/pix_" + a[a.length - 1] + ".png", pix, 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
instance.setVariable("user_defined_dpi", String.valueOf(dpi));
|
instance.setVariable("user_defined_dpi", String.valueOf(dpi));
|
||||||
instance.setPageSegMode(psm);
|
instance.setPageSegMode(psm);
|
||||||
instance.createDocumentsWithResults(pix, null, tesseractOutputFileName, List.of(ITesseract.RenderedFormat.HOCR), ITessAPI.TessPageIteratorLevel.RIL_BLOCK);
|
instance.createDocumentsWithResults(pix, null, tesseractOutputFileName, List.of(ITesseract.RenderedFormat.HOCR), ITessAPI.TessPageIteratorLevel.RIL_BLOCK);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -19,10 +19,12 @@ public class OcrServiceSettings {
|
|||||||
int psmOverride = -1; // Overrides the page segmentation mode if > 0
|
int psmOverride = -1; // Overrides the page segmentation mode if > 0
|
||||||
int minImageHeight = 20; // Minimum height for images to be processed
|
int minImageHeight = 20; // Minimum height for images to be processed
|
||||||
int minImageWidth = 20; // Minimum width for images to be processed
|
int minImageWidth = 20; // Minimum width for images to be processed
|
||||||
float minRotationConfidence = 2; //
|
float minRotationConfidence = 2; // Sets a lower bound for the confidence rating for rotated pages.
|
||||||
boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes
|
boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes
|
||||||
boolean removeWatermark; // If true, watermarks will be removed
|
boolean removeWatermark; // If true, watermarks will be removed
|
||||||
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
|
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
|
||||||
COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR");
|
COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR");
|
||||||
|
boolean boldDetection = true; // if true, bold detection will be attempted
|
||||||
|
double boldThreshold = 0.5; // Words are opened with a brick of average stroke width, if the ratio of remaining pixels is higher the word is determined bold.
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -6,14 +6,17 @@ import java.awt.Graphics;
|
|||||||
import java.awt.Graphics2D;
|
import java.awt.Graphics2D;
|
||||||
import java.awt.Transparency;
|
import java.awt.Transparency;
|
||||||
import java.awt.image.BufferedImage;
|
import java.awt.image.BufferedImage;
|
||||||
|
import java.nio.IntBuffer;
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
|
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
|
||||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
|
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||||
|
import com.sun.jna.ptr.PointerByReference;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
|
import net.sourceforge.lept4j.L_Kernel;
|
||||||
import net.sourceforge.lept4j.Leptonica1;
|
import net.sourceforge.lept4j.Leptonica1;
|
||||||
import net.sourceforge.lept4j.Pix;
|
import net.sourceforge.lept4j.Pix;
|
||||||
import net.sourceforge.lept4j.util.LeptUtils;
|
import net.sourceforge.lept4j.util.LeptUtils;
|
||||||
@ -37,67 +40,6 @@ public class ImageProcessingUtils {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Pix despecklePix(Pix pix) {
|
|
||||||
|
|
||||||
assert pix.d == 8;
|
|
||||||
Pix despeckled;
|
|
||||||
if (pix.w < 100 || pix.h < 100) {
|
|
||||||
// too small to properly despeckle, just binarize instead.
|
|
||||||
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
|
|
||||||
} else {
|
|
||||||
despeckled = LeptUtils.despeckle(pix,
|
|
||||||
LeptUtils.SEL_STR3,
|
|
||||||
3); // sometimes this fails and I can't figure out why. Then we skip the despeckling and just simply convert to binary. Might have something to do with Imagesize, not sure though...
|
|
||||||
if (despeckled == null) {
|
|
||||||
despeckled = Leptonica1.pixThresholdToBinary(pix, 180);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (pix != despeckled) {
|
|
||||||
LeptUtils.disposePix(pix);
|
|
||||||
}
|
|
||||||
return despeckled;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) {
|
|
||||||
|
|
||||||
float targetFactor = targetDpi / imageDpi;
|
|
||||||
|
|
||||||
if (targetFactor > 3) {
|
|
||||||
Pix scaledUp;
|
|
||||||
scaledUp = Leptonica1.pixScaleGray4xLI(grayScale);
|
|
||||||
LeptUtils.disposePix(grayScale);
|
|
||||||
return scaledUp;
|
|
||||||
} else if (targetFactor > 1.9) {
|
|
||||||
Pix scaledUp;
|
|
||||||
scaledUp = Leptonica1.pixScaleGray2xLI(grayScale);
|
|
||||||
LeptUtils.disposePix(grayScale);
|
|
||||||
return scaledUp;
|
|
||||||
} else {
|
|
||||||
return grayScale;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public static Pix convertToGrayScale(Pix pix) {
|
|
||||||
|
|
||||||
if (pix.d == 8) {
|
|
||||||
return pix;
|
|
||||||
} else if (pix.d == 32) {
|
|
||||||
Pix grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
|
|
||||||
LeptUtils.disposePix(pix);
|
|
||||||
return grayScale;
|
|
||||||
} else if (pix.d == 1) {
|
|
||||||
Pix grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255);
|
|
||||||
LeptUtils.disposePix(pix);
|
|
||||||
return grayScale;
|
|
||||||
} else {
|
|
||||||
throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Pix deRotatePix(int orientDegree, Pix pix) {
|
public Pix deRotatePix(int orientDegree, Pix pix) {
|
||||||
|
|
||||||
return switch (360 - orientDegree) {
|
return switch (360 - orientDegree) {
|
||||||
@ -128,4 +70,16 @@ public class ImageProcessingUtils {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static double calculatePixelDensity(Pix pix) {
|
||||||
|
|
||||||
|
IntBuffer pixelCount = IntBuffer.allocate(1);
|
||||||
|
int result = Leptonica1.pixCountPixels(pix, pixelCount, null);
|
||||||
|
if (result == 0) {
|
||||||
|
return (double) pixelCount.get() / (pix.h * pix.w);
|
||||||
|
} else {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,73 @@
|
|||||||
|
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
import net.sourceforge.lept4j.L_Kernel;
|
||||||
|
import net.sourceforge.lept4j.Leptonica1;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
|
public class KernelUtils {
|
||||||
|
|
||||||
|
/*
|
||||||
|
-1, -1, -1
|
||||||
|
-1, 8, -1
|
||||||
|
-1, -1, -1
|
||||||
|
*/
|
||||||
|
public L_Kernel createFullLaplacianKernel() {
|
||||||
|
|
||||||
|
L_Kernel laplacianKernel = Leptonica1.kernelCreate(3, 3);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 0, 0, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 0, 1, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 0, 2, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 1, 0, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 2, 0, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 2, 2, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 1, 1, 8);
|
||||||
|
return laplacianKernel;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
0, 0, -1, 0, 0
|
||||||
|
0, -1, -1, -1, 0
|
||||||
|
-1, -1, 12, -1, -1
|
||||||
|
0, -1, -1, -1, 0
|
||||||
|
0, 0, -1, 0, 0
|
||||||
|
*/
|
||||||
|
public L_Kernel createLaplacianKernel5x5() {
|
||||||
|
|
||||||
|
L_Kernel laplacianKernel = Leptonica1.kernelCreate(5, 5);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 0, 2, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 1, 1, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 1, 3, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 2, 0, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 2, 3, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 2, 4, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 3, 1, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 3, 2, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 3, 3, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 4, 2, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 2, 2, 12);
|
||||||
|
return laplacianKernel;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
0, -1, 0
|
||||||
|
-1, 4, -1
|
||||||
|
0, -1, 0
|
||||||
|
*/
|
||||||
|
public L_Kernel createLaplacianKernel() {
|
||||||
|
|
||||||
|
L_Kernel laplacianKernel = Leptonica1.kernelCreate(3, 3);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 0, 1, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 1, 0, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1);
|
||||||
|
Leptonica1.kernelSetElement(laplacianKernel, 1, 1, 4);
|
||||||
|
return laplacianKernel;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
@ -138,4 +138,11 @@ public class Tesseract2 extends Tesseract1 {
|
|||||||
return renderer;
|
return renderer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void dispose() {
|
||||||
|
|
||||||
|
TessBaseAPIEnd(getHandle());
|
||||||
|
TessBaseAPIDelete(getHandle());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -20,7 +20,7 @@ class Type0FontMetricsFactoryTest {
|
|||||||
public void testStringWidth() {
|
public void testStringWidth() {
|
||||||
|
|
||||||
try (PDDocument document = Loader.loadPDF(new File(Type0FontMetricsFactoryTest.class.getClassLoader().getResource("InvisibleText.pdf").getPath()))) {
|
try (PDDocument document = Loader.loadPDF(new File(Type0FontMetricsFactoryTest.class.getClassLoader().getResource("InvisibleText.pdf").getPath()))) {
|
||||||
Type0FontMetricsFactory metricsFactory = new Type0FontMetricsFactory(document);
|
Type0FontMetricsFactory metricsFactory = Type0FontMetricsFactory.regular(document);
|
||||||
FontMetrics fontMetrics = metricsFactory.calculateMetrics("deine mutter", 100, 50);
|
FontMetrics fontMetrics = metricsFactory.calculateMetrics("deine mutter", 100, 50);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -41,8 +41,15 @@ fforesight:
|
|||||||
ignored-endpoints: [ '/actuator/health', '/actuator/health/**' ]
|
ignored-endpoints: [ '/actuator/health', '/actuator/health/**' ]
|
||||||
enabled: true
|
enabled: true
|
||||||
|
|
||||||
|
logging.pattern.level: "%5p [${spring.application.name},%X{traceId:-},%X{spanId:-}]"
|
||||||
|
|
||||||
management:
|
management:
|
||||||
|
tracing:
|
||||||
|
sampling:
|
||||||
|
probability: 1.0
|
||||||
|
otlp:
|
||||||
|
tracing:
|
||||||
|
endpoint: http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces
|
||||||
endpoint:
|
endpoint:
|
||||||
metrics.enabled: ${monitoring.enabled:false}
|
metrics.enabled: ${monitoring.enabled:false}
|
||||||
prometheus.enabled: ${monitoring.enabled:false}
|
prometheus.enabled: ${monitoring.enabled:false}
|
||||||
|
|||||||
@ -64,7 +64,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testOcr() {
|
public void testOcr() {
|
||||||
|
|
||||||
String text = testOCR("files/2009-1048395_50pages_tables.pdf");
|
String text = testOCR("files/402Study.pdf");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -162,13 +162,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testOcrForSpecificFile() {
|
public void testOcrForSpecificFile() {
|
||||||
|
|
||||||
testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/F.2. A16003E - Acute Inhalation Study.pdf"));
|
testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles2/A16361B - Acute Dermal Toxicity Study in Rats.pdf"));
|
||||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/A23220A - 404 - Skin Irritation in vivo.pdf"));
|
|
||||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/G.1.2 - 1768300_MMNA_A13617AV_report.pdf"));
|
|
||||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 17_Toxicidade Inalatoria Aguda.pdf"));
|
|
||||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 20_Sensibilizacao_02.pdf"));
|
|
||||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/ITEM 23_A15149W - Dermal absorption of formulated product.pdf"));
|
|
||||||
// testOCRForFile(new File("/home/kschuettler/Dokumente/TestFiles/OcrMassTest/SOLICITA_VICTRATO-GOLD-II_Item 16_Toxicidade Cutanea Aguda.pdf"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user