Compare commits
30 Commits
certificat
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5fca39728b | ||
|
|
cd6390fde1 | ||
|
|
bc459ee966 | ||
|
|
47e7f8b297 | ||
|
|
22392e083d | ||
|
|
52a1fb4a05 | ||
|
|
378436cb2f | ||
|
|
f1204acc60 | ||
|
|
998755c3e3 | ||
|
|
c598f62633 | ||
|
|
2e25ee2155 | ||
|
|
7f04fb3c6f | ||
|
|
ff32f016eb | ||
|
|
821ef265fe | ||
|
|
7fcb6652ef | ||
|
|
61b1010e24 | ||
|
|
7b5a175440 | ||
|
|
18ba1daaef | ||
|
|
c61f71871e | ||
|
|
cc2937d0d2 | ||
|
|
71255d9fc9 | ||
|
|
1f9dac17e3 | ||
|
|
5712292698 | ||
|
|
1395318e18 | ||
|
|
842b794153 | ||
|
|
4b3ccc28e2 | ||
|
|
b469ea4174 | ||
|
|
253bb70519 | ||
|
|
d55f245c5e | ||
|
|
7ed1632c6f |
@ -1,3 +1,7 @@
|
||||
variables:
|
||||
# SONAR_PROJECT_KEY: 'ocr-service:ocr-service-server'
|
||||
GIT_SUBMODULE_STRATEGY: recursive
|
||||
GIT_SUBMODULE_FORCE_HTTPS: 'true'
|
||||
include:
|
||||
- project: 'gitlab/gitlab'
|
||||
ref: 'main'
|
||||
|
||||
8
.gitmodules
vendored
Normal file
8
.gitmodules
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
[submodule "ocr-service-v1/ocr-service-server/src/test/resources/files/syngenta"]
|
||||
path = ocr-service-v1/ocr-service-server/src/test/resources/files/syngenta
|
||||
url = https://gitlab.knecon.com/fforesight/documents/syngenta.git
|
||||
update = merge
|
||||
[submodule "ocr-service-v1/ocr-service-server/src/test/resources/files/basf"]
|
||||
path = ocr-service-v1/ocr-service-server/src/test/resources/files/basf
|
||||
url = https://gitlab.knecon.com/fforesight/documents/basf.git
|
||||
update = merge
|
||||
@ -15,6 +15,7 @@ dependencies {
|
||||
api("com.iqser.red.commons:metric-commons:2.1.0")
|
||||
api("com.iqser.red.commons:storage-commons:2.45.0")
|
||||
api("com.knecon.fforesight:tenant-commons:0.21.0")
|
||||
api("com.knecon.fforesight:lifecycle-commons:0.6.0")
|
||||
api("com.pdftron:PDFNet:10.5.0")
|
||||
api("org.apache.pdfbox:pdfbox:3.0.0")
|
||||
api("org.apache.pdfbox:jbig2-imageio:3.0.4")
|
||||
@ -24,7 +25,7 @@ dependencies {
|
||||
api("io.github.karols:hocr4j:0.2.0")
|
||||
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
|
||||
api("com.google.guava:guava:31.1-jre")
|
||||
api("com.iqser.red.commons:pdftron-logic-commons:2.27.0")
|
||||
api("com.knecon.fforesight:viewer-doc-processor:0.89.0")
|
||||
api("com.iqser.red.commons:pdftron-logic-commons:2.32.0")
|
||||
api("com.knecon.fforesight:viewer-doc-processor:0.125.0")
|
||||
testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
|
||||
}
|
||||
|
||||
@ -1,17 +1,20 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.initializer;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
import com.sun.jna.NativeLibrary;
|
||||
|
||||
import jakarta.annotation.PostConstruct;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
public class PDFNetInitializer {
|
||||
public class NativeLibrariesInitializer {
|
||||
|
||||
@Value("${pdftron.license:}")
|
||||
private String pdftronLicense;
|
||||
@ -22,8 +25,25 @@ public class PDFNetInitializer {
|
||||
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
|
||||
public void init() {
|
||||
|
||||
log.info("Initializing Native Libraries");
|
||||
log.info("Setting pdftron license: {}", pdftronLicense);
|
||||
PDFNet.setTempPath("/tmp/pdftron");
|
||||
PDFNet.initialize(pdftronLicense);
|
||||
|
||||
log.info("Setting jna.library.path: {}", System.getenv("VCPKG_DYNAMIC_LIB"));
|
||||
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
|
||||
|
||||
log.info("Asserting Native Libraries loaded");
|
||||
|
||||
try (NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica")) {
|
||||
assert leptonicaLib != null;
|
||||
log.info("Leptonica library loaded from {}", leptonicaLib.getFile().getAbsolutePath());
|
||||
}
|
||||
|
||||
try (NativeLibrary tesseractLib = NativeLibrary.getInstance("tesseract")) {
|
||||
assert tesseractLib != null;
|
||||
log.info("Tesseract library loaded from {}", tesseractLib.getFile().getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,5 +1,6 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.Rectangle;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
@ -34,6 +35,16 @@ public record QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) {
|
||||
new Point2D.Double(bounds.getRight(), bounds.getBottom()));
|
||||
}
|
||||
|
||||
public Rectangle2D getBounds2D() {
|
||||
|
||||
double minX = Math.min(Math.min(Math.min(a.getX(), b.getX()), c.getX()), d.getX());
|
||||
double minY = Math.min(Math.min(Math.min(a.getY(), b.getY()), c.getY()), d.getY());
|
||||
double maxX = Math.max(Math.max(Math.max(a.getX(), b.getX()), c.getX()), d.getX());
|
||||
double maxY = Math.max(Math.max(Math.max(a.getY(), b.getY()), c.getY()), d.getY());
|
||||
|
||||
return new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY);
|
||||
}
|
||||
|
||||
|
||||
public QuadPoint getTransformed(AffineTransform at) {
|
||||
|
||||
|
||||
@ -24,7 +24,7 @@ public class RenderedPageOcrImage implements OcrImage {
|
||||
public AffineTransform getImageCTM() {
|
||||
|
||||
double scalingFactor = calculateScalingFactor();
|
||||
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, -pageInformation.minX(), -pageInformation.minY());
|
||||
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, 0, 0);
|
||||
|
||||
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
|
||||
|
||||
|
||||
@ -3,19 +3,23 @@ package com.knecon.fforesight.service.ocr.processor.service;
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.Converter;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
||||
@ -26,6 +30,9 @@ import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.TextExtractor;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -39,96 +46,161 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OcrResultWriter {
|
||||
|
||||
public static final Color REGULAR_TEXT_COLOR = Color.BLUE;
|
||||
public static final Color BOLD_TEXT_COLOR = Color.CYAN;
|
||||
|
||||
public static final Color REGULAR_TEXT_IN_IGNORE_ZONE = Color.RED;
|
||||
public static final Color BOLD_TEXT_IN_IGNORE_ZONE = Color.RED;
|
||||
|
||||
ViewerDocumentService viewerDocumentService;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void drawOcrResultsToPdf(File document, File viewerDocument, Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||
|
||||
Map<Integer, VisualizationsOnPage> ocrVisualizationsOnPages = createVisualizations(imagesWithResultsPerPage);
|
||||
Map<Integer, VisualizationsOnPage> ocrTextDebugVisualizationsOnPages = createDebugTextVisualizations(imagesWithResultsPerPage);
|
||||
Map<Integer, VisualizationsOnPage> ocrBBoxDebugVisualizationsOnPages = createDebugBBoxVisualizations(imagesWithResultsPerPage);
|
||||
Map<Integer, VisualizationsOnPage> ocrVisualizationsOnPages = new HashMap<>();
|
||||
Map<Integer, VisualizationsOnPage> ocrTextDebugVisualizationsOnPages = new HashMap<>();
|
||||
Map<Integer, VisualizationsOnPage> ocrBBoxDebugVisualizationsOnPages = new HashMap<>();
|
||||
|
||||
try (var in = new FileInputStream(document); PDFDoc doc = new PDFDoc(in)) {
|
||||
|
||||
for (Integer pageNumber : imagesWithResultsPerPage.keySet()) {
|
||||
|
||||
List<Rectangle2D> textBBoxes = getTextBBoxes(doc.getPage(pageNumber));
|
||||
|
||||
ocrVisualizationsOnPages.put(pageNumber - 1, createVisualizations(imagesWithResultsPerPage.get(pageNumber), textBBoxes));
|
||||
ocrTextDebugVisualizationsOnPages.put(pageNumber - 1, createDebugTextVisualizations(imagesWithResultsPerPage.get(pageNumber), textBBoxes));
|
||||
ocrBBoxDebugVisualizationsOnPages.put(pageNumber - 1, createDebugBBoxVisualizations(imagesWithResultsPerPage.get(pageNumber)));
|
||||
}
|
||||
}
|
||||
|
||||
Visualizations visualizations = new Visualizations(ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false);
|
||||
|
||||
List<Visualizations> debugVisualizations = List.of(visualizations,
|
||||
new Visualizations(ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false),
|
||||
new Visualizations(ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false));
|
||||
new Visualizations(ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false),
|
||||
new Visualizations(ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false));
|
||||
|
||||
viewerDocumentService.addVisualizationsOnPage(document, document, visualizations);
|
||||
viewerDocumentService.addVisualizationsOnPage(document, document, List.of(visualizations));
|
||||
viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, debugVisualizations);
|
||||
}
|
||||
|
||||
|
||||
private Map<Integer, VisualizationsOnPage> createVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||
@SuppressWarnings("PMD")
|
||||
private List<Rectangle2D> getTextBBoxes(Page page) {
|
||||
|
||||
return imagesWithResultsPerPage.keySet()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createVisualizations(imagesWithResultsPerPage.get(pageNumber))));
|
||||
List<Rectangle2D> textBBoxes = new ArrayList<>();
|
||||
try (var textExtractor = new TextExtractor()) {
|
||||
textExtractor.begin(page);
|
||||
try {
|
||||
|
||||
for (TextExtractor.Line line = textExtractor.getFirstLine(); line.isValid(); line = getNextLine(line)) {
|
||||
for (TextExtractor.Word word = line.getFirstWord(); word.isValid(); word = getNextWord(word)) {
|
||||
textBBoxes.add(Converter.toRectangle2D(word.getBBox()));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.warn("Could not get word dimension, {}", e.getMessage());
|
||||
}
|
||||
return textBBoxes;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static Function<Integer, Integer> pageNumber1IdxTo0IdxMapper() {
|
||||
// PDFBox uses a 0-based index for page numbers internally, while we use a 1-based index
|
||||
return p -> p - 1;
|
||||
private static TextExtractor.Word getNextWord(TextExtractor.Word word) {
|
||||
|
||||
TextExtractor.Word nextWord = word.getNextWord();
|
||||
word.close();
|
||||
return nextWord;
|
||||
}
|
||||
|
||||
|
||||
private VisualizationsOnPage createVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {
|
||||
private static TextExtractor.Line getNextLine(TextExtractor.Line line) {
|
||||
|
||||
TextExtractor.Line newLine = line.getNextLine();
|
||||
line.close();
|
||||
return newLine;
|
||||
}
|
||||
|
||||
|
||||
private VisualizationsOnPage createVisualizations(List<OcrResultToWrite> ocrResultsToWrite, List<Rectangle2D> ignoreZones) {
|
||||
|
||||
List<TextPositionInImage> words = ocrResultsToWrite.stream()
|
||||
.map(OcrResultToWrite::textPositionInImage)
|
||||
.flatMap(Collection::stream)
|
||||
.filter(word -> ignoreZones.stream()
|
||||
.noneMatch(ignoreZone -> word.getTransformedTextBBox().getBounds2D().intersects(ignoreZone)))
|
||||
.toList();
|
||||
|
||||
List<TextPositionInImage> words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
|
||||
List<PlacedText> placedTexts = words.stream()
|
||||
.map(word -> new PlacedText(word.getText(),
|
||||
null,
|
||||
Color.BLACK,
|
||||
(float) word.getFontSize(),
|
||||
word.getFontMetricsFactory(),
|
||||
Optional.of(word.getTextMatrix()),
|
||||
Optional.of(RenderingMode.NEITHER)))
|
||||
null,
|
||||
Color.BLACK,
|
||||
(float) word.getFontSize(),
|
||||
word.getFontMetricsFactory(),
|
||||
Optional.of(word.getTextMatrix()),
|
||||
Optional.of(RenderingMode.NEITHER)))
|
||||
.toList();
|
||||
return VisualizationsOnPage.builder().placedTexts(placedTexts).build();
|
||||
}
|
||||
|
||||
|
||||
private Map<Integer, VisualizationsOnPage> createDebugTextVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||
private VisualizationsOnPage createDebugTextVisualizations(List<OcrResultToWrite> ocrResultsToWrite, List<Rectangle2D> textBBoxes) {
|
||||
|
||||
return imagesWithResultsPerPage.keySet()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createDebugTextVisualizations(imagesWithResultsPerPage.get(pageNumber))));
|
||||
}
|
||||
List<TextPositionInImage> wordsToDraw = new ArrayList<>();
|
||||
List<TextPositionInImage> ignoredWords = new ArrayList<>();
|
||||
|
||||
|
||||
private VisualizationsOnPage createDebugTextVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {
|
||||
|
||||
List<TextPositionInImage> words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
|
||||
List<PlacedText> placedTexts = words.stream()
|
||||
for (OcrResultToWrite ocrResultToWrite : ocrResultsToWrite) {
|
||||
for (TextPositionInImage textPositionInImage : ocrResultToWrite.textPositionInImage()) {
|
||||
if (textBBoxes.stream()
|
||||
.anyMatch(ignoreZone -> textPositionInImage.getTransformedTextBBox().getBounds2D().intersects(ignoreZone))) {
|
||||
ignoredWords.add(textPositionInImage);
|
||||
} else {
|
||||
wordsToDraw.add(textPositionInImage);
|
||||
}
|
||||
}
|
||||
}
|
||||
Stream<PlacedText> placedTexts = wordsToDraw.stream()
|
||||
.map(word -> new PlacedText(word.getText(),
|
||||
null,
|
||||
word.getFontStyle().equals(FontStyle.REGULAR) ? Color.BLUE : Color.RED,
|
||||
(float) word.getFontSize(),
|
||||
word.getFontMetricsFactory(),
|
||||
Optional.of(word.getTextMatrix()),
|
||||
Optional.of(RenderingMode.FILL)))
|
||||
.toList();
|
||||
return VisualizationsOnPage.builder().placedTexts(placedTexts).build();
|
||||
}
|
||||
null,
|
||||
word.getFontStyle().equals(FontStyle.REGULAR) ? REGULAR_TEXT_COLOR : BOLD_TEXT_COLOR,
|
||||
(float) word.getFontSize(),
|
||||
word.getFontMetricsFactory(),
|
||||
Optional.of(word.getTextMatrix()),
|
||||
Optional.of(RenderingMode.FILL)));
|
||||
|
||||
Stream<PlacedText> placedTexts2 = ignoredWords.stream()
|
||||
.map(word -> new PlacedText(word.getText(),
|
||||
null,
|
||||
word.getFontStyle().equals(FontStyle.REGULAR) ? REGULAR_TEXT_IN_IGNORE_ZONE : BOLD_TEXT_IN_IGNORE_ZONE,
|
||||
(float) word.getFontSize(),
|
||||
word.getFontMetricsFactory(),
|
||||
Optional.of(word.getTextMatrix()),
|
||||
Optional.of(RenderingMode.FILL)));
|
||||
|
||||
private Map<Integer, VisualizationsOnPage> createDebugBBoxVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||
|
||||
return imagesWithResultsPerPage.keySet()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createDebugBBoxVisualizations(imagesWithResultsPerPage.get(pageNumber))));
|
||||
return VisualizationsOnPage.builder()
|
||||
.placedTexts(Stream.of(placedTexts, placedTexts2)
|
||||
.flatMap(Function.identity())
|
||||
.toList())
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private VisualizationsOnPage createDebugBBoxVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {
|
||||
|
||||
List<TextPositionInImage> words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
|
||||
List<TextPositionInImage> words = ocrResultsToWrite.stream()
|
||||
.map(OcrResultToWrite::textPositionInImage)
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
List<ColoredLine> coloredLines = Stream.concat(//
|
||||
words.stream().map(TextPositionInImage::getTransformedTextBBox).map(this::quadPointAsLines),//
|
||||
ocrResultsToWrite.stream().map(OcrResultToWrite::imageBoundingBox).map(this::createGrid)//
|
||||
).flatMap(Collection::stream).toList();
|
||||
words.stream()
|
||||
.map(TextPositionInImage::getTransformedTextBBox)
|
||||
.map(this::quadPointAsLines),//
|
||||
ocrResultsToWrite.stream()
|
||||
.map(OcrResultToWrite::imageBoundingBox)
|
||||
.map(this::createGrid)//
|
||||
)
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
return VisualizationsOnPage.builder().coloredLines(coloredLines).build();
|
||||
}
|
||||
|
||||
@ -136,9 +208,9 @@ public class OcrResultWriter {
|
||||
private List<ColoredLine> quadPointAsLines(QuadPoint rect) {
|
||||
|
||||
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.ORANGE, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.MAGENTA, 1));
|
||||
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.MAGENTA, 1));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -227,7 +227,7 @@ public class ImageProcessingThread extends Thread {
|
||||
if (pix.w < 100 || pix.h < 100) {
|
||||
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
|
||||
} else {
|
||||
binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.2f, null);
|
||||
binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.1f, null);
|
||||
if (binarized == null) { // Sometimes Otsu just fails, then we binarize directly
|
||||
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
|
||||
}
|
||||
|
||||
@ -3,7 +3,7 @@ import org.springframework.boot.gradle.tasks.bundling.BootBuildImage
|
||||
plugins {
|
||||
application
|
||||
id("com.iqser.red.service.java-conventions")
|
||||
id("org.springframework.boot") version "3.1.5"
|
||||
id("org.springframework.boot") version "3.2.3"
|
||||
id("io.spring.dependency-management") version "1.1.3"
|
||||
id("org.sonarqube") version "4.3.0.3225"
|
||||
id("io.freefair.lombok") version "8.4"
|
||||
@ -17,14 +17,14 @@ configurations {
|
||||
}
|
||||
}
|
||||
|
||||
val springBootStarterVersion = "3.1.5"
|
||||
val springBootStarterVersion = "3.2.3"
|
||||
|
||||
dependencies {
|
||||
implementation(project(":ocr-service-processor"))
|
||||
implementation(project(":ocr-service-api"))
|
||||
|
||||
implementation("com.knecon.fforesight:tracing-commons:0.5.0")
|
||||
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
|
||||
implementation("com.knecon.fforesight:tracing-commons:0.7.0")
|
||||
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.1.1")
|
||||
implementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}")
|
||||
|
||||
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
|
||||
@ -39,7 +39,7 @@ tasks.named<BootBuildImage>("bootBuildImage") {
|
||||
|
||||
environment.put("BPE_DELIM_JAVA_TOOL_OPTIONS", " ")
|
||||
environment.put("BPE_APPEND_JAVA_TOOL_OPTIONS", "-Dfile.encoding=UTF-8")
|
||||
environment.put("BPE_GS_LIB", "/layers/fagiani_apt/apt/usr/share/ghostscript/9.26/Resource/Init/") // set ghostscript lib path
|
||||
environment.put("BPE_GS_LIB", "/layers/fagiani_apt/apt/usr/share/ghostscript/9.55.0/Resource/Init/") // set ghostscript lib path, version in path must match version in Aptfile
|
||||
environment.put("BPE_FONTCONFIG_PATH", "/layers/fagiani_apt/apt/etc/fonts/") // set ghostscript fontconfig path
|
||||
|
||||
var aptfile = layout.projectDirectory.file("src/main/resources/Aptfile").toString()
|
||||
@ -53,7 +53,7 @@ tasks.named<BootBuildImage>("bootBuildImage") {
|
||||
|
||||
buildpacks.set(
|
||||
listOf(
|
||||
"ghcr.io/fagiani/buildpacks/fagiani_apt@sha256:6471c8c70f32b749e29f65ae562ac0339fecad26aa9217628c00a6c31f197dae",
|
||||
"ghcr.io/knsita/buildpacks/fagiani_apt@sha256:9771d4d27d8050aee62769490b8882fffc794745c129fb98e1f33196e2c93504",
|
||||
"ghcr.io/kschuettler/knecon-vcpkg@sha256:ba5e967b124de4865ff7e8f565684f752dd6e97b302e2dcf651283f6a19b98b9",
|
||||
"ghcr.io/kschuettler/knecon-tessdata@sha256:9062f728aa0340ac963bcdd6f5e740d683823a81d3f480db894da15bff72691a",
|
||||
"urn:cnb:builder:paketo-buildpacks/java"
|
||||
|
||||
@ -6,23 +6,27 @@ import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.EnableAspectJAutoProxy;
|
||||
import org.springframework.context.annotation.Import;
|
||||
import org.springframework.scheduling.annotation.EnableAsync;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
||||
import com.knecon.fforesight.lifecyclecommons.LifecycleAutoconfiguration;
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceProcessorConfiguration;
|
||||
import com.knecon.fforesight.service.ocr.v1.server.queue.MessagingConfiguration;
|
||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
|
||||
import com.knecon.fforesight.tracing.OpenTelemetryConfig;
|
||||
|
||||
import io.micrometer.core.aop.TimedAspect;
|
||||
import io.micrometer.core.instrument.MeterRegistry;
|
||||
|
||||
@EnableAsync
|
||||
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class})
|
||||
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class, LifecycleAutoconfiguration.class})
|
||||
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
|
||||
@Import({MessagingConfiguration.class, StorageAutoConfiguration.class, OcrServiceProcessorConfiguration.class})
|
||||
@Import({MessagingConfiguration.class, StorageAutoConfiguration.class, OcrServiceProcessorConfiguration.class, OpenTelemetryConfig.class})
|
||||
@EnableAspectJAutoProxy
|
||||
public class Application {
|
||||
|
||||
/**
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
# you can list packages
|
||||
ghostscript
|
||||
ghostscript=9.55.0~dfsg1-0ubuntu5.9
|
||||
pkg-config
|
||||
zip
|
||||
unzip
|
||||
@ -11,6 +11,7 @@ libk5crypto3
|
||||
libkrb5support0
|
||||
libkeyutils1
|
||||
libkrb5-3
|
||||
libbrotli1
|
||||
|
||||
# or include links to specific .deb files
|
||||
# http://ftp.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.8_all.deb
|
||||
|
||||
@ -12,6 +12,9 @@ project.version: 1.0-SNAPSHOT
|
||||
server:
|
||||
port: 8080
|
||||
|
||||
lifecycle:
|
||||
base-package: com.knecon.fforesight.service.ocr
|
||||
|
||||
spring:
|
||||
application:
|
||||
name: ocr-service
|
||||
|
||||
@ -24,10 +24,10 @@ import org.springframework.context.annotation.Primary;
|
||||
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||
|
||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.initializer.PDFNetInitializer;
|
||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
|
||||
import com.knecon.fforesight.service.ocr.processor.initializer.NativeLibrariesInitializer;
|
||||
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
|
||||
@ -36,7 +36,7 @@ import lombok.SneakyThrows;
|
||||
|
||||
@ExtendWith({SpringExtension.class, MockitoExtension.class})
|
||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
||||
@Import({AbstractTest.TestConfiguration.class, PDFNetInitializer.class})
|
||||
@Import({AbstractTest.TestConfiguration.class, NativeLibrariesInitializer.class})
|
||||
@AutoConfigureObservability
|
||||
public class AbstractTest {
|
||||
|
||||
|
||||
@ -50,9 +50,9 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@Test
|
||||
public void testOCRMetrics() {
|
||||
|
||||
testOCR("files/Watermark.pdf");
|
||||
testOCR("files/Watermark.pdf");
|
||||
testOCR("files/Watermark.pdf");
|
||||
testOCR("files/syngenta/CustomerFiles/SinglePages/Watermark_Page1_10.SYN524464 FS (A16148C) - Absorção cutânea.pdf");
|
||||
testOCR("files/syngenta/CustomerFiles/SinglePages/Watermark_Page1_10.SYN524464 FS (A16148C) - Absorção cutânea.pdf");
|
||||
testOCR("files/syngenta/CustomerFiles/SinglePages/Watermark_Page1_10.SYN524464 FS (A16148C) - Absorção cutânea.pdf");
|
||||
|
||||
var ocrOnDocumentMeter = registry.getMeters().stream().filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny();
|
||||
assertThat(ocrOnDocumentMeter.isPresent()).isTrue();
|
||||
@ -81,7 +81,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@Test
|
||||
public void testMergeImages() {
|
||||
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
|
||||
String text = testOCR("files/merge_images.pdf");
|
||||
String text = testOCR("files/syngenta/CustomerFiles/SinglePages/merge_images - Page241_18 Chlorothalonil RAR 08 Volume 3CA B 6a Oct 2017.pdf");
|
||||
assertThat(text).contains("Bodyweight change of dams with live young - group mean values",
|
||||
"Control",
|
||||
"mg/g day",
|
||||
@ -101,7 +101,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@Test
|
||||
public void testOCRWatermark() {
|
||||
|
||||
assertThat(testOCR("files/Watermark.pdf")).contains("syngenta");
|
||||
assertThat(testOCR("files/syngenta/CustomerFiles/SinglePages/Watermark_Page1_10.SYN524464 FS (A16148C) - Absorção cutânea.pdf")).contains("syngenta");
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -15,3 +15,10 @@ management:
|
||||
health.enabled: true
|
||||
endpoints.web.exposure.include: prometheus, health, metrics
|
||||
metrics.export.prometheus.enabled: true
|
||||
tracing:
|
||||
enabled: ${TRACING_ENABLED:false}
|
||||
sampling:
|
||||
probability: ${TRACING_PROBABILITY:1.0}
|
||||
otlp:
|
||||
tracing:
|
||||
endpoint: ${OTLP_ENDPOINT:http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces}
|
||||
@ -0,0 +1 @@
|
||||
Subproject commit 9dc6c2337dea32e63aef53271dba0692537c6605
|
||||
@ -0,0 +1 @@
|
||||
Subproject commit 21fefb64bf27ca2b3329a6c69d90a27450b17930
|
||||
@ -1,5 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
dir=${PWD##*/}
|
||||
|
||||
gradle assemble
|
||||
|
||||
# Get the current Git branch
|
||||
@ -11,5 +15,32 @@ commit_hash=$(git rev-parse --short=5 HEAD)
|
||||
# Combine branch and commit hash
|
||||
buildName="${USER}-${branch}-${commit_hash}"
|
||||
|
||||
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName
|
||||
echo "nexus.knecon.com:5001/ff/${dir}-server:$buildName"
|
||||
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${buildName}
|
||||
|
||||
newImageName="nexus.knecon.com:5001/ff/ocr-service-server:$buildName"
|
||||
|
||||
echo "full image name:"
|
||||
echo ${newImageName}
|
||||
echo ""
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
namespace=${1}
|
||||
deployment_name="ocr-service-v1"
|
||||
|
||||
echo "deploying to ${namespace}"
|
||||
|
||||
oldImageName=$(rancher kubectl -n ${namespace} get deployment ${deployment_name} -o=jsonpath='{.spec.template.spec.containers[*].image}')
|
||||
|
||||
if [ "${newImageName}" = "${oldImageName}" ]; then
|
||||
echo "Image tag did not change, redeploying..."
|
||||
rancher kubectl rollout restart deployment ${deployment_name} -n ${namespace}
|
||||
else
|
||||
echo "upgrading the image tag..."
|
||||
rancher kubectl set image deployment/${deployment_name} ${deployment_name}=${newImageName} -n ${namespace}
|
||||
fi
|
||||
rancher kubectl rollout status deployment ${deployment_name} -n ${namespace}
|
||||
echo "Built ${deployment_name}:${buildName} and deployed to ${namespace}"
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user