Compare commits

..

1 Commits

Author SHA1 Message Date
Dominique Eifländer
8924e905ad hotfix: Extend Tesseract instead of Tesseract1 2024-01-15 16:13:43 +01:00
44 changed files with 533 additions and 699 deletions

View File

@ -1,7 +1,3 @@
variables:
# SONAR_PROJECT_KEY: 'ocr-service:ocr-service-server'
GIT_SUBMODULE_STRATEGY: recursive
GIT_SUBMODULE_FORCE_HTTPS: 'true'
include: include:
- project: 'gitlab/gitlab' - project: 'gitlab/gitlab'
ref: 'main' ref: 'main'

8
.gitmodules vendored
View File

@ -1,8 +0,0 @@
[submodule "ocr-service-v1/ocr-service-server/src/test/resources/files/syngenta"]
path = ocr-service-v1/ocr-service-server/src/test/resources/files/syngenta
url = https://gitlab.knecon.com/fforesight/documents/syngenta.git
update = merge
[submodule "ocr-service-v1/ocr-service-server/src/test/resources/files/basf"]
path = ocr-service-v1/ocr-service-server/src/test/resources/files/basf
url = https://gitlab.knecon.com/fforesight/documents/basf.git
update = merge

View File

@ -74,14 +74,12 @@ String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-ch
``` ```
## Integration ## Integration
The OCR-service communicates via RabbitMQ and uses the queues `ocr_request_queue`, `ocr_response_queue`, The OCR-service communicates via RabbitMQ and uses the queues `ocrQueue`, `ocrDLQ`, and `ocr_status_update_response_queue`.
`ocr_dead_letter_queue`, and `ocr_status_update_response_queue`.
### ocr_request_queue ### ocrQueue
This queue is used to start the OCR process, a DocumentRequest must be passed as a message. The service will then download the PDF from the provided cloud storage. This queue is used to start the OCR process, a DocumentRequest must be passed as a message. The service will then download the PDF from the provided cloud storage.
### ocr_response_queue
This queue is also used to signal the end of processing.
### ocr_dead_letter_queue
This queue is used to signal an error has occurred during processing.
### ocr_status_update_response_queue ### ocr_status_update_response_queue
This queue is used by the OCR service to give updates about the progress of the ongoing OCR on a image per image basis. The total amount may change, when less images are found than initially assumed. This queue is used by the OCR service to give updates about the progress of the ongoing OCR on a image per image basis. The total amount may change, when less images are found than initially assumed.
This queue is also used to signal the end of processing.
### ocrDLQ
This queue is used to signal an error has occurred during processing.

View File

@ -12,10 +12,6 @@ group = "com.knecon.fforesight.service"
java.sourceCompatibility = JavaVersion.VERSION_17 java.sourceCompatibility = JavaVersion.VERSION_17
java.targetCompatibility = JavaVersion.VERSION_17 java.targetCompatibility = JavaVersion.VERSION_17
pmd {
isConsoleOutput = true
}
tasks.pmdMain { tasks.pmdMain {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml") pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
} }

View File

@ -9,12 +9,12 @@
</description> </description>
<rule ref="category/java/errorprone.xml"> <rule ref="category/java/errorprone.xml">
<exclude name="DataflowAnomalyAnalysis"/>
<exclude name="MissingSerialVersionUID"/> <exclude name="MissingSerialVersionUID"/>
<exclude name="NullAssignment"/>
<exclude name="AvoidLiteralsInIfCondition"/> <exclude name="AvoidLiteralsInIfCondition"/>
<exclude name="AvoidDuplicateLiterals"/> <exclude name="AvoidDuplicateLiterals"/>
<exclude name="NullAssignment"/> <exclude name="AvoidFieldNameMatchingMethodName"/>
<exclude name="AssignmentInOperand"/>
<exclude name="BeanMembersShouldSerialize"/>
</rule> </rule>
</ruleset> </ruleset>

View File

@ -10,13 +10,14 @@
<rule ref="category/java/errorprone.xml"> <rule ref="category/java/errorprone.xml">
<exclude name="DataflowAnomalyAnalysis"/>
<exclude name="MissingSerialVersionUID"/> <exclude name="MissingSerialVersionUID"/>
<exclude name="NullAssignment"/>
<exclude name="AvoidLiteralsInIfCondition"/> <exclude name="AvoidLiteralsInIfCondition"/>
<exclude name="AvoidDuplicateLiterals"/> <exclude name="AvoidDuplicateLiterals"/>
<exclude name="NullAssignment"/> <exclude name="AvoidFieldNameMatchingMethodName"/>
<exclude name="AssignmentInOperand"/> <exclude name="AvoidFieldNameMatchingTypeName"/>
<exclude name="TestClassWithoutTestCases"/> <exclude name="TestClassWithoutTestCases"/>
<exclude name="BeanMembersShouldSerialize"/>
</rule> </rule>
</ruleset> </ruleset>

View File

@ -1,7 +1,7 @@
plugins { plugins {
`maven-publish` `maven-publish`
id("com.iqser.red.service.java-conventions") id("com.iqser.red.service.java-conventions")
id("io.freefair.lombok") version "8.4" id("io.freefair.lombok") version "8.2.2"
} }
publishing { publishing {

View File

@ -13,12 +13,5 @@ public class DocumentRequest {
protected String dossierId; protected String dossierId;
protected String fileId; protected String fileId;
protected boolean removeWatermark;
public DocumentRequest(String dossierId, String fileId) {
this.dossierId = dossierId;
this.fileId = fileId;
}
} }

View File

@ -15,6 +15,5 @@ public class OCRStatusUpdateResponse {
private int numberOfPagesToOCR; private int numberOfPagesToOCR;
private int numberOfOCRedPages; private int numberOfOCRedPages;
private boolean ocrFinished; private boolean ocrFinished;
private boolean ocrStarted;
} }

View File

@ -1,6 +1,6 @@
plugins { plugins {
id("com.iqser.red.service.java-conventions") id("com.iqser.red.service.java-conventions")
id("io.freefair.lombok") version "8.4" id("io.freefair.lombok") version "8.2.2"
} }
configurations { configurations {
@ -14,8 +14,7 @@ dependencies {
api("net.sourceforge.tess4j:tess4j:5.8.0") api("net.sourceforge.tess4j:tess4j:5.8.0")
api("com.iqser.red.commons:metric-commons:2.1.0") api("com.iqser.red.commons:metric-commons:2.1.0")
api("com.iqser.red.commons:storage-commons:2.45.0") api("com.iqser.red.commons:storage-commons:2.45.0")
api("com.knecon.fforesight:tenant-commons:0.21.0") api("com.knecon.fforesight:tenant-commons:0.19.0")
api("com.knecon.fforesight:lifecycle-commons:0.6.0")
api("com.pdftron:PDFNet:10.5.0") api("com.pdftron:PDFNet:10.5.0")
api("org.apache.pdfbox:pdfbox:3.0.0") api("org.apache.pdfbox:pdfbox:3.0.0")
api("org.apache.pdfbox:jbig2-imageio:3.0.4") api("org.apache.pdfbox:jbig2-imageio:3.0.4")
@ -25,7 +24,6 @@ dependencies {
api("io.github.karols:hocr4j:0.2.0") api("io.github.karols:hocr4j:0.2.0")
api("com.amazonaws:aws-java-sdk-kms:1.12.440") api("com.amazonaws:aws-java-sdk-kms:1.12.440")
api("com.google.guava:guava:31.1-jre") api("com.google.guava:guava:31.1-jre")
api("com.iqser.red.commons:pdftron-logic-commons:2.32.0") api("com.iqser.red.commons:pdftron-logic-commons:2.20.0")
api("com.knecon.fforesight:viewer-doc-processor:0.125.0")
testImplementation("org.junit.jupiter:junit-jupiter:5.8.1") testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
} }

View File

@ -1,26 +1,14 @@
package com.knecon.fforesight.service.ocr.processor; package com.knecon.fforesight.service.ocr.processor;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.context.properties.EnableConfigurationProperties; import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.ComponentScan; import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Configuration;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import io.micrometer.observation.ObservationRegistry;
@Configuration @Configuration
@ComponentScan @ComponentScan
@EnableConfigurationProperties(OcrServiceSettings.class) @EnableConfigurationProperties(OcrServiceSettings.class)
public class OcrServiceProcessorConfiguration { public class OcrServiceProcessorConfiguration {
@Bean
@Autowired
public ViewerDocumentService viewerDocumentService(ObservationRegistry registry) {
return new ViewerDocumentService(registry);
}
} }

View File

@ -1,20 +1,17 @@
package com.knecon.fforesight.service.ocr.processor.initializer; package com.knecon.fforesight.service.ocr.processor.initializer;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import com.pdftron.pdf.PDFNet; import com.pdftron.pdf.PDFNet;
import com.sun.jna.NativeLibrary; import com.sun.jna.NativeLibrary;
import jakarta.annotation.PostConstruct; import jakarta.annotation.PostConstruct;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
@Slf4j
@Component @Component
@RequiredArgsConstructor @RequiredArgsConstructor
public class NativeLibrariesInitializer { public class PDFNetInitializer {
@Value("${pdftron.license:}") @Value("${pdftron.license:}")
private String pdftronLicense; private String pdftronLicense;
@ -25,25 +22,8 @@ public class NativeLibrariesInitializer {
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError. // Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
public void init() { public void init() {
log.info("Initializing Native Libraries");
log.info("Setting pdftron license: {}", pdftronLicense);
PDFNet.setTempPath("/tmp/pdftron"); PDFNet.setTempPath("/tmp/pdftron");
PDFNet.initialize(pdftronLicense); PDFNet.initialize(pdftronLicense);
log.info("Setting jna.library.path: {}", System.getenv("VCPKG_DYNAMIC_LIB"));
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB")); System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
log.info("Asserting Native Libraries loaded");
try (NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica")) {
assert leptonicaLib != null;
log.info("Leptonica library loaded from {}", leptonicaLib.getFile().getAbsolutePath());
}
try (NativeLibrary tesseractLib = NativeLibrary.getInstance("tesseract")) {
assert tesseractLib != null;
log.info("Tesseract library loaded from {}", tesseractLib.getFile().getAbsolutePath());
}
} }
} }

View File

@ -25,18 +25,11 @@ public record OcrResultToWrite(List<TextPositionInImage> textPositionInImage, Qu
.collect(Collectors.toMap(Map.Entry::getKey, .collect(Collectors.toMap(Map.Entry::getKey,
entry -> entry.getValue() entry -> entry.getValue()
.stream() .stream()
.map(ocrResult -> new OcrResultToWrite(toTextPositionInImage(ocrResult, fontMetricsFactory), ocrResult.image().getImageCoordinatesInInitialUserSpace())) .map(ocrResult -> new OcrResultToWrite(ocrResult.getAllWords()
.stream()
.filter(word -> !word.isBlank())
.map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR))
.toList(), ocrResult.image().getImageCoordinatesInInitialUserSpace()))
.toList())); .toList()));
} }
private static List<TextPositionInImage> toTextPositionInImage(OcrResult ocrResult, FontMetricsFactory fontMetricsFactory) {
return ocrResult.getAllWords()
.stream()
.filter(word -> !word.isBlank())
.map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR))
.toList();
}
} }

View File

@ -1,42 +1,12 @@
package com.knecon.fforesight.service.ocr.processor.model; package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.Rectangle2D;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
public record PageInformation(Rectangle2D mediabox, int number, int rotationDegrees) { public record PageInformation(int height, int width, int number, int rotationDegrees) {
public static PageInformation fromPDPage(int pageNum, PDPage page) { public static PageInformation fromPDPage(int pageNum, PDPage page) {
PDRectangle mediaBox = page.getMediaBox(); return new PageInformation((int) page.getMediaBox().getHeight(), (int) page.getMediaBox().getWidth(), pageNum, page.getRotation());
return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()),
pageNum,
page.getRotation());
}
public double height() {
return mediabox.getHeight();
}
public double width() {
return mediabox.getWidth();
}
public double minX() {
return mediabox.getX();
}
public double minY() {
return mediabox.getY();
} }
} }

View File

@ -1,6 +1,5 @@
package com.knecon.fforesight.service.ocr.processor.model; package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.Rectangle;
import java.awt.geom.AffineTransform; import java.awt.geom.AffineTransform;
import java.awt.geom.Line2D; import java.awt.geom.Line2D;
import java.awt.geom.Point2D; import java.awt.geom.Point2D;
@ -35,16 +34,6 @@ public record QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) {
new Point2D.Double(bounds.getRight(), bounds.getBottom())); new Point2D.Double(bounds.getRight(), bounds.getBottom()));
} }
public Rectangle2D getBounds2D() {
double minX = Math.min(Math.min(Math.min(a.getX(), b.getX()), c.getX()), d.getX());
double minY = Math.min(Math.min(Math.min(a.getY(), b.getY()), c.getY()), d.getY());
double maxX = Math.max(Math.max(Math.max(a.getX(), b.getX()), c.getX()), d.getX());
double maxY = Math.max(Math.max(Math.max(a.getY(), b.getY()), c.getY()), d.getY());
return new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY);
}
public QuadPoint getTransformed(AffineTransform at) { public QuadPoint getTransformed(AffineTransform at) {

View File

@ -1,12 +1,20 @@
package com.knecon.fforesight.service.ocr.processor.model; package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.AffineTransform; import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import lombok.AccessLevel; import lombok.AccessLevel;
import lombok.Getter; import lombok.Getter;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.Setter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults; import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix; import net.sourceforge.lept4j.Pix;
import net.sourceforge.tess4j.ITessAPI;
@Getter @Getter
@RequiredArgsConstructor @RequiredArgsConstructor

View File

@ -1,11 +1,13 @@
package com.knecon.fforesight.service.ocr.processor.service; package com.knecon.fforesight.service.ocr.processor.service;
import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream; import java.io.InputStream;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.StandardCopyOption; import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import org.apache.commons.io.IOUtils;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType; import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
@ -29,38 +31,47 @@ public class FileStorageService {
return dossierId + "/" + fileId + "." + fileType.name() + fileType.getExtension(); return dossierId + "/" + fileId + "." + fileType.name() + fileType.getExtension();
} }
@SneakyThrows
public byte[] getOriginalFile(String dossierId, String fileId) {
try (InputStream inputStream = getInputStream(getStorageId(dossierId, fileId, FileType.ORIGIN))) {
return IOUtils.toByteArray(inputStream);
}
}
@SneakyThrows
public InputStream getOriginalFileAsStream(String dossierId, String fileId) {
return getInputStream(getStorageId(dossierId, fileId, FileType.ORIGIN));
}
public void storeOriginalFile(String dossierId, String fileId, InputStream stream) {
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), stream);
}
public boolean untouchedFileExists(String dossierId, String fileId) { public boolean untouchedFileExists(String dossierId, String fileId) {
return storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED)); return storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED));
} }
@SneakyThrows
public void storeFiles(String dossierId, String fileId, File documentFile, File viewerDocumentFile) {
try (var in = new FileInputStream(documentFile)) { public void storeUntouchedFile(String dossierId, String fileId, byte[] data) {
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), in);
} storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED), new ByteArrayInputStream(data));
try (var in = new FileInputStream(viewerDocumentFile)) {
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT), in);
}
} }
@SneakyThrows @SneakyThrows
public void downloadFiles(String dossierId, String fileId, File documentFile, File viewerDocumentFile) { private InputStream getInputStream(String storageId) {
storageService.downloadTo(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), documentFile); File tempFile = File.createTempFile("temp", ".data");
if (storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT))) { storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
storageService.downloadTo(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT), viewerDocumentFile); return Files.newInputStream(Paths.get(tempFile.getPath()), StandardOpenOption.DELETE_ON_CLOSE);
} else {
Files.copy(documentFile.toPath(), viewerDocumentFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
}
if (!untouchedFileExists(dossierId, fileId)) {
try (var in = new FileInputStream(documentFile)) {
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED), in);
}
}
} }
} }

View File

@ -7,10 +7,7 @@ public interface IOcrMessageSender {
void sendUpdate(String fileId, int finishedImages, int totalImages); void sendUpdate(String fileId, int finishedImages, int totalImages);
void sendOCRStarted(String fileId);
void sendOcrFinished(String fileId, int totalImages); void sendOcrFinished(String fileId, int totalImages);
void sendOcrResponse(String dossierId, String fileId);
} }

View File

@ -1,12 +1,12 @@
package com.knecon.fforesight.service.ocr.processor.service; package com.knecon.fforesight.service.ocr.processor.service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.IOException;
import java.io.FileOutputStream; import java.io.InputStream;
import java.nio.file.Files; import java.io.OutputStream;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -27,10 +27,7 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.FontStyleDetector; import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.FontStyleDetector;
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread; import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.pdftron.pdf.PDFDoc;
import io.micrometer.observation.ObservationRegistry;
import io.micrometer.observation.annotation.Observed;
import lombok.AccessLevel; import lombok.AccessLevel;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows; import lombok.SneakyThrows;
@ -51,7 +48,6 @@ public class OCRService {
OcrResultWriter ocrResultWriter; OcrResultWriter ocrResultWriter;
GhostScriptService ghostScriptService; GhostScriptService ghostScriptService;
FontStyleDetector boldDetector; FontStyleDetector boldDetector;
ObservationRegistry registry;
/** /**
@ -59,66 +55,54 @@ public class OCRService {
* looking for stitchedImages (if so converting the current page to an image with ghostscript and work on this instead), * looking for stitchedImages (if so converting the current page to an image with ghostscript and work on this instead),
* perform tesseract-ocr on these images (via threads) and write the generated ocr-text as invisible elements. * perform tesseract-ocr on these images (via threads) and write the generated ocr-text as invisible elements.
* *
* @param dossierId Id of dossier * @param dossierId Id of dossier
* @param fileId Id of file * @param fileId Id of file
* @param tmpDir working directory for all files * @param out OutputStream where to write to
* @param documentFile the file to perform ocr on, results are written invisibly
* @param viewerDocumentFile debugging file, results are written visibly in an optional content group
*/ */
@Observed(name = "OCRService", contextualName = "run-ocr-on-document")
@SneakyThrows @SneakyThrows
public void runOcrOnDocument(String dossierId, String fileId, boolean removeWatermark, Path tmpDir, File documentFile, File viewerDocumentFile) { public void runOcrOnDocument(String dossierId, String fileId, OutputStream out) {
if (removeWatermark) { try (InputStream fileStream = removeWatermarkIfEnabled(dossierId, fileId); ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) {
removeWatermarkIfEnabled(documentFile);
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
log.info("Starting OCR for file {}", fileId);
long ocrStart = System.currentTimeMillis();
Statistics stats = runOcr(transferInputStream, out, fileId, dossierId);
long ocrEnd = System.currentTimeMillis();
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, String.format("%.1f", (ocrEnd - ocrStart) / 1000.0));
log.info("Runtime breakdown: {}", stats);
}
} }
removeInvisibleElements(documentFile); }
log.info("Starting OCR for file {}", fileId);
long ocrStart = System.currentTimeMillis();
Statistics stats = runOcr(tmpDir, documentFile, viewerDocumentFile, fileId, dossierId);
long ocrEnd = System.currentTimeMillis();
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, String.format("%.1f", (ocrEnd - ocrStart) / 1000.0));
log.info("Runtime breakdown: {}", stats);
private InputStream removeWatermarkIfEnabled(String dossierId, String fileId) throws IOException {
if (settings.isRemoveWatermark()) {
try (var in = fileStorageService.getOriginalFileAsStream(dossierId, fileId); var transferOutputStream = new ByteArrayOutputStream()) {
watermarkRemovalService.removeWatermarks(in, transferOutputStream);
return new ByteArrayInputStream(transferOutputStream.toByteArray());
}
}
return fileStorageService.getOriginalFileAsStream(dossierId, fileId);
} }
@SneakyThrows @SneakyThrows
private void removeInvisibleElements(File originFile) { public Statistics runOcr(InputStream in, OutputStream out, String fileId, String dossierId) {
Path tmpFile = Files.createTempFile("invisibleElements", ".pdf");
try (var in = new FileInputStream(originFile); var out = new FileOutputStream(tmpFile.toFile())) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false, false);
}
Files.copy(tmpFile, originFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
assert tmpFile.toFile().delete();
}
@SneakyThrows
private void removeWatermarkIfEnabled(File originFile) {
Path tmpFile = Files.createTempFile("removeWatermarks", ".pdf");
try (var in = new FileInputStream(originFile); var out = new FileOutputStream(tmpFile.toFile())) {
watermarkRemovalService.removeWatermarks(in, out);
}
Files.copy(tmpFile, originFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
assert tmpFile.toFile().delete();
}
@SneakyThrows
public Statistics runOcr(Path tmpDir, File documentFile, File viewerDocumentFile, String fileId, String dossierId) {
long timestamp; long timestamp;
Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve(dossierId + "-" + fileId);
Path tmpImageDir = tmpDir.resolve("images"); Path tmpImageDir = tmpDir.resolve("images");
Path tesseractOutputDir = tmpDir.resolve("tesseract_output"); Path tesseractOutputDir = tmpDir.resolve("tesseract_output");
tesseractOutputDir.toFile().mkdirs(); tesseractOutputDir.toFile().mkdirs();
tmpImageDir.toFile().mkdirs(); tmpImageDir.toFile().mkdirs();
File documentFile = OsUtils.writeFileToTmpFolder(in, tmpDir);
Statistics stats; Statistics stats;
try (PDDocument document = Loader.loadPDF(documentFile)) { try (PDDocument document = Loader.loadPDF(documentFile)) {
OcrProgressLogger logger = new OcrProgressLogger(document.getNumberOfPages(), ocrMessageSender, fileId); OcrProgressLogger logger = new OcrProgressLogger(document.getNumberOfPages(), ocrMessageSender, fileId);
@ -162,11 +146,12 @@ public class OCRService {
stats.increaseFontStyleDetectionDuration(System.currentTimeMillis() - timestamp); stats.increaseFontStyleDetectionDuration(System.currentTimeMillis() - timestamp);
timestamp = System.currentTimeMillis(); timestamp = System.currentTimeMillis();
ocrResultWriter.drawOcrResultsToPdf(documentFile, viewerDocumentFile, imageWithTextPositionsPerPage); var dictionariesToUpdate = ocrResultWriter.drawOcrResultsToPdf(document, imageWithTextPositionsPerPage);
log.info("Saving document"); log.info("Saving document");
document.saveIncremental(out, dictionariesToUpdate);
stats.increaseWritingTextDuration(System.currentTimeMillis() - timestamp); stats.increaseWritingTextDuration(System.currentTimeMillis() - timestamp);
FileSystemUtils.deleteRecursively(tmpDir);
logger.sendFinished(); logger.sendFinished();
return stats; return stats;
} }

View File

@ -1,38 +1,29 @@
package com.knecon.fforesight.service.ocr.processor.service; package com.knecon.fforesight.service.ocr.processor.service;
import java.awt.Color; import java.awt.Color;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D; import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.HashMap; import java.util.HashSet;
import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Optional; import java.util.Set;
import java.util.function.Function;
import java.util.stream.Stream;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentGroup;
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentProperties;
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode; import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.iqser.red.pdftronlogic.commons.Converter;
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite; import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint; import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage; import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle; import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.TextExtractor;
import lombok.AccessLevel; import lombok.AccessLevel;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -46,179 +37,180 @@ import lombok.extern.slf4j.Slf4j;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OcrResultWriter { public class OcrResultWriter {
public static final Color REGULAR_TEXT_COLOR = Color.BLUE; static String ocrLayerName = "knecon OCR";
public static final Color BOLD_TEXT_COLOR = Color.CYAN; OcrServiceSettings settings;
public static final Color REGULAR_TEXT_IN_IGNORE_ZONE = Color.RED;
public static final Color BOLD_TEXT_IN_IGNORE_ZONE = Color.RED;
ViewerDocumentService viewerDocumentService;
@SneakyThrows @SneakyThrows
public void drawOcrResultsToPdf(File document, File viewerDocument, Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) { public Set<COSDictionary> drawOcrResultsToPdf(PDDocument document, Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
Map<Integer, VisualizationsOnPage> ocrVisualizationsOnPages = new HashMap<>(); Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
Map<Integer, VisualizationsOnPage> ocrTextDebugVisualizationsOnPages = new HashMap<>(); imagesWithResultsPerPage.keySet().forEach(pageNumber -> drawResultsPerPage(document, pageNumber, imagesWithResultsPerPage.get(pageNumber), dictionariesToUpdate));
Map<Integer, VisualizationsOnPage> ocrBBoxDebugVisualizationsOnPages = new HashMap<>(); dictionariesToUpdate.add(document.getDocumentInformation().getCOSObject());
return dictionariesToUpdate;
try (var in = new FileInputStream(document); PDFDoc doc = new PDFDoc(in)) {
for (Integer pageNumber : imagesWithResultsPerPage.keySet()) {
List<Rectangle2D> textBBoxes = getTextBBoxes(doc.getPage(pageNumber));
ocrVisualizationsOnPages.put(pageNumber - 1, createVisualizations(imagesWithResultsPerPage.get(pageNumber), textBBoxes));
ocrTextDebugVisualizationsOnPages.put(pageNumber - 1, createDebugTextVisualizations(imagesWithResultsPerPage.get(pageNumber), textBBoxes));
ocrBBoxDebugVisualizationsOnPages.put(pageNumber - 1, createDebugBBoxVisualizations(imagesWithResultsPerPage.get(pageNumber)));
}
}
Visualizations visualizations = new Visualizations(ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false);
List<Visualizations> debugVisualizations = List.of(visualizations,
new Visualizations(ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false),
new Visualizations(ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false));
viewerDocumentService.addVisualizationsOnPage(document, document, List.of(visualizations));
viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, debugVisualizations);
}
@SuppressWarnings("PMD")
private List<Rectangle2D> getTextBBoxes(Page page) {
List<Rectangle2D> textBBoxes = new ArrayList<>();
try (var textExtractor = new TextExtractor()) {
textExtractor.begin(page);
try {
for (TextExtractor.Line line = textExtractor.getFirstLine(); line.isValid(); line = getNextLine(line)) {
for (TextExtractor.Word word = line.getFirstWord(); word.isValid(); word = getNextWord(word)) {
textBBoxes.add(Converter.toRectangle2D(word.getBBox()));
}
}
} catch (Exception e) {
log.warn("Could not get word dimension, {}", e.getMessage());
}
return textBBoxes;
}
}
private static TextExtractor.Word getNextWord(TextExtractor.Word word) {
TextExtractor.Word nextWord = word.getNextWord();
word.close();
return nextWord;
}
private static TextExtractor.Line getNextLine(TextExtractor.Line line) {
TextExtractor.Line newLine = line.getNextLine();
line.close();
return newLine;
}
private VisualizationsOnPage createVisualizations(List<OcrResultToWrite> ocrResultsToWrite, List<Rectangle2D> ignoreZones) {
List<TextPositionInImage> words = ocrResultsToWrite.stream()
.map(OcrResultToWrite::textPositionInImage)
.flatMap(Collection::stream)
.filter(word -> ignoreZones.stream()
.noneMatch(ignoreZone -> word.getTransformedTextBBox().getBounds2D().intersects(ignoreZone)))
.toList();
List<PlacedText> placedTexts = words.stream()
.map(word -> new PlacedText(word.getText(),
null,
Color.BLACK,
(float) word.getFontSize(),
word.getFontMetricsFactory(),
Optional.of(word.getTextMatrix()),
Optional.of(RenderingMode.NEITHER)))
.toList();
return VisualizationsOnPage.builder().placedTexts(placedTexts).build();
}
private VisualizationsOnPage createDebugTextVisualizations(List<OcrResultToWrite> ocrResultsToWrite, List<Rectangle2D> textBBoxes) {
List<TextPositionInImage> wordsToDraw = new ArrayList<>();
List<TextPositionInImage> ignoredWords = new ArrayList<>();
for (OcrResultToWrite ocrResultToWrite : ocrResultsToWrite) {
for (TextPositionInImage textPositionInImage : ocrResultToWrite.textPositionInImage()) {
if (textBBoxes.stream()
.anyMatch(ignoreZone -> textPositionInImage.getTransformedTextBBox().getBounds2D().intersects(ignoreZone))) {
ignoredWords.add(textPositionInImage);
} else {
wordsToDraw.add(textPositionInImage);
}
}
}
Stream<PlacedText> placedTexts = wordsToDraw.stream()
.map(word -> new PlacedText(word.getText(),
null,
word.getFontStyle().equals(FontStyle.REGULAR) ? REGULAR_TEXT_COLOR : BOLD_TEXT_COLOR,
(float) word.getFontSize(),
word.getFontMetricsFactory(),
Optional.of(word.getTextMatrix()),
Optional.of(RenderingMode.FILL)));
Stream<PlacedText> placedTexts2 = ignoredWords.stream()
.map(word -> new PlacedText(word.getText(),
null,
word.getFontStyle().equals(FontStyle.REGULAR) ? REGULAR_TEXT_IN_IGNORE_ZONE : BOLD_TEXT_IN_IGNORE_ZONE,
(float) word.getFontSize(),
word.getFontMetricsFactory(),
Optional.of(word.getTextMatrix()),
Optional.of(RenderingMode.FILL)));
return VisualizationsOnPage.builder()
.placedTexts(Stream.of(placedTexts, placedTexts2)
.flatMap(Function.identity())
.toList())
.build();
}
private VisualizationsOnPage createDebugBBoxVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {
List<TextPositionInImage> words = ocrResultsToWrite.stream()
.map(OcrResultToWrite::textPositionInImage)
.flatMap(Collection::stream)
.toList();
List<ColoredLine> coloredLines = Stream.concat(//
words.stream()
.map(TextPositionInImage::getTransformedTextBBox)
.map(this::quadPointAsLines),//
ocrResultsToWrite.stream()
.map(OcrResultToWrite::imageBoundingBox)
.map(this::createGrid)//
)
.flatMap(Collection::stream)
.toList();
return VisualizationsOnPage.builder().coloredLines(coloredLines).build();
}
private List<ColoredLine> quadPointAsLines(QuadPoint rect) {
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.ORANGE, 1),
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1),
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1),
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.MAGENTA, 1));
} }
@SneakyThrows @SneakyThrows
private List<ColoredLine> createGrid(QuadPoint rect) { private void drawResultsPerPage(PDDocument document, Integer pageNumber, List<OcrResultToWrite> ocrResultToWrite, Set<COSDictionary> dictionariesToUpdate) {
List<ColoredLine> lines = new LinkedList<>(quadPointAsLines(rect)); var pdPage = document.getPage(pageNumber - 1);
PDOptionalContentGroup textDebugLayer = new PDOptionalContentGroup(ocrLayerName);
PDOptionalContentGroup bBoxDebugLayer = new PDOptionalContentGroup(ocrLayerName + "BBox");
if (settings.isDebug()) {
textDebugLayer = addOptionalGroup(ocrLayerName, document, pdPage, dictionariesToUpdate);
bBoxDebugLayer = addOptionalGroup(ocrLayerName + " BBox", document, pdPage, dictionariesToUpdate);
}
escapeContentStreams(document, pdPage);
List<TextPositionInImage> words = ocrResultToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
// write invisible ocr text inside tagged content
contentStream.beginMarkedContent(settings.getOcrMarkedContentTag());
contentStream.saveGraphicsState();
contentStream.setNonStrokingColor(Color.BLUE);
contentStream.setStrokingColor(Color.BLUE);
contentStream.setLineWidth(1);
words.forEach(word -> drawInvisibleWord(word, contentStream));
contentStream.restoreGraphicsState();
contentStream.endMarkedContent();
if (settings.isDebug()) { // must not be written, as it will interfere with layout parsing
// write visible ocr text inside optional group
contentStream.beginMarkedContent(COSName.OC, textDebugLayer);
contentStream.saveGraphicsState();
words.forEach(word -> drawVisibleWord(word, contentStream));
contentStream.restoreGraphicsState();
contentStream.endMarkedContent();
// write word bounding boxes (tesseract output) inside optional group
contentStream.beginMarkedContent(COSName.OC, bBoxDebugLayer);
contentStream.saveGraphicsState();
ocrResultToWrite.stream()
.map(OcrResultToWrite::imageBoundingBox)
.forEach(imagePosition -> drawGrid(contentStream, imagePosition));
words.stream().map(TextPositionInImage::getTransformedTextBBox).forEach(word -> drawRectangle(contentStream, word));
contentStream.restoreGraphicsState();
contentStream.endMarkedContent();
}
}
dictionariesToUpdate.add(pdPage.getCOSObject());
dictionariesToUpdate.add(pdPage.getResources().getCOSObject());
}
@SneakyThrows
private static void escapeContentStreams(PDDocument document, PDPage pdPage) {
// We need to append to the contentstream, otherwise the content could be overlapped by images
// But we also need to save the graphics state before, such that our appended content cannot be affected by previous contentstreams with side-effects, such as not escaped matrix transformations
try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.PREPEND, false)) {
contentStream.saveGraphicsState();
}
try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, false)) {
contentStream.restoreGraphicsState();
}
}
private PDOptionalContentGroup addOptionalGroup(String ocrLayerName, PDDocument document, PDPage pdPage, Set<COSDictionary> dictionariesToUpdate) {
PDDocumentCatalog catalog = document.getDocumentCatalog();
PDOptionalContentProperties ocprops = catalog.getOCProperties();
if (ocprops == null) {
ocprops = new PDOptionalContentProperties();
catalog.setOCProperties(ocprops);
}
PDOptionalContentGroup layer = null;
if (ocprops.hasGroup(ocrLayerName)) {
layer = ocprops.getGroup(ocrLayerName);
} else {
layer = new PDOptionalContentGroup(ocrLayerName);
ocprops.addGroup(layer);
}
// enable debug layers by default only when DEBUG flag is set.
ocprops.setGroupEnabled(layer, settings.isDebug());
PDResources resources = pdPage.getResources();
if (resources == null) {
resources = new PDResources();
pdPage.setResources(resources);
}
dictionariesToUpdate.add(catalog.getCOSObject());
return layer;
}
@SneakyThrows
private void drawRectangle(PDPageContentStream contentStream, QuadPoint rect) {
contentStream.saveGraphicsState();
contentStream.setLineWidth(1);
contentStream.moveTo((float) rect.a().getX(), (float) rect.a().getY());
contentStream.lineTo((float) rect.b().getX(), (float) rect.b().getY());
contentStream.setStrokingColor(Color.ORANGE);
contentStream.stroke();
contentStream.moveTo((float) rect.b().getX(), (float) rect.b().getY());
contentStream.lineTo((float) rect.c().getX(), (float) rect.c().getY());
contentStream.setStrokingColor(Color.BLUE);
contentStream.stroke();
contentStream.moveTo((float) rect.c().getX(), (float) rect.c().getY());
contentStream.lineTo((float) rect.d().getX(), (float) rect.d().getY());
contentStream.setStrokingColor(Color.GREEN);
contentStream.stroke();
contentStream.moveTo((float) rect.d().getX(), (float) rect.d().getY());
contentStream.lineTo((float) rect.a().getX(), (float) rect.a().getY());
contentStream.setStrokingColor(Color.MAGENTA);
contentStream.stroke();
contentStream.restoreGraphicsState();
}
private void drawInvisibleWord(TextPositionInImage word, PDPageContentStream contentStream) {
drawWord(word, contentStream, RenderingMode.NEITHER);
}
private void drawVisibleWord(TextPositionInImage word, PDPageContentStream contentStream) {
drawWord(word, contentStream, RenderingMode.FILL);
}
// @SneakyThrows
private void drawWord(TextPositionInImage position, PDPageContentStream contentStream, RenderingMode renderingMode) {
try {
contentStream.setNonStrokingColor(switch (position.getFontStyle()) {
case BOLD -> Color.RED;
case ITALIC -> Color.GREEN;
default -> Color.BLUE;
});
contentStream.beginText();
contentStream.setRenderingMode(renderingMode);
contentStream.setFont(position.getFont(), (float) position.getFontSize());
contentStream.setTextMatrix(position.getTextMatrix());
contentStream.showText(position.getText());
contentStream.endText();
} catch (Exception e) {
log.error("Failed to write text {}", position.getText());
log.error(e.getMessage());
}
}
@SneakyThrows
private void drawGrid(PDPageContentStream contentStream, QuadPoint rect) {
drawRectangle(contentStream, rect);
contentStream.saveGraphicsState();
contentStream.setStrokingColor(Color.BLACK);
contentStream.setLineWidth(0.2F);
int nRows = 8; int nRows = 8;
int nCols = 8; int nCols = 8;
@ -226,7 +218,7 @@ public class OcrResultWriter {
Point2D start = add(rect.a(), abStep); Point2D start = add(rect.a(), abStep);
Point2D end = add(rect.d(), abStep); Point2D end = add(rect.d(), abStep);
for (int row = 0; row < nRows; ++row) { for (int row = 0; row < nRows; ++row) {
lines.add(new ColoredLine(new Line2D.Double(start, end), Color.BLACK, 0.2f)); drawLine(start, end, contentStream);
start = add(start, abStep); start = add(start, abStep);
end = add(end, abStep); end = add(end, abStep);
} }
@ -234,12 +226,21 @@ public class OcrResultWriter {
start = add(rect.a(), adStep); start = add(rect.a(), adStep);
end = add(rect.b(), adStep); end = add(rect.b(), adStep);
for (int col = 0; col < nCols; ++col) { for (int col = 0; col < nCols; ++col) {
lines.add(new ColoredLine(new Line2D.Double(start, end), Color.BLACK, 0.2f)); drawLine(start, end, contentStream);
start = add(start, adStep); start = add(start, adStep);
end = add(end, adStep); end = add(end, adStep);
} }
contentStream.restoreGraphicsState();
return lines; }
@SneakyThrows
private void drawLine(Point2D a, Point2D b, PDPageContentStream contentStream) {
contentStream.moveTo((float) a.getX(), (float) a.getY());
contentStream.lineTo((float) b.getX(), (float) b.getY());
contentStream.stroke();
} }

View File

@ -4,12 +4,11 @@ import org.apache.pdfbox.pdmodel.font.PDFont;
import com.knecon.fforesight.service.ocr.processor.model.FontMetrics; import com.knecon.fforesight.service.ocr.processor.model.FontMetrics;
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent; import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
import com.knecon.fforesight.service.viewerdoc.model.EmbeddableFont;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
public interface FontMetricsFactory extends EmbeddableFont { public interface FontMetricsFactory {
default FontMetrics calculateMetrics(String text, double textWidth, double textHeight) { default FontMetrics calculateMetrics(String text, double textWidth, double textHeight) {

View File

@ -1,6 +1,8 @@
package com.knecon.fforesight.service.ocr.processor.service.fonts; package com.knecon.fforesight.service.ocr.processor.service.fonts;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.util.Collections;
import java.util.List;
import java.util.Set; import java.util.Set;
import org.apache.fontbox.ttf.GlyphData; import org.apache.fontbox.ttf.GlyphData;
@ -13,63 +15,45 @@ import org.apache.pdfbox.pdmodel.font.PDType0Font;
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent; import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
import lombok.AllArgsConstructor;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import software.amazon.awssdk.services.s3.endpoints.internal.Value;
@Slf4j @Slf4j
@RequiredArgsConstructor @RequiredArgsConstructor
@AllArgsConstructor
public class Type0FontMetricsFactory implements FontMetricsFactory { public class Type0FontMetricsFactory implements FontMetricsFactory {
private final String resourcePath; private final PDType0Font type0Font;
private PDType0Font type0Font; private final TrueTypeFont trueTypeFont;
private TrueTypeFont trueTypeFont;
private PDDocument documentThisIsEmbeddedIn;
// for this specific font back-/forward-slashes have a lot of descent screwing up the font size and therefore bold detection. So if we find such a character we ignore its descent. // for this specific font back-/forward-slashes have a lot of descent screwing up the font size and therefore bold detection. So if we find such a character we ignore its descent.
private static final Set<Integer> slashGlyphIds = Set.of(18, 63); private static final Set<Integer> slashGlyphIds = Set.of(18, 63);
@SneakyThrows
public static Type0FontMetricsFactory regular(PDDocument document) { public static Type0FontMetricsFactory regular(PDDocument document) {
String resourcePath = "fonts/cmu-regular.ttf"; return createFromResource("fonts/cmu-regular.ttf", document);
return createFromResourcePath(resourcePath, document);
} }
@SneakyThrows
public static Type0FontMetricsFactory bold(PDDocument document) { public static Type0FontMetricsFactory bold(PDDocument document) {
String resourcePath = "fonts/cmu-bold.ttf"; return createFromResource("fonts/cmu-bold.ttf", document);
return createFromResourcePath(resourcePath, document);
} }
@SneakyThrows @SneakyThrows
@SuppressWarnings("PMD.CloseResource") private static Type0FontMetricsFactory createFromResource(String resourcePath, PDDocument document) {
private static TrueTypeFont readFromResourcePath(String resourcePath) {
// The ttf is closed with the document, see PDType0Font line 134
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(resourcePath); var buffer = new RandomAccessReadBuffer(in)) { try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(resourcePath); var buffer = new RandomAccessReadBuffer(in)) {
return new TTFParser().parse(buffer); TrueTypeFont trueTypeFont = new TTFParser().parse(buffer); // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
PDType0Font type0Font = PDType0Font.load(document, trueTypeFont, true); // use Type0Font for unicode support
return new Type0FontMetricsFactory(type0Font, trueTypeFont);
} }
} }
@SneakyThrows
@SuppressWarnings("PMD.CloseResource")
private static Type0FontMetricsFactory createFromResourcePath(String resourcePath, PDDocument document) {
TrueTypeFont trueTypeFont = readFromResourcePath(resourcePath);
// since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
return new Type0FontMetricsFactory(resourcePath, PDType0Font.load(document, trueTypeFont, true), trueTypeFont, document); // use Type0Font for unicode support)
}
@SneakyThrows @SneakyThrows
public HeightAndDescent calculateHeightAndDescent(String text) { public HeightAndDescent calculateHeightAndDescent(String text) {
@ -113,28 +97,4 @@ public class Type0FontMetricsFactory implements FontMetricsFactory {
return type0Font; return type0Font;
} }
@Override
@SneakyThrows
public PDFont embed(PDDocument document) {
if (documentThisIsEmbeddedIn.equals(document)) {
return getFont();
}
// no need to close, the font will be closed with the document it is embedded in
this.trueTypeFont = readFromResourcePath(resourcePath);
this.type0Font = PDType0Font.load(document, trueTypeFont, true);
this.documentThisIsEmbeddedIn = document;
return getFont();
}
@SneakyThrows
public void close() {
trueTypeFont.close();
}
} }

View File

@ -49,11 +49,11 @@ public class FontStyleDetector {
* (<a href="https://en.wikipedia.org/wiki/Opening_(morphology)">Opening (Morphology)</a>). * (<a href="https://en.wikipedia.org/wiki/Opening_(morphology)">Opening (Morphology)</a>).
* We then threshold the ratio of remaining pixels to determine whether a word is bold or not. * We then threshold the ratio of remaining pixels to determine whether a word is bold or not.
* <p> * <p>
* I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size estimation. * I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size.
* But that is calculated based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height. * But this is based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height.
* The paper does not describe its clustering algorithm, so I've decided on DBSCAN due to its good runtime and readily available implementation by apache commons math. * The paper does not describe its clustering algorithm, so I've decided on DBSCAN due to its good runtime and readily available implementation by apache commons math.
* Moreover, the paper states that stroke width scales linearly with text height. I've come to the conclusion this is not the case. * Moreover, the paper states that stroke width scales linearly with text height. I've come to the conclusion this is not the case.
* It seems it scales with the square root of the text height. Or at least this seemed to give the best results for me. * It seems it scales with the square root of the text height. Or at least this seemed to give the best results.
*/ */
public Map<Integer, List<OcrResultToWrite>> detectBold(List<OcrResult> ocrResults, PDDocument document) { public Map<Integer, List<OcrResultToWrite>> detectBold(List<OcrResult> ocrResults, PDDocument document) {

View File

@ -109,6 +109,7 @@ public class GhostScriptOutputHandler extends Thread {
if (imageFile == null) { if (imageFile == null) {
throw new IllegalArgumentException(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet())); throw new IllegalArgumentException(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet()));
} }
assert new File(imageFile.absoluteFilePath()).isFile();
renderedPageImageFileOutput.add(imageFile); renderedPageImageFileOutput.add(imageFile);
} }

View File

@ -45,7 +45,7 @@ public class ImageProcessingThread extends Thread {
final BlockingQueue<UnprocessedImage> imageInputQueue; final BlockingQueue<UnprocessedImage> imageInputQueue;
final BlockingQueue<OcrImage> imageOutputQueue; final BlockingQueue<OcrImage> imageOutputQueue;
final ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle(); final ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
final L_Kernel gaussianKernel = Leptonica1.makeGaussianKernel(2, 2, 1.0f, 1); final L_Kernel gaussianKernel = Leptonica1.makeGaussianKernel(2, 2, 1.2f, 1);
final Statistics stats; final Statistics stats;
final OcrServiceSettings settings; final OcrServiceSettings settings;
final PDDocument document; final PDDocument document;
@ -107,7 +107,6 @@ public class ImageProcessingThread extends Thread {
} }
@SuppressWarnings("PMD.CompareObjectsWithEquals")
private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) { private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) {
Pix pix = processPix(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi()); Pix pix = processPix(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi());
@ -128,7 +127,7 @@ public class ImageProcessingThread extends Thread {
return ocrImage; return ocrImage;
} }
@SuppressWarnings("PMD.CompareObjectsWithEquals")
private OcrImage processExtractedImage(ExtractedImage extractedImage) { private OcrImage processExtractedImage(ExtractedImage extractedImage) {
float imageDPI = Math.abs(extractedImage.image().getWidth() / (extractedImage.ctm().getScalingFactorX() / 72)); float imageDPI = Math.abs(extractedImage.image().getWidth() / (extractedImage.ctm().getScalingFactorX() / 72));
@ -199,10 +198,8 @@ public class ImageProcessingThread extends Thread {
grayScale = pix; grayScale = pix;
} else if (pix.d == 32) { } else if (pix.d == 32) {
grayScale = Leptonica1.pixConvertRGBToGrayFast(pix); grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
LeptUtils.disposePix(pix);
} else if (pix.d == 1) { } else if (pix.d == 1) {
grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255); grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255);
LeptUtils.disposePix(pix);
} else { } else {
throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d)); throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d));
} }
@ -211,27 +208,29 @@ public class ImageProcessingThread extends Thread {
float targetFactor = targetDpi / imageDpi; float targetFactor = targetDpi / imageDpi;
if (targetFactor > 2.1) { if (targetFactor > 2.1) {
scaledUp = Leptonica1.pixScaleGray4xLI(grayScale); scaledUp = Leptonica1.pixScaleGray4xLI(grayScale);
LeptUtils.disposePix(grayScale);
} else if (targetFactor > 1.1) { } else if (targetFactor > 1.1) {
scaledUp = Leptonica1.pixScaleGray2xLI(grayScale); scaledUp = Leptonica1.pixScaleGray2xLI(grayScale);
LeptUtils.disposePix(grayScale);
} else { } else {
scaledUp = grayScale; scaledUp = grayScale;
} }
// remove noise and prep for Otsu // remove noise and prep for Otsu
gaussian = Leptonica1.pixConvolve(scaledUp, gaussianKernel, 8, 1); gaussian = Leptonica1.pixConvolve(scaledUp, gaussianKernel, 8, 1);
LeptUtils.disposePix(scaledUp);
// Threshold to binary // Threshold to binary
if (pix.w < 100 || pix.h < 100) { if (pix.w < 100 || pix.h < 100) {
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170); binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
} else { } else {
binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.1f, null); binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.2f, null);
if (binarized == null) { // Sometimes Otsu just fails, then we binarize directly if (binarized == null) { // Sometimes Otsu just fails, then we binarize directly
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170); binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
} }
} }
LeptUtils.disposePix(pix);
LeptUtils.disposePix(grayScale);
LeptUtils.disposePix(scaledUp);
LeptUtils.disposePix(gaussian); LeptUtils.disposePix(gaussian);
return binarized; return binarized;

View File

@ -21,8 +21,10 @@ public class OcrServiceSettings {
int minImageWidth = 20; // Minimum width for images to be processed int minImageWidth = 20; // Minimum width for images to be processed
float minRotationConfidence = 2; // Sets a lower bound for the confidence rating for rotated pages. float minRotationConfidence = 2; // Sets a lower bound for the confidence rating for rotated pages.
boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes
boolean removeWatermark; // If true, watermarks will be removed
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR"); COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR");
boolean boldDetection = true; // if true, bold detection will be attempted boolean boldDetection = true; // if true, bold detection will be attempted
double boldThreshold = 0.5; // Words are opened with a brick of average stroke width, if the ratio of remaining pixels is higher the word is determined bold. double boldThreshold = 0.5; // Words are opened with a brick of average stroke width, if the ratio of remaining pixels is higher the word is determined bold.
} }

View File

@ -17,57 +17,58 @@ public class PdfDraw {
@SneakyThrows @SneakyThrows
public static void drawGrid(ElementWriter writer, Page page) { public static void drawGrid(ElementWriter writer, Page page) {
try (var eb = new ElementBuilder()) { ElementBuilder eb = new ElementBuilder();
double dX = 15; double dX = 15;
double dY = 15; double dY = 15;
int nRows = (int) (page.getPageHeight() / dY) + 1; int nRows = (int) (page.getPageHeight() / dY) + 1;
int nCols = (int) (page.getPageWidth() / dX) + 1; int nCols = (int) (page.getPageWidth() / dX) + 1;
for (int row = 0; row < nRows; ++row) { for (int row = 0; row < nRows; ++row) {
for (int col = 0; col < nCols; ++col) { for (int col = 0; col < nCols; ++col) {
Element cell = eb.createRect(col * dX, row * dY, dX, dY); Element cell = eb.createRect(col * dX, row * dY, dX, dY);
cell.setPathStroke(true); cell.setPathStroke(true);
cell.getGState().setLineWidth(1); cell.getGState().setLineWidth(1);
cell.getGState().setStrokeOpacity(0.1); cell.getGState().setStrokeOpacity(0.1);
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
if (row == 0 && col == 0) { if (row == 0 && col == 0) {
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1)); cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
cell.setPathFill(true); cell.setPathFill(true);
cell.getGState().setFillOpacity(0.8); cell.getGState().setFillOpacity(0.8);
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
cell.getGState().setFillColor(new ColorPt(0, 0, 1)); cell.getGState().setFillColor(new ColorPt(0, 0, 1));
} else { } else {
cell.setPathFill(false); cell.setPathFill(false);
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1)); cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
}
writer.writePlacedElement(cell);
} }
writer.writePlacedElement(cell);
} }
} }
eb.destroy();
} }
@SneakyThrows @SneakyThrows
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection) { public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection) {
try (var colorPt = new ColorPt(1, 0, 0); var eb = new ElementBuilder()) { ColorPt colorPt = new ColorPt(1, 0, 0);
for (int i = 0; i < rectCollection.getNumRects(); ++i) { ElementBuilder eb = new ElementBuilder();
try(var r = rectCollection.getRectAt(i)) { for (int i = 0; i < rectCollection.getNumRects(); ++i) {
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight()); Rect r = rectCollection.getRectAt(i);
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
rect.setPathStroke(true); rect.setPathStroke(true);
rect.getGState().setLineWidth(5); rect.getGState().setLineWidth(5);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt); rect.getGState().setStrokeColor(colorPt);
rect.setPathFill(true); rect.setPathFill(true);
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setFillColor(colorPt); rect.getGState().setFillColor(colorPt);
rect.getGState().setFillOpacity(0.5); rect.getGState().setFillOpacity(0.5);
writer.writePlacedElement(rect); writer.writePlacedElement(rect);
}
}
} }
colorPt.destroy();
eb.destroy();
} }
} }

View File

@ -1,5 +1,25 @@
package com.knecon.fforesight.service.ocr.processor.utils; package com.knecon.fforesight.service.ocr.processor.utils;
import static net.sourceforge.tess4j.ITessAPI.TRUE;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIDelete;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIEnd;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIGetIterator;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIGetStringVariable;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIMeanTextConf;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIProcessPage;
import static net.sourceforge.tess4j.TessAPI1.TessDeleteResultRenderer;
import static net.sourceforge.tess4j.TessAPI1.TessHOcrRendererCreate;
import static net.sourceforge.tess4j.TessAPI1.TessPageIteratorBegin;
import static net.sourceforge.tess4j.TessAPI1.TessPageIteratorBoundingBox;
import static net.sourceforge.tess4j.TessAPI1.TessPageIteratorNext;
import static net.sourceforge.tess4j.TessAPI1.TessResultIteratorConfidence;
import static net.sourceforge.tess4j.TessAPI1.TessResultIteratorDelete;
import static net.sourceforge.tess4j.TessAPI1.TessResultIteratorGetPageIterator;
import static net.sourceforge.tess4j.TessAPI1.TessResultIteratorGetUTF8Text;
import static net.sourceforge.tess4j.TessAPI1.TessResultRendererBeginDocument;
import static net.sourceforge.tess4j.TessAPI1.TessResultRendererEndDocument;
import static net.sourceforge.tess4j.TessAPI1.TessResultRendererInsert;
import java.awt.Rectangle; import java.awt.Rectangle;
import java.nio.IntBuffer; import java.nio.IntBuffer;
import java.util.ArrayList; import java.util.ArrayList;
@ -9,20 +29,19 @@ import com.sun.jna.Pointer;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.Pix; import net.sourceforge.lept4j.Pix;
import net.sourceforge.tess4j.ITessAPI;
import net.sourceforge.tess4j.OCRResult; import net.sourceforge.tess4j.OCRResult;
import net.sourceforge.tess4j.TessAPI1; import net.sourceforge.tess4j.TessAPI1;
import net.sourceforge.tess4j.Tesseract1; import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException; import net.sourceforge.tess4j.TesseractException;
import net.sourceforge.tess4j.Word; import net.sourceforge.tess4j.Word;
@Slf4j @Slf4j
/** /**
* Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted. * Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted.
*/ */ public class Tesseract2 extends Tesseract {
public class Tesseract2 extends Tesseract1 {
private int createDocuments(Pix pix, String filename, ITessAPI.TessResultRenderer renderer) {
private int createDocuments(Pix pix, String filename, TessResultRenderer renderer) {
String title = TessBaseAPIGetStringVariable(getHandle(), DOCUMENT_TITLE); String title = TessBaseAPIGetStringVariable(getHandle(), DOCUMENT_TITLE);
TessResultRendererBeginDocument(renderer, title); TessResultRendererBeginDocument(renderer, title);
@ -62,7 +81,7 @@ public class Tesseract2 extends Tesseract1 {
try { try {
for (int i = 0; i < pixs.length; i++) { for (int i = 0; i < pixs.length; i++) {
try { try {
TessResultRenderer renderer = createRenderers(outputbases[i], formats); ITessAPI.TessResultRenderer renderer = createRenderers(outputbases[i], formats);
int meanTextConfidence = createDocuments(pixs[i], filenames[i], renderer); int meanTextConfidence = createDocuments(pixs[i], filenames[i], renderer);
TessDeleteResultRenderer(renderer); TessDeleteResultRenderer(renderer);
List<Word> words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel) : new ArrayList<Word>(); List<Word> words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel) : new ArrayList<Word>();
@ -85,8 +104,8 @@ public class Tesseract2 extends Tesseract1 {
List<Word> words = new ArrayList<>(); List<Word> words = new ArrayList<>();
try { try {
TessResultIterator ri = TessBaseAPIGetIterator(getHandle()); ITessAPI.TessResultIterator ri = TessBaseAPIGetIterator(getHandle());
TessPageIterator pi = TessResultIteratorGetPageIterator(ri); ITessAPI.TessPageIterator pi = TessResultIteratorGetPageIterator(ri);
TessPageIteratorBegin(pi); TessPageIteratorBegin(pi);
do { do {
@ -119,9 +138,9 @@ public class Tesseract2 extends Tesseract1 {
} }
private TessResultRenderer createRenderers(String outputbase, List<RenderedFormat> formats) { private ITessAPI.TessResultRenderer createRenderers(String outputbase, List<RenderedFormat> formats) {
TessResultRenderer renderer = null; ITessAPI.TessResultRenderer renderer = null;
for (RenderedFormat format : formats) { for (RenderedFormat format : formats) {
switch (format) { switch (format) {
@ -138,4 +157,12 @@ public class Tesseract2 extends Tesseract1 {
return renderer; return renderer;
} }
@Override
protected void dispose() {
TessBaseAPIEnd(getHandle());
TessBaseAPIDelete(getHandle());
}
} }

View File

@ -3,10 +3,10 @@ import org.springframework.boot.gradle.tasks.bundling.BootBuildImage
plugins { plugins {
application application
id("com.iqser.red.service.java-conventions") id("com.iqser.red.service.java-conventions")
id("org.springframework.boot") version "3.2.3" id("org.springframework.boot") version "3.1.5"
id("io.spring.dependency-management") version "1.1.3" id("io.spring.dependency-management") version "1.1.3"
id("org.sonarqube") version "4.3.0.3225" id("org.sonarqube") version "4.3.0.3225"
id("io.freefair.lombok") version "8.4" id("io.freefair.lombok") version "8.2.2"
} }
configurations { configurations {
@ -17,14 +17,14 @@ configurations {
} }
} }
val springBootStarterVersion = "3.2.3" val springBootStarterVersion = "3.1.5"
dependencies { dependencies {
implementation(project(":ocr-service-processor")) implementation(project(":ocr-service-processor"))
implementation(project(":ocr-service-api")) implementation(project(":ocr-service-api"))
implementation("com.knecon.fforesight:tracing-commons:0.7.0") implementation("com.knecon.fforesight:tracing-commons:0.3.0")
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.1.1") implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
implementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}") implementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}")
implementation("net.logstash.logback:logstash-logback-encoder:7.4") implementation("net.logstash.logback:logstash-logback-encoder:7.4")
@ -39,7 +39,7 @@ tasks.named<BootBuildImage>("bootBuildImage") {
environment.put("BPE_DELIM_JAVA_TOOL_OPTIONS", " ") environment.put("BPE_DELIM_JAVA_TOOL_OPTIONS", " ")
environment.put("BPE_APPEND_JAVA_TOOL_OPTIONS", "-Dfile.encoding=UTF-8") environment.put("BPE_APPEND_JAVA_TOOL_OPTIONS", "-Dfile.encoding=UTF-8")
environment.put("BPE_GS_LIB", "/layers/fagiani_apt/apt/usr/share/ghostscript/9.55.0/Resource/Init/") // set ghostscript lib path, version in path must match version in Aptfile environment.put("BPE_GS_LIB", "/layers/fagiani_apt/apt/usr/share/ghostscript/9.26/Resource/Init/") // set ghostscript lib path
environment.put("BPE_FONTCONFIG_PATH", "/layers/fagiani_apt/apt/etc/fonts/") // set ghostscript fontconfig path environment.put("BPE_FONTCONFIG_PATH", "/layers/fagiani_apt/apt/etc/fonts/") // set ghostscript fontconfig path
var aptfile = layout.projectDirectory.file("src/main/resources/Aptfile").toString() var aptfile = layout.projectDirectory.file("src/main/resources/Aptfile").toString()
@ -53,7 +53,7 @@ tasks.named<BootBuildImage>("bootBuildImage") {
buildpacks.set( buildpacks.set(
listOf( listOf(
"ghcr.io/knsita/buildpacks/fagiani_apt@sha256:9771d4d27d8050aee62769490b8882fffc794745c129fb98e1f33196e2c93504", "ghcr.io/fagiani/buildpacks/fagiani_apt@sha256:6471c8c70f32b749e29f65ae562ac0339fecad26aa9217628c00a6c31f197dae",
"ghcr.io/kschuettler/knecon-vcpkg@sha256:ba5e967b124de4865ff7e8f565684f752dd6e97b302e2dcf651283f6a19b98b9", "ghcr.io/kschuettler/knecon-vcpkg@sha256:ba5e967b124de4865ff7e8f565684f752dd6e97b302e2dcf651283f6a19b98b9",
"ghcr.io/kschuettler/knecon-tessdata@sha256:9062f728aa0340ac963bcdd6f5e740d683823a81d3f480db894da15bff72691a", "ghcr.io/kschuettler/knecon-tessdata@sha256:9062f728aa0340ac963bcdd6f5e740d683823a81d3f480db894da15bff72691a",
"urn:cnb:builder:paketo-buildpacks/java" "urn:cnb:builder:paketo-buildpacks/java"

View File

@ -5,28 +5,27 @@ import org.springframework.boot.actuate.autoconfigure.security.servlet.Managemen
import org.springframework.boot.autoconfigure.ImportAutoConfiguration; import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration; import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration;
import org.springframework.cloud.openfeign.EnableFeignClients;
import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.EnableAspectJAutoProxy;
import org.springframework.context.annotation.Import; import org.springframework.context.annotation.Import;
import org.springframework.scheduling.annotation.EnableAsync; import org.springframework.scheduling.annotation.EnableAsync;
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService; import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService; import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
import com.knecon.fforesight.lifecyclecommons.LifecycleAutoconfiguration;
import com.knecon.fforesight.service.ocr.processor.OcrServiceProcessorConfiguration; import com.knecon.fforesight.service.ocr.processor.OcrServiceProcessorConfiguration;
import com.knecon.fforesight.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
import com.knecon.fforesight.service.ocr.v1.server.queue.MessagingConfiguration; import com.knecon.fforesight.service.ocr.v1.server.queue.MessagingConfiguration;
import com.iqser.red.storage.commons.StorageAutoConfiguration; import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration; import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
import com.knecon.fforesight.tracing.OpenTelemetryConfig;
import io.micrometer.core.aop.TimedAspect; import io.micrometer.core.aop.TimedAspect;
import io.micrometer.core.instrument.MeterRegistry; import io.micrometer.core.instrument.MeterRegistry;
@EnableAsync @EnableAsync
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class, LifecycleAutoconfiguration.class}) @ImportAutoConfiguration({MultiTenancyAutoConfiguration.class})
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class}) @SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
@Import({MessagingConfiguration.class, StorageAutoConfiguration.class, OcrServiceProcessorConfiguration.class, OpenTelemetryConfig.class}) @Import({MessagingConfiguration.class, StorageAutoConfiguration.class, OcrServiceProcessorConfiguration.class})
@EnableAspectJAutoProxy @EnableFeignClients(basePackageClasses = FileStatusProcessingUpdateClient.class)
public class Application { public class Application {
/** /**

View File

@ -0,0 +1,10 @@
package com.knecon.fforesight.service.ocr.v1.server.client;
import org.springframework.cloud.openfeign.FeignClient;
import com.iqser.red.service.persistence.service.v1.api.internal.resources.FileStatusProcessingUpdateResource;
@FeignClient(name = "FileStatusProcessingUpdateResource", url = "${persistence-service.url}")
public interface FileStatusProcessingUpdateClient extends FileStatusProcessingUpdateResource {
}

View File

@ -11,10 +11,35 @@ import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor @RequiredArgsConstructor
public class MessagingConfiguration { public class MessagingConfiguration {
public static final String OCR_REQUEST_QUEUE = "ocr_request_queue"; public static final String OCR_QUEUE = "ocrQueue";
public static final String OCR_RESPONSE_QUEUE = "ocr_response_queue"; public static final String OCR_DLQ = "ocrDLQ";
public static final String X_DEAD_LETTER_EXCHANGE = "x-dead-letter-exchange";
public static final String X_DEAD_LETTER_ROUTING_KEY = "x-dead-letter-routing-key";
public static final String X_MAX_PRIORITY = "x-max-priority";
public static final String OCR_STATUS_UPDATE_RESPONSE_QUEUE = "ocr_status_update_response_queue"; public static final String OCR_STATUS_UPDATE_RESPONSE_QUEUE = "ocr_status_update_response_queue";
public static final String X_ERROR_INFO_HEADER = "x-error-message"; public static final String X_ERROR_INFO_HEADER = "x-error-message";
public static final String X_ERROR_INFO_TIMESTAMP_HEADER = "x-error-message-timestamp"; public static final String X_ERROR_INFO_TIMESTAMP_HEADER = "x-error-message-timestamp";
@Bean
public Queue ocrQueue() {
return QueueBuilder.durable(OCR_QUEUE)
.withArgument(X_DEAD_LETTER_EXCHANGE, "")
.withArgument(X_DEAD_LETTER_ROUTING_KEY, OCR_DLQ)
.withArgument(X_MAX_PRIORITY, 2)
.maxPriority(2)
.build();
}
@Bean
public Queue ocrDeadLetterQueue() {
return QueueBuilder.durable(OCR_DLQ).build();
}
} }

View File

@ -1,42 +0,0 @@
package com.knecon.fforesight.service.ocr.v1.server.queue;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.ocr.processor.service.IOcrMessageSender;
import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@ConditionalOnProperty(value = "ocrService.sendStatusUpdates", havingValue = "false")
public class NoStatusUpdateOcrMessageSender implements IOcrMessageSender {
RabbitTemplate rabbitTemplate;
public void sendOcrFinished(String fileId, int totalImages) {
}
public void sendOCRStarted(String fileId) {
}
public void sendUpdate(String fileId, int finishedImages, int totalImages) {
}
public void sendOcrResponse(String dossierId, String fileId) {
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_RESPONSE_QUEUE, new DocumentRequest(dossierId, fileId));
}
}

View File

@ -1,8 +1,8 @@
package com.knecon.fforesight.service.ocr.v1.server.queue; package com.knecon.fforesight.service.ocr.v1.server.queue;
import java.io.File; import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Path;
import java.time.OffsetDateTime; import java.time.OffsetDateTime;
import java.time.temporal.ChronoUnit; import java.time.temporal.ChronoUnit;
@ -10,15 +10,17 @@ import org.springframework.amqp.AmqpRejectAndDontRequeueException;
import org.springframework.amqp.core.Message; import org.springframework.amqp.core.Message;
import org.springframework.amqp.rabbit.annotation.RabbitHandler; import org.springframework.amqp.rabbit.annotation.RabbitHandler;
import org.springframework.amqp.rabbit.annotation.RabbitListener; import org.springframework.amqp.rabbit.annotation.RabbitListener;
import org.springframework.http.HttpStatus;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import org.springframework.util.FileSystemUtils;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.knecon.fforesight.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
import com.knecon.fforesight.service.ocr.processor.service.FileStorageService; import com.knecon.fforesight.service.ocr.processor.service.FileStorageService;
import com.knecon.fforesight.service.ocr.processor.service.IOcrMessageSender;
import com.knecon.fforesight.service.ocr.processor.service.OCRService; import com.knecon.fforesight.service.ocr.processor.service.OCRService;
import com.knecon.fforesight.service.ocr.processor.service.OsUtils;
import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest; import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest;
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileErrorInfo;
import feign.FeignException;
import lombok.AccessLevel; import lombok.AccessLevel;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults; import lombok.experimental.FieldDefaults;
@ -30,49 +32,71 @@ import lombok.extern.slf4j.Slf4j;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OcrMessageReceiver { public class OcrMessageReceiver {
FileStorageService fileStorageService; FileStorageService fileStorageService;
ObjectMapper objectMapper; ObjectMapper objectMapper;
OCRService ocrService; FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
IOcrMessageSender ocrMessageSender; OCRService ocrService;
@RabbitHandler @RabbitHandler
@RabbitListener(queues = MessagingConfiguration.OCR_REQUEST_QUEUE, concurrency = "1") @RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1")
public void receiveOcr(Message in) throws IOException { public void receiveOcr(Message in) throws IOException {
if (in.getMessageProperties().isRedelivered()) {
throw new AmqpRejectAndDontRequeueException("Redelivered OCR Request, aborting...");
}
DocumentRequest ocrRequestMessage = objectMapper.readValue(in.getBody(), DocumentRequest.class); DocumentRequest ocrRequestMessage = objectMapper.readValue(in.getBody(), DocumentRequest.class);
String dossierId = ocrRequestMessage.getDossierId(); log.info("--------------------------------------------------------------------------");
String fileId = ocrRequestMessage.getFileId(); log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve(dossierId + "-" + fileId);
try { try {
log.info("--------------------------------------------------------------------------"); setStatusOcrProcessing(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
log.info("Start ocr for file with dossierId {} and fileId {}", dossierId, fileId);
ocrMessageSender.sendOCRStarted(fileId); if (!fileStorageService.untouchedFileExists(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId())) {
byte[] originalFile = fileStorageService.getOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile);
}
tmpDir.toFile().mkdirs(); try (var transferStream = new ByteArrayOutputStream()) {
File documentFile = tmpDir.resolve("document.pdf").toFile(); ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), transferStream);
File viewerDocumentFile = tmpDir.resolve("viewerDocument.pdf").toFile(); try (var inputStream = new ByteArrayInputStream(transferStream.toByteArray())) {
fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), inputStream);
}
} catch (IOException e) {
log.error("Failed to store file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
throw new RuntimeException(e);
}
fileStorageService.downloadFiles(dossierId, fileId, documentFile, viewerDocumentFile); fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
ocrService.runOcrOnDocument(dossierId, fileId, ocrRequestMessage.isRemoveWatermark(), tmpDir, documentFile, viewerDocumentFile);
fileStorageService.storeFiles(dossierId, fileId, documentFile, viewerDocumentFile);
ocrMessageSender.sendOcrResponse(dossierId, fileId);
} catch (Exception e) { } catch (Exception e) {
log.warn("An exception occurred in ocr file stage: {}", e.getMessage()); log.warn("An exception occurred in ocr file stage: {}", e.getMessage());
in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_HEADER, e.getMessage()); in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_HEADER, e.getMessage());
in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_TIMESTAMP_HEADER, OffsetDateTime.now().truncatedTo(ChronoUnit.MILLIS)); in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_TIMESTAMP_HEADER, OffsetDateTime.now().truncatedTo(ChronoUnit.MILLIS));
throw new RuntimeException(e); throw new RuntimeException(e);
} finally { }
FileSystemUtils.deleteRecursively(tmpDir); }
@RabbitHandler
@RabbitListener(queues = MessagingConfiguration.OCR_DLQ, concurrency = "1")
public void receiveOcrDLQ(Message failedMessage) throws IOException {
DocumentRequest ocrRequestMessage = objectMapper.readValue(failedMessage.getBody(), DocumentRequest.class);
log.info("OCR DQL received: {}", ocrRequestMessage);
String errorMessage = failedMessage.getMessageProperties().getHeader(MessagingConfiguration.X_ERROR_INFO_HEADER);
OffsetDateTime timestamp = failedMessage.getMessageProperties().getHeader(MessagingConfiguration.X_ERROR_INFO_TIMESTAMP_HEADER);
timestamp = timestamp != null ? timestamp : OffsetDateTime.now().truncatedTo(ChronoUnit.MILLIS);
fileStatusProcessingUpdateClient.ocrFailed(ocrRequestMessage.getDossierId(),
ocrRequestMessage.getFileId(),
new FileErrorInfo(errorMessage, MessagingConfiguration.OCR_DLQ, "ocr-service", timestamp));
}
private void setStatusOcrProcessing(String dossierId, String fileId) {
try {
fileStatusProcessingUpdateClient.ocrProcessing(dossierId, fileId);
} catch (FeignException e) {
if (e.status() == HttpStatus.CONFLICT.value()) {
throw new AmqpRejectAndDontRequeueException(e.getMessage());
}
} }
} }

View File

@ -1,23 +1,17 @@
package com.knecon.fforesight.service.ocr.v1.server.queue; package com.knecon.fforesight.service.ocr.v1.server.queue;
import org.springframework.amqp.rabbit.core.RabbitTemplate; import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.ocr.processor.service.IOcrMessageSender; import com.knecon.fforesight.service.ocr.processor.service.IOcrMessageSender;
import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest;
import com.knecon.fforesight.service.ocr.v1.api.model.OCRStatusUpdateResponse; import com.knecon.fforesight.service.ocr.v1.api.model.OCRStatusUpdateResponse;
import jakarta.annotation.PostConstruct;
import lombok.AccessLevel; import lombok.AccessLevel;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults; import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service @Service
@RequiredArgsConstructor @RequiredArgsConstructor
@ConditionalOnProperty(value = "ocrService.sendStatusUpdates", havingValue = "true")
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OcrMessageSender implements IOcrMessageSender { public class OcrMessageSender implements IOcrMessageSender {
@ -31,14 +25,6 @@ public class OcrMessageSender implements IOcrMessageSender {
} }
public void sendOCRStarted(String fileId) {
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
OCRStatusUpdateResponse.builder().fileId(fileId).ocrStarted(true).build());
}
public void sendUpdate(String fileId, int finishedImages, int totalImages) { public void sendUpdate(String fileId, int finishedImages, int totalImages) {
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
@ -46,10 +32,4 @@ public class OcrMessageSender implements IOcrMessageSender {
} }
public void sendOcrResponse(String dossierId, String fileId) {
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_RESPONSE_QUEUE, new DocumentRequest(dossierId, fileId));
}
} }

View File

@ -1,5 +1,5 @@
# you can list packages # you can list packages
ghostscript=9.55.0~dfsg1-0ubuntu5.9 ghostscript
pkg-config pkg-config
zip zip
unzip unzip
@ -11,7 +11,6 @@ libk5crypto3
libkrb5support0 libkrb5support0
libkeyutils1 libkeyutils1
libkrb5-3 libkrb5-3
libbrotli1
# or include links to specific .deb files # or include links to specific .deb files
# http://ftp.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.8_all.deb # http://ftp.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.8_all.deb

View File

@ -12,9 +12,6 @@ project.version: 1.0-SNAPSHOT
server: server:
port: 8080 port: 8080
lifecycle:
base-package: com.knecon.fforesight.service.ocr
spring: spring:
application: application:
name: ocr-service name: ocr-service
@ -60,6 +57,3 @@ management:
endpoint: ${OTLP_ENDPOINT:http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces} endpoint: ${OTLP_ENDPOINT:http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces}
pdftron.license: ${PDFTRON_LICENSE} pdftron.license: ${PDFTRON_LICENSE}
ocrService:
sendStatusUpdates: true

View File

@ -6,7 +6,7 @@
"overrides": [ "overrides": [
{ {
"name": "tesseract", "name": "tesseract",
"version": "5.3.3" "version": "5.3.2"
}, },
{ {
"name": "leptonica", "name": "leptonica",

View File

@ -24,10 +24,10 @@ import org.springframework.context.annotation.Primary;
import org.springframework.test.context.junit.jupiter.SpringExtension; import org.springframework.test.context.junit.jupiter.SpringExtension;
import com.iqser.red.commons.jackson.ObjectMapperFactory; import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.knecon.fforesight.service.ocr.processor.initializer.PDFNetInitializer;
import com.iqser.red.storage.commons.StorageAutoConfiguration; import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService; import com.iqser.red.storage.commons.service.StorageService;
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService; import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
import com.knecon.fforesight.service.ocr.processor.initializer.NativeLibrariesInitializer;
import com.knecon.fforesight.tenantcommons.TenantsClient; import com.knecon.fforesight.tenantcommons.TenantsClient;
import com.pdftron.pdf.PDFNet; import com.pdftron.pdf.PDFNet;
@ -36,7 +36,7 @@ import lombok.SneakyThrows;
@ExtendWith({SpringExtension.class, MockitoExtension.class}) @ExtendWith({SpringExtension.class, MockitoExtension.class})
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) @SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@Import({AbstractTest.TestConfiguration.class, NativeLibrariesInitializer.class}) @Import({AbstractTest.TestConfiguration.class, PDFNetInitializer.class})
@AutoConfigureObservability @AutoConfigureObservability
public class AbstractTest { public class AbstractTest {

View File

@ -9,7 +9,6 @@ import java.io.FileInputStream;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.Comparator; import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@ -26,7 +25,6 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.knecon.fforesight.service.ocr.processor.service.FileStorageService; import com.knecon.fforesight.service.ocr.processor.service.FileStorageService;
import com.knecon.fforesight.service.ocr.processor.service.OCRService; import com.knecon.fforesight.service.ocr.processor.service.OCRService;
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType; import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
import com.knecon.fforesight.service.ocr.processor.service.OsUtils;
import com.knecon.fforesight.tenantcommons.TenantContext; import com.knecon.fforesight.tenantcommons.TenantContext;
import io.micrometer.prometheus.PrometheusMeterRegistry; import io.micrometer.prometheus.PrometheusMeterRegistry;
@ -50,9 +48,9 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@Test @Test
public void testOCRMetrics() { public void testOCRMetrics() {
testOCR("files/syngenta/CustomerFiles/SinglePages/Watermark_Page1_10.SYN524464 FS (A16148C) - Absorção cutânea.pdf"); testOCR("files/Watermark.pdf");
testOCR("files/syngenta/CustomerFiles/SinglePages/Watermark_Page1_10.SYN524464 FS (A16148C) - Absorção cutânea.pdf"); testOCR("files/Watermark.pdf");
testOCR("files/syngenta/CustomerFiles/SinglePages/Watermark_Page1_10.SYN524464 FS (A16148C) - Absorção cutânea.pdf"); testOCR("files/Watermark.pdf");
var ocrOnDocumentMeter = registry.getMeters().stream().filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny(); var ocrOnDocumentMeter = registry.getMeters().stream().filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny();
assertThat(ocrOnDocumentMeter.isPresent()).isTrue(); assertThat(ocrOnDocumentMeter.isPresent()).isTrue();
@ -81,7 +79,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@Test @Test
public void testMergeImages() { public void testMergeImages() {
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there // check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
String text = testOCR("files/syngenta/CustomerFiles/SinglePages/merge_images - Page241_18 Chlorothalonil RAR 08 Volume 3CA B 6a Oct 2017.pdf"); String text = testOCR("files/merge_images.pdf");
assertThat(text).contains("Bodyweight change of dams with live young - group mean values", assertThat(text).contains("Bodyweight change of dams with live young - group mean values",
"Control", "Control",
"mg/g day", "mg/g day",
@ -101,7 +99,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@Test @Test
public void testOCRWatermark() { public void testOCRWatermark() {
assertThat(testOCR("files/syngenta/CustomerFiles/SinglePages/Watermark_Page1_10.SYN524464 FS (A16148C) - Absorção cutânea.pdf")).contains("syngenta"); assertThat(testOCR("files/Watermark.pdf")).contains("syngenta");
} }
@ -118,17 +116,18 @@ public class OcrServiceIntegrationTest extends AbstractTest {
private String testOCR(String fileName) { private String testOCR(String fileName) {
ClassPathResource pdfFileResource = new ClassPathResource(fileName); ClassPathResource pdfFileResource = new ClassPathResource(fileName);
Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve("OCR_TEST").resolve(Path.of(fileName).getFileName()); var originId = FileStorageService.getStorageId(TEST_DOSSIER_ID, "file", FileType.ORIGIN);
tmpDir.toFile().mkdirs(); try (var fileStream = pdfFileResource.getInputStream()) {
var documentFile = tmpDir.resolve(Path.of("document.pdf")); storageService.storeObject(TenantContext.getTenantId(), originId, fileStream);
var viewerDocumentFile = tmpDir.resolve(Path.of("viewerDocument.pdf")); }
Files.copy(pdfFileResource.getFile().toPath(), documentFile, StandardCopyOption.REPLACE_EXISTING);
Files.copy(pdfFileResource.getFile().toPath(), viewerDocumentFile, StandardCopyOption.REPLACE_EXISTING);
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", false, tmpDir, documentFile.toFile(), viewerDocumentFile.toFile()); Path tmpFileName = Path.of(getTemporaryDirectory()).resolve(Path.of(fileName).getFileName());
System.out.println("File:" + documentFile); try (var out = new FileOutputStream(tmpFileName.toFile())) {
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", out);
System.out.println("File:" + tmpFileName);
}
try (var fileStream = new FileInputStream(documentFile.toFile())) { try (var fileStream = new FileInputStream(tmpFileName.toFile())) {
return extractAllTextFromDocument(fileStream); return extractAllTextFromDocument(fileStream);
} }
} }
@ -167,18 +166,20 @@ public class OcrServiceIntegrationTest extends AbstractTest {
} }
@SneakyThrows @SneakyThrows
private void testOCRForFile(File file) { private void testOCRForFile(File file) {
Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve("OCR_TEST").resolve(file.toPath().getFileName()); var originId = FileStorageService.getStorageId(TEST_DOSSIER_ID, "file", FileType.ORIGIN);
tmpDir.toFile().mkdirs(); try (var fileStream = new FileInputStream(file)) {
var documentFile = tmpDir.resolve(Path.of("document.pdf")); storageService.storeObject(TenantContext.getTenantId(), originId, fileStream);
var viewerDocumentFile = tmpDir.resolve(Path.of("viewerDocument.pdf")); }
Files.copy(file.toPath(), documentFile, StandardCopyOption.REPLACE_EXISTING);
Files.copy(file.toPath(), viewerDocumentFile, StandardCopyOption.REPLACE_EXISTING);
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", false, tmpDir, documentFile.toFile(), viewerDocumentFile.toFile()); Path tmpFileName = Path.of(getTemporaryDirectory()).resolve(Path.of(file.getAbsolutePath()).getFileName());
System.out.println("File:" + documentFile); try (var out = new FileOutputStream(tmpFileName.toFile())) {
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", out);
System.out.println("File:" + tmpFileName);
}
System.out.println("\n\n"); System.out.println("\n\n");
} }

View File

@ -15,10 +15,3 @@ management:
health.enabled: true health.enabled: true
endpoints.web.exposure.include: prometheus, health, metrics endpoints.web.exposure.include: prometheus, health, metrics
metrics.export.prometheus.enabled: true metrics.export.prometheus.enabled: true
tracing:
enabled: ${TRACING_ENABLED:false}
sampling:
probability: ${TRACING_PROBABILITY:1.0}
otlp:
tracing:
endpoint: ${OTLP_ENDPOINT:http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces}

@ -1 +0,0 @@
Subproject commit 9dc6c2337dea32e63aef53271dba0692537c6605

@ -1 +0,0 @@
Subproject commit 21fefb64bf27ca2b3329a6c69d90a27450b17930

View File

@ -1,9 +1,5 @@
#!/bin/bash #!/bin/bash
set -e
dir=${PWD##*/} dir=${PWD##*/}
gradle assemble gradle assemble
# Get the current Git branch # Get the current Git branch
@ -15,32 +11,5 @@ commit_hash=$(git rev-parse --short=5 HEAD)
# Combine branch and commit hash # Combine branch and commit hash
buildName="${USER}-${branch}-${commit_hash}" buildName="${USER}-${branch}-${commit_hash}"
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${buildName} gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache
echo "nexus.knecon.com:5001/ff/${dir}-server:$buildName"
newImageName="nexus.knecon.com:5001/ff/ocr-service-server:$buildName"
echo "full image name:"
echo ${newImageName}
echo ""
if [ -z "$1" ]; then
exit 0
fi
namespace=${1}
deployment_name="ocr-service-v1"
echo "deploying to ${namespace}"
oldImageName=$(rancher kubectl -n ${namespace} get deployment ${deployment_name} -o=jsonpath='{.spec.template.spec.containers[*].image}')
if [ "${newImageName}" = "${oldImageName}" ]; then
echo "Image tag did not change, redeploying..."
rancher kubectl rollout restart deployment ${deployment_name} -n ${namespace}
else
echo "upgrading the image tag..."
rancher kubectl set image deployment/${deployment_name} ${deployment_name}=${newImageName} -n ${namespace}
fi
rancher kubectl rollout status deployment ${deployment_name} -n ${namespace}
echo "Built ${deployment_name}:${buildName} and deployed to ${namespace}"