Compare commits
1 Commits
master
...
TestThread
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8924e905ad |
@ -1,7 +1,3 @@
|
|||||||
variables:
|
|
||||||
# SONAR_PROJECT_KEY: 'ocr-service:ocr-service-server'
|
|
||||||
GIT_SUBMODULE_STRATEGY: recursive
|
|
||||||
GIT_SUBMODULE_FORCE_HTTPS: 'true'
|
|
||||||
include:
|
include:
|
||||||
- project: 'gitlab/gitlab'
|
- project: 'gitlab/gitlab'
|
||||||
ref: 'main'
|
ref: 'main'
|
||||||
|
|||||||
8
.gitmodules
vendored
8
.gitmodules
vendored
@ -1,8 +0,0 @@
|
|||||||
[submodule "ocr-service-v1/ocr-service-server/src/test/resources/files/syngenta"]
|
|
||||||
path = ocr-service-v1/ocr-service-server/src/test/resources/files/syngenta
|
|
||||||
url = https://gitlab.knecon.com/fforesight/documents/syngenta.git
|
|
||||||
update = merge
|
|
||||||
[submodule "ocr-service-v1/ocr-service-server/src/test/resources/files/basf"]
|
|
||||||
path = ocr-service-v1/ocr-service-server/src/test/resources/files/basf
|
|
||||||
url = https://gitlab.knecon.com/fforesight/documents/basf.git
|
|
||||||
update = merge
|
|
||||||
12
README.md
12
README.md
@ -74,14 +74,12 @@ String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-ch
|
|||||||
```
|
```
|
||||||
## Integration
|
## Integration
|
||||||
|
|
||||||
The OCR-service communicates via RabbitMQ and uses the queues `ocr_request_queue`, `ocr_response_queue`,
|
The OCR-service communicates via RabbitMQ and uses the queues `ocrQueue`, `ocrDLQ`, and `ocr_status_update_response_queue`.
|
||||||
`ocr_dead_letter_queue`, and `ocr_status_update_response_queue`.
|
|
||||||
|
|
||||||
### ocr_request_queue
|
### ocrQueue
|
||||||
This queue is used to start the OCR process, a DocumentRequest must be passed as a message. The service will then download the PDF from the provided cloud storage.
|
This queue is used to start the OCR process, a DocumentRequest must be passed as a message. The service will then download the PDF from the provided cloud storage.
|
||||||
### ocr_response_queue
|
|
||||||
This queue is also used to signal the end of processing.
|
|
||||||
### ocr_dead_letter_queue
|
|
||||||
This queue is used to signal an error has occurred during processing.
|
|
||||||
### ocr_status_update_response_queue
|
### ocr_status_update_response_queue
|
||||||
This queue is used by the OCR service to give updates about the progress of the ongoing OCR on a image per image basis. The total amount may change, when less images are found than initially assumed.
|
This queue is used by the OCR service to give updates about the progress of the ongoing OCR on a image per image basis. The total amount may change, when less images are found than initially assumed.
|
||||||
|
This queue is also used to signal the end of processing.
|
||||||
|
### ocrDLQ
|
||||||
|
This queue is used to signal an error has occurred during processing.
|
||||||
|
|||||||
@ -12,10 +12,6 @@ group = "com.knecon.fforesight.service"
|
|||||||
java.sourceCompatibility = JavaVersion.VERSION_17
|
java.sourceCompatibility = JavaVersion.VERSION_17
|
||||||
java.targetCompatibility = JavaVersion.VERSION_17
|
java.targetCompatibility = JavaVersion.VERSION_17
|
||||||
|
|
||||||
pmd {
|
|
||||||
isConsoleOutput = true
|
|
||||||
}
|
|
||||||
|
|
||||||
tasks.pmdMain {
|
tasks.pmdMain {
|
||||||
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
|
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
|
||||||
}
|
}
|
||||||
|
|||||||
@ -9,12 +9,12 @@
|
|||||||
</description>
|
</description>
|
||||||
|
|
||||||
<rule ref="category/java/errorprone.xml">
|
<rule ref="category/java/errorprone.xml">
|
||||||
|
<exclude name="DataflowAnomalyAnalysis"/>
|
||||||
<exclude name="MissingSerialVersionUID"/>
|
<exclude name="MissingSerialVersionUID"/>
|
||||||
|
<exclude name="NullAssignment"/>
|
||||||
<exclude name="AvoidLiteralsInIfCondition"/>
|
<exclude name="AvoidLiteralsInIfCondition"/>
|
||||||
<exclude name="AvoidDuplicateLiterals"/>
|
<exclude name="AvoidDuplicateLiterals"/>
|
||||||
<exclude name="NullAssignment"/>
|
<exclude name="AvoidFieldNameMatchingMethodName"/>
|
||||||
<exclude name="AssignmentInOperand"/>
|
|
||||||
<exclude name="BeanMembersShouldSerialize"/>
|
|
||||||
</rule>
|
</rule>
|
||||||
|
|
||||||
</ruleset>
|
</ruleset>
|
||||||
@ -10,13 +10,14 @@
|
|||||||
|
|
||||||
|
|
||||||
<rule ref="category/java/errorprone.xml">
|
<rule ref="category/java/errorprone.xml">
|
||||||
|
<exclude name="DataflowAnomalyAnalysis"/>
|
||||||
<exclude name="MissingSerialVersionUID"/>
|
<exclude name="MissingSerialVersionUID"/>
|
||||||
|
<exclude name="NullAssignment"/>
|
||||||
<exclude name="AvoidLiteralsInIfCondition"/>
|
<exclude name="AvoidLiteralsInIfCondition"/>
|
||||||
<exclude name="AvoidDuplicateLiterals"/>
|
<exclude name="AvoidDuplicateLiterals"/>
|
||||||
<exclude name="NullAssignment"/>
|
<exclude name="AvoidFieldNameMatchingMethodName"/>
|
||||||
<exclude name="AssignmentInOperand"/>
|
<exclude name="AvoidFieldNameMatchingTypeName"/>
|
||||||
<exclude name="TestClassWithoutTestCases"/>
|
<exclude name="TestClassWithoutTestCases"/>
|
||||||
<exclude name="BeanMembersShouldSerialize"/>
|
|
||||||
</rule>
|
</rule>
|
||||||
|
|
||||||
</ruleset>
|
</ruleset>
|
||||||
@ -1,7 +1,7 @@
|
|||||||
plugins {
|
plugins {
|
||||||
`maven-publish`
|
`maven-publish`
|
||||||
id("com.iqser.red.service.java-conventions")
|
id("com.iqser.red.service.java-conventions")
|
||||||
id("io.freefair.lombok") version "8.4"
|
id("io.freefair.lombok") version "8.2.2"
|
||||||
}
|
}
|
||||||
|
|
||||||
publishing {
|
publishing {
|
||||||
|
|||||||
@ -13,12 +13,5 @@ public class DocumentRequest {
|
|||||||
|
|
||||||
protected String dossierId;
|
protected String dossierId;
|
||||||
protected String fileId;
|
protected String fileId;
|
||||||
protected boolean removeWatermark;
|
|
||||||
|
|
||||||
public DocumentRequest(String dossierId, String fileId) {
|
|
||||||
|
|
||||||
this.dossierId = dossierId;
|
|
||||||
this.fileId = fileId;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -15,6 +15,5 @@ public class OCRStatusUpdateResponse {
|
|||||||
private int numberOfPagesToOCR;
|
private int numberOfPagesToOCR;
|
||||||
private int numberOfOCRedPages;
|
private int numberOfOCRedPages;
|
||||||
private boolean ocrFinished;
|
private boolean ocrFinished;
|
||||||
private boolean ocrStarted;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
plugins {
|
plugins {
|
||||||
id("com.iqser.red.service.java-conventions")
|
id("com.iqser.red.service.java-conventions")
|
||||||
id("io.freefair.lombok") version "8.4"
|
id("io.freefair.lombok") version "8.2.2"
|
||||||
}
|
}
|
||||||
|
|
||||||
configurations {
|
configurations {
|
||||||
@ -14,8 +14,7 @@ dependencies {
|
|||||||
api("net.sourceforge.tess4j:tess4j:5.8.0")
|
api("net.sourceforge.tess4j:tess4j:5.8.0")
|
||||||
api("com.iqser.red.commons:metric-commons:2.1.0")
|
api("com.iqser.red.commons:metric-commons:2.1.0")
|
||||||
api("com.iqser.red.commons:storage-commons:2.45.0")
|
api("com.iqser.red.commons:storage-commons:2.45.0")
|
||||||
api("com.knecon.fforesight:tenant-commons:0.21.0")
|
api("com.knecon.fforesight:tenant-commons:0.19.0")
|
||||||
api("com.knecon.fforesight:lifecycle-commons:0.6.0")
|
|
||||||
api("com.pdftron:PDFNet:10.5.0")
|
api("com.pdftron:PDFNet:10.5.0")
|
||||||
api("org.apache.pdfbox:pdfbox:3.0.0")
|
api("org.apache.pdfbox:pdfbox:3.0.0")
|
||||||
api("org.apache.pdfbox:jbig2-imageio:3.0.4")
|
api("org.apache.pdfbox:jbig2-imageio:3.0.4")
|
||||||
@ -25,7 +24,6 @@ dependencies {
|
|||||||
api("io.github.karols:hocr4j:0.2.0")
|
api("io.github.karols:hocr4j:0.2.0")
|
||||||
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
|
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
|
||||||
api("com.google.guava:guava:31.1-jre")
|
api("com.google.guava:guava:31.1-jre")
|
||||||
api("com.iqser.red.commons:pdftron-logic-commons:2.32.0")
|
api("com.iqser.red.commons:pdftron-logic-commons:2.20.0")
|
||||||
api("com.knecon.fforesight:viewer-doc-processor:0.125.0")
|
|
||||||
testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
|
testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,26 +1,14 @@
|
|||||||
package com.knecon.fforesight.service.ocr.processor;
|
package com.knecon.fforesight.service.ocr.processor;
|
||||||
|
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
|
||||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||||
import org.springframework.context.annotation.Bean;
|
|
||||||
import org.springframework.context.annotation.ComponentScan;
|
import org.springframework.context.annotation.ComponentScan;
|
||||||
import org.springframework.context.annotation.Configuration;
|
import org.springframework.context.annotation.Configuration;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
|
|
||||||
|
|
||||||
import io.micrometer.observation.ObservationRegistry;
|
|
||||||
|
|
||||||
@Configuration
|
@Configuration
|
||||||
@ComponentScan
|
@ComponentScan
|
||||||
@EnableConfigurationProperties(OcrServiceSettings.class)
|
@EnableConfigurationProperties(OcrServiceSettings.class)
|
||||||
public class OcrServiceProcessorConfiguration {
|
public class OcrServiceProcessorConfiguration {
|
||||||
|
|
||||||
@Bean
|
|
||||||
@Autowired
|
|
||||||
public ViewerDocumentService viewerDocumentService(ObservationRegistry registry) {
|
|
||||||
|
|
||||||
return new ViewerDocumentService(registry);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,20 +1,17 @@
|
|||||||
package com.knecon.fforesight.service.ocr.processor.initializer;
|
package com.knecon.fforesight.service.ocr.processor.initializer;
|
||||||
|
|
||||||
import org.springframework.beans.factory.annotation.Value;
|
|
||||||
import org.springframework.stereotype.Component;
|
|
||||||
|
|
||||||
import com.pdftron.pdf.PDFNet;
|
import com.pdftron.pdf.PDFNet;
|
||||||
import com.sun.jna.NativeLibrary;
|
import com.sun.jna.NativeLibrary;
|
||||||
|
|
||||||
import jakarta.annotation.PostConstruct;
|
import jakarta.annotation.PostConstruct;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import org.springframework.beans.factory.annotation.Value;
|
||||||
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
@Slf4j
|
|
||||||
@Component
|
@Component
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class NativeLibrariesInitializer {
|
public class PDFNetInitializer {
|
||||||
|
|
||||||
@Value("${pdftron.license:}")
|
@Value("${pdftron.license:}")
|
||||||
private String pdftronLicense;
|
private String pdftronLicense;
|
||||||
@ -25,25 +22,8 @@ public class NativeLibrariesInitializer {
|
|||||||
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
|
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
|
||||||
public void init() {
|
public void init() {
|
||||||
|
|
||||||
log.info("Initializing Native Libraries");
|
|
||||||
log.info("Setting pdftron license: {}", pdftronLicense);
|
|
||||||
PDFNet.setTempPath("/tmp/pdftron");
|
PDFNet.setTempPath("/tmp/pdftron");
|
||||||
PDFNet.initialize(pdftronLicense);
|
PDFNet.initialize(pdftronLicense);
|
||||||
|
|
||||||
log.info("Setting jna.library.path: {}", System.getenv("VCPKG_DYNAMIC_LIB"));
|
|
||||||
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
|
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
|
||||||
|
|
||||||
log.info("Asserting Native Libraries loaded");
|
|
||||||
|
|
||||||
try (NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica")) {
|
|
||||||
assert leptonicaLib != null;
|
|
||||||
log.info("Leptonica library loaded from {}", leptonicaLib.getFile().getAbsolutePath());
|
|
||||||
}
|
|
||||||
|
|
||||||
try (NativeLibrary tesseractLib = NativeLibrary.getInstance("tesseract")) {
|
|
||||||
assert tesseractLib != null;
|
|
||||||
log.info("Tesseract library loaded from {}", tesseractLib.getFile().getAbsolutePath());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -25,18 +25,11 @@ public record OcrResultToWrite(List<TextPositionInImage> textPositionInImage, Qu
|
|||||||
.collect(Collectors.toMap(Map.Entry::getKey,
|
.collect(Collectors.toMap(Map.Entry::getKey,
|
||||||
entry -> entry.getValue()
|
entry -> entry.getValue()
|
||||||
.stream()
|
.stream()
|
||||||
.map(ocrResult -> new OcrResultToWrite(toTextPositionInImage(ocrResult, fontMetricsFactory), ocrResult.image().getImageCoordinatesInInitialUserSpace()))
|
.map(ocrResult -> new OcrResultToWrite(ocrResult.getAllWords()
|
||||||
|
.stream()
|
||||||
|
.filter(word -> !word.isBlank())
|
||||||
|
.map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR))
|
||||||
|
.toList(), ocrResult.image().getImageCoordinatesInInitialUserSpace()))
|
||||||
.toList()));
|
.toList()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static List<TextPositionInImage> toTextPositionInImage(OcrResult ocrResult, FontMetricsFactory fontMetricsFactory) {
|
|
||||||
|
|
||||||
return ocrResult.getAllWords()
|
|
||||||
.stream()
|
|
||||||
.filter(word -> !word.isBlank())
|
|
||||||
.map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR))
|
|
||||||
.toList();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,42 +1,12 @@
|
|||||||
package com.knecon.fforesight.service.ocr.processor.model;
|
package com.knecon.fforesight.service.ocr.processor.model;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
|
||||||
|
|
||||||
public record PageInformation(Rectangle2D mediabox, int number, int rotationDegrees) {
|
public record PageInformation(int height, int width, int number, int rotationDegrees) {
|
||||||
|
|
||||||
public static PageInformation fromPDPage(int pageNum, PDPage page) {
|
public static PageInformation fromPDPage(int pageNum, PDPage page) {
|
||||||
|
|
||||||
PDRectangle mediaBox = page.getMediaBox();
|
return new PageInformation((int) page.getMediaBox().getHeight(), (int) page.getMediaBox().getWidth(), pageNum, page.getRotation());
|
||||||
return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()),
|
|
||||||
pageNum,
|
|
||||||
page.getRotation());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public double height() {
|
|
||||||
|
|
||||||
return mediabox.getHeight();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public double width() {
|
|
||||||
|
|
||||||
return mediabox.getWidth();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public double minX() {
|
|
||||||
|
|
||||||
return mediabox.getX();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public double minY() {
|
|
||||||
|
|
||||||
return mediabox.getY();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
package com.knecon.fforesight.service.ocr.processor.model;
|
package com.knecon.fforesight.service.ocr.processor.model;
|
||||||
|
|
||||||
import java.awt.Rectangle;
|
|
||||||
import java.awt.geom.AffineTransform;
|
import java.awt.geom.AffineTransform;
|
||||||
import java.awt.geom.Line2D;
|
import java.awt.geom.Line2D;
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
@ -35,16 +34,6 @@ public record QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) {
|
|||||||
new Point2D.Double(bounds.getRight(), bounds.getBottom()));
|
new Point2D.Double(bounds.getRight(), bounds.getBottom()));
|
||||||
}
|
}
|
||||||
|
|
||||||
public Rectangle2D getBounds2D() {
|
|
||||||
|
|
||||||
double minX = Math.min(Math.min(Math.min(a.getX(), b.getX()), c.getX()), d.getX());
|
|
||||||
double minY = Math.min(Math.min(Math.min(a.getY(), b.getY()), c.getY()), d.getY());
|
|
||||||
double maxX = Math.max(Math.max(Math.max(a.getX(), b.getX()), c.getX()), d.getX());
|
|
||||||
double maxY = Math.max(Math.max(Math.max(a.getY(), b.getY()), c.getY()), d.getY());
|
|
||||||
|
|
||||||
return new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public QuadPoint getTransformed(AffineTransform at) {
|
public QuadPoint getTransformed(AffineTransform at) {
|
||||||
|
|
||||||
|
|||||||
@ -1,12 +1,20 @@
|
|||||||
package com.knecon.fforesight.service.ocr.processor.model;
|
package com.knecon.fforesight.service.ocr.processor.model;
|
||||||
|
|
||||||
import java.awt.geom.AffineTransform;
|
import java.awt.geom.AffineTransform;
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.Setter;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
import lombok.experimental.FieldDefaults;
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import net.sourceforge.lept4j.Leptonica1;
|
||||||
import net.sourceforge.lept4j.Pix;
|
import net.sourceforge.lept4j.Pix;
|
||||||
|
import net.sourceforge.tess4j.ITessAPI;
|
||||||
|
|
||||||
@Getter
|
@Getter
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
|
|||||||
@ -1,11 +1,13 @@
|
|||||||
package com.knecon.fforesight.service.ocr.processor.service;
|
package com.knecon.fforesight.service.ocr.processor.service;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.StandardCopyOption;
|
import java.nio.file.Paths;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
|
||||||
@ -29,38 +31,47 @@ public class FileStorageService {
|
|||||||
return dossierId + "/" + fileId + "." + fileType.name() + fileType.getExtension();
|
return dossierId + "/" + fileId + "." + fileType.name() + fileType.getExtension();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public byte[] getOriginalFile(String dossierId, String fileId) {
|
||||||
|
|
||||||
|
try (InputStream inputStream = getInputStream(getStorageId(dossierId, fileId, FileType.ORIGIN))) {
|
||||||
|
return IOUtils.toByteArray(inputStream);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public InputStream getOriginalFileAsStream(String dossierId, String fileId) {
|
||||||
|
|
||||||
|
return getInputStream(getStorageId(dossierId, fileId, FileType.ORIGIN));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void storeOriginalFile(String dossierId, String fileId, InputStream stream) {
|
||||||
|
|
||||||
|
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean untouchedFileExists(String dossierId, String fileId) {
|
public boolean untouchedFileExists(String dossierId, String fileId) {
|
||||||
|
|
||||||
return storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED));
|
return storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED));
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public void storeFiles(String dossierId, String fileId, File documentFile, File viewerDocumentFile) {
|
|
||||||
|
|
||||||
try (var in = new FileInputStream(documentFile)) {
|
public void storeUntouchedFile(String dossierId, String fileId, byte[] data) {
|
||||||
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), in);
|
|
||||||
}
|
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED), new ByteArrayInputStream(data));
|
||||||
try (var in = new FileInputStream(viewerDocumentFile)) {
|
|
||||||
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT), in);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void downloadFiles(String dossierId, String fileId, File documentFile, File viewerDocumentFile) {
|
private InputStream getInputStream(String storageId) {
|
||||||
|
|
||||||
storageService.downloadTo(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), documentFile);
|
File tempFile = File.createTempFile("temp", ".data");
|
||||||
if (storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT))) {
|
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
|
||||||
storageService.downloadTo(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT), viewerDocumentFile);
|
return Files.newInputStream(Paths.get(tempFile.getPath()), StandardOpenOption.DELETE_ON_CLOSE);
|
||||||
} else {
|
|
||||||
Files.copy(documentFile.toPath(), viewerDocumentFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!untouchedFileExists(dossierId, fileId)) {
|
|
||||||
try (var in = new FileInputStream(documentFile)) {
|
|
||||||
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED), in);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -7,10 +7,7 @@ public interface IOcrMessageSender {
|
|||||||
|
|
||||||
void sendUpdate(String fileId, int finishedImages, int totalImages);
|
void sendUpdate(String fileId, int finishedImages, int totalImages);
|
||||||
|
|
||||||
void sendOCRStarted(String fileId);
|
|
||||||
|
|
||||||
void sendOcrFinished(String fileId, int totalImages);
|
void sendOcrFinished(String fileId, int totalImages);
|
||||||
|
|
||||||
void sendOcrResponse(String dossierId, String fileId);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,12 +1,12 @@
|
|||||||
package com.knecon.fforesight.service.ocr.processor.service;
|
package com.knecon.fforesight.service.ocr.processor.service;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.IOException;
|
||||||
import java.io.FileOutputStream;
|
import java.io.InputStream;
|
||||||
import java.nio.file.Files;
|
import java.io.OutputStream;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardCopyOption;
|
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@ -27,10 +27,7 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
|||||||
import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.FontStyleDetector;
|
import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.FontStyleDetector;
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||||
import com.pdftron.pdf.PDFDoc;
|
|
||||||
|
|
||||||
import io.micrometer.observation.ObservationRegistry;
|
|
||||||
import io.micrometer.observation.annotation.Observed;
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
@ -51,7 +48,6 @@ public class OCRService {
|
|||||||
OcrResultWriter ocrResultWriter;
|
OcrResultWriter ocrResultWriter;
|
||||||
GhostScriptService ghostScriptService;
|
GhostScriptService ghostScriptService;
|
||||||
FontStyleDetector boldDetector;
|
FontStyleDetector boldDetector;
|
||||||
ObservationRegistry registry;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -59,66 +55,54 @@ public class OCRService {
|
|||||||
* looking for stitchedImages (if so converting the current page to an image with ghostscript and work on this instead),
|
* looking for stitchedImages (if so converting the current page to an image with ghostscript and work on this instead),
|
||||||
* perform tesseract-ocr on these images (via threads) and write the generated ocr-text as invisible elements.
|
* perform tesseract-ocr on these images (via threads) and write the generated ocr-text as invisible elements.
|
||||||
*
|
*
|
||||||
* @param dossierId Id of dossier
|
* @param dossierId Id of dossier
|
||||||
* @param fileId Id of file
|
* @param fileId Id of file
|
||||||
* @param tmpDir working directory for all files
|
* @param out OutputStream where to write to
|
||||||
* @param documentFile the file to perform ocr on, results are written invisibly
|
|
||||||
* @param viewerDocumentFile debugging file, results are written visibly in an optional content group
|
|
||||||
*/
|
*/
|
||||||
@Observed(name = "OCRService", contextualName = "run-ocr-on-document")
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void runOcrOnDocument(String dossierId, String fileId, boolean removeWatermark, Path tmpDir, File documentFile, File viewerDocumentFile) {
|
public void runOcrOnDocument(String dossierId, String fileId, OutputStream out) {
|
||||||
|
|
||||||
if (removeWatermark) {
|
try (InputStream fileStream = removeWatermarkIfEnabled(dossierId, fileId); ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) {
|
||||||
removeWatermarkIfEnabled(documentFile);
|
|
||||||
|
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
|
||||||
|
|
||||||
|
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
|
||||||
|
log.info("Starting OCR for file {}", fileId);
|
||||||
|
long ocrStart = System.currentTimeMillis();
|
||||||
|
Statistics stats = runOcr(transferInputStream, out, fileId, dossierId);
|
||||||
|
long ocrEnd = System.currentTimeMillis();
|
||||||
|
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, String.format("%.1f", (ocrEnd - ocrStart) / 1000.0));
|
||||||
|
log.info("Runtime breakdown: {}", stats);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
removeInvisibleElements(documentFile);
|
}
|
||||||
|
|
||||||
log.info("Starting OCR for file {}", fileId);
|
|
||||||
long ocrStart = System.currentTimeMillis();
|
|
||||||
Statistics stats = runOcr(tmpDir, documentFile, viewerDocumentFile, fileId, dossierId);
|
|
||||||
long ocrEnd = System.currentTimeMillis();
|
|
||||||
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, String.format("%.1f", (ocrEnd - ocrStart) / 1000.0));
|
|
||||||
log.info("Runtime breakdown: {}", stats);
|
|
||||||
|
|
||||||
|
private InputStream removeWatermarkIfEnabled(String dossierId, String fileId) throws IOException {
|
||||||
|
|
||||||
|
if (settings.isRemoveWatermark()) {
|
||||||
|
try (var in = fileStorageService.getOriginalFileAsStream(dossierId, fileId); var transferOutputStream = new ByteArrayOutputStream()) {
|
||||||
|
watermarkRemovalService.removeWatermarks(in, transferOutputStream);
|
||||||
|
return new ByteArrayInputStream(transferOutputStream.toByteArray());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fileStorageService.getOriginalFileAsStream(dossierId, fileId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void removeInvisibleElements(File originFile) {
|
public Statistics runOcr(InputStream in, OutputStream out, String fileId, String dossierId) {
|
||||||
|
|
||||||
Path tmpFile = Files.createTempFile("invisibleElements", ".pdf");
|
|
||||||
try (var in = new FileInputStream(originFile); var out = new FileOutputStream(tmpFile.toFile())) {
|
|
||||||
invisibleElementRemovalService.removeInvisibleElements(in, out, false, false);
|
|
||||||
}
|
|
||||||
Files.copy(tmpFile, originFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
|
|
||||||
assert tmpFile.toFile().delete();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
private void removeWatermarkIfEnabled(File originFile) {
|
|
||||||
|
|
||||||
Path tmpFile = Files.createTempFile("removeWatermarks", ".pdf");
|
|
||||||
try (var in = new FileInputStream(originFile); var out = new FileOutputStream(tmpFile.toFile())) {
|
|
||||||
watermarkRemovalService.removeWatermarks(in, out);
|
|
||||||
}
|
|
||||||
Files.copy(tmpFile, originFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
|
|
||||||
assert tmpFile.toFile().delete();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public Statistics runOcr(Path tmpDir, File documentFile, File viewerDocumentFile, String fileId, String dossierId) {
|
|
||||||
|
|
||||||
long timestamp;
|
long timestamp;
|
||||||
|
Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve(dossierId + "-" + fileId);
|
||||||
Path tmpImageDir = tmpDir.resolve("images");
|
Path tmpImageDir = tmpDir.resolve("images");
|
||||||
Path tesseractOutputDir = tmpDir.resolve("tesseract_output");
|
Path tesseractOutputDir = tmpDir.resolve("tesseract_output");
|
||||||
|
|
||||||
tesseractOutputDir.toFile().mkdirs();
|
tesseractOutputDir.toFile().mkdirs();
|
||||||
tmpImageDir.toFile().mkdirs();
|
tmpImageDir.toFile().mkdirs();
|
||||||
|
|
||||||
|
File documentFile = OsUtils.writeFileToTmpFolder(in, tmpDir);
|
||||||
|
|
||||||
Statistics stats;
|
Statistics stats;
|
||||||
try (PDDocument document = Loader.loadPDF(documentFile)) {
|
try (PDDocument document = Loader.loadPDF(documentFile)) {
|
||||||
OcrProgressLogger logger = new OcrProgressLogger(document.getNumberOfPages(), ocrMessageSender, fileId);
|
OcrProgressLogger logger = new OcrProgressLogger(document.getNumberOfPages(), ocrMessageSender, fileId);
|
||||||
@ -162,11 +146,12 @@ public class OCRService {
|
|||||||
stats.increaseFontStyleDetectionDuration(System.currentTimeMillis() - timestamp);
|
stats.increaseFontStyleDetectionDuration(System.currentTimeMillis() - timestamp);
|
||||||
|
|
||||||
timestamp = System.currentTimeMillis();
|
timestamp = System.currentTimeMillis();
|
||||||
ocrResultWriter.drawOcrResultsToPdf(documentFile, viewerDocumentFile, imageWithTextPositionsPerPage);
|
var dictionariesToUpdate = ocrResultWriter.drawOcrResultsToPdf(document, imageWithTextPositionsPerPage);
|
||||||
|
|
||||||
log.info("Saving document");
|
log.info("Saving document");
|
||||||
|
document.saveIncremental(out, dictionariesToUpdate);
|
||||||
stats.increaseWritingTextDuration(System.currentTimeMillis() - timestamp);
|
stats.increaseWritingTextDuration(System.currentTimeMillis() - timestamp);
|
||||||
|
|
||||||
|
FileSystemUtils.deleteRecursively(tmpDir);
|
||||||
logger.sendFinished();
|
logger.sendFinished();
|
||||||
return stats;
|
return stats;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,38 +1,29 @@
|
|||||||
package com.knecon.fforesight.service.ocr.processor.service;
|
package com.knecon.fforesight.service.ocr.processor.service;
|
||||||
|
|
||||||
import java.awt.Color;
|
import java.awt.Color;
|
||||||
import java.awt.geom.Line2D;
|
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
import java.awt.geom.Rectangle2D;
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashSet;
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Set;
|
||||||
import java.util.function.Function;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
|
import org.apache.pdfbox.cos.COSDictionary;
|
||||||
|
import org.apache.pdfbox.cos.COSName;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDResources;
|
||||||
|
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentGroup;
|
||||||
|
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentProperties;
|
||||||
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
|
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.iqser.red.pdftronlogic.commons.Converter;
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
|
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
|
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
|
||||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
|
||||||
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
|
|
||||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
|
||||||
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
|
|
||||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
|
|
||||||
import com.pdftron.pdf.PDFDoc;
|
|
||||||
import com.pdftron.pdf.Page;
|
|
||||||
import com.pdftron.pdf.TextExtractor;
|
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -46,179 +37,180 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
public class OcrResultWriter {
|
public class OcrResultWriter {
|
||||||
|
|
||||||
public static final Color REGULAR_TEXT_COLOR = Color.BLUE;
|
static String ocrLayerName = "knecon OCR";
|
||||||
public static final Color BOLD_TEXT_COLOR = Color.CYAN;
|
OcrServiceSettings settings;
|
||||||
|
|
||||||
public static final Color REGULAR_TEXT_IN_IGNORE_ZONE = Color.RED;
|
|
||||||
public static final Color BOLD_TEXT_IN_IGNORE_ZONE = Color.RED;
|
|
||||||
|
|
||||||
ViewerDocumentService viewerDocumentService;
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void drawOcrResultsToPdf(File document, File viewerDocument, Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
public Set<COSDictionary> drawOcrResultsToPdf(PDDocument document, Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||||
|
|
||||||
Map<Integer, VisualizationsOnPage> ocrVisualizationsOnPages = new HashMap<>();
|
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
|
||||||
Map<Integer, VisualizationsOnPage> ocrTextDebugVisualizationsOnPages = new HashMap<>();
|
imagesWithResultsPerPage.keySet().forEach(pageNumber -> drawResultsPerPage(document, pageNumber, imagesWithResultsPerPage.get(pageNumber), dictionariesToUpdate));
|
||||||
Map<Integer, VisualizationsOnPage> ocrBBoxDebugVisualizationsOnPages = new HashMap<>();
|
dictionariesToUpdate.add(document.getDocumentInformation().getCOSObject());
|
||||||
|
return dictionariesToUpdate;
|
||||||
try (var in = new FileInputStream(document); PDFDoc doc = new PDFDoc(in)) {
|
|
||||||
|
|
||||||
for (Integer pageNumber : imagesWithResultsPerPage.keySet()) {
|
|
||||||
|
|
||||||
List<Rectangle2D> textBBoxes = getTextBBoxes(doc.getPage(pageNumber));
|
|
||||||
|
|
||||||
ocrVisualizationsOnPages.put(pageNumber - 1, createVisualizations(imagesWithResultsPerPage.get(pageNumber), textBBoxes));
|
|
||||||
ocrTextDebugVisualizationsOnPages.put(pageNumber - 1, createDebugTextVisualizations(imagesWithResultsPerPage.get(pageNumber), textBBoxes));
|
|
||||||
ocrBBoxDebugVisualizationsOnPages.put(pageNumber - 1, createDebugBBoxVisualizations(imagesWithResultsPerPage.get(pageNumber)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Visualizations visualizations = new Visualizations(ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false);
|
|
||||||
|
|
||||||
List<Visualizations> debugVisualizations = List.of(visualizations,
|
|
||||||
new Visualizations(ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false),
|
|
||||||
new Visualizations(ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false));
|
|
||||||
|
|
||||||
viewerDocumentService.addVisualizationsOnPage(document, document, List.of(visualizations));
|
|
||||||
viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, debugVisualizations);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SuppressWarnings("PMD")
|
|
||||||
private List<Rectangle2D> getTextBBoxes(Page page) {
|
|
||||||
|
|
||||||
List<Rectangle2D> textBBoxes = new ArrayList<>();
|
|
||||||
try (var textExtractor = new TextExtractor()) {
|
|
||||||
textExtractor.begin(page);
|
|
||||||
try {
|
|
||||||
|
|
||||||
for (TextExtractor.Line line = textExtractor.getFirstLine(); line.isValid(); line = getNextLine(line)) {
|
|
||||||
for (TextExtractor.Word word = line.getFirstWord(); word.isValid(); word = getNextWord(word)) {
|
|
||||||
textBBoxes.add(Converter.toRectangle2D(word.getBBox()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.warn("Could not get word dimension, {}", e.getMessage());
|
|
||||||
}
|
|
||||||
return textBBoxes;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static TextExtractor.Word getNextWord(TextExtractor.Word word) {
|
|
||||||
|
|
||||||
TextExtractor.Word nextWord = word.getNextWord();
|
|
||||||
word.close();
|
|
||||||
return nextWord;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static TextExtractor.Line getNextLine(TextExtractor.Line line) {
|
|
||||||
|
|
||||||
TextExtractor.Line newLine = line.getNextLine();
|
|
||||||
line.close();
|
|
||||||
return newLine;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private VisualizationsOnPage createVisualizations(List<OcrResultToWrite> ocrResultsToWrite, List<Rectangle2D> ignoreZones) {
|
|
||||||
|
|
||||||
List<TextPositionInImage> words = ocrResultsToWrite.stream()
|
|
||||||
.map(OcrResultToWrite::textPositionInImage)
|
|
||||||
.flatMap(Collection::stream)
|
|
||||||
.filter(word -> ignoreZones.stream()
|
|
||||||
.noneMatch(ignoreZone -> word.getTransformedTextBBox().getBounds2D().intersects(ignoreZone)))
|
|
||||||
.toList();
|
|
||||||
|
|
||||||
List<PlacedText> placedTexts = words.stream()
|
|
||||||
.map(word -> new PlacedText(word.getText(),
|
|
||||||
null,
|
|
||||||
Color.BLACK,
|
|
||||||
(float) word.getFontSize(),
|
|
||||||
word.getFontMetricsFactory(),
|
|
||||||
Optional.of(word.getTextMatrix()),
|
|
||||||
Optional.of(RenderingMode.NEITHER)))
|
|
||||||
.toList();
|
|
||||||
return VisualizationsOnPage.builder().placedTexts(placedTexts).build();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private VisualizationsOnPage createDebugTextVisualizations(List<OcrResultToWrite> ocrResultsToWrite, List<Rectangle2D> textBBoxes) {
|
|
||||||
|
|
||||||
List<TextPositionInImage> wordsToDraw = new ArrayList<>();
|
|
||||||
List<TextPositionInImage> ignoredWords = new ArrayList<>();
|
|
||||||
|
|
||||||
for (OcrResultToWrite ocrResultToWrite : ocrResultsToWrite) {
|
|
||||||
for (TextPositionInImage textPositionInImage : ocrResultToWrite.textPositionInImage()) {
|
|
||||||
if (textBBoxes.stream()
|
|
||||||
.anyMatch(ignoreZone -> textPositionInImage.getTransformedTextBBox().getBounds2D().intersects(ignoreZone))) {
|
|
||||||
ignoredWords.add(textPositionInImage);
|
|
||||||
} else {
|
|
||||||
wordsToDraw.add(textPositionInImage);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Stream<PlacedText> placedTexts = wordsToDraw.stream()
|
|
||||||
.map(word -> new PlacedText(word.getText(),
|
|
||||||
null,
|
|
||||||
word.getFontStyle().equals(FontStyle.REGULAR) ? REGULAR_TEXT_COLOR : BOLD_TEXT_COLOR,
|
|
||||||
(float) word.getFontSize(),
|
|
||||||
word.getFontMetricsFactory(),
|
|
||||||
Optional.of(word.getTextMatrix()),
|
|
||||||
Optional.of(RenderingMode.FILL)));
|
|
||||||
|
|
||||||
Stream<PlacedText> placedTexts2 = ignoredWords.stream()
|
|
||||||
.map(word -> new PlacedText(word.getText(),
|
|
||||||
null,
|
|
||||||
word.getFontStyle().equals(FontStyle.REGULAR) ? REGULAR_TEXT_IN_IGNORE_ZONE : BOLD_TEXT_IN_IGNORE_ZONE,
|
|
||||||
(float) word.getFontSize(),
|
|
||||||
word.getFontMetricsFactory(),
|
|
||||||
Optional.of(word.getTextMatrix()),
|
|
||||||
Optional.of(RenderingMode.FILL)));
|
|
||||||
|
|
||||||
return VisualizationsOnPage.builder()
|
|
||||||
.placedTexts(Stream.of(placedTexts, placedTexts2)
|
|
||||||
.flatMap(Function.identity())
|
|
||||||
.toList())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private VisualizationsOnPage createDebugBBoxVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {
|
|
||||||
|
|
||||||
List<TextPositionInImage> words = ocrResultsToWrite.stream()
|
|
||||||
.map(OcrResultToWrite::textPositionInImage)
|
|
||||||
.flatMap(Collection::stream)
|
|
||||||
.toList();
|
|
||||||
List<ColoredLine> coloredLines = Stream.concat(//
|
|
||||||
words.stream()
|
|
||||||
.map(TextPositionInImage::getTransformedTextBBox)
|
|
||||||
.map(this::quadPointAsLines),//
|
|
||||||
ocrResultsToWrite.stream()
|
|
||||||
.map(OcrResultToWrite::imageBoundingBox)
|
|
||||||
.map(this::createGrid)//
|
|
||||||
)
|
|
||||||
.flatMap(Collection::stream)
|
|
||||||
.toList();
|
|
||||||
return VisualizationsOnPage.builder().coloredLines(coloredLines).build();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private List<ColoredLine> quadPointAsLines(QuadPoint rect) {
|
|
||||||
|
|
||||||
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.ORANGE, 1),
|
|
||||||
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1),
|
|
||||||
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1),
|
|
||||||
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.MAGENTA, 1));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private List<ColoredLine> createGrid(QuadPoint rect) {
|
private void drawResultsPerPage(PDDocument document, Integer pageNumber, List<OcrResultToWrite> ocrResultToWrite, Set<COSDictionary> dictionariesToUpdate) {
|
||||||
|
|
||||||
List<ColoredLine> lines = new LinkedList<>(quadPointAsLines(rect));
|
var pdPage = document.getPage(pageNumber - 1);
|
||||||
|
|
||||||
|
PDOptionalContentGroup textDebugLayer = new PDOptionalContentGroup(ocrLayerName);
|
||||||
|
PDOptionalContentGroup bBoxDebugLayer = new PDOptionalContentGroup(ocrLayerName + "BBox");
|
||||||
|
if (settings.isDebug()) {
|
||||||
|
textDebugLayer = addOptionalGroup(ocrLayerName, document, pdPage, dictionariesToUpdate);
|
||||||
|
bBoxDebugLayer = addOptionalGroup(ocrLayerName + " BBox", document, pdPage, dictionariesToUpdate);
|
||||||
|
}
|
||||||
|
|
||||||
|
escapeContentStreams(document, pdPage);
|
||||||
|
|
||||||
|
List<TextPositionInImage> words = ocrResultToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
|
||||||
|
try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
|
||||||
|
|
||||||
|
// write invisible ocr text inside tagged content
|
||||||
|
contentStream.beginMarkedContent(settings.getOcrMarkedContentTag());
|
||||||
|
contentStream.saveGraphicsState();
|
||||||
|
contentStream.setNonStrokingColor(Color.BLUE);
|
||||||
|
contentStream.setStrokingColor(Color.BLUE);
|
||||||
|
contentStream.setLineWidth(1);
|
||||||
|
words.forEach(word -> drawInvisibleWord(word, contentStream));
|
||||||
|
contentStream.restoreGraphicsState();
|
||||||
|
contentStream.endMarkedContent();
|
||||||
|
|
||||||
|
if (settings.isDebug()) { // must not be written, as it will interfere with layout parsing
|
||||||
|
// write visible ocr text inside optional group
|
||||||
|
contentStream.beginMarkedContent(COSName.OC, textDebugLayer);
|
||||||
|
contentStream.saveGraphicsState();
|
||||||
|
words.forEach(word -> drawVisibleWord(word, contentStream));
|
||||||
|
contentStream.restoreGraphicsState();
|
||||||
|
contentStream.endMarkedContent();
|
||||||
|
|
||||||
|
// write word bounding boxes (tesseract output) inside optional group
|
||||||
|
contentStream.beginMarkedContent(COSName.OC, bBoxDebugLayer);
|
||||||
|
contentStream.saveGraphicsState();
|
||||||
|
ocrResultToWrite.stream()
|
||||||
|
.map(OcrResultToWrite::imageBoundingBox)
|
||||||
|
.forEach(imagePosition -> drawGrid(contentStream, imagePosition));
|
||||||
|
words.stream().map(TextPositionInImage::getTransformedTextBBox).forEach(word -> drawRectangle(contentStream, word));
|
||||||
|
contentStream.restoreGraphicsState();
|
||||||
|
contentStream.endMarkedContent();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dictionariesToUpdate.add(pdPage.getCOSObject());
|
||||||
|
dictionariesToUpdate.add(pdPage.getResources().getCOSObject());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private static void escapeContentStreams(PDDocument document, PDPage pdPage) {
|
||||||
|
// We need to append to the contentstream, otherwise the content could be overlapped by images
|
||||||
|
// But we also need to save the graphics state before, such that our appended content cannot be affected by previous contentstreams with side-effects, such as not escaped matrix transformations
|
||||||
|
try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.PREPEND, false)) {
|
||||||
|
contentStream.saveGraphicsState();
|
||||||
|
}
|
||||||
|
try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, false)) {
|
||||||
|
contentStream.restoreGraphicsState();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private PDOptionalContentGroup addOptionalGroup(String ocrLayerName, PDDocument document, PDPage pdPage, Set<COSDictionary> dictionariesToUpdate) {
|
||||||
|
|
||||||
|
PDDocumentCatalog catalog = document.getDocumentCatalog();
|
||||||
|
PDOptionalContentProperties ocprops = catalog.getOCProperties();
|
||||||
|
if (ocprops == null) {
|
||||||
|
ocprops = new PDOptionalContentProperties();
|
||||||
|
catalog.setOCProperties(ocprops);
|
||||||
|
}
|
||||||
|
PDOptionalContentGroup layer = null;
|
||||||
|
if (ocprops.hasGroup(ocrLayerName)) {
|
||||||
|
layer = ocprops.getGroup(ocrLayerName);
|
||||||
|
} else {
|
||||||
|
layer = new PDOptionalContentGroup(ocrLayerName);
|
||||||
|
ocprops.addGroup(layer);
|
||||||
|
}
|
||||||
|
|
||||||
|
// enable debug layers by default only when DEBUG flag is set.
|
||||||
|
ocprops.setGroupEnabled(layer, settings.isDebug());
|
||||||
|
PDResources resources = pdPage.getResources();
|
||||||
|
if (resources == null) {
|
||||||
|
resources = new PDResources();
|
||||||
|
pdPage.setResources(resources);
|
||||||
|
}
|
||||||
|
dictionariesToUpdate.add(catalog.getCOSObject());
|
||||||
|
return layer;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private void drawRectangle(PDPageContentStream contentStream, QuadPoint rect) {
|
||||||
|
|
||||||
|
contentStream.saveGraphicsState();
|
||||||
|
contentStream.setLineWidth(1);
|
||||||
|
contentStream.moveTo((float) rect.a().getX(), (float) rect.a().getY());
|
||||||
|
contentStream.lineTo((float) rect.b().getX(), (float) rect.b().getY());
|
||||||
|
contentStream.setStrokingColor(Color.ORANGE);
|
||||||
|
contentStream.stroke();
|
||||||
|
contentStream.moveTo((float) rect.b().getX(), (float) rect.b().getY());
|
||||||
|
contentStream.lineTo((float) rect.c().getX(), (float) rect.c().getY());
|
||||||
|
contentStream.setStrokingColor(Color.BLUE);
|
||||||
|
contentStream.stroke();
|
||||||
|
contentStream.moveTo((float) rect.c().getX(), (float) rect.c().getY());
|
||||||
|
contentStream.lineTo((float) rect.d().getX(), (float) rect.d().getY());
|
||||||
|
contentStream.setStrokingColor(Color.GREEN);
|
||||||
|
contentStream.stroke();
|
||||||
|
contentStream.moveTo((float) rect.d().getX(), (float) rect.d().getY());
|
||||||
|
contentStream.lineTo((float) rect.a().getX(), (float) rect.a().getY());
|
||||||
|
contentStream.setStrokingColor(Color.MAGENTA);
|
||||||
|
contentStream.stroke();
|
||||||
|
contentStream.restoreGraphicsState();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void drawInvisibleWord(TextPositionInImage word, PDPageContentStream contentStream) {
|
||||||
|
|
||||||
|
drawWord(word, contentStream, RenderingMode.NEITHER);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void drawVisibleWord(TextPositionInImage word, PDPageContentStream contentStream) {
|
||||||
|
|
||||||
|
drawWord(word, contentStream, RenderingMode.FILL);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// @SneakyThrows
|
||||||
|
private void drawWord(TextPositionInImage position, PDPageContentStream contentStream, RenderingMode renderingMode) {
|
||||||
|
|
||||||
|
try {
|
||||||
|
contentStream.setNonStrokingColor(switch (position.getFontStyle()) {
|
||||||
|
case BOLD -> Color.RED;
|
||||||
|
case ITALIC -> Color.GREEN;
|
||||||
|
default -> Color.BLUE;
|
||||||
|
});
|
||||||
|
contentStream.beginText();
|
||||||
|
contentStream.setRenderingMode(renderingMode);
|
||||||
|
contentStream.setFont(position.getFont(), (float) position.getFontSize());
|
||||||
|
contentStream.setTextMatrix(position.getTextMatrix());
|
||||||
|
contentStream.showText(position.getText());
|
||||||
|
contentStream.endText();
|
||||||
|
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.error("Failed to write text {}", position.getText());
|
||||||
|
log.error(e.getMessage());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private void drawGrid(PDPageContentStream contentStream, QuadPoint rect) {
|
||||||
|
|
||||||
|
drawRectangle(contentStream, rect);
|
||||||
|
|
||||||
|
contentStream.saveGraphicsState();
|
||||||
|
contentStream.setStrokingColor(Color.BLACK);
|
||||||
|
contentStream.setLineWidth(0.2F);
|
||||||
int nRows = 8;
|
int nRows = 8;
|
||||||
int nCols = 8;
|
int nCols = 8;
|
||||||
|
|
||||||
@ -226,7 +218,7 @@ public class OcrResultWriter {
|
|||||||
Point2D start = add(rect.a(), abStep);
|
Point2D start = add(rect.a(), abStep);
|
||||||
Point2D end = add(rect.d(), abStep);
|
Point2D end = add(rect.d(), abStep);
|
||||||
for (int row = 0; row < nRows; ++row) {
|
for (int row = 0; row < nRows; ++row) {
|
||||||
lines.add(new ColoredLine(new Line2D.Double(start, end), Color.BLACK, 0.2f));
|
drawLine(start, end, contentStream);
|
||||||
start = add(start, abStep);
|
start = add(start, abStep);
|
||||||
end = add(end, abStep);
|
end = add(end, abStep);
|
||||||
}
|
}
|
||||||
@ -234,12 +226,21 @@ public class OcrResultWriter {
|
|||||||
start = add(rect.a(), adStep);
|
start = add(rect.a(), adStep);
|
||||||
end = add(rect.b(), adStep);
|
end = add(rect.b(), adStep);
|
||||||
for (int col = 0; col < nCols; ++col) {
|
for (int col = 0; col < nCols; ++col) {
|
||||||
lines.add(new ColoredLine(new Line2D.Double(start, end), Color.BLACK, 0.2f));
|
drawLine(start, end, contentStream);
|
||||||
start = add(start, adStep);
|
start = add(start, adStep);
|
||||||
end = add(end, adStep);
|
end = add(end, adStep);
|
||||||
}
|
}
|
||||||
|
contentStream.restoreGraphicsState();
|
||||||
|
|
||||||
return lines;
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private void drawLine(Point2D a, Point2D b, PDPageContentStream contentStream) {
|
||||||
|
|
||||||
|
contentStream.moveTo((float) a.getX(), (float) a.getY());
|
||||||
|
contentStream.lineTo((float) b.getX(), (float) b.getY());
|
||||||
|
contentStream.stroke();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -4,12 +4,11 @@ import org.apache.pdfbox.pdmodel.font.PDFont;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.FontMetrics;
|
import com.knecon.fforesight.service.ocr.processor.model.FontMetrics;
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
|
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
|
||||||
import com.knecon.fforesight.service.viewerdoc.model.EmbeddableFont;
|
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
public interface FontMetricsFactory extends EmbeddableFont {
|
public interface FontMetricsFactory {
|
||||||
|
|
||||||
default FontMetrics calculateMetrics(String text, double textWidth, double textHeight) {
|
default FontMetrics calculateMetrics(String text, double textWidth, double textHeight) {
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,8 @@
|
|||||||
package com.knecon.fforesight.service.ocr.processor.service.fonts;
|
package com.knecon.fforesight.service.ocr.processor.service.fonts;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.fontbox.ttf.GlyphData;
|
import org.apache.fontbox.ttf.GlyphData;
|
||||||
@ -13,63 +15,45 @@ import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
|
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import software.amazon.awssdk.services.s3.endpoints.internal.Value;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
@AllArgsConstructor
|
|
||||||
public class Type0FontMetricsFactory implements FontMetricsFactory {
|
public class Type0FontMetricsFactory implements FontMetricsFactory {
|
||||||
|
|
||||||
private final String resourcePath;
|
private final PDType0Font type0Font;
|
||||||
private PDType0Font type0Font;
|
private final TrueTypeFont trueTypeFont;
|
||||||
private TrueTypeFont trueTypeFont;
|
|
||||||
private PDDocument documentThisIsEmbeddedIn;
|
|
||||||
|
|
||||||
// for this specific font back-/forward-slashes have a lot of descent screwing up the font size and therefore bold detection. So if we find such a character we ignore its descent.
|
// for this specific font back-/forward-slashes have a lot of descent screwing up the font size and therefore bold detection. So if we find such a character we ignore its descent.
|
||||||
private static final Set<Integer> slashGlyphIds = Set.of(18, 63);
|
private static final Set<Integer> slashGlyphIds = Set.of(18, 63);
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public static Type0FontMetricsFactory regular(PDDocument document) {
|
public static Type0FontMetricsFactory regular(PDDocument document) {
|
||||||
|
|
||||||
String resourcePath = "fonts/cmu-regular.ttf";
|
return createFromResource("fonts/cmu-regular.ttf", document);
|
||||||
return createFromResourcePath(resourcePath, document);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public static Type0FontMetricsFactory bold(PDDocument document) {
|
public static Type0FontMetricsFactory bold(PDDocument document) {
|
||||||
|
|
||||||
String resourcePath = "fonts/cmu-bold.ttf";
|
return createFromResource("fonts/cmu-bold.ttf", document);
|
||||||
return createFromResourcePath(resourcePath, document);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@SuppressWarnings("PMD.CloseResource")
|
private static Type0FontMetricsFactory createFromResource(String resourcePath, PDDocument document) {
|
||||||
private static TrueTypeFont readFromResourcePath(String resourcePath) {
|
|
||||||
|
|
||||||
// The ttf is closed with the document, see PDType0Font line 134
|
|
||||||
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(resourcePath); var buffer = new RandomAccessReadBuffer(in)) {
|
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(resourcePath); var buffer = new RandomAccessReadBuffer(in)) {
|
||||||
return new TTFParser().parse(buffer);
|
TrueTypeFont trueTypeFont = new TTFParser().parse(buffer); // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
|
||||||
|
PDType0Font type0Font = PDType0Font.load(document, trueTypeFont, true); // use Type0Font for unicode support
|
||||||
|
return new Type0FontMetricsFactory(type0Font, trueTypeFont);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
@SuppressWarnings("PMD.CloseResource")
|
|
||||||
private static Type0FontMetricsFactory createFromResourcePath(String resourcePath, PDDocument document) {
|
|
||||||
|
|
||||||
TrueTypeFont trueTypeFont = readFromResourcePath(resourcePath);
|
|
||||||
// since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
|
|
||||||
return new Type0FontMetricsFactory(resourcePath, PDType0Font.load(document, trueTypeFont, true), trueTypeFont, document); // use Type0Font for unicode support)
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public HeightAndDescent calculateHeightAndDescent(String text) {
|
public HeightAndDescent calculateHeightAndDescent(String text) {
|
||||||
|
|
||||||
@ -113,28 +97,4 @@ public class Type0FontMetricsFactory implements FontMetricsFactory {
|
|||||||
return type0Font;
|
return type0Font;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
@SneakyThrows
|
|
||||||
public PDFont embed(PDDocument document) {
|
|
||||||
|
|
||||||
if (documentThisIsEmbeddedIn.equals(document)) {
|
|
||||||
return getFont();
|
|
||||||
}
|
|
||||||
|
|
||||||
// no need to close, the font will be closed with the document it is embedded in
|
|
||||||
|
|
||||||
this.trueTypeFont = readFromResourcePath(resourcePath);
|
|
||||||
this.type0Font = PDType0Font.load(document, trueTypeFont, true);
|
|
||||||
this.documentThisIsEmbeddedIn = document;
|
|
||||||
return getFont();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public void close() {
|
|
||||||
|
|
||||||
trueTypeFont.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -49,11 +49,11 @@ public class FontStyleDetector {
|
|||||||
* (<a href="https://en.wikipedia.org/wiki/Opening_(morphology)">Opening (Morphology)</a>).
|
* (<a href="https://en.wikipedia.org/wiki/Opening_(morphology)">Opening (Morphology)</a>).
|
||||||
* We then threshold the ratio of remaining pixels to determine whether a word is bold or not.
|
* We then threshold the ratio of remaining pixels to determine whether a word is bold or not.
|
||||||
* <p>
|
* <p>
|
||||||
* I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size estimation.
|
* I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size.
|
||||||
* But that is calculated based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height.
|
* But this is based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height.
|
||||||
* The paper does not describe its clustering algorithm, so I've decided on DBSCAN due to its good runtime and readily available implementation by apache commons math.
|
* The paper does not describe its clustering algorithm, so I've decided on DBSCAN due to its good runtime and readily available implementation by apache commons math.
|
||||||
* Moreover, the paper states that stroke width scales linearly with text height. I've come to the conclusion this is not the case.
|
* Moreover, the paper states that stroke width scales linearly with text height. I've come to the conclusion this is not the case.
|
||||||
* It seems it scales with the square root of the text height. Or at least this seemed to give the best results for me.
|
* It seems it scales with the square root of the text height. Or at least this seemed to give the best results.
|
||||||
*/
|
*/
|
||||||
public Map<Integer, List<OcrResultToWrite>> detectBold(List<OcrResult> ocrResults, PDDocument document) {
|
public Map<Integer, List<OcrResultToWrite>> detectBold(List<OcrResult> ocrResults, PDDocument document) {
|
||||||
|
|
||||||
|
|||||||
@ -109,6 +109,7 @@ public class GhostScriptOutputHandler extends Thread {
|
|||||||
if (imageFile == null) {
|
if (imageFile == null) {
|
||||||
throw new IllegalArgumentException(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet()));
|
throw new IllegalArgumentException(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet()));
|
||||||
}
|
}
|
||||||
|
assert new File(imageFile.absoluteFilePath()).isFile();
|
||||||
renderedPageImageFileOutput.add(imageFile);
|
renderedPageImageFileOutput.add(imageFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -45,7 +45,7 @@ public class ImageProcessingThread extends Thread {
|
|||||||
final BlockingQueue<UnprocessedImage> imageInputQueue;
|
final BlockingQueue<UnprocessedImage> imageInputQueue;
|
||||||
final BlockingQueue<OcrImage> imageOutputQueue;
|
final BlockingQueue<OcrImage> imageOutputQueue;
|
||||||
final ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
|
final ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
|
||||||
final L_Kernel gaussianKernel = Leptonica1.makeGaussianKernel(2, 2, 1.0f, 1);
|
final L_Kernel gaussianKernel = Leptonica1.makeGaussianKernel(2, 2, 1.2f, 1);
|
||||||
final Statistics stats;
|
final Statistics stats;
|
||||||
final OcrServiceSettings settings;
|
final OcrServiceSettings settings;
|
||||||
final PDDocument document;
|
final PDDocument document;
|
||||||
@ -107,7 +107,6 @@ public class ImageProcessingThread extends Thread {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SuppressWarnings("PMD.CompareObjectsWithEquals")
|
|
||||||
private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) {
|
private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) {
|
||||||
|
|
||||||
Pix pix = processPix(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi());
|
Pix pix = processPix(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi());
|
||||||
@ -128,7 +127,7 @@ public class ImageProcessingThread extends Thread {
|
|||||||
return ocrImage;
|
return ocrImage;
|
||||||
}
|
}
|
||||||
|
|
||||||
@SuppressWarnings("PMD.CompareObjectsWithEquals")
|
|
||||||
private OcrImage processExtractedImage(ExtractedImage extractedImage) {
|
private OcrImage processExtractedImage(ExtractedImage extractedImage) {
|
||||||
|
|
||||||
float imageDPI = Math.abs(extractedImage.image().getWidth() / (extractedImage.ctm().getScalingFactorX() / 72));
|
float imageDPI = Math.abs(extractedImage.image().getWidth() / (extractedImage.ctm().getScalingFactorX() / 72));
|
||||||
@ -199,10 +198,8 @@ public class ImageProcessingThread extends Thread {
|
|||||||
grayScale = pix;
|
grayScale = pix;
|
||||||
} else if (pix.d == 32) {
|
} else if (pix.d == 32) {
|
||||||
grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
|
grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
|
||||||
LeptUtils.disposePix(pix);
|
|
||||||
} else if (pix.d == 1) {
|
} else if (pix.d == 1) {
|
||||||
grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255);
|
grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255);
|
||||||
LeptUtils.disposePix(pix);
|
|
||||||
} else {
|
} else {
|
||||||
throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d));
|
throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d));
|
||||||
}
|
}
|
||||||
@ -211,27 +208,29 @@ public class ImageProcessingThread extends Thread {
|
|||||||
float targetFactor = targetDpi / imageDpi;
|
float targetFactor = targetDpi / imageDpi;
|
||||||
if (targetFactor > 2.1) {
|
if (targetFactor > 2.1) {
|
||||||
scaledUp = Leptonica1.pixScaleGray4xLI(grayScale);
|
scaledUp = Leptonica1.pixScaleGray4xLI(grayScale);
|
||||||
LeptUtils.disposePix(grayScale);
|
|
||||||
} else if (targetFactor > 1.1) {
|
} else if (targetFactor > 1.1) {
|
||||||
scaledUp = Leptonica1.pixScaleGray2xLI(grayScale);
|
scaledUp = Leptonica1.pixScaleGray2xLI(grayScale);
|
||||||
LeptUtils.disposePix(grayScale);
|
|
||||||
} else {
|
} else {
|
||||||
scaledUp = grayScale;
|
scaledUp = grayScale;
|
||||||
}
|
}
|
||||||
|
|
||||||
// remove noise and prep for Otsu
|
// remove noise and prep for Otsu
|
||||||
gaussian = Leptonica1.pixConvolve(scaledUp, gaussianKernel, 8, 1);
|
gaussian = Leptonica1.pixConvolve(scaledUp, gaussianKernel, 8, 1);
|
||||||
LeptUtils.disposePix(scaledUp);
|
|
||||||
|
|
||||||
// Threshold to binary
|
// Threshold to binary
|
||||||
if (pix.w < 100 || pix.h < 100) {
|
if (pix.w < 100 || pix.h < 100) {
|
||||||
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
|
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
|
||||||
} else {
|
} else {
|
||||||
binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.1f, null);
|
binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.2f, null);
|
||||||
|
|
||||||
if (binarized == null) { // Sometimes Otsu just fails, then we binarize directly
|
if (binarized == null) { // Sometimes Otsu just fails, then we binarize directly
|
||||||
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
|
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LeptUtils.disposePix(pix);
|
||||||
|
LeptUtils.disposePix(grayScale);
|
||||||
|
LeptUtils.disposePix(scaledUp);
|
||||||
LeptUtils.disposePix(gaussian);
|
LeptUtils.disposePix(gaussian);
|
||||||
|
|
||||||
return binarized;
|
return binarized;
|
||||||
|
|||||||
@ -21,8 +21,10 @@ public class OcrServiceSettings {
|
|||||||
int minImageWidth = 20; // Minimum width for images to be processed
|
int minImageWidth = 20; // Minimum width for images to be processed
|
||||||
float minRotationConfidence = 2; // Sets a lower bound for the confidence rating for rotated pages.
|
float minRotationConfidence = 2; // Sets a lower bound for the confidence rating for rotated pages.
|
||||||
boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes
|
boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes
|
||||||
|
boolean removeWatermark; // If true, watermarks will be removed
|
||||||
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
|
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
|
||||||
COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR");
|
COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR");
|
||||||
boolean boldDetection = true; // if true, bold detection will be attempted
|
boolean boldDetection = true; // if true, bold detection will be attempted
|
||||||
double boldThreshold = 0.5; // Words are opened with a brick of average stroke width, if the ratio of remaining pixels is higher the word is determined bold.
|
double boldThreshold = 0.5; // Words are opened with a brick of average stroke width, if the ratio of remaining pixels is higher the word is determined bold.
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -17,57 +17,58 @@ public class PdfDraw {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public static void drawGrid(ElementWriter writer, Page page) {
|
public static void drawGrid(ElementWriter writer, Page page) {
|
||||||
|
|
||||||
try (var eb = new ElementBuilder()) {
|
ElementBuilder eb = new ElementBuilder();
|
||||||
double dX = 15;
|
double dX = 15;
|
||||||
double dY = 15;
|
double dY = 15;
|
||||||
int nRows = (int) (page.getPageHeight() / dY) + 1;
|
int nRows = (int) (page.getPageHeight() / dY) + 1;
|
||||||
int nCols = (int) (page.getPageWidth() / dX) + 1;
|
int nCols = (int) (page.getPageWidth() / dX) + 1;
|
||||||
for (int row = 0; row < nRows; ++row) {
|
for (int row = 0; row < nRows; ++row) {
|
||||||
for (int col = 0; col < nCols; ++col) {
|
for (int col = 0; col < nCols; ++col) {
|
||||||
Element cell = eb.createRect(col * dX, row * dY, dX, dY);
|
Element cell = eb.createRect(col * dX, row * dY, dX, dY);
|
||||||
cell.setPathStroke(true);
|
cell.setPathStroke(true);
|
||||||
cell.getGState().setLineWidth(1);
|
cell.getGState().setLineWidth(1);
|
||||||
cell.getGState().setStrokeOpacity(0.1);
|
cell.getGState().setStrokeOpacity(0.1);
|
||||||
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||||
if (row == 0 && col == 0) {
|
if (row == 0 && col == 0) {
|
||||||
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
|
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
|
||||||
cell.setPathFill(true);
|
cell.setPathFill(true);
|
||||||
cell.getGState().setFillOpacity(0.8);
|
cell.getGState().setFillOpacity(0.8);
|
||||||
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||||
cell.getGState().setFillColor(new ColorPt(0, 0, 1));
|
cell.getGState().setFillColor(new ColorPt(0, 0, 1));
|
||||||
} else {
|
} else {
|
||||||
cell.setPathFill(false);
|
cell.setPathFill(false);
|
||||||
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
|
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
|
||||||
}
|
|
||||||
writer.writePlacedElement(cell);
|
|
||||||
}
|
}
|
||||||
|
writer.writePlacedElement(cell);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
eb.destroy();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection) {
|
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection) {
|
||||||
|
|
||||||
try (var colorPt = new ColorPt(1, 0, 0); var eb = new ElementBuilder()) {
|
ColorPt colorPt = new ColorPt(1, 0, 0);
|
||||||
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
|
ElementBuilder eb = new ElementBuilder();
|
||||||
try(var r = rectCollection.getRectAt(i)) {
|
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
|
||||||
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
|
Rect r = rectCollection.getRectAt(i);
|
||||||
|
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
|
||||||
|
|
||||||
rect.setPathStroke(true);
|
rect.setPathStroke(true);
|
||||||
rect.getGState().setLineWidth(5);
|
rect.getGState().setLineWidth(5);
|
||||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||||
rect.getGState().setStrokeColor(colorPt);
|
rect.getGState().setStrokeColor(colorPt);
|
||||||
|
|
||||||
rect.setPathFill(true);
|
rect.setPathFill(true);
|
||||||
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||||
rect.getGState().setFillColor(colorPt);
|
rect.getGState().setFillColor(colorPt);
|
||||||
rect.getGState().setFillOpacity(0.5);
|
rect.getGState().setFillOpacity(0.5);
|
||||||
|
|
||||||
writer.writePlacedElement(rect);
|
writer.writePlacedElement(rect);
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
colorPt.destroy();
|
||||||
|
eb.destroy();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,25 @@
|
|||||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||||
|
|
||||||
|
import static net.sourceforge.tess4j.ITessAPI.TRUE;
|
||||||
|
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIDelete;
|
||||||
|
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIEnd;
|
||||||
|
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIGetIterator;
|
||||||
|
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIGetStringVariable;
|
||||||
|
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIMeanTextConf;
|
||||||
|
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIProcessPage;
|
||||||
|
import static net.sourceforge.tess4j.TessAPI1.TessDeleteResultRenderer;
|
||||||
|
import static net.sourceforge.tess4j.TessAPI1.TessHOcrRendererCreate;
|
||||||
|
import static net.sourceforge.tess4j.TessAPI1.TessPageIteratorBegin;
|
||||||
|
import static net.sourceforge.tess4j.TessAPI1.TessPageIteratorBoundingBox;
|
||||||
|
import static net.sourceforge.tess4j.TessAPI1.TessPageIteratorNext;
|
||||||
|
import static net.sourceforge.tess4j.TessAPI1.TessResultIteratorConfidence;
|
||||||
|
import static net.sourceforge.tess4j.TessAPI1.TessResultIteratorDelete;
|
||||||
|
import static net.sourceforge.tess4j.TessAPI1.TessResultIteratorGetPageIterator;
|
||||||
|
import static net.sourceforge.tess4j.TessAPI1.TessResultIteratorGetUTF8Text;
|
||||||
|
import static net.sourceforge.tess4j.TessAPI1.TessResultRendererBeginDocument;
|
||||||
|
import static net.sourceforge.tess4j.TessAPI1.TessResultRendererEndDocument;
|
||||||
|
import static net.sourceforge.tess4j.TessAPI1.TessResultRendererInsert;
|
||||||
|
|
||||||
import java.awt.Rectangle;
|
import java.awt.Rectangle;
|
||||||
import java.nio.IntBuffer;
|
import java.nio.IntBuffer;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -9,20 +29,19 @@ import com.sun.jna.Pointer;
|
|||||||
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import net.sourceforge.lept4j.Pix;
|
import net.sourceforge.lept4j.Pix;
|
||||||
|
import net.sourceforge.tess4j.ITessAPI;
|
||||||
import net.sourceforge.tess4j.OCRResult;
|
import net.sourceforge.tess4j.OCRResult;
|
||||||
import net.sourceforge.tess4j.TessAPI1;
|
import net.sourceforge.tess4j.TessAPI1;
|
||||||
import net.sourceforge.tess4j.Tesseract1;
|
import net.sourceforge.tess4j.Tesseract;
|
||||||
import net.sourceforge.tess4j.TesseractException;
|
import net.sourceforge.tess4j.TesseractException;
|
||||||
import net.sourceforge.tess4j.Word;
|
import net.sourceforge.tess4j.Word;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
/**
|
/**
|
||||||
* Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted.
|
* Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted.
|
||||||
*/
|
*/ public class Tesseract2 extends Tesseract {
|
||||||
public class Tesseract2 extends Tesseract1 {
|
|
||||||
|
|
||||||
|
private int createDocuments(Pix pix, String filename, ITessAPI.TessResultRenderer renderer) {
|
||||||
private int createDocuments(Pix pix, String filename, TessResultRenderer renderer) {
|
|
||||||
|
|
||||||
String title = TessBaseAPIGetStringVariable(getHandle(), DOCUMENT_TITLE);
|
String title = TessBaseAPIGetStringVariable(getHandle(), DOCUMENT_TITLE);
|
||||||
TessResultRendererBeginDocument(renderer, title);
|
TessResultRendererBeginDocument(renderer, title);
|
||||||
@ -62,7 +81,7 @@ public class Tesseract2 extends Tesseract1 {
|
|||||||
try {
|
try {
|
||||||
for (int i = 0; i < pixs.length; i++) {
|
for (int i = 0; i < pixs.length; i++) {
|
||||||
try {
|
try {
|
||||||
TessResultRenderer renderer = createRenderers(outputbases[i], formats);
|
ITessAPI.TessResultRenderer renderer = createRenderers(outputbases[i], formats);
|
||||||
int meanTextConfidence = createDocuments(pixs[i], filenames[i], renderer);
|
int meanTextConfidence = createDocuments(pixs[i], filenames[i], renderer);
|
||||||
TessDeleteResultRenderer(renderer);
|
TessDeleteResultRenderer(renderer);
|
||||||
List<Word> words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel) : new ArrayList<Word>();
|
List<Word> words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel) : new ArrayList<Word>();
|
||||||
@ -85,8 +104,8 @@ public class Tesseract2 extends Tesseract1 {
|
|||||||
List<Word> words = new ArrayList<>();
|
List<Word> words = new ArrayList<>();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
TessResultIterator ri = TessBaseAPIGetIterator(getHandle());
|
ITessAPI.TessResultIterator ri = TessBaseAPIGetIterator(getHandle());
|
||||||
TessPageIterator pi = TessResultIteratorGetPageIterator(ri);
|
ITessAPI.TessPageIterator pi = TessResultIteratorGetPageIterator(ri);
|
||||||
TessPageIteratorBegin(pi);
|
TessPageIteratorBegin(pi);
|
||||||
|
|
||||||
do {
|
do {
|
||||||
@ -119,9 +138,9 @@ public class Tesseract2 extends Tesseract1 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private TessResultRenderer createRenderers(String outputbase, List<RenderedFormat> formats) {
|
private ITessAPI.TessResultRenderer createRenderers(String outputbase, List<RenderedFormat> formats) {
|
||||||
|
|
||||||
TessResultRenderer renderer = null;
|
ITessAPI.TessResultRenderer renderer = null;
|
||||||
|
|
||||||
for (RenderedFormat format : formats) {
|
for (RenderedFormat format : formats) {
|
||||||
switch (format) {
|
switch (format) {
|
||||||
@ -138,4 +157,12 @@ public class Tesseract2 extends Tesseract1 {
|
|||||||
return renderer;
|
return renderer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void dispose() {
|
||||||
|
|
||||||
|
TessBaseAPIEnd(getHandle());
|
||||||
|
TessBaseAPIDelete(getHandle());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -3,10 +3,10 @@ import org.springframework.boot.gradle.tasks.bundling.BootBuildImage
|
|||||||
plugins {
|
plugins {
|
||||||
application
|
application
|
||||||
id("com.iqser.red.service.java-conventions")
|
id("com.iqser.red.service.java-conventions")
|
||||||
id("org.springframework.boot") version "3.2.3"
|
id("org.springframework.boot") version "3.1.5"
|
||||||
id("io.spring.dependency-management") version "1.1.3"
|
id("io.spring.dependency-management") version "1.1.3"
|
||||||
id("org.sonarqube") version "4.3.0.3225"
|
id("org.sonarqube") version "4.3.0.3225"
|
||||||
id("io.freefair.lombok") version "8.4"
|
id("io.freefair.lombok") version "8.2.2"
|
||||||
}
|
}
|
||||||
|
|
||||||
configurations {
|
configurations {
|
||||||
@ -17,14 +17,14 @@ configurations {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
val springBootStarterVersion = "3.2.3"
|
val springBootStarterVersion = "3.1.5"
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation(project(":ocr-service-processor"))
|
implementation(project(":ocr-service-processor"))
|
||||||
implementation(project(":ocr-service-api"))
|
implementation(project(":ocr-service-api"))
|
||||||
|
|
||||||
implementation("com.knecon.fforesight:tracing-commons:0.7.0")
|
implementation("com.knecon.fforesight:tracing-commons:0.3.0")
|
||||||
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.1.1")
|
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
|
||||||
implementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}")
|
implementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}")
|
||||||
|
|
||||||
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
|
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
|
||||||
@ -39,7 +39,7 @@ tasks.named<BootBuildImage>("bootBuildImage") {
|
|||||||
|
|
||||||
environment.put("BPE_DELIM_JAVA_TOOL_OPTIONS", " ")
|
environment.put("BPE_DELIM_JAVA_TOOL_OPTIONS", " ")
|
||||||
environment.put("BPE_APPEND_JAVA_TOOL_OPTIONS", "-Dfile.encoding=UTF-8")
|
environment.put("BPE_APPEND_JAVA_TOOL_OPTIONS", "-Dfile.encoding=UTF-8")
|
||||||
environment.put("BPE_GS_LIB", "/layers/fagiani_apt/apt/usr/share/ghostscript/9.55.0/Resource/Init/") // set ghostscript lib path, version in path must match version in Aptfile
|
environment.put("BPE_GS_LIB", "/layers/fagiani_apt/apt/usr/share/ghostscript/9.26/Resource/Init/") // set ghostscript lib path
|
||||||
environment.put("BPE_FONTCONFIG_PATH", "/layers/fagiani_apt/apt/etc/fonts/") // set ghostscript fontconfig path
|
environment.put("BPE_FONTCONFIG_PATH", "/layers/fagiani_apt/apt/etc/fonts/") // set ghostscript fontconfig path
|
||||||
|
|
||||||
var aptfile = layout.projectDirectory.file("src/main/resources/Aptfile").toString()
|
var aptfile = layout.projectDirectory.file("src/main/resources/Aptfile").toString()
|
||||||
@ -53,7 +53,7 @@ tasks.named<BootBuildImage>("bootBuildImage") {
|
|||||||
|
|
||||||
buildpacks.set(
|
buildpacks.set(
|
||||||
listOf(
|
listOf(
|
||||||
"ghcr.io/knsita/buildpacks/fagiani_apt@sha256:9771d4d27d8050aee62769490b8882fffc794745c129fb98e1f33196e2c93504",
|
"ghcr.io/fagiani/buildpacks/fagiani_apt@sha256:6471c8c70f32b749e29f65ae562ac0339fecad26aa9217628c00a6c31f197dae",
|
||||||
"ghcr.io/kschuettler/knecon-vcpkg@sha256:ba5e967b124de4865ff7e8f565684f752dd6e97b302e2dcf651283f6a19b98b9",
|
"ghcr.io/kschuettler/knecon-vcpkg@sha256:ba5e967b124de4865ff7e8f565684f752dd6e97b302e2dcf651283f6a19b98b9",
|
||||||
"ghcr.io/kschuettler/knecon-tessdata@sha256:9062f728aa0340ac963bcdd6f5e740d683823a81d3f480db894da15bff72691a",
|
"ghcr.io/kschuettler/knecon-tessdata@sha256:9062f728aa0340ac963bcdd6f5e740d683823a81d3f480db894da15bff72691a",
|
||||||
"urn:cnb:builder:paketo-buildpacks/java"
|
"urn:cnb:builder:paketo-buildpacks/java"
|
||||||
|
|||||||
@ -5,28 +5,27 @@ import org.springframework.boot.actuate.autoconfigure.security.servlet.Managemen
|
|||||||
import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
|
import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
|
||||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||||
import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration;
|
import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration;
|
||||||
|
import org.springframework.cloud.openfeign.EnableFeignClients;
|
||||||
import org.springframework.context.annotation.Bean;
|
import org.springframework.context.annotation.Bean;
|
||||||
import org.springframework.context.annotation.EnableAspectJAutoProxy;
|
|
||||||
import org.springframework.context.annotation.Import;
|
import org.springframework.context.annotation.Import;
|
||||||
import org.springframework.scheduling.annotation.EnableAsync;
|
import org.springframework.scheduling.annotation.EnableAsync;
|
||||||
|
|
||||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||||
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
||||||
import com.knecon.fforesight.lifecyclecommons.LifecycleAutoconfiguration;
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceProcessorConfiguration;
|
import com.knecon.fforesight.service.ocr.processor.OcrServiceProcessorConfiguration;
|
||||||
|
import com.knecon.fforesight.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
|
||||||
import com.knecon.fforesight.service.ocr.v1.server.queue.MessagingConfiguration;
|
import com.knecon.fforesight.service.ocr.v1.server.queue.MessagingConfiguration;
|
||||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||||
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
|
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
|
||||||
import com.knecon.fforesight.tracing.OpenTelemetryConfig;
|
|
||||||
|
|
||||||
import io.micrometer.core.aop.TimedAspect;
|
import io.micrometer.core.aop.TimedAspect;
|
||||||
import io.micrometer.core.instrument.MeterRegistry;
|
import io.micrometer.core.instrument.MeterRegistry;
|
||||||
|
|
||||||
@EnableAsync
|
@EnableAsync
|
||||||
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class, LifecycleAutoconfiguration.class})
|
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class})
|
||||||
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
|
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
|
||||||
@Import({MessagingConfiguration.class, StorageAutoConfiguration.class, OcrServiceProcessorConfiguration.class, OpenTelemetryConfig.class})
|
@Import({MessagingConfiguration.class, StorageAutoConfiguration.class, OcrServiceProcessorConfiguration.class})
|
||||||
@EnableAspectJAutoProxy
|
@EnableFeignClients(basePackageClasses = FileStatusProcessingUpdateClient.class)
|
||||||
public class Application {
|
public class Application {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -0,0 +1,10 @@
|
|||||||
|
package com.knecon.fforesight.service.ocr.v1.server.client;
|
||||||
|
|
||||||
|
import org.springframework.cloud.openfeign.FeignClient;
|
||||||
|
|
||||||
|
import com.iqser.red.service.persistence.service.v1.api.internal.resources.FileStatusProcessingUpdateResource;
|
||||||
|
|
||||||
|
@FeignClient(name = "FileStatusProcessingUpdateResource", url = "${persistence-service.url}")
|
||||||
|
public interface FileStatusProcessingUpdateClient extends FileStatusProcessingUpdateResource {
|
||||||
|
|
||||||
|
}
|
||||||
@ -11,10 +11,35 @@ import lombok.RequiredArgsConstructor;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class MessagingConfiguration {
|
public class MessagingConfiguration {
|
||||||
|
|
||||||
public static final String OCR_REQUEST_QUEUE = "ocr_request_queue";
|
public static final String OCR_QUEUE = "ocrQueue";
|
||||||
public static final String OCR_RESPONSE_QUEUE = "ocr_response_queue";
|
public static final String OCR_DLQ = "ocrDLQ";
|
||||||
|
|
||||||
|
public static final String X_DEAD_LETTER_EXCHANGE = "x-dead-letter-exchange";
|
||||||
|
public static final String X_DEAD_LETTER_ROUTING_KEY = "x-dead-letter-routing-key";
|
||||||
|
public static final String X_MAX_PRIORITY = "x-max-priority";
|
||||||
|
|
||||||
public static final String OCR_STATUS_UPDATE_RESPONSE_QUEUE = "ocr_status_update_response_queue";
|
public static final String OCR_STATUS_UPDATE_RESPONSE_QUEUE = "ocr_status_update_response_queue";
|
||||||
|
|
||||||
public static final String X_ERROR_INFO_HEADER = "x-error-message";
|
public static final String X_ERROR_INFO_HEADER = "x-error-message";
|
||||||
public static final String X_ERROR_INFO_TIMESTAMP_HEADER = "x-error-message-timestamp";
|
public static final String X_ERROR_INFO_TIMESTAMP_HEADER = "x-error-message-timestamp";
|
||||||
|
|
||||||
|
|
||||||
|
@Bean
|
||||||
|
public Queue ocrQueue() {
|
||||||
|
|
||||||
|
return QueueBuilder.durable(OCR_QUEUE)
|
||||||
|
.withArgument(X_DEAD_LETTER_EXCHANGE, "")
|
||||||
|
.withArgument(X_DEAD_LETTER_ROUTING_KEY, OCR_DLQ)
|
||||||
|
.withArgument(X_MAX_PRIORITY, 2)
|
||||||
|
.maxPriority(2)
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Bean
|
||||||
|
public Queue ocrDeadLetterQueue() {
|
||||||
|
|
||||||
|
return QueueBuilder.durable(OCR_DLQ).build();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,42 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.ocr.v1.server.queue;
|
|
||||||
|
|
||||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
|
||||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.IOcrMessageSender;
|
|
||||||
import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest;
|
|
||||||
import lombok.AccessLevel;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import lombok.experimental.FieldDefaults;
|
|
||||||
|
|
||||||
@Service
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
|
||||||
@ConditionalOnProperty(value = "ocrService.sendStatusUpdates", havingValue = "false")
|
|
||||||
public class NoStatusUpdateOcrMessageSender implements IOcrMessageSender {
|
|
||||||
|
|
||||||
RabbitTemplate rabbitTemplate;
|
|
||||||
|
|
||||||
|
|
||||||
public void sendOcrFinished(String fileId, int totalImages) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void sendOCRStarted(String fileId) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void sendUpdate(String fileId, int finishedImages, int totalImages) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void sendOcrResponse(String dossierId, String fileId) {
|
|
||||||
|
|
||||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_RESPONSE_QUEUE, new DocumentRequest(dossierId, fileId));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,8 +1,8 @@
|
|||||||
package com.knecon.fforesight.service.ocr.v1.server.queue;
|
package com.knecon.fforesight.service.ocr.v1.server.queue;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.time.OffsetDateTime;
|
import java.time.OffsetDateTime;
|
||||||
import java.time.temporal.ChronoUnit;
|
import java.time.temporal.ChronoUnit;
|
||||||
|
|
||||||
@ -10,15 +10,17 @@ import org.springframework.amqp.AmqpRejectAndDontRequeueException;
|
|||||||
import org.springframework.amqp.core.Message;
|
import org.springframework.amqp.core.Message;
|
||||||
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
||||||
import org.springframework.amqp.rabbit.annotation.RabbitListener;
|
import org.springframework.amqp.rabbit.annotation.RabbitListener;
|
||||||
|
import org.springframework.http.HttpStatus;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
import org.springframework.util.FileSystemUtils;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.knecon.fforesight.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.FileStorageService;
|
import com.knecon.fforesight.service.ocr.processor.service.FileStorageService;
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.IOcrMessageSender;
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.OCRService;
|
import com.knecon.fforesight.service.ocr.processor.service.OCRService;
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.OsUtils;
|
|
||||||
import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest;
|
import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest;
|
||||||
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileErrorInfo;
|
||||||
|
|
||||||
|
import feign.FeignException;
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.experimental.FieldDefaults;
|
import lombok.experimental.FieldDefaults;
|
||||||
@ -30,49 +32,71 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
public class OcrMessageReceiver {
|
public class OcrMessageReceiver {
|
||||||
|
|
||||||
FileStorageService fileStorageService;
|
FileStorageService fileStorageService;
|
||||||
ObjectMapper objectMapper;
|
ObjectMapper objectMapper;
|
||||||
OCRService ocrService;
|
FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
|
||||||
IOcrMessageSender ocrMessageSender;
|
OCRService ocrService;
|
||||||
|
|
||||||
|
|
||||||
@RabbitHandler
|
@RabbitHandler
|
||||||
@RabbitListener(queues = MessagingConfiguration.OCR_REQUEST_QUEUE, concurrency = "1")
|
@RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1")
|
||||||
public void receiveOcr(Message in) throws IOException {
|
public void receiveOcr(Message in) throws IOException {
|
||||||
|
|
||||||
if (in.getMessageProperties().isRedelivered()) {
|
|
||||||
throw new AmqpRejectAndDontRequeueException("Redelivered OCR Request, aborting...");
|
|
||||||
}
|
|
||||||
|
|
||||||
DocumentRequest ocrRequestMessage = objectMapper.readValue(in.getBody(), DocumentRequest.class);
|
DocumentRequest ocrRequestMessage = objectMapper.readValue(in.getBody(), DocumentRequest.class);
|
||||||
String dossierId = ocrRequestMessage.getDossierId();
|
log.info("--------------------------------------------------------------------------");
|
||||||
String fileId = ocrRequestMessage.getFileId();
|
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||||
Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve(dossierId + "-" + fileId);
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
log.info("--------------------------------------------------------------------------");
|
setStatusOcrProcessing(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||||
log.info("Start ocr for file with dossierId {} and fileId {}", dossierId, fileId);
|
|
||||||
|
|
||||||
ocrMessageSender.sendOCRStarted(fileId);
|
if (!fileStorageService.untouchedFileExists(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId())) {
|
||||||
|
byte[] originalFile = fileStorageService.getOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||||
|
fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile);
|
||||||
|
}
|
||||||
|
|
||||||
tmpDir.toFile().mkdirs();
|
try (var transferStream = new ByteArrayOutputStream()) {
|
||||||
File documentFile = tmpDir.resolve("document.pdf").toFile();
|
ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), transferStream);
|
||||||
File viewerDocumentFile = tmpDir.resolve("viewerDocument.pdf").toFile();
|
try (var inputStream = new ByteArrayInputStream(transferStream.toByteArray())) {
|
||||||
|
fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), inputStream);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.error("Failed to store file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
fileStorageService.downloadFiles(dossierId, fileId, documentFile, viewerDocumentFile);
|
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||||
|
|
||||||
ocrService.runOcrOnDocument(dossierId, fileId, ocrRequestMessage.isRemoveWatermark(), tmpDir, documentFile, viewerDocumentFile);
|
|
||||||
|
|
||||||
fileStorageService.storeFiles(dossierId, fileId, documentFile, viewerDocumentFile);
|
|
||||||
|
|
||||||
ocrMessageSender.sendOcrResponse(dossierId, fileId);
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.warn("An exception occurred in ocr file stage: {}", e.getMessage());
|
log.warn("An exception occurred in ocr file stage: {}", e.getMessage());
|
||||||
in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_HEADER, e.getMessage());
|
in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_HEADER, e.getMessage());
|
||||||
in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_TIMESTAMP_HEADER, OffsetDateTime.now().truncatedTo(ChronoUnit.MILLIS));
|
in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_TIMESTAMP_HEADER, OffsetDateTime.now().truncatedTo(ChronoUnit.MILLIS));
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
} finally {
|
}
|
||||||
FileSystemUtils.deleteRecursively(tmpDir);
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@RabbitHandler
|
||||||
|
@RabbitListener(queues = MessagingConfiguration.OCR_DLQ, concurrency = "1")
|
||||||
|
public void receiveOcrDLQ(Message failedMessage) throws IOException {
|
||||||
|
|
||||||
|
DocumentRequest ocrRequestMessage = objectMapper.readValue(failedMessage.getBody(), DocumentRequest.class);
|
||||||
|
log.info("OCR DQL received: {}", ocrRequestMessage);
|
||||||
|
String errorMessage = failedMessage.getMessageProperties().getHeader(MessagingConfiguration.X_ERROR_INFO_HEADER);
|
||||||
|
OffsetDateTime timestamp = failedMessage.getMessageProperties().getHeader(MessagingConfiguration.X_ERROR_INFO_TIMESTAMP_HEADER);
|
||||||
|
timestamp = timestamp != null ? timestamp : OffsetDateTime.now().truncatedTo(ChronoUnit.MILLIS);
|
||||||
|
fileStatusProcessingUpdateClient.ocrFailed(ocrRequestMessage.getDossierId(),
|
||||||
|
ocrRequestMessage.getFileId(),
|
||||||
|
new FileErrorInfo(errorMessage, MessagingConfiguration.OCR_DLQ, "ocr-service", timestamp));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void setStatusOcrProcessing(String dossierId, String fileId) {
|
||||||
|
|
||||||
|
try {
|
||||||
|
fileStatusProcessingUpdateClient.ocrProcessing(dossierId, fileId);
|
||||||
|
} catch (FeignException e) {
|
||||||
|
if (e.status() == HttpStatus.CONFLICT.value()) {
|
||||||
|
throw new AmqpRejectAndDontRequeueException(e.getMessage());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,23 +1,17 @@
|
|||||||
package com.knecon.fforesight.service.ocr.v1.server.queue;
|
package com.knecon.fforesight.service.ocr.v1.server.queue;
|
||||||
|
|
||||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.IOcrMessageSender;
|
import com.knecon.fforesight.service.ocr.processor.service.IOcrMessageSender;
|
||||||
import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest;
|
|
||||||
import com.knecon.fforesight.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
import com.knecon.fforesight.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
||||||
|
|
||||||
import jakarta.annotation.PostConstruct;
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.experimental.FieldDefaults;
|
import lombok.experimental.FieldDefaults;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
|
|
||||||
@Slf4j
|
|
||||||
@Service
|
@Service
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
@ConditionalOnProperty(value = "ocrService.sendStatusUpdates", havingValue = "true")
|
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
public class OcrMessageSender implements IOcrMessageSender {
|
public class OcrMessageSender implements IOcrMessageSender {
|
||||||
|
|
||||||
@ -31,14 +25,6 @@ public class OcrMessageSender implements IOcrMessageSender {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void sendOCRStarted(String fileId) {
|
|
||||||
|
|
||||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
|
||||||
OCRStatusUpdateResponse.builder().fileId(fileId).ocrStarted(true).build());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void sendUpdate(String fileId, int finishedImages, int totalImages) {
|
public void sendUpdate(String fileId, int finishedImages, int totalImages) {
|
||||||
|
|
||||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||||
@ -46,10 +32,4 @@ public class OcrMessageSender implements IOcrMessageSender {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void sendOcrResponse(String dossierId, String fileId) {
|
|
||||||
|
|
||||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_RESPONSE_QUEUE, new DocumentRequest(dossierId, fileId));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
# you can list packages
|
# you can list packages
|
||||||
ghostscript=9.55.0~dfsg1-0ubuntu5.9
|
ghostscript
|
||||||
pkg-config
|
pkg-config
|
||||||
zip
|
zip
|
||||||
unzip
|
unzip
|
||||||
@ -11,7 +11,6 @@ libk5crypto3
|
|||||||
libkrb5support0
|
libkrb5support0
|
||||||
libkeyutils1
|
libkeyutils1
|
||||||
libkrb5-3
|
libkrb5-3
|
||||||
libbrotli1
|
|
||||||
|
|
||||||
# or include links to specific .deb files
|
# or include links to specific .deb files
|
||||||
# http://ftp.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.8_all.deb
|
# http://ftp.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.8_all.deb
|
||||||
|
|||||||
@ -12,9 +12,6 @@ project.version: 1.0-SNAPSHOT
|
|||||||
server:
|
server:
|
||||||
port: 8080
|
port: 8080
|
||||||
|
|
||||||
lifecycle:
|
|
||||||
base-package: com.knecon.fforesight.service.ocr
|
|
||||||
|
|
||||||
spring:
|
spring:
|
||||||
application:
|
application:
|
||||||
name: ocr-service
|
name: ocr-service
|
||||||
@ -60,6 +57,3 @@ management:
|
|||||||
endpoint: ${OTLP_ENDPOINT:http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces}
|
endpoint: ${OTLP_ENDPOINT:http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces}
|
||||||
|
|
||||||
pdftron.license: ${PDFTRON_LICENSE}
|
pdftron.license: ${PDFTRON_LICENSE}
|
||||||
|
|
||||||
ocrService:
|
|
||||||
sendStatusUpdates: true
|
|
||||||
|
|||||||
@ -6,7 +6,7 @@
|
|||||||
"overrides": [
|
"overrides": [
|
||||||
{
|
{
|
||||||
"name": "tesseract",
|
"name": "tesseract",
|
||||||
"version": "5.3.3"
|
"version": "5.3.2"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "leptonica",
|
"name": "leptonica",
|
||||||
|
|||||||
@ -24,10 +24,10 @@ import org.springframework.context.annotation.Primary;
|
|||||||
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||||
|
|
||||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||||
|
import com.knecon.fforesight.service.ocr.processor.initializer.PDFNetInitializer;
|
||||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||||
import com.iqser.red.storage.commons.service.StorageService;
|
import com.iqser.red.storage.commons.service.StorageService;
|
||||||
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
|
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
|
||||||
import com.knecon.fforesight.service.ocr.processor.initializer.NativeLibrariesInitializer;
|
|
||||||
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
||||||
import com.pdftron.pdf.PDFNet;
|
import com.pdftron.pdf.PDFNet;
|
||||||
|
|
||||||
@ -36,7 +36,7 @@ import lombok.SneakyThrows;
|
|||||||
|
|
||||||
@ExtendWith({SpringExtension.class, MockitoExtension.class})
|
@ExtendWith({SpringExtension.class, MockitoExtension.class})
|
||||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
||||||
@Import({AbstractTest.TestConfiguration.class, NativeLibrariesInitializer.class})
|
@Import({AbstractTest.TestConfiguration.class, PDFNetInitializer.class})
|
||||||
@AutoConfigureObservability
|
@AutoConfigureObservability
|
||||||
public class AbstractTest {
|
public class AbstractTest {
|
||||||
|
|
||||||
|
|||||||
@ -9,7 +9,6 @@ import java.io.FileInputStream;
|
|||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardCopyOption;
|
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
@ -26,7 +25,6 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||||||
import com.knecon.fforesight.service.ocr.processor.service.FileStorageService;
|
import com.knecon.fforesight.service.ocr.processor.service.FileStorageService;
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.OCRService;
|
import com.knecon.fforesight.service.ocr.processor.service.OCRService;
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
|
||||||
import com.knecon.fforesight.service.ocr.processor.service.OsUtils;
|
|
||||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||||
|
|
||||||
import io.micrometer.prometheus.PrometheusMeterRegistry;
|
import io.micrometer.prometheus.PrometheusMeterRegistry;
|
||||||
@ -50,9 +48,9 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testOCRMetrics() {
|
public void testOCRMetrics() {
|
||||||
|
|
||||||
testOCR("files/syngenta/CustomerFiles/SinglePages/Watermark_Page1_10.SYN524464 FS (A16148C) - Absorção cutânea.pdf");
|
testOCR("files/Watermark.pdf");
|
||||||
testOCR("files/syngenta/CustomerFiles/SinglePages/Watermark_Page1_10.SYN524464 FS (A16148C) - Absorção cutânea.pdf");
|
testOCR("files/Watermark.pdf");
|
||||||
testOCR("files/syngenta/CustomerFiles/SinglePages/Watermark_Page1_10.SYN524464 FS (A16148C) - Absorção cutânea.pdf");
|
testOCR("files/Watermark.pdf");
|
||||||
|
|
||||||
var ocrOnDocumentMeter = registry.getMeters().stream().filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny();
|
var ocrOnDocumentMeter = registry.getMeters().stream().filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny();
|
||||||
assertThat(ocrOnDocumentMeter.isPresent()).isTrue();
|
assertThat(ocrOnDocumentMeter.isPresent()).isTrue();
|
||||||
@ -81,7 +79,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testMergeImages() {
|
public void testMergeImages() {
|
||||||
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
|
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
|
||||||
String text = testOCR("files/syngenta/CustomerFiles/SinglePages/merge_images - Page241_18 Chlorothalonil RAR 08 Volume 3CA B 6a Oct 2017.pdf");
|
String text = testOCR("files/merge_images.pdf");
|
||||||
assertThat(text).contains("Bodyweight change of dams with live young - group mean values",
|
assertThat(text).contains("Bodyweight change of dams with live young - group mean values",
|
||||||
"Control",
|
"Control",
|
||||||
"mg/g day",
|
"mg/g day",
|
||||||
@ -101,7 +99,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testOCRWatermark() {
|
public void testOCRWatermark() {
|
||||||
|
|
||||||
assertThat(testOCR("files/syngenta/CustomerFiles/SinglePages/Watermark_Page1_10.SYN524464 FS (A16148C) - Absorção cutânea.pdf")).contains("syngenta");
|
assertThat(testOCR("files/Watermark.pdf")).contains("syngenta");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -118,17 +116,18 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
|||||||
private String testOCR(String fileName) {
|
private String testOCR(String fileName) {
|
||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
||||||
Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve("OCR_TEST").resolve(Path.of(fileName).getFileName());
|
var originId = FileStorageService.getStorageId(TEST_DOSSIER_ID, "file", FileType.ORIGIN);
|
||||||
tmpDir.toFile().mkdirs();
|
try (var fileStream = pdfFileResource.getInputStream()) {
|
||||||
var documentFile = tmpDir.resolve(Path.of("document.pdf"));
|
storageService.storeObject(TenantContext.getTenantId(), originId, fileStream);
|
||||||
var viewerDocumentFile = tmpDir.resolve(Path.of("viewerDocument.pdf"));
|
}
|
||||||
Files.copy(pdfFileResource.getFile().toPath(), documentFile, StandardCopyOption.REPLACE_EXISTING);
|
|
||||||
Files.copy(pdfFileResource.getFile().toPath(), viewerDocumentFile, StandardCopyOption.REPLACE_EXISTING);
|
|
||||||
|
|
||||||
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", false, tmpDir, documentFile.toFile(), viewerDocumentFile.toFile());
|
Path tmpFileName = Path.of(getTemporaryDirectory()).resolve(Path.of(fileName).getFileName());
|
||||||
System.out.println("File:" + documentFile);
|
try (var out = new FileOutputStream(tmpFileName.toFile())) {
|
||||||
|
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", out);
|
||||||
|
System.out.println("File:" + tmpFileName);
|
||||||
|
}
|
||||||
|
|
||||||
try (var fileStream = new FileInputStream(documentFile.toFile())) {
|
try (var fileStream = new FileInputStream(tmpFileName.toFile())) {
|
||||||
return extractAllTextFromDocument(fileStream);
|
return extractAllTextFromDocument(fileStream);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -167,18 +166,20 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void testOCRForFile(File file) {
|
private void testOCRForFile(File file) {
|
||||||
|
|
||||||
Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve("OCR_TEST").resolve(file.toPath().getFileName());
|
var originId = FileStorageService.getStorageId(TEST_DOSSIER_ID, "file", FileType.ORIGIN);
|
||||||
tmpDir.toFile().mkdirs();
|
try (var fileStream = new FileInputStream(file)) {
|
||||||
var documentFile = tmpDir.resolve(Path.of("document.pdf"));
|
storageService.storeObject(TenantContext.getTenantId(), originId, fileStream);
|
||||||
var viewerDocumentFile = tmpDir.resolve(Path.of("viewerDocument.pdf"));
|
}
|
||||||
Files.copy(file.toPath(), documentFile, StandardCopyOption.REPLACE_EXISTING);
|
|
||||||
Files.copy(file.toPath(), viewerDocumentFile, StandardCopyOption.REPLACE_EXISTING);
|
|
||||||
|
|
||||||
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", false, tmpDir, documentFile.toFile(), viewerDocumentFile.toFile());
|
Path tmpFileName = Path.of(getTemporaryDirectory()).resolve(Path.of(file.getAbsolutePath()).getFileName());
|
||||||
System.out.println("File:" + documentFile);
|
try (var out = new FileOutputStream(tmpFileName.toFile())) {
|
||||||
|
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", out);
|
||||||
|
System.out.println("File:" + tmpFileName);
|
||||||
|
}
|
||||||
System.out.println("\n\n");
|
System.out.println("\n\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -15,10 +15,3 @@ management:
|
|||||||
health.enabled: true
|
health.enabled: true
|
||||||
endpoints.web.exposure.include: prometheus, health, metrics
|
endpoints.web.exposure.include: prometheus, health, metrics
|
||||||
metrics.export.prometheus.enabled: true
|
metrics.export.prometheus.enabled: true
|
||||||
tracing:
|
|
||||||
enabled: ${TRACING_ENABLED:false}
|
|
||||||
sampling:
|
|
||||||
probability: ${TRACING_PROBABILITY:1.0}
|
|
||||||
otlp:
|
|
||||||
tracing:
|
|
||||||
endpoint: ${OTLP_ENDPOINT:http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces}
|
|
||||||
@ -1 +0,0 @@
|
|||||||
Subproject commit 9dc6c2337dea32e63aef53271dba0692537c6605
|
|
||||||
@ -1 +0,0 @@
|
|||||||
Subproject commit 21fefb64bf27ca2b3329a6c69d90a27450b17930
|
|
||||||
@ -1,9 +1,5 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
dir=${PWD##*/}
|
dir=${PWD##*/}
|
||||||
|
|
||||||
gradle assemble
|
gradle assemble
|
||||||
|
|
||||||
# Get the current Git branch
|
# Get the current Git branch
|
||||||
@ -15,32 +11,5 @@ commit_hash=$(git rev-parse --short=5 HEAD)
|
|||||||
# Combine branch and commit hash
|
# Combine branch and commit hash
|
||||||
buildName="${USER}-${branch}-${commit_hash}"
|
buildName="${USER}-${branch}-${commit_hash}"
|
||||||
|
|
||||||
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${buildName}
|
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache
|
||||||
|
echo "nexus.knecon.com:5001/ff/${dir}-server:$buildName"
|
||||||
newImageName="nexus.knecon.com:5001/ff/ocr-service-server:$buildName"
|
|
||||||
|
|
||||||
echo "full image name:"
|
|
||||||
echo ${newImageName}
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
if [ -z "$1" ]; then
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
namespace=${1}
|
|
||||||
deployment_name="ocr-service-v1"
|
|
||||||
|
|
||||||
echo "deploying to ${namespace}"
|
|
||||||
|
|
||||||
oldImageName=$(rancher kubectl -n ${namespace} get deployment ${deployment_name} -o=jsonpath='{.spec.template.spec.containers[*].image}')
|
|
||||||
|
|
||||||
if [ "${newImageName}" = "${oldImageName}" ]; then
|
|
||||||
echo "Image tag did not change, redeploying..."
|
|
||||||
rancher kubectl rollout restart deployment ${deployment_name} -n ${namespace}
|
|
||||||
else
|
|
||||||
echo "upgrading the image tag..."
|
|
||||||
rancher kubectl set image deployment/${deployment_name} ${deployment_name}=${newImageName} -n ${namespace}
|
|
||||||
fi
|
|
||||||
rancher kubectl rollout status deployment ${deployment_name} -n ${namespace}
|
|
||||||
echo "Built ${deployment_name}:${buildName} and deployed to ${namespace}"
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user