CYB-001: Improve OCR-Module performance
This commit is contained in:
parent
948c4bed79
commit
a82676c36b
47
.gitignore
vendored
47
.gitignore
vendored
@ -9,6 +9,49 @@
|
||||
**/tmp/
|
||||
**/.apt_generated/
|
||||
|
||||
HELP.md
|
||||
target/
|
||||
!.mvn/wrapper/maven-wrapper.jar
|
||||
!**/src/main/**/target/
|
||||
!**/src/test/**/target/
|
||||
|
||||
### maven build ###
|
||||
*.class
|
||||
/out/
|
||||
/build/
|
||||
/target/
|
||||
**/out/
|
||||
**/build/
|
||||
**/target/
|
||||
|
||||
### STS ###
|
||||
.apt_generated
|
||||
.classpath
|
||||
.factorypath
|
||||
.project
|
||||
.settings
|
||||
.springBeans
|
||||
.sts4-cache
|
||||
.gradle
|
||||
|
||||
### IntelliJ IDEA ###
|
||||
.idea
|
||||
*.iws
|
||||
*.iml
|
||||
*.ipr
|
||||
|
||||
### NetBeans ###
|
||||
/nbproject/private/
|
||||
/nbbuild/
|
||||
/dist/
|
||||
/nbdist/
|
||||
/.nb-gradle/
|
||||
build/
|
||||
!**/src/main/**/build/
|
||||
!**/src/test/**/build/
|
||||
|
||||
### VS Code ###
|
||||
.vscode/
|
||||
|
||||
.factorypath
|
||||
.springBeans
|
||||
@ -26,3 +69,7 @@
|
||||
**/.DS_Store
|
||||
**/classpath-data.json
|
||||
**/dependencies-and-licenses-overview.txt
|
||||
gradle.properties
|
||||
gradlew
|
||||
gradlew.bat
|
||||
gradle/
|
||||
|
||||
@ -1,6 +1,21 @@
|
||||
variables:
|
||||
SONAR_PROJECT_KEY: 'LM_license-service'
|
||||
include:
|
||||
- project: 'gitlab/gitlab'
|
||||
ref: 'main'
|
||||
file: 'ci-templates/maven_java.yml'
|
||||
file: 'ci-templates/gradle_java.yml'
|
||||
|
||||
deploy:
|
||||
stage: deploy
|
||||
tags:
|
||||
- dind
|
||||
script:
|
||||
- echo "Building with gradle version ${BUILDVERSION}"
|
||||
- gradle -Pversion=${BUILDVERSION} publish
|
||||
- gradle bootBuildImage --cleanCache --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${BUILDVERSION}
|
||||
- echo "BUILDVERSION=$BUILDVERSION" >> version.env
|
||||
artifacts:
|
||||
reports:
|
||||
dotenv: version.env
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
||||
- if: $CI_COMMIT_BRANCH =~ /^release/
|
||||
- if: $CI_COMMIT_TAG
|
||||
79
README.md
Normal file
79
README.md
Normal file
@ -0,0 +1,79 @@
|
||||
# OCR Service
|
||||
## Overview
|
||||
The OCR service is a tool designed for extracting text content from PDF files. It utilizes Tesseract, Leptonica, PDFTron, PDFBox, and Ghostscript to perform various tasks, including removing invisible elements and watermarks, extracting images, stitching striped images, binarizing images, running OCR on the processed images, and writing the recognized text back to the original PDF. This service is particularly useful for obtaining machine-readable text from PDF documents.
|
||||
|
||||
## Dependencies
|
||||
[Tesseract](https://github.com/tesseract-ocr/tesseract)
|
||||
[Leptonica](http://leptonica.org/)
|
||||
[PDFTron](https://apryse.com/)
|
||||
[PDFBox](https://pdfbox.apache.org/)
|
||||
[Ghostscript](https://www.ghostscript.com/)
|
||||
## Functionality
|
||||
1. Invisible Element and Watermark Removal
|
||||
The service uses PDFTron to attempt the removal of invisible elements and watermarks from the PDF.
|
||||
2. Image Extraction
|
||||
Extracts all images from the PDF using PDFBox
|
||||
3. Striped Image Detection and Stitching
|
||||
Detects if images are striped and stitches them together using Ghostscript.
|
||||
4. Binarization
|
||||
Binarizes the resulting images using Leptonica and the Otsu thresholding algorithm.
|
||||
5. OCR Processing
|
||||
Runs Tesseract on the images to extract text.
|
||||
6. Text Integration
|
||||
Draws the resulting text onto the original PDF using PDFBox.
|
||||
|
||||
Steps 2.-5. happen in parallel and communicate via a blocking queue to limit RAM usage.
|
||||
Therefore, choosing your thread counts carefully leads to most optimal performance.
|
||||
For example with 18 available cores, I achieved the highest performance with 2 Image extraction threads, 2 ghostscript processes and 16 OCR threads.
|
||||
|
||||
Setting all threads to basically unlimited (1000+) leads to comparable performance without laborious thread tuning, but at the cost of (potentially a lot) more RAM.
|
||||
|
||||
## Installation
|
||||
To run the OCR service, ensure that the following dependencies are installed:
|
||||
|
||||
1. Ghostscript: Install using apt.
|
||||
```bash
|
||||
sudo apt install ghostscript
|
||||
```
|
||||
2. Tesseract and Leptonica: Install using [vcpkg](https://github.com/microsoft/vcpkg) with the command and set the environment variable `VCPKG_DYNAMIC_LIB` to your vcpkg lib folder (e.g. ~/vcpkg/installed/x64-linux-dynamic/lib).
|
||||
```bash
|
||||
vcpkg install tesseract --triplet x64-linux-dynamic
|
||||
```
|
||||
```bash
|
||||
vcpkg install leptonica --triplet x64-linux-dynamic
|
||||
```
|
||||
3. Other dependencies are handled by Gradle build
|
||||
```bash
|
||||
gradle build
|
||||
```
|
||||
|
||||
## Configuration
|
||||
Configuration settings are available in the OcrServiceSettings class.
|
||||
These settings can be overridden using environment variables. e.g.
|
||||
`OCR_SERVICE_OCR_THREAD_COUNT=16`
|
||||
|
||||
Possible configurations and their defaults include:
|
||||
|
||||
```java
|
||||
int ocrThreadCount = 4; // Number of OCR threads
|
||||
int imageExtractThreadCount = 4; // Number of image extraction threads
|
||||
int gsProcessCount = 4; // Number of Ghostscript processes
|
||||
int dpi = 300; // Target DPI for binarized images
|
||||
int psmOverride = -1; // Overrides the page segmentation mode if > 0
|
||||
int minImageHeight = 20; // Minimum height for images to be processed
|
||||
int minImageWidth = 20; // Minimum width for images to be processed
|
||||
boolean debug = false; // If true, overlays OCR images with a grid and draws word bounding boxes
|
||||
boolean removeWatermark; // If false, watermarks will not be removed
|
||||
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
|
||||
```
|
||||
## Integration
|
||||
|
||||
The OCR-service communicates via RabbitMQ and uses the queues `ocrQueue`, `ocrDLQ`, and `ocr_status_update_response_queue`.
|
||||
|
||||
### ocrQueue
|
||||
This queue is used to start the OCR process, a DocumentRequest must be passed as a message. The service will then download the PDF from the provided cloud storage.
|
||||
### ocr_status_update_response_queue
|
||||
This queue is used by the OCR service to give updates about the progress of the ongoing OCR on a image per image basis. The total amount may change, when less images are found than initially assumed.
|
||||
This queue is also used to signal the end of processing.
|
||||
### ocrDLQ
|
||||
This queue is used to signal an error has occurred during processing.
|
||||
15
buildSrc/build.gradle.kts
Normal file
15
buildSrc/build.gradle.kts
Normal file
@ -0,0 +1,15 @@
|
||||
/*
|
||||
* This file was generated by the Gradle 'init' task.
|
||||
*
|
||||
* This project uses @Incubating APIs which are subject to change.
|
||||
*/
|
||||
|
||||
plugins {
|
||||
// Support convention plugins written in Kotlin. Convention plugins are build scripts in 'src/main' that automatically become available as plugins in the main build.
|
||||
`kotlin-dsl`
|
||||
}
|
||||
|
||||
repositories {
|
||||
// Use the plugin portal to apply community plugins in convention plugins.
|
||||
gradlePluginPortal()
|
||||
}
|
||||
@ -0,0 +1,64 @@
|
||||
plugins {
|
||||
`java-library`
|
||||
pmd
|
||||
checkstyle
|
||||
jacoco
|
||||
}
|
||||
|
||||
group = "com.knecon.fforesight.service"
|
||||
|
||||
|
||||
|
||||
java.sourceCompatibility = JavaVersion.VERSION_17
|
||||
java.targetCompatibility = JavaVersion.VERSION_17
|
||||
|
||||
tasks.pmdMain {
|
||||
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
|
||||
}
|
||||
|
||||
tasks.pmdTest {
|
||||
pmd.ruleSetFiles = files("${rootDir}/config/pmd/test_pmd.xml")
|
||||
}
|
||||
|
||||
tasks.named<Test>("test") {
|
||||
useJUnitPlatform()
|
||||
reports {
|
||||
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
|
||||
}
|
||||
}
|
||||
|
||||
tasks.test {
|
||||
finalizedBy(tasks.jacocoTestReport) // report is always generated after tests run
|
||||
}
|
||||
|
||||
tasks.jacocoTestReport {
|
||||
dependsOn(tasks.test) // tests are required to run before generating the report
|
||||
reports {
|
||||
xml.required.set(true)
|
||||
csv.required.set(false)
|
||||
html.outputLocation.set(layout.buildDirectory.dir("jacocoHtml"))
|
||||
}
|
||||
}
|
||||
|
||||
java {
|
||||
withJavadocJar()
|
||||
}
|
||||
|
||||
repositories {
|
||||
mavenLocal()
|
||||
maven {
|
||||
url = uri("https://nexus.knecon.com/repository/gindev/")
|
||||
credentials {
|
||||
username = providers.gradleProperty("mavenUser").getOrNull()
|
||||
password = providers.gradleProperty("mavenPassword").getOrNull()
|
||||
}
|
||||
}
|
||||
maven {
|
||||
url = uri("https://nexus.knecon.com/repository/PDFTron/")
|
||||
credentials {
|
||||
username = providers.gradleProperty("mavenUser").getOrNull()
|
||||
password = providers.gradleProperty("mavenPassword").getOrNull()
|
||||
}
|
||||
}
|
||||
mavenCentral()
|
||||
}
|
||||
39
config/checkstyle/checkstyle.xml
Normal file
39
config/checkstyle/checkstyle.xml
Normal file
@ -0,0 +1,39 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
|
||||
"http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
|
||||
<module name="Checker">
|
||||
<property
|
||||
name="severity"
|
||||
value="error"/>
|
||||
<module name="TreeWalker">
|
||||
<module name="SuppressWarningsHolder"/>
|
||||
<module name="MissingDeprecated"/>
|
||||
<module name="MissingOverride"/>
|
||||
<module name="AnnotationLocation"/>
|
||||
<module name="JavadocStyle"/>
|
||||
<module name="NonEmptyAtclauseDescription"/>
|
||||
<module name="IllegalImport"/>
|
||||
<module name="RedundantImport"/>
|
||||
<module name="RedundantModifier"/>
|
||||
<module name="EmptyBlock"/>
|
||||
<module name="DefaultComesLast"/>
|
||||
<module name="EmptyStatement"/>
|
||||
<module name="EqualsHashCode"/>
|
||||
<module name="ExplicitInitialization"/>
|
||||
<module name="IllegalInstantiation"/>
|
||||
<module name="ModifiedControlVariable"/>
|
||||
<module name="MultipleVariableDeclarations"/>
|
||||
<module name="PackageDeclaration"/>
|
||||
<module name="ParameterAssignment"/>
|
||||
<module name="SimplifyBooleanExpression"/>
|
||||
<module name="SimplifyBooleanReturn"/>
|
||||
<module name="StringLiteralEquality"/>
|
||||
<module name="OneStatementPerLine"/>
|
||||
<module name="FinalClass"/>
|
||||
<module name="ArrayTypeStyle"/>
|
||||
<module name="UpperEll"/>
|
||||
<module name="OuterTypeFilename"/>
|
||||
</module>
|
||||
<module name="FileTabCharacter"/>
|
||||
<module name="SuppressWarningsFilter"/>
|
||||
</module>
|
||||
20
config/pmd/pmd.xml
Normal file
20
config/pmd/pmd.xml
Normal file
@ -0,0 +1,20 @@
|
||||
<?xml version="1.0"?>
|
||||
<ruleset name="Custom ruleset"
|
||||
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
|
||||
|
||||
<description>
|
||||
Knecon ruleset checks the code for bad stuff
|
||||
</description>
|
||||
|
||||
<rule ref="category/java/errorprone.xml">
|
||||
<exclude name="DataflowAnomalyAnalysis"/>
|
||||
<exclude name="MissingSerialVersionUID"/>
|
||||
<exclude name="NullAssignment"/>
|
||||
<exclude name="AvoidLiteralsInIfCondition"/>
|
||||
<exclude name="AvoidDuplicateLiterals"/>
|
||||
<exclude name="AvoidFieldNameMatchingMethodName"/>
|
||||
</rule>
|
||||
|
||||
</ruleset>
|
||||
23
config/pmd/test_pmd.xml
Normal file
23
config/pmd/test_pmd.xml
Normal file
@ -0,0 +1,23 @@
|
||||
<?xml version="1.0"?>
|
||||
<ruleset name="Custom ruleset"
|
||||
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
|
||||
|
||||
<description>
|
||||
Knecon test ruleset checks the code for bad stuff
|
||||
</description>
|
||||
|
||||
|
||||
<rule ref="category/java/errorprone.xml">
|
||||
<exclude name="DataflowAnomalyAnalysis"/>
|
||||
<exclude name="MissingSerialVersionUID"/>
|
||||
<exclude name="NullAssignment"/>
|
||||
<exclude name="AvoidLiteralsInIfCondition"/>
|
||||
<exclude name="AvoidDuplicateLiterals"/>
|
||||
<exclude name="AvoidFieldNameMatchingMethodName"/>
|
||||
<exclude name="AvoidFieldNameMatchingTypeName"/>
|
||||
<exclude name="TestClassWithoutTestCases"/>
|
||||
</rule>
|
||||
|
||||
</ruleset>
|
||||
1
gradle.properties.kts
Normal file
1
gradle.properties.kts
Normal file
@ -0,0 +1 @@
|
||||
version = 4.0-SNAPSHOT
|
||||
Binary file not shown.
@ -1,118 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<groupId>com.knecon.fforesight</groupId>
|
||||
<artifactId>platform-docker-dependency</artifactId>
|
||||
<version>0.1.0</version>
|
||||
<relativePath/>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>ocr-service-image-v1</artifactId>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<version>3.0-SNAPSHOT</version>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<properties>
|
||||
<service.server>ocr-service-server-v1</service.server>
|
||||
<platform.jar>${service.server}.jar</platform.jar>
|
||||
<docker.skip.push>false</docker.skip.push>
|
||||
<docker.image.name>${docker.image.prefix}/${service.server}</docker.image.name>
|
||||
</properties>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-resources-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>exec-maven-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>io.fabric8</groupId>
|
||||
<artifactId>docker-maven-plugin</artifactId>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
<pluginManagement>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>download-platform-jar</id>
|
||||
<phase>prepare-package</phase>
|
||||
<goals>
|
||||
<goal>copy</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<artifactItems>
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>${service.server}</artifactId>
|
||||
<version>${version}</version>
|
||||
<type>jar</type>
|
||||
<overWrite>true</overWrite>
|
||||
<destFileName>${platform.jar}</destFileName>
|
||||
</dependency>
|
||||
</artifactItems>
|
||||
<outputDirectory>${docker.build.directory}</outputDirectory>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>io.fabric8</groupId>
|
||||
<artifactId>docker-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<images>
|
||||
<image>
|
||||
<name>${docker.image.name}</name>
|
||||
<build>
|
||||
<dockerFileDir>${docker.build.directory}</dockerFileDir>
|
||||
<args>
|
||||
<PLATFORM_JAR>${platform.jar}</PLATFORM_JAR>
|
||||
</args>
|
||||
<tags>
|
||||
<tag>${docker.image.version}</tag>
|
||||
<tag>latest</tag>
|
||||
</tags>
|
||||
</build>
|
||||
</image>
|
||||
</images>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<artifactId>maven-resources-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>copy-resources</id>
|
||||
<phase>prepare-package</phase>
|
||||
<goals>
|
||||
<goal>copy-resources</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<outputDirectory>${basedir}/target/build/libs/</outputDirectory>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>libs</directory>
|
||||
<filtering>false</filtering>
|
||||
</resource>
|
||||
</resources>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
</build>
|
||||
</project>
|
||||
@ -1,18 +0,0 @@
|
||||
FROM red/base-image:2.0.2
|
||||
|
||||
COPY "libs/pdftron/OCRModuleLinux.tar.gz" .
|
||||
RUN tar xvzf OCRModuleLinux.tar.gz
|
||||
RUN mkdir /OCRModule
|
||||
RUN mv Lib/* /OCRModule/
|
||||
|
||||
RUN apt-get -y update
|
||||
# Ghostscript somehow improves ocr quality using pdfton, do not remove!
|
||||
RUN apt-get -y install ghostscript
|
||||
|
||||
ARG PLATFORM_JAR
|
||||
|
||||
ENV PLATFORM_JAR ${PLATFORM_JAR}
|
||||
|
||||
ENV USES_ELASTICSEARCH false
|
||||
|
||||
COPY ["${PLATFORM_JAR}", "/"]
|
||||
@ -1,33 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<artifactId>ocr-service-v1</artifactId>
|
||||
<version>3.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>ocr-service-api-v1</artifactId>
|
||||
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<!-- This dependency contains annotations that are used in specifying REST endpoints. -->
|
||||
<!-- It is optional since not all users of this API might use Feign. -->
|
||||
<groupId>io.github.openfeign</groupId>
|
||||
<artifactId>feign-core</artifactId>
|
||||
<optional>true</optional>
|
||||
</dependency>
|
||||
<!-- spring -->
|
||||
<dependency>
|
||||
<groupId>org.springframework</groupId>
|
||||
<artifactId>spring-web</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
</dependencies>
|
||||
</project>
|
||||
@ -1,17 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.api.model;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class IdentityTest {
|
||||
|
||||
@Test
|
||||
public void mockTest() {
|
||||
|
||||
int i = 1;
|
||||
assertThat(i).isEqualTo(1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
22
ocr-service-v1/ocr-service-api/build.gradle.kts
Normal file
22
ocr-service-v1/ocr-service-api/build.gradle.kts
Normal file
@ -0,0 +1,22 @@
|
||||
plugins {
|
||||
`maven-publish`
|
||||
id("com.iqser.red.service.java-conventions")
|
||||
id("io.freefair.lombok") version "8.2.2"
|
||||
}
|
||||
|
||||
publishing {
|
||||
publications {
|
||||
create<MavenPublication>(name) {
|
||||
from(components["java"])
|
||||
}
|
||||
}
|
||||
repositories {
|
||||
maven {
|
||||
url = uri("https://nexus.knecon.com/repository/red-platform-releases/")
|
||||
credentials {
|
||||
username = providers.gradleProperty("mavenUser").getOrNull();
|
||||
password = providers.gradleProperty("mavenPassword").getOrNull();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,13 +1,14 @@
|
||||
package com.iqser.red.service.ocr.v1.api.model;
|
||||
package com.knecon.fforesight.service.ocr.v1.api.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public class DocumentRequest {
|
||||
|
||||
protected String dossierId;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.ocr.v1.api.model;
|
||||
package com.knecon.fforesight.service.ocr.v1.api.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -6,9 +6,9 @@ import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public class OCRStatusUpdateResponse {
|
||||
|
||||
private String fileId;
|
||||
28
ocr-service-v1/ocr-service-processor/build.gradle.kts
Normal file
28
ocr-service-v1/ocr-service-processor/build.gradle.kts
Normal file
@ -0,0 +1,28 @@
|
||||
plugins {
|
||||
id("com.iqser.red.service.java-conventions")
|
||||
id("io.freefair.lombok") version "8.2.2"
|
||||
}
|
||||
|
||||
configurations {
|
||||
all {
|
||||
exclude(group = "org.springframework.boot", module = "spring-boot-starter-logging")
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
api("com.iqser.red.service:persistence-service-internal-api-v1:2.224.0")
|
||||
api("net.sourceforge.tess4j:tess4j:5.8.0")
|
||||
api("com.iqser.red.commons:metric-commons:2.1.0")
|
||||
api("com.iqser.red.commons:storage-commons:2.45.0")
|
||||
api("com.knecon.fforesight:tenant-commons:0.13.0")
|
||||
api("com.pdftron:PDFNet:10.5.0")
|
||||
api("org.apache.pdfbox:pdfbox:3.0.0")
|
||||
api("org.apache.pdfbox:jbig2-imageio:3.0.4")
|
||||
api("com.github.jai-imageio:jai-imageio-core:1.4.0")
|
||||
api("com.github.jai-imageio:jai-imageio-jpeg2000:1.4.0")
|
||||
api("io.github.karols:hocr4j:0.1.2")
|
||||
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
|
||||
api("com.google.guava:guava:31.1-jre")
|
||||
api("com.iqser.red.commons:pdftron-logic-commons:2.20.0")
|
||||
testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
package com.knecon.fforesight.service.ocr.processor;
|
||||
|
||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.context.annotation.ComponentScan;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
@Configuration
|
||||
@ComponentScan
|
||||
@EnableConfigurationProperties(OcrServiceSettings.class)
|
||||
public class OcrServiceProcessorConfiguration {
|
||||
|
||||
}
|
||||
@ -1,6 +1,7 @@
|
||||
package com.iqser.red.service.ocr.v1.server.initializer;
|
||||
package com.knecon.fforesight.service.ocr.processor.initializer;
|
||||
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
import com.sun.jna.NativeLibrary;
|
||||
|
||||
import jakarta.annotation.PostConstruct;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -15,9 +16,6 @@ public class PDFNetInitializer {
|
||||
@Value("${pdftron.license:}")
|
||||
private String pdftronLicense;
|
||||
|
||||
@Value("${pdftron.ocrmodule.path:/tmp}")
|
||||
private String ocrModulePath;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@PostConstruct
|
||||
@ -25,7 +23,7 @@ public class PDFNetInitializer {
|
||||
public void init() {
|
||||
|
||||
PDFNet.setTempPath("/tmp/pdftron");
|
||||
PDFNet.addResourceSearchPath(ocrModulePath);
|
||||
PDFNet.initialize(pdftronLicense);
|
||||
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,150 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.AlphaComposite;
|
||||
import java.awt.Color;
|
||||
import java.awt.Graphics2D;
|
||||
import java.awt.Transparency;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
import java.nio.IntBuffer;
|
||||
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.Setter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
import net.sourceforge.tess4j.ITessAPI;
|
||||
|
||||
@Slf4j
|
||||
@Getter
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class ExtractedOcrImage implements OcrImage {
|
||||
|
||||
final int pageNumber;
|
||||
final Pix pix;
|
||||
final int originalHeight;
|
||||
final int originalWidth;
|
||||
final int height;
|
||||
final int width;
|
||||
final Matrix ctm;
|
||||
final int numberOnPage;
|
||||
|
||||
@Setter
|
||||
int rotationDegrees;
|
||||
|
||||
@SneakyThrows
|
||||
public ExtractedOcrImage(int pageNumber, int numberOnPage, BufferedImage bufferedImage, Matrix ctm, int targetDpi, boolean isGray) {
|
||||
|
||||
this.pageNumber = pageNumber;
|
||||
this.numberOnPage = numberOnPage;
|
||||
this.ctm = ctm;
|
||||
this.originalHeight = bufferedImage.getHeight();
|
||||
this.originalWidth = bufferedImage.getWidth();
|
||||
float imageDPI = Math.abs(bufferedImage.getWidth() / (ctm.getScalingFactorX() / 72));
|
||||
this.pix = binarize(bufferedImage, imageDPI, targetDpi, isGray);
|
||||
this.height = pix.h;
|
||||
this.width = pix.w;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Pix binarize(BufferedImage image, float imageDpi, int targetDpi, boolean isGray) {
|
||||
|
||||
setAlphaChannelToWhite(image);
|
||||
Pix grayScale = convertToGrayScale(image, isGray);
|
||||
Pix scaledUp = scaleToTargetDpi(imageDpi, targetDpi, grayScale);
|
||||
Pix despeckled = LeptUtils.despeckle(scaledUp, LeptUtils.SEL_STR3, 3);
|
||||
LeptUtils.disposePix(scaledUp);
|
||||
return despeckled;
|
||||
}
|
||||
|
||||
|
||||
private static Pix scaleToTargetDpi(float imageDpi, int targetDpi, Pix grayScale) {
|
||||
|
||||
Pix scaledUp;
|
||||
float targetFactor = targetDpi / imageDpi;
|
||||
|
||||
if (targetFactor > 3) {
|
||||
scaledUp = Leptonica1.pixScaleGray4xLI(grayScale);
|
||||
LeptUtils.disposePix(grayScale);
|
||||
} else if (targetFactor > 1.9) {
|
||||
scaledUp = Leptonica1.pixScaleGray2xLI(grayScale);
|
||||
LeptUtils.disposePix(grayScale);
|
||||
} else {
|
||||
scaledUp = grayScale;
|
||||
}
|
||||
return scaledUp;
|
||||
}
|
||||
|
||||
|
||||
private static Pix convertToGrayScale(BufferedImage image, boolean isGray) throws IOException {
|
||||
|
||||
Pix pix = LeptUtils.convertImageToPix(image);
|
||||
Pix grayScale;
|
||||
if (isGray) {
|
||||
grayScale = pix;
|
||||
} else {
|
||||
grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
|
||||
LeptUtils.disposePix(pix);
|
||||
}
|
||||
return grayScale;
|
||||
}
|
||||
|
||||
|
||||
private static void setAlphaChannelToWhite(BufferedImage image) {
|
||||
|
||||
if (image.getTransparency() == Transparency.TRANSLUCENT) {
|
||||
// NOTE: For BITMASK images, the color model is likely IndexColorModel,
|
||||
// and this model will contain the "real" color of the transparent parts
|
||||
// which is likely a better fit than unconditionally setting it to white.
|
||||
|
||||
// Fill background with white
|
||||
Graphics2D graphics = image.createGraphics();
|
||||
try {
|
||||
graphics.setComposite(AlphaComposite.DstOver); // Set composite rules to paint "behind"
|
||||
graphics.setPaint(Color.WHITE);
|
||||
graphics.fillRect(0, 0, image.getWidth(), image.getHeight());
|
||||
} finally {
|
||||
graphics.dispose();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public AffineTransform getImageCTM() {
|
||||
|
||||
AffineTransform affineTransform = ctm.createAffineTransform();
|
||||
|
||||
affineTransform.scale((double) 1 / getWidth(), (double) 1 / getHeight());
|
||||
|
||||
AffineTransform deRotationMatrix = switch (360 - rotationDegrees) {
|
||||
case 90 -> new AffineTransform(0, 1, -1, 0, getHeight(), 0);
|
||||
case 180 -> new AffineTransform(-1, 0, 0, -1, getWidth(), getHeight());
|
||||
case 270 -> new AffineTransform(0, -1, 1, 0, getWidth() - getHeight(), getHeight()); // results from 90 + 180 rotations
|
||||
default -> new AffineTransform();
|
||||
};
|
||||
|
||||
affineTransform.concatenate(deRotationMatrix);
|
||||
AffineTransform mirrorTransform = new AffineTransform(1, 0, 0, -1, 0, getHeight());
|
||||
affineTransform.concatenate(mirrorTransform);
|
||||
return affineTransform;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getOptimalPageSegmentationMode() {
|
||||
|
||||
return ITessAPI.TessPageSegMode.PSM_SINGLE_BLOCK;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public final class FontMetrics {
|
||||
|
||||
float descent; // descent is the part of the text which is below the baseline, e.g. the lower curve of a 'g'. https://en.wikipedia.org/wiki/Body_height_(typography)
|
||||
float fontSize;
|
||||
float heightScaling;
|
||||
|
||||
}
|
||||
@ -0,0 +1,5 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
public record HeightAndDescent(float height, float descent) {
|
||||
|
||||
}
|
||||
@ -0,0 +1,144 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.PdfDpiCalculator;
|
||||
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
public interface OcrImage {
|
||||
|
||||
/**
|
||||
* Retrieves the page number where the OCR image is located. It uses 1-based-index.
|
||||
*
|
||||
* @return The page number where the OCR image is located.
|
||||
*/
|
||||
int getPageNumber();
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the number of this image on the page. For full page images this always returns 0.
|
||||
*
|
||||
* @return The number of this image on the page.
|
||||
*/
|
||||
int getNumberOnPage();
|
||||
|
||||
|
||||
int getHeight();
|
||||
|
||||
|
||||
int getWidth();
|
||||
|
||||
|
||||
/**
|
||||
* Gets the outer boundary of the image in image coordinates. (0,0) is upper left corner. And height and width is the image size
|
||||
*
|
||||
* @return the QuadPoint representing the size of the image
|
||||
*/
|
||||
default QuadPoint getImageBounds() {
|
||||
|
||||
// cannot be solved with a nice rotation matrix, since the after rotating the text coordinates in the image will always start at (0,0) and will therefore always start at (0,0) in the PDF.
|
||||
// So in order to mimic this behavior we need to start with (0,0) coordinates always.
|
||||
if (getRotationDegrees() == 90 || getRotationDegrees() == 270) {
|
||||
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, getWidth()), new Point2D.Double(getHeight(), getWidth()), new Point2D.Double(getHeight(), 0));
|
||||
} else {
|
||||
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, getHeight()), new Point2D.Double(getWidth(), getHeight()), new Point2D.Double(getWidth(), 0));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the image coordinates in the PDF by transforming the image bounds using the current transformation matrix (CTM).
|
||||
*
|
||||
* @return The image coordinates as a QuadPoint object.
|
||||
*/
|
||||
default QuadPoint getImageCoordinatesInInitialUserSpace() {
|
||||
|
||||
return getImageBounds().getTransformed(getImageCTM());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the rotation degree of the OCR image.
|
||||
*
|
||||
* @return The rotation degree of the OCR image.
|
||||
*/
|
||||
int getRotationDegrees();
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the optimal page segmentation mode for the OCR image.
|
||||
*
|
||||
* @return The optimal page segmentation mode.
|
||||
*/
|
||||
int getOptimalPageSegmentationMode(); // TODO: evaluate if PSM can be dynamically chosen to increase performance
|
||||
|
||||
|
||||
/**
|
||||
* Sets the rotation degree of the OCR image. The rotation degree specifies the amount of rotation applied to the image.
|
||||
* Currently only quadrant rotations are supported.
|
||||
* Rotated partial images work, due to the CTM present in the pdf working with any rotation.
|
||||
*
|
||||
* @param rotationDegree The rotation degree of the OCR image.
|
||||
*/
|
||||
void setRotationDegrees(int rotationDegree);
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the buffered image associated with the OCR image.
|
||||
*
|
||||
* @return The BufferedImage object representing the image.
|
||||
*/
|
||||
Pix getPix();
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the rotated image of the OCR image.
|
||||
*
|
||||
* @return The rotated BufferedImage object of the OCR image.
|
||||
*/
|
||||
default Pix getRotatedPix() {
|
||||
|
||||
return switch (360 - getRotationDegrees()) {
|
||||
case 90 -> Leptonica1.pixRotateOrth(getPix(), 1);
|
||||
case 180 -> Leptonica1.pixRotateOrth(getPix(), 2);
|
||||
case 270 -> Leptonica1.pixRotateOrth(getPix(), 3);
|
||||
default -> getPix();
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
default int getDpi() {
|
||||
|
||||
return PdfDpiCalculator.calculateDpi(getImageBounds(), getImageCTM(), getWidth());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the current transformation matrix (CTM). The CTM may be used to transform the image coordinates to Initial User Space coordinates.
|
||||
*
|
||||
* @return The AffineTransform representing the current transformation matrix.
|
||||
*/
|
||||
AffineTransform getImageCTM();
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the size (width * height) of the image.
|
||||
*
|
||||
* @return The size of the image.
|
||||
*/
|
||||
default int getImageSize() {
|
||||
|
||||
return getHeight() * getWidth();
|
||||
}
|
||||
|
||||
|
||||
default void destroyPix() {
|
||||
|
||||
LeptUtils.disposePix(getPix());
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,33 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.HOcrPageParser;
|
||||
|
||||
import io.github.karols.hocr4j.Word;
|
||||
|
||||
public record OcrResult(Image image, String hOcrPageAbsolutePath) {
|
||||
|
||||
public static OcrResult create(OcrImage image, String tesseractResult) {
|
||||
|
||||
return new OcrResult(Image.fromOcrImage(image), tesseractResult);
|
||||
}
|
||||
|
||||
|
||||
public List<Word> getAllWords() {
|
||||
|
||||
return HOcrPageParser.extractHocrPage(hOcrPageAbsolutePath).getAllWords();
|
||||
}
|
||||
|
||||
|
||||
public record Image(Integer pageNumber, AffineTransform ctm, QuadPoint position) {
|
||||
|
||||
public static Image fromOcrImage(OcrImage image) {
|
||||
|
||||
return new Image(image.getPageNumber(), image.getImageCTM(), image.getImageCoordinatesInInitialUserSpace());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,100 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import io.github.karols.hocr4j.Bounds;
|
||||
|
||||
public record QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) {
|
||||
|
||||
/*
|
||||
B _____ C
|
||||
| |
|
||||
A|_____|D
|
||||
*/
|
||||
|
||||
|
||||
public static QuadPoint fromRectangle2D(Rectangle2D rectangle2D) {
|
||||
|
||||
return new QuadPoint(new Point2D.Double(rectangle2D.getX(), rectangle2D.getY()),
|
||||
new Point2D.Double(rectangle2D.getX(), rectangle2D.getMaxY()),
|
||||
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getMaxY()),
|
||||
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getY()));
|
||||
}
|
||||
|
||||
|
||||
public static QuadPoint fromBounds(Bounds bounds) {
|
||||
|
||||
return new QuadPoint(new Point2D.Double(bounds.getLeft(), bounds.getBottom()),
|
||||
new Point2D.Double(bounds.getLeft(), bounds.getTop()),
|
||||
new Point2D.Double(bounds.getRight(), bounds.getTop()),
|
||||
new Point2D.Double(bounds.getRight(), bounds.getBottom()));
|
||||
}
|
||||
|
||||
|
||||
public QuadPoint getTransformed(AffineTransform at) {
|
||||
|
||||
return new QuadPoint(at.transform(a, null), at.transform(b, null), at.transform(c, null), at.transform(d, null));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determines if the given QuadPoint aligns with this QuadPoint within a given threshold.
|
||||
* It does os by trying every possible combination of aligning sides. It starts with the most likely combination of ab and cd.
|
||||
*
|
||||
* @param other The QuadPoint to compare with.
|
||||
* @param threshold The maximum distance allowed for alignment.
|
||||
* @return True if the QuadPoints align within the threshold, false otherwise.
|
||||
*/
|
||||
public boolean aligns(QuadPoint other, double threshold) {
|
||||
|
||||
Line2D ab = new Line2D.Double(a, b);
|
||||
Line2D bc = new Line2D.Double(b, c);
|
||||
Line2D cd = new Line2D.Double(c, d);
|
||||
Line2D da = new Line2D.Double(d, a);
|
||||
|
||||
Line2D ab2 = new Line2D.Double(other.a, other.b);
|
||||
Line2D bc2 = new Line2D.Double(other.b, other.c);
|
||||
Line2D cd2 = new Line2D.Double(other.c, other.d);
|
||||
Line2D da2 = new Line2D.Double(other.d, other.a);
|
||||
|
||||
List<Line2D> lines = List.of(ab, cd, bc, da);
|
||||
List<Line2D> lines2 = List.of(cd2, ab2, bc2, da2);
|
||||
return lines.stream().anyMatch(line -> lines2.stream().anyMatch(line2 -> aligns(line, line2, threshold)));
|
||||
}
|
||||
|
||||
|
||||
private static boolean aligns(Line2D a, Line2D b, double threshold) {
|
||||
|
||||
return aligns(a.getP1(), a.getP2(), b.getP1(), b.getP2(), threshold);
|
||||
}
|
||||
|
||||
|
||||
private static boolean aligns(Point2D a, Point2D b, Point2D a2, Point2D b2, double threshold) {
|
||||
|
||||
if (a.distance(a2) < threshold && b.distance(b2) < threshold) {
|
||||
return true;
|
||||
}
|
||||
return a.distance(b2) < threshold && b.distance(a2) < threshold;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.format("A:(%.2f, %.2f) | B:(%.2f, %.2f) | C:(%.2f, %.2f) | D:(%.2f, %.2f)",
|
||||
a().getX(),
|
||||
a().getY(),
|
||||
b().getX(),
|
||||
b().getY(),
|
||||
c().getX(),
|
||||
c().getY(),
|
||||
d().getX(),
|
||||
d().getY());
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,5 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
public record RenderedPageImageFile(int pageNumber, String absoluteFilePath) {
|
||||
|
||||
}
|
||||
@ -0,0 +1,129 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.tess4j.ITessAPI;
|
||||
|
||||
@Getter
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class RenderedPageOcrImage implements OcrImage {
|
||||
|
||||
final String absoluteImagePath;
|
||||
final int height;
|
||||
final int width;
|
||||
final PageInformation pageInformation;
|
||||
final Pix pix;
|
||||
@Setter
|
||||
int rotationDegrees;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public RenderedPageOcrImage(RenderedPageImageFile renderedPageImageFile, PDDocument document) {
|
||||
|
||||
this.pageInformation = PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1));
|
||||
this.absoluteImagePath = renderedPageImageFile.absoluteFilePath();
|
||||
this.pix = Leptonica1.pixRead(absoluteImagePath);
|
||||
this.height = getPix().h;
|
||||
this.width = getPix().w;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getOptimalPageSegmentationMode() {
|
||||
|
||||
return ITessAPI.TessPageSegMode.PSM_SINGLE_BLOCK;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public AffineTransform getImageCTM() {
|
||||
|
||||
double scalingFactor = calculateScalingFactor();
|
||||
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, 0, 0);
|
||||
|
||||
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
|
||||
|
||||
AffineTransform rotationMatrix = switch (calculateTotalRotation()) {
|
||||
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
|
||||
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
|
||||
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height()); // results from 90 + 180 rotations
|
||||
default -> new AffineTransform();
|
||||
};
|
||||
|
||||
// matrix multiplication is performed from right to left, so the order is reversed.
|
||||
// scaling -> mirror -> rotation
|
||||
AffineTransform resultMatrix = new AffineTransform();
|
||||
|
||||
resultMatrix.concatenate(rotationMatrix);
|
||||
resultMatrix.concatenate(mirrorMatrix);
|
||||
resultMatrix.concatenate(imageToCropBoxScaling);
|
||||
return resultMatrix;
|
||||
}
|
||||
|
||||
|
||||
private int calculateTotalRotation() {
|
||||
|
||||
return (pageInformation.rotationDegrees() + (360 - rotationDegrees)) % 360;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public QuadPoint getImageBounds() {
|
||||
|
||||
if (rotationDegrees == 90 || rotationDegrees == 270) {
|
||||
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, width), new Point2D.Double(height, width), new Point2D.Double(height, 0));
|
||||
} else {
|
||||
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, height), new Point2D.Double(width, height), new Point2D.Double(width, 0));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getPageNumber() {
|
||||
|
||||
return pageInformation.number();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getNumberOnPage() {
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
private double calculateScalingFactor() {
|
||||
|
||||
// PDFBox always returns page height and width based on rotation
|
||||
double pageWidth;
|
||||
if (pageInformation.rotationDegrees == 90 || pageInformation.rotationDegrees == 270) {
|
||||
pageWidth = pageInformation.height();
|
||||
} else {
|
||||
pageWidth = pageInformation.width();
|
||||
}
|
||||
|
||||
return pageWidth / width;
|
||||
}
|
||||
|
||||
|
||||
private record PageInformation(int height, int width, int number, int rotationDegrees) {
|
||||
|
||||
public static PageInformation fromPDPage(int pageNum, PDPage page) {
|
||||
|
||||
return new PageInformation((int) page.getCropBox().getHeight(), (int) page.getCropBox().getWidth(), pageNum, page.getRotation());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,122 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
||||
|
||||
import io.github.karols.hocr4j.Bounds;
|
||||
import io.github.karols.hocr4j.Word;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
public class TextPositionInImage {
|
||||
|
||||
QuadPoint position;
|
||||
String text;
|
||||
AffineTransform imageCTM;
|
||||
FontMetricsFactory fontMetricsFactory;
|
||||
|
||||
|
||||
public TextPositionInImage(Word word, AffineTransform imageCTM, FontMetricsFactory fontMetricsFactory) {
|
||||
|
||||
this.position = QuadPoint.fromBounds(word.getBounds());
|
||||
this.text = word.getText();
|
||||
this.imageCTM = imageCTM;
|
||||
this.fontMetricsFactory = fontMetricsFactory;
|
||||
}
|
||||
|
||||
|
||||
public QuadPoint getTransformedTextBBox() {
|
||||
|
||||
return position.getTransformed(imageCTM);
|
||||
}
|
||||
|
||||
|
||||
public PDFont getFont() {
|
||||
|
||||
return fontMetricsFactory.getFont();
|
||||
}
|
||||
|
||||
|
||||
public Matrix getTextMatrix() {
|
||||
|
||||
FontMetrics metrics = fontMetricsFactory.calculateMetrics(text, getTransformedWidth(), getTransformedHeight());
|
||||
|
||||
// Matrix multiplication is from right to left:
|
||||
// convert to image coords -> subtract descent -> scale height -> reverse imageCTM scaling -> translate to coordinates in image -> convert to pdf coords
|
||||
// width must not be set, since it is scaled with the fontsize attribute
|
||||
|
||||
AffineTransform ctm = new AffineTransform();
|
||||
ctm.concatenate(imageCTM);
|
||||
ctm.translate(position.a().getX(), position.a().getY());
|
||||
ctm.scale(getWidth() / getTransformedWidth(),
|
||||
getHeight() / getTransformedHeight()); // scale with transformation coefficient, such that fontsize may be set with transformed width.
|
||||
ctm.scale(1, metrics.getHeightScaling());
|
||||
ctm.translate(0, metrics.getDescent());
|
||||
ctm.concatenate(new AffineTransform(1, 0, 0, -1, 0, 0)); // start in image coordinates, with (0,0) being top left and negative height.
|
||||
|
||||
return new Matrix(ctm);
|
||||
}
|
||||
|
||||
|
||||
public double getFontSize() {
|
||||
|
||||
return fontMetricsFactory.calculateFontSize(text, getTransformedWidth());
|
||||
}
|
||||
|
||||
|
||||
public double getTransformedWidth() {
|
||||
|
||||
return transformedA().distance(transformedD());
|
||||
}
|
||||
|
||||
|
||||
public double getTransformedHeight() {
|
||||
|
||||
return transformedA().distance(transformedB());
|
||||
}
|
||||
|
||||
|
||||
public double getWidth() {
|
||||
|
||||
return position.a().distance(position.d());
|
||||
}
|
||||
|
||||
|
||||
public double getHeight() {
|
||||
|
||||
return position.a().distance(position.b());
|
||||
}
|
||||
|
||||
|
||||
public Point2D transformedA() {
|
||||
|
||||
return imageCTM.transform(position.a(), null);
|
||||
}
|
||||
|
||||
|
||||
public Point2D transformedB() {
|
||||
|
||||
return imageCTM.transform(position.b(), null);
|
||||
}
|
||||
|
||||
|
||||
public Point2D transformedC() {
|
||||
|
||||
return imageCTM.transform(position.c(), null);
|
||||
}
|
||||
|
||||
|
||||
public Point2D transformedD() {
|
||||
|
||||
return imageCTM.transform(position.d(), null);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
@ -10,7 +10,6 @@ import java.nio.file.StandardOpenOption;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
@ -36,10 +35,9 @@ public class FileStorageService {
|
||||
@SneakyThrows
|
||||
public byte[] getOriginalFile(String dossierId, String fileId) {
|
||||
|
||||
InputStream inputStream = getInputStream(getStorageId(dossierId, fileId, FileType.ORIGIN));
|
||||
byte[] bytes = IOUtils.toByteArray(inputStream);
|
||||
inputStream.close();
|
||||
return bytes;
|
||||
try (InputStream inputStream = getInputStream(getStorageId(dossierId, fileId, FileType.ORIGIN))) {
|
||||
return IOUtils.toByteArray(inputStream);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -68,12 +66,6 @@ public class FileStorageService {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public ImageServiceResponse getImageServiceResponse(String dossierId, String fileId) {
|
||||
|
||||
return storageService.readJSONObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.IMAGE_INFO), ImageServiceResponse.class);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private InputStream getInputStream(String storageId) {
|
||||
|
||||
@ -0,0 +1,157 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.ProcessIOLogger;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
@SuppressWarnings("PMD") // can't figure out how to safely close the stdOut and stdError streams in line 142/144
|
||||
public class GhostScriptService {
|
||||
|
||||
static String FORMAT = ".tiff";
|
||||
static String DEVICE = "tiffgray";
|
||||
OcrServiceSettings settings;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void renderPagesAsImagesBatchedAndAddToQueue(List<Integer> stitchedPageNumbers,
|
||||
String documentAbsolutePath,
|
||||
Path tmpImageDir,
|
||||
PDDocument document,
|
||||
BlockingQueue<OcrImage> imageOutputQueue,
|
||||
Statistics stats) {
|
||||
|
||||
int numOfProcesses = Math.min(settings.getGsProcessCount(), stitchedPageNumbers.size());
|
||||
|
||||
List<List<ProcessInfo>> processInfoBatches = buildSubListForEachProcess(stitchedPageNumbers,
|
||||
numOfProcesses,
|
||||
2 * settings.getOcrThreadCount()); // use 2 times the thread count as batch size, such that GS generates the rendered pages as needed by the OCR Threads
|
||||
for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) {
|
||||
long timestamp = System.currentTimeMillis();
|
||||
List<RenderedPageImageFile> renderedPageImageFiles = Collections.synchronizedList(new LinkedList<>());
|
||||
List<ProcessInfo> processInfos = processInfoBatches.get(batchIdx);
|
||||
|
||||
log.info("Batch {}: Running {} gs processes with ({}) pages each",
|
||||
batchIdx,
|
||||
processInfos.size(),
|
||||
processInfos.stream().map(info -> info.stitchedPageNumbers().size()).map(String::valueOf).collect(Collectors.joining(", ")));
|
||||
|
||||
int finalBatchIdx = batchIdx;
|
||||
List<Process> processes = processInfos.stream()
|
||||
.parallel()
|
||||
.map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.stitchedPageNumbers(), tmpImageDir, documentAbsolutePath, renderedPageImageFiles))
|
||||
.peek(s -> log.debug(String.join(" ", s)))
|
||||
.map(this::executeProcess)
|
||||
.toList();
|
||||
|
||||
List<Integer> processExitCodes = new LinkedList<>();
|
||||
for (Process process : processes) {
|
||||
processExitCodes.add(process.waitFor());
|
||||
}
|
||||
stats.increasePDF2ImgDuration(System.currentTimeMillis() - timestamp);
|
||||
|
||||
log.info("Batch {}: Ghostscript processes finished with exit codes " + processExitCodes, batchIdx);
|
||||
for (RenderedPageImageFile renderedPageImageFile : renderedPageImageFiles) {
|
||||
OcrImage image = new RenderedPageOcrImage(renderedPageImageFile, document);
|
||||
imageOutputQueue.put(image);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private List<List<ProcessInfo>> buildSubListForEachProcess(List<Integer> stitchedPageNumbers, int processCount, int batchSize) {
|
||||
|
||||
// GhostScript command line can only handle so many page numbers at once, so we split it into batches
|
||||
int batchCount = (int) Math.ceil((double) stitchedPageNumbers.size() / batchSize);
|
||||
|
||||
log.info("Splitting {} page renderings across {} process(es) in {} batch(es) with size {}", stitchedPageNumbers.size(), processCount, batchCount, batchSize);
|
||||
|
||||
List<List<ProcessInfo>> processInfoBatches = new ArrayList<>(batchCount);
|
||||
List<List<List<Integer>>> batchedBalancedSublist = ListSplittingUtils.buildBatchedBalancedSublist(stitchedPageNumbers.stream().sorted().toList(), processCount, batchCount);
|
||||
|
||||
for (var batch : batchedBalancedSublist) {
|
||||
List<ProcessInfo> processInfos = new ArrayList<>(processCount);
|
||||
for (int threadIdx = 0; threadIdx < batch.size(); threadIdx++) {
|
||||
List<Integer> balancedPageNumbersSubList = batch.get(threadIdx);
|
||||
processInfos.add(new ProcessInfo(threadIdx, balancedPageNumbersSubList));
|
||||
}
|
||||
processInfoBatches.add(processInfos);
|
||||
}
|
||||
return processInfoBatches;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private String[] buildCmdArgs(Integer processIdx,
|
||||
Integer batchIdx,
|
||||
List<Integer> stitchedImagePageIndices,
|
||||
Path outputDir,
|
||||
String documentAbsolutePath,
|
||||
List<RenderedPageImageFile> fullPageImages) {
|
||||
|
||||
String imagePathFormat = outputDir.resolve("output_" + processIdx + "_" + batchIdx + ".%04d" + FORMAT).toFile().toString();
|
||||
|
||||
for (int i = 0; i < stitchedImagePageIndices.size(); i++) {
|
||||
Integer pageNumber = stitchedImagePageIndices.get(i);
|
||||
fullPageImages.add(new RenderedPageImageFile(pageNumber, String.format(imagePathFormat, i + 1)));
|
||||
}
|
||||
|
||||
StringBuilder sPageList = new StringBuilder();
|
||||
int i = 1;
|
||||
for (Integer integer : stitchedImagePageIndices) {
|
||||
sPageList.append(integer);
|
||||
if (i < stitchedImagePageIndices.size()) {
|
||||
sPageList.append(",");
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + settings.getDpi(), "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"};
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Process executeProcess(String[] cmdArgs) {
|
||||
|
||||
Process p = Runtime.getRuntime().exec(cmdArgs);
|
||||
InputStream stdOut = p.getInputStream();
|
||||
ProcessIOLogger stdOutLogger = new ProcessIOLogger(stdOut, "GS", ProcessIOLogger.Type.STD_OUT);
|
||||
InputStream stdError = p.getErrorStream();
|
||||
ProcessIOLogger stdErrorLogger = new ProcessIOLogger(stdError, "GS", ProcessIOLogger.Type.ERROR);
|
||||
|
||||
stdOutLogger.start();
|
||||
stdErrorLogger.start();
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
private record ProcessInfo(Integer processIdx, List<Integer> stitchedPageNumbers) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,24 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.util.List;
|
||||
|
||||
import io.github.karols.hocr4j.Page;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class HOcrPageParser {
|
||||
|
||||
@SneakyThrows
|
||||
public Page extractHocrPage(String tesseractOutputFileName) {
|
||||
|
||||
String hOcrString;
|
||||
try (var hocrIn = new FileInputStream(tesseractOutputFileName + ".hocr")) {
|
||||
hOcrString = new String(hocrIn.readAllBytes());
|
||||
}
|
||||
return Page.fromHocr(List.of(hOcrString)).get(0);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@Service
|
||||
public interface IOcrMessageSender {
|
||||
|
||||
void sendUpdate(String fileId, int finishedImages, int totalImages);
|
||||
|
||||
|
||||
void sendOcrFinished(String fileId, int totalImages);
|
||||
|
||||
}
|
||||
@ -0,0 +1,107 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.awt.Graphics;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.contentstream.PDFStreamEngine;
|
||||
import org.apache.pdfbox.contentstream.operator.DrawObject;
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
|
||||
import org.apache.pdfbox.contentstream.operator.state.Restore;
|
||||
import org.apache.pdfbox.contentstream.operator.state.Save;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
|
||||
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Getter
|
||||
public class ImageStreamEngine extends PDFStreamEngine {
|
||||
|
||||
private ExtractedOcrImage currentImageOnPage;
|
||||
private List<ExtractedOcrImage> imagesOnCurrentPage;
|
||||
private OcrServiceSettings settings;
|
||||
private int pageNum;
|
||||
|
||||
|
||||
public ImageStreamEngine(OcrServiceSettings settings) {
|
||||
|
||||
this.settings = settings;
|
||||
// preparing PDFStreamEngine
|
||||
addOperator(new Concatenate(this));
|
||||
addOperator(new DrawObject(this));
|
||||
addOperator(new SetGraphicsStateParameters(this));
|
||||
addOperator(new Save(this));
|
||||
addOperator(new Restore(this));
|
||||
addOperator(new SetMatrix(this));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException {
|
||||
|
||||
String operation = operator.getName();
|
||||
if ("Do".equals(operation)) {
|
||||
COSName objectName = (COSName) operands.get(0);
|
||||
// get the PDF object
|
||||
PDXObject xobject = getResources().getXObject(objectName);
|
||||
|
||||
// check if the object is an image object
|
||||
if (xobject instanceof PDImageXObject imageXObject) {
|
||||
|
||||
if (imageXObject.getWidth() < settings.getMinImageWidth() || imageXObject.getHeight() < settings.getMinImageHeight()) {
|
||||
return;
|
||||
}
|
||||
|
||||
Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix();
|
||||
if (imageXObject.getColorSpace() instanceof PDDeviceRGB) {
|
||||
BufferedImage image = imageXObject.getImage();
|
||||
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi(), false);
|
||||
} else if (imageXObject.getColorSpace() instanceof PDDeviceGray) {
|
||||
BufferedImage image = imageXObject.getImage();
|
||||
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi(), true);
|
||||
} else {
|
||||
BufferedImage pdfImage = imageXObject.getImage();
|
||||
BufferedImage image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(),
|
||||
BufferedImage.TYPE_BYTE_GRAY);
|
||||
Graphics g = image.getGraphics();
|
||||
g.drawImage(pdfImage, 0, 0, null);
|
||||
g.dispose();
|
||||
this.currentImageOnPage = new ExtractedOcrImage(pageNum, imagesOnCurrentPage.size(), image, imageCTM, settings.getDpi(), true);
|
||||
}
|
||||
this.imagesOnCurrentPage.add(this.currentImageOnPage);
|
||||
//imagesOnPages.add(this.currentImageOnPage);
|
||||
} else if (xobject instanceof PDFormXObject) {
|
||||
PDFormXObject form = (PDFormXObject) xobject;
|
||||
showForm(form);
|
||||
}
|
||||
} else {
|
||||
super.processOperator(operator, operands);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void processPage(int pageNum, PDPage page) {
|
||||
|
||||
this.pageNum = pageNum;
|
||||
this.imagesOnCurrentPage = new LinkedList<>();
|
||||
super.processPage(page);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,152 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.util.FileSystemUtils;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OCRService {
|
||||
|
||||
FileStorageService fileStorageService;
|
||||
OcrServiceSettings settings;
|
||||
IOcrMessageSender ocrMessageSender;
|
||||
WatermarkRemovalService watermarkRemovalService;
|
||||
InvisibleElementRemovalService invisibleElementRemovalService;
|
||||
OcrResultWriter ocrResultWriter;
|
||||
GhostScriptService ghostScriptService;
|
||||
|
||||
|
||||
/**
|
||||
* Starts the OCR-Process: Collecting images (via threads),
|
||||
* looking for stitchedImages (if so converting the current page to an image with ghostscript and work on this instead),
|
||||
* perform tesseract-ocr on these images (via threads) and write the generated ocr-text as invisible elements.
|
||||
*
|
||||
* @param dossierId Id of dossier
|
||||
* @param fileId Id of file
|
||||
* @param out OutputStream where to write to
|
||||
*/
|
||||
@SneakyThrows
|
||||
public void runOcrOnDocument(String dossierId, String fileId, OutputStream out) {
|
||||
|
||||
try (InputStream fileStream = removeWatermarkIfEnabled(dossierId, fileId); ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) {
|
||||
|
||||
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
|
||||
|
||||
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
|
||||
log.info("Starting OCR for file {}", fileId);
|
||||
long ocrStart = System.currentTimeMillis();
|
||||
Statistics stats = runOcr(transferInputStream, out, fileId, dossierId);
|
||||
long ocrEnd = System.currentTimeMillis();
|
||||
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, String.format("%.1f", (ocrEnd - ocrStart) / 1000.0));
|
||||
log.info("Runtime breakdown: {}", stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private InputStream removeWatermarkIfEnabled(String dossierId, String fileId) throws IOException {
|
||||
|
||||
if (settings.isRemoveWatermark()) {
|
||||
try (var in = fileStorageService.getOriginalFileAsStream(dossierId, fileId); var transferOutputStream = new ByteArrayOutputStream()) {
|
||||
watermarkRemovalService.removeWatermarks(in, transferOutputStream);
|
||||
return new ByteArrayInputStream(transferOutputStream.toByteArray());
|
||||
}
|
||||
}
|
||||
return fileStorageService.getOriginalFileAsStream(dossierId, fileId);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Statistics runOcr(InputStream in, OutputStream out, String fileId, String dossierId) {
|
||||
|
||||
long timestamp;
|
||||
Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve(dossierId + "-" + fileId);
|
||||
Path tmpImageDir = tmpDir.resolve("images");
|
||||
Path tesseractOutputDir = tmpDir.resolve("tesseract_output");
|
||||
|
||||
tesseractOutputDir.toFile().mkdirs();
|
||||
tmpImageDir.toFile().mkdirs();
|
||||
|
||||
File documentFile = OsUtils.writeFileToTmpFolder(in, tmpDir);
|
||||
|
||||
Statistics stats;
|
||||
try (PDDocument document = Loader.loadPDF(documentFile)) {
|
||||
OcrProgressLogger logger = new OcrProgressLogger(document.getNumberOfPages(), ocrMessageSender, fileId);
|
||||
|
||||
int numberOfExtractThreads = Math.min(settings.getImageExtractThreadCount(), document.getNumberOfPages());
|
||||
int numberOfOcrThreads = Math.min(settings.getOcrThreadCount(), document.getNumberOfPages());
|
||||
stats = new Statistics(numberOfExtractThreads, numberOfOcrThreads);
|
||||
|
||||
BlockingQueue<OcrImage> ocrImageQueue = new ArrayBlockingQueue<>(numberOfOcrThreads);
|
||||
|
||||
OcrImageFactory ocrImageFactory = new OcrImageFactory(document,
|
||||
documentFile,
|
||||
tmpImageDir,
|
||||
numberOfExtractThreads,
|
||||
ghostScriptService,
|
||||
ocrImageQueue,
|
||||
logger,
|
||||
settings,
|
||||
stats);
|
||||
ocrImageFactory.start();
|
||||
|
||||
List<OcrResult> ocrResults = new LinkedList<>();
|
||||
List<OCRThread> ocrThreads = IntStream.range(0, numberOfOcrThreads)
|
||||
.boxed()
|
||||
.map(id -> new OCRThread(id, ocrImageQueue, tesseractOutputDir, ocrResults, logger, stats, settings))
|
||||
.peek(Thread::start)
|
||||
.toList();
|
||||
log.info("Started {} OCR consumer threads, listening for images on the queue", ocrThreads.size());
|
||||
ocrImageFactory.join();
|
||||
log.info("Extracted all images, interrupting ocr threads");
|
||||
|
||||
ocrThreads.forEach(Thread::interrupt);
|
||||
for (OCRThread ocrThread : ocrThreads) {
|
||||
ocrThread.join();
|
||||
}
|
||||
|
||||
log.info("OCR processing has finished, writing results");
|
||||
timestamp = System.currentTimeMillis();
|
||||
var dictionariesToUpdate = ocrResultWriter.drawOcrResultsToPdf(document, ocrResults);
|
||||
log.info("Saving document");
|
||||
document.saveIncremental(out, dictionariesToUpdate);
|
||||
stats.increaseWritingTextDuration(System.currentTimeMillis() - timestamp);
|
||||
|
||||
FileSystemUtils.deleteRecursively(tmpDir);
|
||||
logger.sendFinished();
|
||||
return stats;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,89 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageExtractionThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OcrImageFactory {
|
||||
|
||||
PDDocument document;
|
||||
File documentFile;
|
||||
Path tmpImageDir;
|
||||
GhostScriptService ghostScriptService;
|
||||
BlockingQueue<OcrImage> imageOutputQueue;
|
||||
List<ImageExtractionThread> imageExtractionThreads;
|
||||
List<Integer> stitchedPageNumbers;
|
||||
Statistics stats;
|
||||
|
||||
|
||||
public OcrImageFactory(PDDocument document,
|
||||
File documentFile,
|
||||
Path tmpImageDir,
|
||||
int numberOfThreads,
|
||||
GhostScriptService ghostScriptService,
|
||||
BlockingQueue<OcrImage> imageOutputQueue,
|
||||
OcrProgressLogger logger,
|
||||
OcrServiceSettings settings,
|
||||
Statistics stats) {
|
||||
|
||||
this.document = document;
|
||||
this.documentFile = documentFile;
|
||||
this.tmpImageDir = tmpImageDir;
|
||||
this.ghostScriptService = ghostScriptService;
|
||||
this.imageOutputQueue = imageOutputQueue;
|
||||
this.stitchedPageNumbers = Collections.synchronizedList(new LinkedList<>());
|
||||
this.stats = stats;
|
||||
|
||||
this.imageExtractionThreads = new ArrayList<>(numberOfThreads);
|
||||
|
||||
List<List<Integer>> balancedPageNumbers = ListSplittingUtils.buildBalancedContinuousSublist(document.getNumberOfPages(), numberOfThreads);
|
||||
for (int i = 0; i < balancedPageNumbers.size(); i++) {
|
||||
imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageOutputQueue, stitchedPageNumbers));
|
||||
}
|
||||
log.info("Started {} image extraction threads, with ({}) pages each",
|
||||
imageExtractionThreads.size(),
|
||||
imageExtractionThreads.stream().map(ImageExtractionThread::getPageIndices).map(List::size).map(String::valueOf).collect(Collectors.joining(", ")));
|
||||
}
|
||||
|
||||
|
||||
public void start() {
|
||||
|
||||
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
|
||||
imageExtractionThread.start();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void join() {
|
||||
|
||||
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
|
||||
imageExtractionThread.join();
|
||||
}
|
||||
if (stitchedPageNumbers.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageOutputQueue, stats);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,91 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OcrProgressLogger {
|
||||
|
||||
Set<ImageNumberWithPageNumber> imagesToProcess;
|
||||
Set<ImageNumberWithPageNumber> processedImages;
|
||||
IOcrMessageSender ocrMessageSender;
|
||||
|
||||
String fileId;
|
||||
|
||||
|
||||
public OcrProgressLogger(int totalPageCount, IOcrMessageSender ocrMessageSender, String fileId) {
|
||||
|
||||
this.ocrMessageSender = ocrMessageSender;
|
||||
this.fileId = fileId;
|
||||
this.imagesToProcess = Collections.synchronizedSet(new HashSet<>(totalPageCount));
|
||||
for (int i = 0; i < totalPageCount; i++) {
|
||||
imagesToProcess.add(new ImageNumberWithPageNumber(0, i + 1));
|
||||
}
|
||||
this.processedImages = Collections.synchronizedSet(new HashSet<>(totalPageCount));
|
||||
}
|
||||
|
||||
|
||||
public void logImageFinished(OcrImage image, int psm) {
|
||||
|
||||
this.processedImages.add(new ImageNumberWithPageNumber(image.getNumberOnPage(), image.getPageNumber()));
|
||||
|
||||
if (image instanceof ExtractedOcrImage) {
|
||||
log.info("{}/{}: Finished image {} on page {} with rotation {}, used PSM {}, quad-point: {}",
|
||||
processedImages.size(),
|
||||
imagesToProcess.size(),
|
||||
image.getNumberOnPage(),
|
||||
image.getPageNumber(),
|
||||
image.getRotationDegrees(),
|
||||
psm,
|
||||
image.getImageCoordinatesInInitialUserSpace());
|
||||
} else {
|
||||
log.info("{}/{}: Finished page {} as fully rendered page with rotation {}, used PSM {}",
|
||||
processedImages.size(),
|
||||
imagesToProcess.size(),
|
||||
image.getPageNumber(),
|
||||
image.getRotationDegrees(),
|
||||
psm);
|
||||
|
||||
}
|
||||
ocrMessageSender.sendUpdate(fileId, this.processedImages.size(), this.imagesToProcess.size());
|
||||
}
|
||||
|
||||
|
||||
public void logPageSkipped(Integer pageIndex) {
|
||||
|
||||
var pageDummy = new ImageNumberWithPageNumber(0, pageIndex);
|
||||
this.imagesToProcess.remove(pageDummy);
|
||||
log.debug("{}/{}: No images to ocr on page {}", processedImages.size(), imagesToProcess.size(), pageIndex);
|
||||
ocrMessageSender.sendUpdate(fileId, this.processedImages.size(), imagesToProcess.size());
|
||||
}
|
||||
|
||||
|
||||
public void addImagesToProcess(int pageNumber, int imageNumber) {
|
||||
|
||||
this.imagesToProcess.add(new ImageNumberWithPageNumber(imageNumber, pageNumber));
|
||||
}
|
||||
|
||||
|
||||
public void sendFinished() {
|
||||
|
||||
log.info("{}/{}: Finished OCR on all images", processedImages.size(), imagesToProcess.size());
|
||||
ocrMessageSender.sendOcrFinished(fileId, imagesToProcess.size());
|
||||
}
|
||||
|
||||
|
||||
private record ImageNumberWithPageNumber(int imageNumber, int pageNumber) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,259 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.PDResources;
|
||||
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentGroup;
|
||||
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentProperties;
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.Type0FontMetricsFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OcrResultWriter {
|
||||
|
||||
static String ocrLayerName = "knecon OCR";
|
||||
OcrServiceSettings settings;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Set<COSDictionary> drawOcrResultsToPdf(PDDocument document, List<OcrResult> ocrResults) {
|
||||
|
||||
FontMetricsFactory fontMetricsFactory = new Type0FontMetricsFactory(document);
|
||||
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
|
||||
Map<Integer, List<OcrResult>> resultsPerPage = ocrResults.stream().collect(Collectors.groupingBy(result -> result.image().pageNumber()));
|
||||
resultsPerPage.keySet().forEach(pageNumber -> drawResultsPerPage(document, pageNumber, resultsPerPage, dictionariesToUpdate, fontMetricsFactory));
|
||||
dictionariesToUpdate.add(document.getDocumentInformation().getCOSObject());
|
||||
return dictionariesToUpdate;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void drawResultsPerPage(PDDocument document, Integer pageNumber, Map<Integer, List<OcrResult>> resultsPerPage, Set<COSDictionary> dictionariesToUpdate, FontMetricsFactory fontMetricsFactory) {
|
||||
|
||||
var pdPage = document.getPage(pageNumber - 1);
|
||||
|
||||
PDOptionalContentGroup textDebugLayer = new PDOptionalContentGroup(ocrLayerName);
|
||||
PDOptionalContentGroup bBoxDebugLayer = new PDOptionalContentGroup(ocrLayerName + "BBox");
|
||||
if (settings.isDebug()) {
|
||||
textDebugLayer = addOptionalGroup(ocrLayerName, document, pdPage, dictionariesToUpdate);
|
||||
bBoxDebugLayer = addOptionalGroup(ocrLayerName + " BBox", document, pdPage, dictionariesToUpdate);
|
||||
}
|
||||
|
||||
escapeContentStreams(document, pdPage);
|
||||
|
||||
List<TextPositionInImage> words = buildTextPositionsOnPage(pageNumber, resultsPerPage, fontMetricsFactory);
|
||||
try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
|
||||
|
||||
// write invisible ocr text inside tagged content
|
||||
contentStream.beginMarkedContent(settings.getOcrMarkedContentTag());
|
||||
contentStream.saveGraphicsState();
|
||||
contentStream.setNonStrokingColor(Color.BLUE);
|
||||
contentStream.setStrokingColor(Color.BLUE);
|
||||
contentStream.setLineWidth(1);
|
||||
words.forEach(word -> drawInvisibleWord(word, contentStream));
|
||||
contentStream.restoreGraphicsState();
|
||||
contentStream.endMarkedContent();
|
||||
|
||||
if (settings.isDebug()) { // must not be written, as it will interfere with layout parsing
|
||||
// write visible ocr text inside optional group
|
||||
contentStream.beginMarkedContent(COSName.OC, textDebugLayer);
|
||||
contentStream.saveGraphicsState();
|
||||
contentStream.setNonStrokingColor(Color.BLUE);
|
||||
words.forEach(word -> drawVisibleWord(word, contentStream));
|
||||
contentStream.restoreGraphicsState();
|
||||
contentStream.endMarkedContent();
|
||||
|
||||
// write word bounding boxes (tesseract output) inside optional group
|
||||
contentStream.beginMarkedContent(COSName.OC, bBoxDebugLayer);
|
||||
contentStream.saveGraphicsState();
|
||||
resultsPerPage.get(pageNumber).stream().map(OcrResult::image).forEach(image -> drawGrid(contentStream, image.position()));
|
||||
words.stream().map(TextPositionInImage::getTransformedTextBBox).forEach(word -> drawRectangle(contentStream, word));
|
||||
contentStream.restoreGraphicsState();
|
||||
contentStream.endMarkedContent();
|
||||
}
|
||||
}
|
||||
dictionariesToUpdate.add(pdPage.getCOSObject());
|
||||
dictionariesToUpdate.add(pdPage.getResources().getCOSObject());
|
||||
}
|
||||
|
||||
|
||||
private static List<TextPositionInImage> buildTextPositionsOnPage(Integer pageNumber, Map<Integer, List<OcrResult>> resultsPerPage, FontMetricsFactory fontMetricsFactory) {
|
||||
|
||||
return resultsPerPage.get(pageNumber)
|
||||
.stream()
|
||||
.flatMap(result -> result.getAllWords().stream().filter(word -> !word.isBlank()).map(word -> new TextPositionInImage(word, result.image().ctm(), fontMetricsFactory)))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void escapeContentStreams(PDDocument document, PDPage pdPage) {
|
||||
// We need to append to the contentstream, otherwise the content could be overlapped by images
|
||||
// But we also need to save the graphics state before, such that our appended content cannot be affected by previous contentstreams with side-effects, such as not escaped matrix transformations
|
||||
try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.PREPEND, false)) {
|
||||
contentStream.saveGraphicsState();
|
||||
}
|
||||
try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, false)) {
|
||||
contentStream.restoreGraphicsState();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private PDOptionalContentGroup addOptionalGroup(String ocrLayerName, PDDocument document, PDPage pdPage, Set<COSDictionary> dictionariesToUpdate) {
|
||||
|
||||
PDDocumentCatalog catalog = document.getDocumentCatalog();
|
||||
PDOptionalContentProperties ocprops = catalog.getOCProperties();
|
||||
if (ocprops == null) {
|
||||
ocprops = new PDOptionalContentProperties();
|
||||
catalog.setOCProperties(ocprops);
|
||||
}
|
||||
PDOptionalContentGroup layer = null;
|
||||
if (ocprops.hasGroup(ocrLayerName)) {
|
||||
layer = ocprops.getGroup(ocrLayerName);
|
||||
} else {
|
||||
layer = new PDOptionalContentGroup(ocrLayerName);
|
||||
ocprops.addGroup(layer);
|
||||
}
|
||||
|
||||
// enable debug layers by default only when DEBUG flag is set.
|
||||
ocprops.setGroupEnabled(layer, settings.isDebug());
|
||||
PDResources resources = pdPage.getResources();
|
||||
if (resources == null) {
|
||||
resources = new PDResources();
|
||||
pdPage.setResources(resources);
|
||||
}
|
||||
dictionariesToUpdate.add(catalog.getCOSObject());
|
||||
return layer;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void drawRectangle(PDPageContentStream contentStream, QuadPoint rect) {
|
||||
|
||||
contentStream.saveGraphicsState();
|
||||
contentStream.setLineWidth(1);
|
||||
contentStream.moveTo((float) rect.a().getX(), (float) rect.a().getY());
|
||||
contentStream.lineTo((float) rect.b().getX(), (float) rect.b().getY());
|
||||
contentStream.setStrokingColor(Color.ORANGE);
|
||||
contentStream.stroke();
|
||||
contentStream.moveTo((float) rect.b().getX(), (float) rect.b().getY());
|
||||
contentStream.lineTo((float) rect.c().getX(), (float) rect.c().getY());
|
||||
contentStream.setStrokingColor(Color.BLUE);
|
||||
contentStream.stroke();
|
||||
contentStream.moveTo((float) rect.c().getX(), (float) rect.c().getY());
|
||||
contentStream.lineTo((float) rect.d().getX(), (float) rect.d().getY());
|
||||
contentStream.setStrokingColor(Color.GREEN);
|
||||
contentStream.stroke();
|
||||
contentStream.moveTo((float) rect.d().getX(), (float) rect.d().getY());
|
||||
contentStream.lineTo((float) rect.a().getX(), (float) rect.a().getY());
|
||||
contentStream.setStrokingColor(Color.MAGENTA);
|
||||
contentStream.stroke();
|
||||
contentStream.restoreGraphicsState();
|
||||
}
|
||||
|
||||
|
||||
private void drawInvisibleWord(TextPositionInImage word, PDPageContentStream contentStream) {
|
||||
|
||||
drawWord(word, contentStream, RenderingMode.NEITHER);
|
||||
}
|
||||
|
||||
|
||||
private void drawVisibleWord(TextPositionInImage word, PDPageContentStream contentStream) {
|
||||
|
||||
drawWord(word, contentStream, RenderingMode.FILL);
|
||||
}
|
||||
|
||||
|
||||
// @SneakyThrows
|
||||
private void drawWord(TextPositionInImage position, PDPageContentStream contentStream, RenderingMode renderingMode) {
|
||||
|
||||
try {
|
||||
contentStream.beginText();
|
||||
contentStream.setRenderingMode(renderingMode);
|
||||
contentStream.setFont(position.getFont(), (float) position.getFontSize());
|
||||
contentStream.setTextMatrix(position.getTextMatrix());
|
||||
contentStream.showText(position.getText());
|
||||
contentStream.endText();
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error("Failed to write text {}", position.getText());
|
||||
log.error(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void drawGrid(PDPageContentStream contentStream, QuadPoint rect) {
|
||||
|
||||
drawRectangle(contentStream, rect);
|
||||
|
||||
contentStream.saveGraphicsState();
|
||||
contentStream.setStrokingColor(Color.BLACK);
|
||||
contentStream.setLineWidth(0.2F);
|
||||
int nRows = 8;
|
||||
int nCols = 8;
|
||||
|
||||
Point2D abStep = new Point2D.Double((rect.b().getX() - rect.a().getX()) / (nRows + 1), (rect.b().getY() - rect.a().getY()) / (nRows + 1));
|
||||
Point2D start = add(rect.a(), abStep);
|
||||
Point2D end = add(rect.d(), abStep);
|
||||
for (int row = 0; row < nRows; ++row) {
|
||||
drawLine(start, end, contentStream);
|
||||
start = add(start, abStep);
|
||||
end = add(end, abStep);
|
||||
}
|
||||
Point2D adStep = new Point2D.Double((rect.d().getX() - rect.a().getX()) / (nCols + 1), (rect.d().getY() - rect.a().getY()) / (nCols + 1));
|
||||
start = add(rect.a(), adStep);
|
||||
end = add(rect.b(), adStep);
|
||||
for (int col = 0; col < nCols; ++col) {
|
||||
drawLine(start, end, contentStream);
|
||||
start = add(start, adStep);
|
||||
end = add(end, adStep);
|
||||
}
|
||||
contentStream.restoreGraphicsState();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void drawLine(Point2D a, Point2D b, PDPageContentStream contentStream) {
|
||||
|
||||
contentStream.moveTo((float) a.getX(), (float) a.getY());
|
||||
contentStream.lineTo((float) b.getX(), (float) b.getY());
|
||||
contentStream.stroke();
|
||||
}
|
||||
|
||||
|
||||
private Point2D add(Point2D a, Point2D b) {
|
||||
|
||||
return new Point2D.Double(a.getX() + b.getX(), a.getY() + b.getY());
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,7 +1,10 @@
|
||||
package com.iqser.red.service.ocr.v1.server.utils;
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
@ -24,6 +27,14 @@ public final class OsUtils {
|
||||
return addBackSlashAtEnd(getTemporaryDirectory()) + addBackSlashAtEnd(SERVICE_NAME) + addBackSlashAtEnd(suffix) + addBackSlashAtEnd(fileId);
|
||||
}
|
||||
|
||||
public static File writeFileToTmpFolder(InputStream in, Path tmpDir) throws IOException {
|
||||
|
||||
File pdfFile = tmpDir.resolve("document.pdf").toFile();
|
||||
try (var fileOut = new FileOutputStream(pdfFile)) {
|
||||
fileOut.write(in.readAllBytes());
|
||||
}
|
||||
return pdfFile;
|
||||
}
|
||||
|
||||
private static boolean isWindows() {
|
||||
|
||||
@ -58,8 +69,10 @@ public final class OsUtils {
|
||||
return "/tmp";
|
||||
}
|
||||
|
||||
|
||||
public static String createTmpFileName(String filename, String suffix) {
|
||||
|
||||
return Path.of(OsUtils.getTemporaryDirectory()).resolve(Path.of(filename).getFileName()).toString().replace(".pdf", "_" + suffix + ".pdf");
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,67 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class Statistics {
|
||||
|
||||
List<Long> imageExtraction;
|
||||
List<Long> tesseractDuration;
|
||||
AtomicLong pdf2ImgDuration;
|
||||
AtomicLong writingTextDuration;
|
||||
|
||||
|
||||
public Statistics(int numberOfExtractThreads, int numberOfOcrThreads) {
|
||||
|
||||
this.imageExtraction = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfExtractThreads, 0L)));
|
||||
this.tesseractDuration = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfOcrThreads, 0L)));
|
||||
this.pdf2ImgDuration = new AtomicLong(0);
|
||||
this.writingTextDuration = new AtomicLong(0);
|
||||
}
|
||||
|
||||
|
||||
public void increaseImageExtraction(int threadId, long duration) {
|
||||
|
||||
imageExtraction.set(threadId, imageExtraction.get(threadId) + duration);
|
||||
}
|
||||
|
||||
|
||||
public void increaseTesseractDuration(int threadId, long duration) {
|
||||
|
||||
tesseractDuration.set(threadId, tesseractDuration.get(threadId) + duration);
|
||||
}
|
||||
|
||||
|
||||
public void increasePDF2ImgDuration(long duration) {
|
||||
|
||||
pdf2ImgDuration.addAndGet(duration);
|
||||
}
|
||||
|
||||
|
||||
public void increaseWritingTextDuration(long duration) {
|
||||
|
||||
writingTextDuration.addAndGet(duration);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.format("imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, PDF2Img=%.2f s, writingText=%.2f s",
|
||||
((float) imageExtraction.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
|
||||
((float) imageExtraction.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
|
||||
((float) imageExtraction.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
|
||||
((float) tesseractDuration.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
|
||||
((float) tesseractDuration.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
|
||||
((float) tesseractDuration.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
|
||||
(float) pdf2ImgDuration.get() / 1000,
|
||||
(float) writingTextDuration.get() / 1000);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,41 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.fonts;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.FontMetrics;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
public interface FontMetricsFactory {
|
||||
|
||||
default FontMetrics calculateMetrics(String text, double textWidth, double textHeight) {
|
||||
|
||||
HeightAndDescent heightAndDescent = calculateHeightAndDescent(text);
|
||||
float fontSize = calculateFontSize(text, textWidth);
|
||||
float heightScaling = (float) ((textHeight / (heightAndDescent.height() - heightAndDescent.descent())) * 1000) / fontSize;
|
||||
|
||||
return new FontMetrics((heightAndDescent.descent() / 1000) * fontSize, fontSize, heightScaling);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
default float calculateFontSize(String text, double textWidth) {
|
||||
|
||||
float width;
|
||||
try {
|
||||
width = getFont().getStringWidth(text);
|
||||
} catch (IllegalArgumentException e) {
|
||||
// this means, the font has no glyph for this character
|
||||
width = getFont().getAverageFontWidth() * text.length();
|
||||
}
|
||||
return (float) (textWidth / width) * 1000;
|
||||
}
|
||||
|
||||
|
||||
PDFont getFont();
|
||||
|
||||
HeightAndDescent calculateHeightAndDescent(String text);
|
||||
|
||||
}
|
||||
@ -0,0 +1,77 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.fonts;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
|
||||
import org.apache.fontbox.ttf.GlyphData;
|
||||
import org.apache.fontbox.ttf.TTFParser;
|
||||
import org.apache.fontbox.ttf.TrueTypeFont;
|
||||
import org.apache.pdfbox.io.RandomAccessReadBuffer;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class Type0FontMetricsFactory implements FontMetricsFactory {
|
||||
|
||||
private final PDType0Font type0Font;
|
||||
private final TrueTypeFont trueTypeFont;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Type0FontMetricsFactory(PDDocument document) {
|
||||
|
||||
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream("fonts/cmu-regular.ttf"); var buffer = new RandomAccessReadBuffer(in)) {
|
||||
this.trueTypeFont = new TTFParser().parse(buffer); // since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
|
||||
this.type0Font = PDType0Font.load(document, this.trueTypeFont, false); // use Type0Font for unicode support
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public HeightAndDescent calculateHeightAndDescent(String text) {
|
||||
|
||||
byte[] bytes;
|
||||
try {
|
||||
bytes = type0Font.encode(text);
|
||||
} catch (IllegalArgumentException e) {
|
||||
log.warn("The string {} could not be parsed, using average height and descent", text);
|
||||
return new HeightAndDescent(800, -50);
|
||||
}
|
||||
|
||||
ByteArrayInputStream in = new ByteArrayInputStream(bytes);
|
||||
|
||||
float descent = 0;
|
||||
float height = 0;
|
||||
while (in.available() > 0) {
|
||||
try {
|
||||
int code = type0Font.readCode(in);
|
||||
int glyphId = type0Font.codeToGID(code);
|
||||
GlyphData glyph = trueTypeFont.getGlyph().getGlyph(glyphId);
|
||||
if (glyph == null || glyph.getBoundingBox() == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
descent = Math.min(descent, glyph.getYMinimum());
|
||||
height = Math.max(height, glyph.getYMaximum());
|
||||
} catch (Exception e) {
|
||||
log.warn("descent and height of string {} could not be parsed, using average fallback value!", text);
|
||||
}
|
||||
}
|
||||
// some characters like comma or minus return very small height values, while tesseract still returns a normal-sized bounding box and therefore exploding the height scaling factors,
|
||||
// so we need a minimum value. Here, 500 seems optimal for the characters "-", ",", "_"
|
||||
return new HeightAndDescent(Math.max(height, 500), descent);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public PDFont getFont() {
|
||||
|
||||
return type0Font;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,106 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ImageExtractionThread extends Thread {
|
||||
|
||||
static double IMAGE_ALIGNMENT_THRESHOLD = 1;
|
||||
|
||||
int id;
|
||||
@Getter
|
||||
List<Integer> pageIndices;
|
||||
File documentFile;
|
||||
OcrProgressLogger logger;
|
||||
Statistics stats;
|
||||
OcrServiceSettings settings;
|
||||
|
||||
// output is written to these lists
|
||||
BlockingQueue<OcrImage> imageOutputQueue;
|
||||
List<Integer> stitchedPageNumbers;
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void run() {
|
||||
|
||||
long timestamp;
|
||||
for (Integer pageIndex : pageIndices) {
|
||||
try (PDDocument document = Loader.loadPDF(documentFile)) { // load new PDDocument for thread safety, also keeps RAM usage low.
|
||||
timestamp = System.currentTimeMillis();
|
||||
List<ExtractedOcrImage> extractedOcrImages = getExtractedOcrImages(pageIndex, document);
|
||||
stats.increaseImageExtraction(id, System.currentTimeMillis() - timestamp);
|
||||
if (extractedOcrImages.isEmpty()) {
|
||||
logger.logPageSkipped(pageIndex);
|
||||
}
|
||||
|
||||
if (checkForStitchedImages(extractedOcrImages)) {
|
||||
stitchedPageNumbers.add(pageIndex);
|
||||
logger.addImagesToProcess(pageIndex, 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (ExtractedOcrImage image : extractedOcrImages) {
|
||||
imageOutputQueue.put(image);
|
||||
logger.addImagesToProcess(image.getPageNumber(), image.getNumberOnPage());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private List<ExtractedOcrImage> getExtractedOcrImages(Integer pageIndex, PDDocument document) {
|
||||
|
||||
PDPage page = document.getPage(pageIndex - 1);
|
||||
ImageStreamEngine imageStreamEngine = new ImageStreamEngine(settings);
|
||||
imageStreamEngine.processPage(pageIndex, page);
|
||||
return imageStreamEngine.getImagesOnCurrentPage();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private boolean checkForStitchedImages(List<ExtractedOcrImage> imagesOnCurrentPage) {
|
||||
|
||||
if (imagesOnCurrentPage.size() <= 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
//checking for intersections or direct alignment of images
|
||||
ExtractedOcrImage[] imageOnPagesArray = new ExtractedOcrImage[imagesOnCurrentPage.size()];
|
||||
int index = 0;
|
||||
for (ExtractedOcrImage imageOnPage : imagesOnCurrentPage) {
|
||||
imageOnPagesArray[index] = imageOnPage;
|
||||
index++;
|
||||
}
|
||||
for (int j = 0; j < imageOnPagesArray.length; j++) {
|
||||
for (int i = j + 1; i < imageOnPagesArray.length; i++) {
|
||||
if (imageOnPagesArray[j].getImageCoordinatesInInitialUserSpace().aligns(imageOnPagesArray[i].getImageCoordinatesInInitialUserSpace(), IMAGE_ALIGNMENT_THRESHOLD)) {
|
||||
// TODO: see if we can stitch aligning images using BufferedImage and skip the gs conversion entirely
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,172 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import static net.sourceforge.tess4j.ITessAPI.TRUE;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.FloatBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.Tesseract2;
|
||||
import com.sun.jna.ptr.PointerByReference;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
import net.sourceforge.tess4j.ITessAPI;
|
||||
import net.sourceforge.tess4j.ITesseract;
|
||||
import net.sourceforge.tess4j.TessAPI1;
|
||||
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OCRThread extends Thread {
|
||||
|
||||
int id;
|
||||
BlockingQueue<OcrImage> imageInputQueue;
|
||||
Path tesseractOutputDir;
|
||||
List<OcrResult> results;
|
||||
OcrProgressLogger logger;
|
||||
Statistics stats;
|
||||
OcrServiceSettings settings;
|
||||
Tesseract2 instance;
|
||||
ITessAPI.TessBaseAPI detectionScriptHandle;
|
||||
|
||||
|
||||
public OCRThread(int id,
|
||||
BlockingQueue<OcrImage> imageInputQueue,
|
||||
Path tesseractOutputDir,
|
||||
List<OcrResult> results,
|
||||
OcrProgressLogger logger,
|
||||
Statistics stats,
|
||||
OcrServiceSettings settings) {
|
||||
|
||||
this.id = id;
|
||||
this.imageInputQueue = imageInputQueue;
|
||||
this.tesseractOutputDir = tesseractOutputDir;
|
||||
this.results = results;
|
||||
this.logger = logger;
|
||||
this.stats = stats;
|
||||
this.settings = settings;
|
||||
this.instance = createInstance(settings);
|
||||
this.detectionScriptHandle = initDetectionScriptHandle();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void run() {
|
||||
|
||||
// Interrupting signals that the image extraction has finished
|
||||
while (!isInterrupted()) {
|
||||
try {
|
||||
final OcrImage image = imageInputQueue.take();
|
||||
this.process(image);
|
||||
} catch (InterruptedException e) {
|
||||
// set isInterrupted to true (This exception may only happen during active waiting for queue, and then isInterrupted will not be set!)
|
||||
interrupt();
|
||||
}
|
||||
}
|
||||
// empty the queue
|
||||
try {
|
||||
while (true) {
|
||||
final OcrImage image = imageInputQueue.remove();
|
||||
this.process(image);
|
||||
}
|
||||
} catch (NoSuchElementException e) {
|
||||
log.debug("Processed all Images, finishing.");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void process(OcrImage image) {
|
||||
|
||||
long timestamp = System.currentTimeMillis();
|
||||
String tmpOutputFileName = String.format("output_%04d_%04d", image.getPageNumber(), image.getNumberOnPage());
|
||||
String tesseractOutputFileName = tesseractOutputDir.resolve(tmpOutputFileName).toFile().toString();
|
||||
|
||||
int psm = settings.getPsmOverride() < 0 ? image.getOptimalPageSegmentationMode() : settings.getPsmOverride();
|
||||
|
||||
int orientDegree = detectOrientation(image);
|
||||
image.setRotationDegrees(orientDegree);
|
||||
Pix rotatedPix = image.getRotatedPix();
|
||||
executeTesseract(psm, image.getDpi(), rotatedPix, tesseractOutputFileName);
|
||||
image.destroyPix();
|
||||
LeptUtils.disposePix(rotatedPix);
|
||||
|
||||
results.add(OcrResult.create(image, tesseractOutputFileName));
|
||||
logger.logImageFinished(image, psm);
|
||||
stats.increaseTesseractDuration(id, System.currentTimeMillis() - timestamp);
|
||||
}
|
||||
|
||||
|
||||
public int detectOrientation(OcrImage image) {
|
||||
|
||||
TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, image.getPix());
|
||||
TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, image.getDpi());
|
||||
|
||||
IntBuffer orient_degB = IntBuffer.allocate(1);
|
||||
FloatBuffer orient_confB = FloatBuffer.allocate(1);
|
||||
PointerByReference script_nameB = new PointerByReference();
|
||||
FloatBuffer script_confB = FloatBuffer.allocate(1);
|
||||
|
||||
int orient_deg = 0;
|
||||
int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle, orient_degB, orient_confB, script_nameB, script_confB);
|
||||
if (result == TRUE) {
|
||||
orient_deg = orient_degB.get();
|
||||
}
|
||||
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
|
||||
|
||||
return orient_deg;
|
||||
}
|
||||
|
||||
|
||||
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
|
||||
|
||||
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
|
||||
String datapath = System.getenv("TESSDATA_PREFIX");
|
||||
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
|
||||
|
||||
return handle;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void executeTesseract(int psm, int dpi, Pix pix, String tesseractOutputFileName) {
|
||||
|
||||
if (settings.isDebug()) {
|
||||
String[] a = tesseractOutputFileName.split("/");
|
||||
String folder = "/tmp/pixs/" + a[a.length - 3];
|
||||
new File(folder).mkdirs();
|
||||
Leptonica1.pixWrite(folder + "/pix_" + a[a.length - 1] + ".png", pix, 3);
|
||||
}
|
||||
|
||||
instance.setVariable("user_defined_dpi", String.valueOf(dpi));
|
||||
instance.setPageSegMode(psm);
|
||||
instance.createDocumentsWithResults(pix, null, tesseractOutputFileName, List.of(ITesseract.RenderedFormat.HOCR), ITessAPI.TessPageIteratorLevel.RIL_BLOCK);
|
||||
}
|
||||
|
||||
|
||||
private static Tesseract2 createInstance(OcrServiceSettings settings) {
|
||||
|
||||
Tesseract2 instance = new Tesseract2();
|
||||
instance.setVariable("debug_file", "/dev/null"); // remove warnings from std out
|
||||
instance.setOcrEngineMode(1); // set to LSTM based Engine
|
||||
instance.setLanguage(settings.getLanguages());
|
||||
return instance;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,55 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ProcessIOLogger extends Thread {
|
||||
|
||||
// If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock.
|
||||
// Since both need to read simultaneously we need to implement the readers as separate threads.
|
||||
|
||||
InputStream is;
|
||||
String processName;
|
||||
Type type;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void run() {
|
||||
|
||||
try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) {
|
||||
|
||||
String line;
|
||||
while (true) {
|
||||
line = br.readLine();
|
||||
|
||||
if (line == null) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (type.equals(Type.ERROR)) {
|
||||
log.error(processName + "_" + type.name() + ">" + line);
|
||||
} else {
|
||||
log.debug(processName + "_" + type.name() + ">" + line);
|
||||
}
|
||||
}
|
||||
}
|
||||
is.close();
|
||||
}
|
||||
|
||||
|
||||
public enum Type {
|
||||
ERROR,
|
||||
STD_OUT
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,27 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.settings;
|
||||
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@ConfigurationProperties("ocr-service")
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class OcrServiceSettings {
|
||||
|
||||
int ocrThreadCount = 4; // Number of OCR threads
|
||||
int imageExtractThreadCount = 2; // Number of image extraction threads
|
||||
int gsProcessCount = 2; // Number of Ghostscript processes
|
||||
int dpi = 300; // Target DPI for binarized images
|
||||
int psmOverride = -1; // Overrides the page segmentation mode if > 0
|
||||
int minImageHeight = 20; // Minimum height for images to be processed
|
||||
int minImageWidth = 20; // Minimum width for images to be processed
|
||||
boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes
|
||||
boolean removeWatermark; // If true, watermarks will be removed
|
||||
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
|
||||
COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR");
|
||||
|
||||
}
|
||||
@ -0,0 +1,64 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ListSplittingUtils {
|
||||
|
||||
public List<List<Integer>> buildBalancedContinuousSublist(Integer totalNumberOfEntries, int threadCount) {
|
||||
|
||||
return buildBalancedSublist(IntStream.range(0, totalNumberOfEntries).map(i -> i + 1).boxed().toList(), threadCount);
|
||||
}
|
||||
|
||||
|
||||
public <T> List<List<T>> buildBalancedSublist(List<T> entries, int threadCount) {
|
||||
|
||||
List<Integer> balancedEntryCounts = buildBalancedEntryCounts(entries.size(), threadCount);
|
||||
List<List<T>> balancedSublist = new ArrayList<>(threadCount);
|
||||
int startIdx = 0;
|
||||
for (Integer numberOfEntriesPerThread : balancedEntryCounts) {
|
||||
balancedSublist.add(entries.subList(startIdx, startIdx + numberOfEntriesPerThread));
|
||||
startIdx += numberOfEntriesPerThread;
|
||||
}
|
||||
return balancedSublist;
|
||||
}
|
||||
|
||||
|
||||
public <T> List<List<List<T>>> buildBatchedBalancedSublist(List<T> entries, int threadCount, int batchSize) {
|
||||
|
||||
// batches -> threads -> entries
|
||||
List<List<List<T>>> batchedBalancedSubList = new LinkedList<>();
|
||||
List<List<List<T>>> threadsWithBatches = buildBalancedSublist(entries, threadCount).stream().map(list -> buildBalancedSublist(list, batchSize)).toList();
|
||||
// swap first two dimensions
|
||||
for (int batchIdx = 0; batchIdx < batchSize; batchIdx++) {
|
||||
List<List<T>> threadEntriesPerBatch = new ArrayList<>(threadCount);
|
||||
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
|
||||
threadEntriesPerBatch.add(threadsWithBatches.get(threadIdx).get(batchIdx));
|
||||
}
|
||||
batchedBalancedSubList.add(threadEntriesPerBatch);
|
||||
|
||||
}
|
||||
return batchedBalancedSubList;
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> buildBalancedEntryCounts(int totalNumberOfEntries, int threadCount) {
|
||||
|
||||
List<Integer> numberOfPagesPerThread = new ArrayList<>(threadCount);
|
||||
for (int i = 0; i < threadCount; i++) {
|
||||
numberOfPagesPerThread.add(0);
|
||||
}
|
||||
int threadIdx;
|
||||
for (int i = 0; i < totalNumberOfEntries; i++) {
|
||||
threadIdx = i % threadCount;
|
||||
numberOfPagesPerThread.set(threadIdx, numberOfPagesPerThread.get(threadIdx) + 1);
|
||||
}
|
||||
return numberOfPagesPerThread;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class PdfDpiCalculator {
|
||||
|
||||
public int calculateDpi(QuadPoint imageBounds, AffineTransform imageCTM, double width) {
|
||||
|
||||
QuadPoint transformedImageBounds = imageBounds.getTransformed(imageCTM);
|
||||
double transformedWidth = transformedImageBounds.a().distance(transformedImageBounds.d());
|
||||
double widthInInches = transformedWidth * 1 / 72;
|
||||
return (int) Math.ceil(width / widthInInches);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.ocr.v1.server.utils;
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import com.pdftron.pdf.ColorPt;
|
||||
import com.pdftron.pdf.ColorSpace;
|
||||
@ -0,0 +1,138 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import java.awt.Rectangle;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.sun.jna.Pointer;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.tess4j.OCRResult;
|
||||
import net.sourceforge.tess4j.TessAPI1;
|
||||
import net.sourceforge.tess4j.Tesseract1;
|
||||
import net.sourceforge.tess4j.TesseractException;
|
||||
import net.sourceforge.tess4j.Word;
|
||||
|
||||
@Slf4j
|
||||
public class Tesseract2 extends Tesseract1 {
|
||||
|
||||
|
||||
private int createDocuments(Pix pix, String filename, TessResultRenderer renderer) {
|
||||
|
||||
String title = TessBaseAPIGetStringVariable(getHandle(), DOCUMENT_TITLE);
|
||||
TessResultRendererBeginDocument(renderer, title);
|
||||
int result = TessBaseAPIProcessPage(getHandle(), pix, 0, filename, null, 0, renderer);
|
||||
TessResultRendererEndDocument(renderer);
|
||||
|
||||
// if (result == ITessAPI.FALSE) {
|
||||
// throw new TesseractException("Error during processing page.");
|
||||
// }
|
||||
|
||||
return TessBaseAPIMeanTextConf(getHandle());
|
||||
}
|
||||
|
||||
|
||||
public OCRResult createDocumentsWithResults(Pix bi, String filename, String outputbase, List<RenderedFormat> formats, int pageIteratorLevel) throws TesseractException {
|
||||
|
||||
List<OCRResult> results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel);
|
||||
if (!results.isEmpty()) {
|
||||
return results.get(0);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public List<OCRResult> createDocumentsWithResults(Pix[] pixs, String[] filenames, String[] outputbases, List<RenderedFormat> formats, int pageIteratorLevel) {
|
||||
|
||||
if (pixs.length != filenames.length || pixs.length != outputbases.length) {
|
||||
throw new RuntimeException("The three arrays must match in length.");
|
||||
}
|
||||
|
||||
init();
|
||||
setVariables();
|
||||
|
||||
List<OCRResult> results = new ArrayList<OCRResult>();
|
||||
|
||||
try {
|
||||
for (int i = 0; i < pixs.length; i++) {
|
||||
try {
|
||||
TessResultRenderer renderer = createRenderers(outputbases[i], formats);
|
||||
int meanTextConfidence = createDocuments(pixs[i], filenames[i], renderer);
|
||||
TessDeleteResultRenderer(renderer);
|
||||
List<Word> words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel) : new ArrayList<Word>();
|
||||
results.add(new OCRResult(meanTextConfidence, words));
|
||||
} catch (Exception e) {
|
||||
// skip the problematic image file
|
||||
log.warn(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
dispose();
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
|
||||
private List<Word> getRecognizedWords(int pageIteratorLevel) {
|
||||
|
||||
List<Word> words = new ArrayList<>();
|
||||
|
||||
try {
|
||||
TessResultIterator ri = TessBaseAPIGetIterator(getHandle());
|
||||
TessPageIterator pi = TessResultIteratorGetPageIterator(ri);
|
||||
TessPageIteratorBegin(pi);
|
||||
|
||||
do {
|
||||
Pointer ptr = TessResultIteratorGetUTF8Text(ri, pageIteratorLevel);
|
||||
if (ptr == null) {
|
||||
continue;
|
||||
}
|
||||
String text = ptr.getString(0);
|
||||
TessAPI1.TessDeleteText(ptr);
|
||||
float confidence = TessResultIteratorConfidence(ri, pageIteratorLevel);
|
||||
IntBuffer leftB = IntBuffer.allocate(1);
|
||||
IntBuffer topB = IntBuffer.allocate(1);
|
||||
IntBuffer rightB = IntBuffer.allocate(1);
|
||||
IntBuffer bottomB = IntBuffer.allocate(1);
|
||||
TessPageIteratorBoundingBox(pi, pageIteratorLevel, leftB, topB, rightB, bottomB);
|
||||
int left = leftB.get();
|
||||
int top = topB.get();
|
||||
int right = rightB.get();
|
||||
int bottom = bottomB.get();
|
||||
Word word = new Word(text, confidence, new Rectangle(left, top, right - left, bottom - top));
|
||||
words.add(word);
|
||||
} while (TessPageIteratorNext(pi, pageIteratorLevel) == TRUE);
|
||||
// TessPageIteratorDelete(pi);
|
||||
TessResultIteratorDelete(ri);
|
||||
} catch (Exception e) {
|
||||
log.warn(e.getMessage(), e);
|
||||
}
|
||||
|
||||
return words;
|
||||
}
|
||||
|
||||
|
||||
private TessResultRenderer createRenderers(String outputbase, List<RenderedFormat> formats) {
|
||||
|
||||
TessResultRenderer renderer = null;
|
||||
|
||||
for (RenderedFormat format : formats) {
|
||||
switch (format) {
|
||||
|
||||
case HOCR:
|
||||
if (renderer == null) {
|
||||
renderer = TessHOcrRendererCreate(outputbase);
|
||||
} else {
|
||||
TessResultRendererInsert(renderer, TessHOcrRendererCreate(outputbase));
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return renderer;
|
||||
}
|
||||
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,29 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.FontMetrics;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.Type0FontMetricsFactory;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@SuppressWarnings("PMD")
|
||||
class Type0FontMetricsFactoryTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testStringWidth() {
|
||||
|
||||
try (PDDocument document = Loader.loadPDF(new File(Type0FontMetricsFactoryTest.class.getClassLoader().getResource("InvisibleText.pdf").getPath()))) {
|
||||
Type0FontMetricsFactory metricsFactory = new Type0FontMetricsFactory(document);
|
||||
FontMetrics fontMetrics = metricsFactory.calculateMetrics("deine mutter", 100, 50);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class ListSplittingUtilsTest {
|
||||
|
||||
@Test
|
||||
public void testBalancedListSplitting() {
|
||||
|
||||
int threadCount = 18;
|
||||
int numberOfPages = 48;
|
||||
var balancedList = ListSplittingUtils.buildBalancedContinuousSublist(numberOfPages, threadCount);
|
||||
assertEquals(threadCount, balancedList.size());
|
||||
assertEquals(numberOfPages, balancedList.stream().mapToLong(Collection::size).sum());
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,109 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.util.FileSystemUtils;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OsUtils;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.ProcessIOLogger;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
// YOU NEED GHOSTSCRIPT INSTALLED TO RUN THIS TEST!!!!
|
||||
@Disabled
|
||||
public class Pdf2ImgTest {
|
||||
|
||||
private static final int DPI = 150;
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
@Disabled
|
||||
public void testPDFBox() {
|
||||
|
||||
String outputDir = OsUtils.getTemporaryDirectory("imageOutput", "");
|
||||
new File(outputDir).mkdirs();
|
||||
ClassPathResource resource = new ClassPathResource("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||
try (PDDocument document = Loader.loadPDF(resource.getFile())) {
|
||||
PDFRenderer renderer = new PDFRenderer(document);
|
||||
for (int pageNumber = 0; pageNumber < document.getNumberOfPages(); pageNumber++) {
|
||||
BufferedImage image = renderer.renderImageWithDPI(pageNumber, DPI);
|
||||
boolean written = ImageIOUtil.writeImage(image, "tif", new File(outputDir + String.format("page%04d", pageNumber)).getAbsolutePath(), DPI);
|
||||
System.out.printf("%d: %s%n", pageNumber, written);
|
||||
}
|
||||
}
|
||||
FileSystemUtils.deleteRecursively(new File(outputDir));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testGhostScript() {
|
||||
|
||||
String outputDir = "/tmp/ghostscript_out/";
|
||||
new File(outputDir).mkdirs();
|
||||
ClassPathResource resource = new ClassPathResource("files/Cyberport__SD-Faktura-Kopie_(ZRG2)_-_31.08.2020.pdf");
|
||||
|
||||
String[] cmdArgs = new String[]{"gs", "-dNOPAUSE", "-sDEVICE=tiff24nc", "-r" + DPI, "-sOutputFile=" + outputDir + "page%04d", resource.getFile().toString(), "-c", "quit"};
|
||||
Process p = Runtime.getRuntime().exec(cmdArgs);
|
||||
ProcessIOLogger logger = new ProcessIOLogger(p.getInputStream(), "GS", ProcessIOLogger.Type.STD_OUT);
|
||||
logger.start();
|
||||
ProcessIOLogger errorLogger = new ProcessIOLogger(p.getErrorStream(), "GS", ProcessIOLogger.Type.STD_OUT);
|
||||
errorLogger.start();
|
||||
int exitcode = p.waitFor();
|
||||
logger.join();
|
||||
errorLogger.join();
|
||||
System.out.println("Ghostscript finished with exit code " + exitcode);
|
||||
FileSystemUtils.deleteRecursively(new File(outputDir));
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testGhostScriptParallel() {
|
||||
|
||||
int numOfProcesses = 5;
|
||||
ClassPathResource resource = new ClassPathResource("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||
String outputDir = "/tmp/ghostscript_out/";
|
||||
List<Process> processes = IntStream.range(0, numOfProcesses).boxed().parallel().map(i -> buildCmdArgs(i, outputDir, resource)).map(Pdf2ImgTest::executeProcess).toList();
|
||||
|
||||
List<Integer> processExitCodes = new LinkedList<>();
|
||||
for (Process process : processes) {
|
||||
processExitCodes.add(process.waitFor());
|
||||
}
|
||||
System.out.println("Ghostscripts finished with exit codes " + processExitCodes);
|
||||
FileSystemUtils.deleteRecursively(new File(outputDir));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static Process executeProcess(String[] cmdArgs) {
|
||||
|
||||
return Runtime.getRuntime().exec(cmdArgs);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static String[] buildCmdArgs(Integer i, String outputDir, ClassPathResource resource) {
|
||||
|
||||
String outDir = outputDir + "/" + i + "/";
|
||||
new File(outDir).mkdirs();
|
||||
return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=tiffgray", "-r" + DPI, "-sOutputFile=" + outDir + "page%04d", resource.getFile().toString(), "-c", "quit"};
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,163 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<artifactId>ocr-service-v1</artifactId>
|
||||
<version>3.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>ocr-service-server-v1</artifactId>
|
||||
|
||||
<properties>
|
||||
<tennat-commons.version>0.14.0</tennat-commons.version>
|
||||
<persistence-service.version>2.118.0</persistence-service.version>
|
||||
<pdftron-logic-commons.version>2.21.0</pdftron-logic-commons.version>
|
||||
<storage-commons.version>2.45.0</storage-commons.version>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.knecon.fforesight</groupId>
|
||||
<artifactId>tenant-commons</artifactId>
|
||||
<version>${tennat-commons.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<artifactId>persistence-service-internal-api-v1</artifactId>
|
||||
<version>${persistence-service.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<artifactId>ocr-service-api-v1</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.commons</groupId>
|
||||
<artifactId>storage-commons</artifactId>
|
||||
<version>${storage-commons.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.commons</groupId>
|
||||
<artifactId>pdftron-logic-commons</artifactId>
|
||||
<version>${pdftron-logic-commons.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.commons</groupId>
|
||||
<artifactId>spring-commons</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.commons</groupId>
|
||||
<artifactId>metric-commons</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.cloud</groupId>
|
||||
<artifactId>spring-cloud-starter-openfeign</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.pdftron</groupId>
|
||||
<artifactId>PDFNet</artifactId>
|
||||
<version>10.1.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-amqp</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.amazonaws</groupId>
|
||||
<artifactId>aws-java-sdk-kms</artifactId>
|
||||
<version>1.12.440</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- Test -->
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.commons</groupId>
|
||||
<artifactId>test-commons</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.amqp</groupId>
|
||||
<artifactId>spring-rabbit-test</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-test</artifactId>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-tomcat</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<!-- generate git.properties for exposure in /info -->
|
||||
<groupId>pl.project13.maven</groupId>
|
||||
<artifactId>git-commit-id-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>revision</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<generateGitPropertiesFile>true</generateGitPropertiesFile>
|
||||
<gitDescribe>
|
||||
<tags>true</tags>
|
||||
</gitDescribe>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<annotationProcessors>
|
||||
<annotationProcessor>lombok.launch.AnnotationProcessorHider$AnnotationProcessor</annotationProcessor>
|
||||
</annotationProcessors>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<!-- repackages the generated jar into a runnable fat-jar and makes it
|
||||
executable -->
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-maven-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>repackage</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<executable>true</executable>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
<repositories>
|
||||
<repository>
|
||||
<id>pdftron</id>
|
||||
<name>PDFNet Maven</name>
|
||||
<url>https://pdftron.com/maven/release</url>
|
||||
</repository>
|
||||
</repositories>
|
||||
|
||||
|
||||
</project>
|
||||
@ -1,10 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.client;
|
||||
|
||||
import org.springframework.cloud.openfeign.FeignClient;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.internal.resources.DossierResource;
|
||||
|
||||
@FeignClient(name = "DossierResource", url = "${persistence-service.url}")
|
||||
public interface DossierClient extends DossierResource {
|
||||
|
||||
}
|
||||
@ -1,10 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.client;
|
||||
|
||||
import org.springframework.cloud.openfeign.FeignClient;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.internal.resources.DossierTemplateResource;
|
||||
|
||||
@FeignClient(name = "DossierTemplateResource", url = "${persistence-service.url}")
|
||||
public interface DossierTemplateClient extends DossierTemplateResource {
|
||||
|
||||
}
|
||||
@ -1,16 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model;
|
||||
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class ImagePosition {
|
||||
|
||||
private Rectangle rectangle;
|
||||
private boolean hasTransparency;
|
||||
|
||||
}
|
||||
@ -1,14 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Classification {
|
||||
|
||||
private Map<String, Float> probabilities = new HashMap<>();
|
||||
private String label;
|
||||
|
||||
}
|
||||
@ -1,11 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class FilterGeometry {
|
||||
|
||||
private ImageSize imageSize;
|
||||
private ImageFormat imageFormat;
|
||||
|
||||
}
|
||||
@ -1,12 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Filters {
|
||||
|
||||
private FilterGeometry geometry;
|
||||
private Probability probability;
|
||||
private boolean allPassed;
|
||||
|
||||
}
|
||||
@ -1,11 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Geometry {
|
||||
|
||||
private float width;
|
||||
private float height;
|
||||
|
||||
}
|
||||
@ -1,12 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class ImageFormat {
|
||||
|
||||
private float quotient;
|
||||
private boolean tooTall;
|
||||
private boolean tooWide;
|
||||
|
||||
}
|
||||
@ -1,14 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class ImageMetadata {
|
||||
|
||||
private Classification classification;
|
||||
private Position position;
|
||||
private Geometry geometry;
|
||||
private Filters filters;
|
||||
private boolean alpha;
|
||||
|
||||
}
|
||||
@ -1,26 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonAlias;
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class ImageServiceResponse {
|
||||
|
||||
private String dossierId;
|
||||
private String fileId;
|
||||
|
||||
@JsonProperty(value = "imageMetadata")
|
||||
@JsonAlias("data")
|
||||
private List<ImageMetadata> data = new ArrayList<>();
|
||||
|
||||
|
||||
@JsonProperty(value = "imageMetadata")
|
||||
@JsonAlias("data")
|
||||
public void setData(List<ImageMetadata> data) {this.data = data;}
|
||||
|
||||
}
|
||||
@ -1,12 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class ImageSize {
|
||||
|
||||
private float quotient;
|
||||
private boolean tooLarge;
|
||||
private boolean tooSmall;
|
||||
|
||||
}
|
||||
@ -1,14 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Position {
|
||||
|
||||
private float x1;
|
||||
private float x2;
|
||||
private float y1;
|
||||
private float y2;
|
||||
private int pageNumber;
|
||||
|
||||
}
|
||||
@ -1,10 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Probability {
|
||||
|
||||
private boolean unconfident;
|
||||
|
||||
}
|
||||
@ -1,36 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.ocr.v1.server.client.DossierClient;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.Dossier;
|
||||
|
||||
import feign.FeignException;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
public class DossierService {
|
||||
|
||||
DossierClient dossierClient;
|
||||
|
||||
public Dossier getDossier(String dossierId) {
|
||||
|
||||
try {
|
||||
return dossierClient.getDossierById(dossierId, true, false);
|
||||
} catch (FeignException e) {
|
||||
if (e.status() == HttpStatus.NOT_FOUND.value()) {
|
||||
return null;
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,36 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.ocr.v1.server.client.DossierTemplateClient;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.DossierTemplate;
|
||||
|
||||
import feign.FeignException;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
public class DossierTemplateService {
|
||||
|
||||
DossierTemplateClient dossierTemplateClient;
|
||||
|
||||
public DossierTemplate getDossierTemplate(String dossierTemplateId) {
|
||||
|
||||
try {
|
||||
return dossierTemplateClient.getDossierTemplateById(dossierTemplateId);
|
||||
} catch (FeignException e) {
|
||||
if (e.status() == HttpStatus.NOT_FOUND.value()) {
|
||||
return null;
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,196 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.pdftron.common.Matrix2D;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.Rect;
|
||||
import com.pdftron.pdf.RectCollection;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Service
|
||||
public class ImagePositionRetrievalService {
|
||||
|
||||
private static final double TOLERANCE = 1e-1;
|
||||
|
||||
// any image with smaller height and width than this gets thrown out, see everyPointInDashedLineIsImage.pdf
|
||||
private static final int PIXEL_THRESHOLD = 10;
|
||||
|
||||
|
||||
/**
|
||||
* Iterates over all elements in a PDF Document and retrieves the bounding box for each image, that is larger than the pixel threshold of 10 in either dimension.
|
||||
* Then it adjusts the bounding boxes for the page rotation.
|
||||
* If the mirrorY flag is set, the Y Coordinates are mirrored and moved up by the page height. This is required for PDFTrons OCRModule.
|
||||
*
|
||||
* @param pdfDoc a PDF File as PDFTron PDFDoc class
|
||||
* @param mirrorY if this flag is set, all coordinates are calculated with upper left corner as (0,0), else initial user space
|
||||
* @return a map with the page indices as keys and the image bounding boxes on that page as a RectCollection
|
||||
*/
|
||||
@SneakyThrows
|
||||
public Map<Integer, RectCollection> getImagePositionPerPage(PDFDoc pdfDoc, boolean mirrorY) {
|
||||
|
||||
Map<Integer, RectCollection> pageIdToImagePositions = new HashMap<>();
|
||||
ElementReader reader = new ElementReader();
|
||||
for (int pageId = 1; pageId <= pdfDoc.getPageCount(); ++pageId) {
|
||||
RectCollection imagePositions = new RectCollection();
|
||||
|
||||
reader.begin(pdfDoc.getPage(pageId));
|
||||
findImagePositionsOnPage(reader, imagePositions, pdfDoc.getPage(pageId), mirrorY);
|
||||
imagePositions = mergeOverlappingRects(imagePositions);
|
||||
reader.end();
|
||||
|
||||
if (imagePositions.getNumRects() > 0) {
|
||||
pageIdToImagePositions.put(pageId, imagePositions);
|
||||
}
|
||||
}
|
||||
reader.destroy();
|
||||
return pageIdToImagePositions;
|
||||
}
|
||||
|
||||
|
||||
private void findImagePositionsOnPage(ElementReader reader, RectCollection imagePositions, Page currentPage, boolean mirrorY) throws PDFNetException {
|
||||
|
||||
Element element;
|
||||
while ((element = reader.next()) != null) {
|
||||
switch (element.getType()) {
|
||||
case Element.e_image, Element.e_inline_image -> {
|
||||
// see everyPointInDashedLineIsImage.pdf TestFile
|
||||
if (element.getImageHeight() > PIXEL_THRESHOLD || element.getImageWidth() > PIXEL_THRESHOLD) {
|
||||
imagePositions.addRect(toRotationAdjustedRect(element.getBBox(), currentPage, mirrorY));
|
||||
}
|
||||
}
|
||||
case Element.e_form -> {
|
||||
reader.formBegin();
|
||||
findImagePositionsOnPage(reader, imagePositions, currentPage, mirrorY);
|
||||
reader.end();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public RectCollection mergeOverlappingRects(RectCollection imagePositions) {
|
||||
|
||||
if (imagePositions.getNumRects() < 2) {
|
||||
return imagePositions;
|
||||
}
|
||||
|
||||
List<Rectangle2D> rectangleList = toSortedRectangleList(imagePositions);
|
||||
|
||||
mergeRectangleList(rectangleList);
|
||||
|
||||
return toRectCollection(rectangleList);
|
||||
}
|
||||
|
||||
|
||||
// Sometimes images are split up into stripes, here we merge the positions of aligned and intersecting rectangles into one larger rectangle
|
||||
private void mergeRectangleList(List<Rectangle2D> rectangleList) {
|
||||
|
||||
for (int idx = 0; rectangleList.size() >= idx + 2; ) {
|
||||
|
||||
var rect1 = rectangleList.get(idx);
|
||||
var rect2 = rectangleList.get(idx + 1);
|
||||
|
||||
if (intersects(rect1, rect2) && isAlignedXOrY(rect1, rect2)) {
|
||||
rectangleList.remove(idx + 1);
|
||||
rectangleList.remove(idx);
|
||||
rectangleList.add(idx, rect1.createUnion(rect2));
|
||||
} else {
|
||||
++idx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean intersects(Rectangle2D rect1, Rectangle2D rect2) {
|
||||
|
||||
return rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE));
|
||||
}
|
||||
|
||||
|
||||
private boolean isAlignedXOrY(Rectangle2D rect1, Rectangle2D rect2) {
|
||||
|
||||
boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE;
|
||||
boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE;
|
||||
|
||||
return isAlignedX || isAlignedY;
|
||||
}
|
||||
|
||||
|
||||
private Rect toRotationAdjustedRect(Rect bbox, Page page, boolean mirrorY) throws PDFNetException {
|
||||
|
||||
int rotation = page.getRotation();
|
||||
double height = page.getPageHeight();
|
||||
double width = page.getPageWidth();
|
||||
|
||||
// Even though PDFTron almost always has the origin in the lower left corner, for some reason, the OCRModule's addTextZonesForPage() uses the upper left corner as origin...
|
||||
Matrix2D mirrorMatrix;
|
||||
if (mirrorY) {
|
||||
mirrorMatrix = new Matrix2D(1, 0, 0, -1, 0, height);
|
||||
} else {
|
||||
mirrorMatrix = new Matrix2D();
|
||||
}
|
||||
|
||||
// We need to rotate the rects to fit to the page rotation
|
||||
Matrix2D rotationMatrix = switch (rotation) {
|
||||
case 1 -> new Matrix2D(0, -1, 1, 0, 0, height);
|
||||
case 2 -> new Matrix2D(-1, 0, 0, -1, width, height);
|
||||
case 3 -> new Matrix2D(0, 1, -1, 0, width, 0);
|
||||
default -> new Matrix2D();
|
||||
};
|
||||
|
||||
Matrix2D finalMatrix = mirrorMatrix.multiply(rotationMatrix);
|
||||
|
||||
Point2D.Double p1 = finalMatrix.multPoint(bbox.getX1(), bbox.getY1());
|
||||
Point2D.Double p2 = finalMatrix.multPoint(bbox.getX2(), bbox.getY2());
|
||||
|
||||
// PDFTron Rect *needs* lower left and upper right coordinates to calculate width and height correctly, even though the documentation states otherwise
|
||||
Point2D.Double lowerLeft = new Point2D.Double(Math.min(p1.x, p2.x), Math.min(p1.y, p2.y));
|
||||
Point2D.Double upperRight = new Point2D.Double(Math.max(p1.x, p2.x), Math.max(p1.y, p2.y));
|
||||
|
||||
return new Rect(lowerLeft.x, lowerLeft.y, upperRight.x, upperRight.y);
|
||||
}
|
||||
|
||||
|
||||
private RectCollection toRectCollection(List<Rectangle2D> rectangleList) {
|
||||
|
||||
RectCollection rectCollection = new RectCollection();
|
||||
rectangleList.forEach(r -> {
|
||||
try {
|
||||
rectCollection.addRect(new Rect(r.getMinX(), r.getMinY(), r.getMaxX(), r.getMaxY()));
|
||||
} catch (PDFNetException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
return rectCollection;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private List<Rectangle2D> toSortedRectangleList(RectCollection rectCollection) {
|
||||
|
||||
List<Rectangle2D> list = new LinkedList<>();
|
||||
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
|
||||
Rect r = rectCollection.getRectAt(i);
|
||||
list.add(new Rectangle2D.Double(r.getX1(), r.getY1(), r.getWidth(), r.getHeight()));
|
||||
}
|
||||
list.sort(Comparator.comparingDouble(RectangularShape::getMinY).thenComparing(RectangularShape::getMinX));
|
||||
return list;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,205 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
||||
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.DossierTemplate;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.Dossier;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.OCRModule;
|
||||
import com.pdftron.pdf.OCROptions;
|
||||
import com.pdftron.pdf.Optimizer;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.RectCollection;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import io.micrometer.core.annotation.Timed;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class OCRService {
|
||||
|
||||
public static final String ENGLISH = "eng";
|
||||
|
||||
private final FileStorageService fileStorageService;
|
||||
private final OcrServiceSettings settings;
|
||||
|
||||
private final RabbitTemplate rabbitTemplate;
|
||||
|
||||
private final WatermarkRemovalService watermarkRemovalService;
|
||||
|
||||
private final InvisibleElementRemovalService invisibleElementRemovalService;
|
||||
|
||||
private final ImagePositionRetrievalService imagePositionRetrievalService;
|
||||
|
||||
private final DossierService dossierService;
|
||||
|
||||
private final DossierTemplateService dossierTemplateService;
|
||||
|
||||
|
||||
/**
|
||||
* First loads the PDF Document from storage.
|
||||
* Then removes all invisible Elements from the PDF, check InvisibleElementRemovalService for details.
|
||||
* Then gets Image Position Information, check ImagePositionRetrievalService for details.
|
||||
* Then runs OCR page by page, exclusively on pages which have images on them. It does so, by creating a new PDFDoc and inserting a single page at a time.
|
||||
* This is because PDFTron OCROptions overlays all regions where OCR should not be run with white images. It does not check for empty pages.
|
||||
* For Documents with many pages but few Images this results in major performance improvements.
|
||||
* It then re-adds the OCRed Pages to the original document and saves it.
|
||||
*
|
||||
* @param dossierId The dossier id
|
||||
* @param fileId The file id
|
||||
* @param out OutputStream to write the file to
|
||||
*/
|
||||
@Timed("redactmanager_runOcrOnDocument")
|
||||
public void runOcrOnDocument(String dossierId, String fileId, OutputStream out) throws IOException {
|
||||
|
||||
ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream();
|
||||
InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
|
||||
|
||||
try {
|
||||
Dossier dossier = dossierService.getDossier(dossierId);
|
||||
DossierTemplate dossierTemplate = dossierTemplateService.getDossierTemplate(dossier.getDossierTemplateId());
|
||||
|
||||
if (dossierTemplate.isRemoveWatermark()) {
|
||||
watermarkRemovalService.removeWatermarks(fileStream, transferOutputStream);
|
||||
fileStream.close();
|
||||
fileStream = new ByteArrayInputStream(transferOutputStream.toByteArray());
|
||||
transferOutputStream.close();
|
||||
transferOutputStream = new ByteArrayOutputStream();
|
||||
}
|
||||
|
||||
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
|
||||
|
||||
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
|
||||
long ocrStart = System.currentTimeMillis();
|
||||
runOcr(transferInputStream, out, fileId);
|
||||
long ocrEnd = System.currentTimeMillis();
|
||||
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (ocrEnd - ocrStart) / 1000.0));
|
||||
}
|
||||
|
||||
} finally {
|
||||
fileStream.close();
|
||||
transferOutputStream.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void runOcr(InputStream fileStream, OutputStream out, String fileId) {
|
||||
|
||||
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
||||
|
||||
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true);
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pageIdToRectCollection.size()).build());
|
||||
|
||||
// Optimization:
|
||||
// When a page does not have a TextZone, PDFTron whites out the page. But, PDFTron scans it anyway, resulting in a longer runtime.
|
||||
// So, we need to remove pages without images.
|
||||
// Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one.
|
||||
// Therefore, we create a new Document with a single page for every page that contains an image.
|
||||
// For some reason, if we insert the OCR singlePageDoc into the original PDFDoc inside the loop, for some documents, the RAM Usage increases exponentially with every page.
|
||||
// This is why, we replace the OCRed Pages outside the main loop.
|
||||
int numProcessedPages = 0;
|
||||
Map<Integer, PDFDoc> pageIdToSinglePagePdfDoc = new HashMap<>();
|
||||
for (Integer pageId : pageIdToRectCollection.keySet()) {
|
||||
try {
|
||||
PDFDoc singlePagePdfDoc = extractSinglePagePdfDoc(pdfDoc, pageId);
|
||||
processOcr(pageIdToRectCollection, pageId, singlePagePdfDoc);
|
||||
|
||||
log.info("{}/{} Page {} done, OCR regions {}",
|
||||
numProcessedPages,
|
||||
pageIdToRectCollection.size(),
|
||||
pageId,
|
||||
getAllOcrTextZonesAsString(pageIdToRectCollection, pageId));
|
||||
|
||||
pageIdToSinglePagePdfDoc.put(pageId, singlePagePdfDoc);
|
||||
++numProcessedPages;
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pageIdToRectCollection.size()).numberOfOCRedPages(numProcessedPages).build());
|
||||
|
||||
} catch (PDFNetException e) {
|
||||
log.error("Failed to process page {}", pageId);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
log.info("Copying {} OCRed Pages into original Document", pageIdToSinglePagePdfDoc.size());
|
||||
pageIdToSinglePagePdfDoc.forEach((pageId, singlePagePdfDoc) -> replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc));
|
||||
Optimizer.optimize(pdfDoc);
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pageIdToRectCollection.size()).numberOfOCRedPages(numProcessedPages).ocrFinished(true).build());
|
||||
|
||||
try {
|
||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
} catch (Exception e) {
|
||||
log.error("Processed File with fileId {} could not be saved", fileId);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
pdfDoc.close();
|
||||
}
|
||||
|
||||
|
||||
private void processOcr(Map<Integer, RectCollection> pageIdToRectCollection, Integer pageId, PDFDoc singlePagePdfDoc) throws PDFNetException {
|
||||
|
||||
OCROptions options = new OCROptions();
|
||||
options.addTextZonesForPage(pageIdToRectCollection.get(pageId), 1);
|
||||
options.addLang(ENGLISH);
|
||||
options.addDPI(settings.getOcrDPI());
|
||||
|
||||
OCRModule.processPDF(singlePagePdfDoc, options);
|
||||
}
|
||||
|
||||
|
||||
private static PDFDoc extractSinglePagePdfDoc(PDFDoc pdfDoc, Integer pageId) throws PDFNetException {
|
||||
|
||||
PDFDoc singlePagePdfDoc = new PDFDoc();
|
||||
Page page = pdfDoc.getPage(pageId);
|
||||
page.setMediaBox(page.getCropBox()); // this line ensures the ocr text is placed correctly by PDFTron, see TestFile MediaBoxBiggerThanCropBox.pdf
|
||||
singlePagePdfDoc.pagePushBack(page);
|
||||
return singlePagePdfDoc;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageId, PDFDoc singlePagePdfDoc) {
|
||||
|
||||
Page ocrPage = singlePagePdfDoc.getPage(1);
|
||||
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage);
|
||||
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1));
|
||||
singlePagePdfDoc.close();
|
||||
}
|
||||
|
||||
|
||||
private static StringBuilder getAllOcrTextZonesAsString(Map<Integer, RectCollection> pageIdToRectCollection, Integer pageId) throws PDFNetException {
|
||||
|
||||
StringBuilder zonesString = new StringBuilder();
|
||||
for (int j = 0; j < pageIdToRectCollection.get(pageId).getNumRects(); ++j) {
|
||||
var r = pageIdToRectCollection.get(pageId).getRectAt(j);
|
||||
zonesString.append(format("[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]", r.getX1(), r.getY1(), r.getX2(), r.getY2()));
|
||||
}
|
||||
return zonesString;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,13 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.settings;
|
||||
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@ConfigurationProperties("ocr-service")
|
||||
public class OcrServiceSettings {
|
||||
|
||||
private int ocrDPI = 300;
|
||||
|
||||
}
|
||||
@ -1,165 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.PdfDraw.drawGrid;
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.PdfDraw.drawRectCollection;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.iqser.red.service.ocr.v1.server.AbstractTest;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.RectCollection;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
|
||||
class ImagePositionRetrievalServiceTest extends AbstractTest {
|
||||
|
||||
@Autowired
|
||||
private ImagePositionRetrievalService imagePositionRetrievalService;
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testImagePositionRetrievalForRotateTestFileWithImages() {
|
||||
|
||||
String fileName = "RotateTestFileWithImages";
|
||||
|
||||
List<int[]> allRectCoords = testImagePositionDetection(fileName);
|
||||
|
||||
assertThat(allRectCoords).contains(new int[]{48, 572, 295, 721},
|
||||
new int[]{54, 279, 301, 428},
|
||||
new int[]{360, 173, 509, 419},
|
||||
new int[]{362, 522, 511, 768},
|
||||
new int[]{459, 354, 608, 600},
|
||||
new int[]{145, 404, 392, 553},
|
||||
new int[]{151, 111, 398, 260},
|
||||
new int[]{457, 5, 606, 251},
|
||||
new int[]{395, 480, 545, 726},
|
||||
new int[]{393, 130, 542, 377},
|
||||
new int[]{88, 236, 334, 386},
|
||||
new int[]{82, 530, 328, 679},
|
||||
new int[]{465, 11, 614, 257},
|
||||
new int[]{159, 117, 406, 266},
|
||||
new int[]{467, 360, 617, 607},
|
||||
new int[]{153, 410, 400, 559});
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testImagePositionRetrievalForRotateTestFileWithImagesExtremeCropbox() {
|
||||
|
||||
String fileName = "RotateTestFileWithImagesExtremeCropbox";
|
||||
|
||||
List<int[]> allRectCoords = testImagePositionDetection(fileName);
|
||||
assertThat(allRectCoords).contains(new int[]{48, 572, 295, 721},
|
||||
new int[]{362, 522, 511, 768},
|
||||
new int[]{360, 173, 509, 419},
|
||||
new int[]{54, 279, 301, 428},
|
||||
new int[]{145, 192, 392, 341},
|
||||
new int[]{459, 142, 608, 388},
|
||||
new int[]{457, -207, 606, 39},
|
||||
new int[]{151, -101, 398, 48},
|
||||
new int[]{-30, 238, 216, 387},
|
||||
new int[]{283, 188, 433, 434},
|
||||
new int[]{281, -162, 430, 85},
|
||||
new int[]{-24, -56, 222, 94},
|
||||
new int[]{-39, 410, 208, 559},
|
||||
new int[]{275, 360, 425, 607},
|
||||
new int[]{273, 11, 422, 257},
|
||||
new int[]{-33, 117, 214, 266});
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testMergeImages() {
|
||||
|
||||
String fileName = "merge_images";
|
||||
List<int[]> allRectCoords = testImagePositionDetection(fileName);
|
||||
assertThat(allRectCoords).contains(new int[]{90, 284, 398, 770});
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testStitchedImagesMultiPage() {
|
||||
|
||||
String fileName = "StitchedImagesMultiPage";
|
||||
List<int[]> allRectCoords = testImagePositionDetection(fileName);
|
||||
assertThat(allRectCoords.size()).isEqualTo(48);
|
||||
}
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testEveryPointInDashedLineIsImage() {
|
||||
String fileName = "everyPointInDashedLineIsImage";
|
||||
List<int[]> allRectCoords = testImagePositionDetection(fileName);
|
||||
assertThat(allRectCoords.size()).isEqualTo(0);
|
||||
}
|
||||
|
||||
|
||||
private List<int[]> testImagePositionDetection(String fileName) throws IOException, PDFNetException {
|
||||
|
||||
try (InputStream fileStream = new FileInputStream(new ClassPathResource("files/" + fileName + ".pdf").getFile().getAbsolutePath())) {
|
||||
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
||||
|
||||
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, false);
|
||||
|
||||
ElementWriter writer = new ElementWriter();
|
||||
pageIdToRectCollection.forEach((pageId, rectCollection) -> {
|
||||
try {
|
||||
writer.begin(pdfDoc.getPage(pageId));
|
||||
drawRectCollection(writer, rectCollection);
|
||||
drawGrid(writer, pdfDoc.getPage(pageId));
|
||||
writer.end();
|
||||
StringBuilder zonesString = new StringBuilder();
|
||||
for (int j = 0; j < rectCollection.getNumRects(); ++j) {
|
||||
var r = rectCollection.getRectAt(j);
|
||||
zonesString.append(String.format("%d:[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]; ", j, r.getX1(), r.getY1(), r.getX2(), r.getY2()));
|
||||
}
|
||||
System.out.printf("Page %d: rotation: %d OCR regions %s \n", pageId, pdfDoc.getPage(pageId).getRotation(), zonesString);
|
||||
} catch (PDFNetException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
|
||||
// Check visually for red Rectangles to match images in the saved pdf file
|
||||
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf")) {
|
||||
out.write(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
|
||||
}
|
||||
pdfDoc.close();
|
||||
System.out.println("result file: " + getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf");
|
||||
// round all coords to nearest int to account for inconsistencies with the calculation of the bounding box
|
||||
return pageIdToRectCollection.values().stream().map(this::toRoundedCoordinateArrayList).flatMap(List::stream).collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private List<int[]> toRoundedCoordinateArrayList(RectCollection rectCollection) {
|
||||
|
||||
List<int[]> coords = new ArrayList<>(rectCollection.getNumRects());
|
||||
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
|
||||
var r = rectCollection.getRectAt(i);
|
||||
coords.add(new int[]{(int) Math.round(r.getX1()), (int) Math.round(r.getY1()), (int) Math.round(r.getX2()), (int) Math.round(r.getY2())});
|
||||
}
|
||||
return coords;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,51 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||
import com.iqser.red.service.ocr.v1.server.AbstractTest;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class InvisibleElementRemovalServiceTest extends AbstractTest {
|
||||
|
||||
@Autowired
|
||||
private InvisibleElementRemovalService invisibleElementRemovalService;
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testRemoveInvisibleText() {
|
||||
|
||||
String fileName = "InvisibleText";
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
|
||||
|
||||
try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out, false);
|
||||
}
|
||||
|
||||
try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_delta.pdf")) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out, true);
|
||||
}
|
||||
|
||||
System.out.println("Output File without invisible elements: files/" + fileName + ".pdf");
|
||||
System.out.println("Output Delta File without invisible elements: files/" + fileName + "_delta.pdf");
|
||||
|
||||
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||
String[] text = extractAllTextFromDocument(fileStream).split("\n");
|
||||
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
@ -1,31 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
||||
import com.iqser.red.service.ocr.v1.server.AbstractTest;
|
||||
import com.iqser.red.service.ocr.v1.server.utils.OsUtils;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class WatermarkRemovalServiceTest extends AbstractTest {
|
||||
|
||||
@Autowired
|
||||
private WatermarkRemovalService watermarkRemovalService;
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void removeWatermarksTest() {
|
||||
|
||||
String filename = "files/Watermark.pdf";
|
||||
try (var in = new ClassPathResource(filename).getInputStream(); var out = new FileOutputStream(OsUtils.createTmpFileName(filename, "WATERMARK_REMOVED"))) {
|
||||
watermarkRemovalService.removeWatermarks(in, out);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
75
ocr-service-v1/ocr-service-server/build.gradle.kts
Normal file
75
ocr-service-v1/ocr-service-server/build.gradle.kts
Normal file
@ -0,0 +1,75 @@
|
||||
import org.springframework.boot.gradle.tasks.bundling.BootBuildImage
|
||||
|
||||
plugins {
|
||||
application
|
||||
id("com.iqser.red.service.java-conventions")
|
||||
id("org.springframework.boot") version "3.1.3"
|
||||
id("io.spring.dependency-management") version "1.1.3"
|
||||
id("org.sonarqube") version "4.3.0.3225"
|
||||
id("io.freefair.lombok") version "8.2.2"
|
||||
}
|
||||
|
||||
configurations {
|
||||
all {
|
||||
exclude(group = "org.springframework.boot", module = "spring-boot-starter-logging")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
dependencies {
|
||||
implementation(project(":ocr-service-processor"))
|
||||
implementation(project(":ocr-service-api"))
|
||||
|
||||
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
|
||||
implementation("org.springframework.boot:spring-boot-starter-amqp:3.1.4")
|
||||
|
||||
testImplementation("org.springframework.boot:spring-boot-starter-test:3.1.4")
|
||||
testImplementation("com.iqser.red.commons:test-commons:2.1.0")
|
||||
testImplementation("org.springframework.amqp:spring-rabbit-test:3.0.2")
|
||||
}
|
||||
|
||||
tasks.named<BootBuildImage>("bootBuildImage") {
|
||||
|
||||
environment.put("BPE_DELIM_JAVA_TOOL_OPTIONS", " ")
|
||||
environment.put("BPE_APPEND_JAVA_TOOL_OPTIONS", "-Dfile.encoding=UTF-8")
|
||||
environment.put("BPE_GS_LIB", "/layers/fagiani_apt/apt/usr/share/ghostscript/9.26/Resource/Init/") // set ghostscript lib path
|
||||
environment.put("BPE_FONTCONFIG_PATH", "/layers/fagiani_apt/apt/etc/fonts/") // set ghostscript fontconfig path
|
||||
|
||||
var aptfile = layout.projectDirectory.file("src/main/resources/Aptfile").toString()
|
||||
bindings.add("${aptfile}:/workspace/Aptfile:ro")
|
||||
|
||||
val vcpkgFile = layout.projectDirectory.file("src/main/resources/vcpkg.json").toString()
|
||||
bindings.add("${vcpkgFile}:/workspace/vcpkg.json:ro")
|
||||
|
||||
val languagesFile = layout.projectDirectory.file("src/main/resources/tesseract_languages").toString()
|
||||
bindings.add("${languagesFile}:/workspace/tesseract_languages:ro")
|
||||
|
||||
buildpacks.set(
|
||||
listOf(
|
||||
"ghcr.io/fagiani/buildpacks/fagiani_apt@sha256:6471c8c70f32b749e29f65ae562ac0339fecad26aa9217628c00a6c31f197dae",
|
||||
"ghcr.io/kschuettler/knecon-vcpkg@sha256:ba5e967b124de4865ff7e8f565684f752dd6e97b302e2dcf651283f6a19b98b9",
|
||||
"ghcr.io/kschuettler/knecon-tessdata@sha256:9062f728aa0340ac963bcdd6f5e740d683823a81d3f480db894da15bff72691a",
|
||||
"urn:cnb:builder:paketo-buildpacks/java"
|
||||
)
|
||||
)
|
||||
imageName.set("nexus.knecon.com:5001/ff/${project.name}") // must build image with same name always, otherwise the builder will not know which image to use as cache. DO NOT CHANGE!
|
||||
if (project.hasProperty("buildbootDockerHostNetwork")) {
|
||||
network.set("host")
|
||||
}
|
||||
docker {
|
||||
if (project.hasProperty("buildbootDockerHostNetwork")) {
|
||||
bindHostToBuilder.set(true)
|
||||
}
|
||||
verboseLogging.set(true)
|
||||
|
||||
publishRegistry {
|
||||
username.set(providers.gradleProperty("mavenUser").getOrNull())
|
||||
password.set(providers.gradleProperty("mavenPassword").getOrNull())
|
||||
email.set(providers.gradleProperty("mavenEmail").getOrNull())
|
||||
url.set("https://nexus.knecon.com:5001/")
|
||||
}
|
||||
|
||||
val dockerTag = "nexus.knecon.com:5001/ff/${project.name}:${project.version}"
|
||||
tags.set(listOf(dockerTag))
|
||||
}
|
||||
}
|
||||
@ -1,11 +1,10 @@
|
||||
package com.iqser.red.service.ocr.v1.server;
|
||||
package com.knecon.fforesight.service.ocr.v1.server;
|
||||
|
||||
import org.springframework.boot.SpringApplication;
|
||||
import org.springframework.boot.actuate.autoconfigure.security.servlet.ManagementWebSecurityAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||
import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration;
|
||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.cloud.openfeign.EnableFeignClients;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Import;
|
||||
@ -13,9 +12,9 @@ import org.springframework.scheduling.annotation.EnableAsync;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
||||
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
|
||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceProcessorConfiguration;
|
||||
import com.knecon.fforesight.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
|
||||
import com.knecon.fforesight.service.ocr.v1.server.queue.MessagingConfiguration;
|
||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
|
||||
|
||||
@ -24,9 +23,8 @@ import io.micrometer.core.instrument.MeterRegistry;
|
||||
|
||||
@EnableAsync
|
||||
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class})
|
||||
@EnableConfigurationProperties(OcrServiceSettings.class)
|
||||
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
|
||||
@Import({MessagingConfiguration.class, StorageAutoConfiguration.class})
|
||||
@Import({MessagingConfiguration.class, StorageAutoConfiguration.class, OcrServiceProcessorConfiguration.class})
|
||||
@EnableFeignClients(basePackageClasses = FileStatusProcessingUpdateClient.class)
|
||||
public class Application {
|
||||
|
||||
@ -36,7 +34,7 @@ public class Application {
|
||||
* @param args Any command line parameter given upon startup.
|
||||
*/
|
||||
public static void main(String[] args) {
|
||||
|
||||
System.setProperty("org.apache.pdfbox.rendering.UsePureJavaCMYKConversion", "true");
|
||||
SpringApplication.run(Application.class, args);
|
||||
}
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.ocr.v1.server.client;
|
||||
package com.knecon.fforesight.service.ocr.v1.server.client;
|
||||
|
||||
import org.springframework.cloud.openfeign.FeignClient;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.ocr.v1.server.configuration;
|
||||
package com.knecon.fforesight.service.ocr.v1.server.queue;
|
||||
|
||||
import org.springframework.amqp.core.Queue;
|
||||
import org.springframework.amqp.core.QueueBuilder;
|
||||
@ -1,7 +1,4 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import static com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration.X_ERROR_INFO_HEADER;
|
||||
import static com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration.X_ERROR_INFO_TIMESTAMP_HEADER;
|
||||
package com.knecon.fforesight.service.ocr.v1.server.queue;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
@ -17,24 +14,28 @@ import org.springframework.http.HttpStatus;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.ocr.v1.api.model.DocumentRequest;
|
||||
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
|
||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||
import com.knecon.fforesight.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.FileStorageService;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OCRService;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileErrorInfo;
|
||||
|
||||
import feign.FeignException;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OcrMessageReceiver {
|
||||
|
||||
private final FileStorageService fileStorageService;
|
||||
private final ObjectMapper objectMapper;
|
||||
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
|
||||
private final OCRService ocrService;
|
||||
FileStorageService fileStorageService;
|
||||
ObjectMapper objectMapper;
|
||||
FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
|
||||
OCRService ocrService;
|
||||
|
||||
|
||||
@RabbitHandler
|
||||
@ -42,7 +43,7 @@ public class OcrMessageReceiver {
|
||||
public void receiveOcr(Message in) throws IOException {
|
||||
|
||||
DocumentRequest ocrRequestMessage = objectMapper.readValue(in.getBody(), DocumentRequest.class);
|
||||
|
||||
log.info("--------------------------------------------------------------------------");
|
||||
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
|
||||
try {
|
||||
@ -66,21 +67,21 @@ public class OcrMessageReceiver {
|
||||
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
} catch (Exception e) {
|
||||
log.warn("An exception occurred in ocr file stage: {}", e.getMessage());
|
||||
in.getMessageProperties().getHeaders().put(X_ERROR_INFO_HEADER, e.getMessage());
|
||||
in.getMessageProperties().getHeaders().put(X_ERROR_INFO_TIMESTAMP_HEADER, OffsetDateTime.now().truncatedTo(ChronoUnit.MILLIS));
|
||||
throw e;
|
||||
in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_HEADER, e.getMessage());
|
||||
in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_TIMESTAMP_HEADER, OffsetDateTime.now().truncatedTo(ChronoUnit.MILLIS));
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@RabbitHandler
|
||||
@RabbitListener(queues = MessagingConfiguration.OCR_DLQ, concurrency = "1")
|
||||
public void receiveOcrDQL(Message failedMessage) throws IOException {
|
||||
public void receiveOcrDLQ(Message failedMessage) throws IOException {
|
||||
|
||||
DocumentRequest ocrRequestMessage = objectMapper.readValue(failedMessage.getBody(), DocumentRequest.class);
|
||||
log.info("OCR DQL received: {}", ocrRequestMessage);
|
||||
String errorMessage = failedMessage.getMessageProperties().getHeader(X_ERROR_INFO_HEADER);
|
||||
OffsetDateTime timestamp = failedMessage.getMessageProperties().getHeader(X_ERROR_INFO_TIMESTAMP_HEADER);
|
||||
String errorMessage = failedMessage.getMessageProperties().getHeader(MessagingConfiguration.X_ERROR_INFO_HEADER);
|
||||
OffsetDateTime timestamp = failedMessage.getMessageProperties().getHeader(MessagingConfiguration.X_ERROR_INFO_TIMESTAMP_HEADER);
|
||||
timestamp = timestamp != null ? timestamp : OffsetDateTime.now().truncatedTo(ChronoUnit.MILLIS);
|
||||
fileStatusProcessingUpdateClient.ocrFailed(ocrRequestMessage.getDossierId(),
|
||||
ocrRequestMessage.getFileId(),
|
||||
@ -0,0 +1,35 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.server.queue;
|
||||
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.IOcrMessageSender;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OcrMessageSender implements IOcrMessageSender {
|
||||
|
||||
RabbitTemplate rabbitTemplate;
|
||||
|
||||
|
||||
public void sendOcrFinished(String fileId, int totalImages) {
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(totalImages).numberOfOCRedPages(totalImages).ocrFinished(true).build());
|
||||
}
|
||||
|
||||
|
||||
public void sendUpdate(String fileId, int finishedImages, int totalImages) {
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(totalImages).numberOfOCRedPages(finishedImages).build());
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
19
ocr-service-v1/ocr-service-server/src/main/resources/Aptfile
Normal file
19
ocr-service-v1/ocr-service-server/src/main/resources/Aptfile
Normal file
@ -0,0 +1,19 @@
|
||||
# you can list packages
|
||||
ghostscript
|
||||
pkg-config
|
||||
zip
|
||||
unzip
|
||||
curl
|
||||
|
||||
# Ghostscript dependencies which are Ubuntu defaults and therefore not normally installed via apt
|
||||
libgssapi-krb5-2
|
||||
libk5crypto3
|
||||
libkrb5support0
|
||||
libkeyutils1
|
||||
libkrb5-3
|
||||
|
||||
# or include links to specific .deb files
|
||||
# http://ftp.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.8_all.deb
|
||||
|
||||
# or add custom apt repos (only required if using packages outside of the standard Ubuntu APT repositories)
|
||||
# :repo:deb http://cz.archive.ubuntu.com/ubuntu artful main universe
|
||||
@ -28,6 +28,11 @@ spring:
|
||||
max-interval: 15000
|
||||
prefetch: 1
|
||||
|
||||
fforesight:
|
||||
keycloak:
|
||||
ignored-endpoints: [ '/actuator/health', '/actuator/health/**' ]
|
||||
enabled: true
|
||||
|
||||
|
||||
management:
|
||||
endpoint:
|
||||
@ -0,0 +1,3 @@
|
||||
osd
|
||||
eng
|
||||
deu
|
||||
@ -0,0 +1,17 @@
|
||||
{
|
||||
"dependencies": [
|
||||
"tesseract",
|
||||
"leptonica"
|
||||
],
|
||||
"overrides": [
|
||||
{
|
||||
"name": "tesseract",
|
||||
"version": "5.3.2"
|
||||
},
|
||||
{
|
||||
"name": "leptonica",
|
||||
"version": "1.83.1"
|
||||
}
|
||||
],
|
||||
"builtin-baseline": "3715d743ac08146d9b7714085c1babdba9f262d5"
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.ocr.v1.server;
|
||||
package com.knecon.fforesight.service.ocr.v1.server;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
@ -24,9 +24,7 @@ import org.springframework.context.annotation.Primary;
|
||||
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||
|
||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||
import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer;
|
||||
import com.iqser.red.service.ocr.v1.server.service.DossierService;
|
||||
import com.iqser.red.service.ocr.v1.server.service.DossierTemplateService;
|
||||
import com.knecon.fforesight.service.ocr.processor.initializer.PDFNetInitializer;
|
||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
|
||||
@ -47,10 +45,6 @@ public class AbstractTest {
|
||||
|
||||
@MockBean
|
||||
private TenantsClient tenantsClient;
|
||||
@MockBean
|
||||
protected DossierService dossierService;
|
||||
@MockBean
|
||||
protected DossierTemplateService dossierTemplateService;
|
||||
|
||||
@Autowired
|
||||
protected StorageService storageService;
|
||||
@ -1,12 +1,12 @@
|
||||
package com.iqser.red.service.ocr.v1.server;
|
||||
package com.knecon.fforesight.service.ocr.v1.server;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||
import static com.knecon.fforesight.service.ocr.processor.service.OsUtils.getTemporaryDirectory;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
@ -17,20 +17,17 @@ import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
|
||||
import com.iqser.red.service.ocr.v1.server.service.OCRService;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.DossierTemplate;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.Dossier;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.FileStorageService;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OCRService;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
import com.pdftron.pdf.OCRModule;
|
||||
|
||||
import io.micrometer.prometheus.PrometheusMeterRegistry;
|
||||
import io.micrometer.prometheus.PrometheusTimer;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
|
||||
@SpringBootTest(properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
|
||||
@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help.
|
||||
@SpringBootTest()
|
||||
public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
|
||||
@Autowired
|
||||
@ -43,28 +40,12 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
private PrometheusMeterRegistry registry;
|
||||
|
||||
|
||||
@BeforeEach
|
||||
@SneakyThrows
|
||||
public void assertOCRModuleIsLoaded() {
|
||||
|
||||
assert OCRModule.isModuleAvailable();
|
||||
}
|
||||
|
||||
|
||||
@BeforeEach
|
||||
public void stubMethods() {
|
||||
|
||||
when(dossierService.getDossier(TEST_DOSSIER_ID)).thenReturn(Dossier.builder().dossierTemplateId(TEST_DOSSIER_TEMPLATE_ID).build());
|
||||
when(dossierTemplateService.getDossierTemplate(TEST_DOSSIER_TEMPLATE_ID)).thenReturn(DossierTemplate.builder().removeWatermark(false).build());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testOCRMetrics() {
|
||||
|
||||
testOCR("Watermark");
|
||||
testOCR("Watermark");
|
||||
testOCR("Watermark");
|
||||
testOCR("files/Watermark.pdf");
|
||||
testOCR("files/Watermark.pdf");
|
||||
testOCR("files/Watermark.pdf");
|
||||
|
||||
var ocrOnDocumentMeter = registry.getMeters().stream().filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny();
|
||||
assertThat(ocrOnDocumentMeter.isPresent()).isTrue();
|
||||
@ -75,16 +56,17 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testOcr() {
|
||||
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
|
||||
String text = testOCR("StitchedImagesMultiPage");
|
||||
|
||||
String text = testOCR("files/2009-1048395_50pages_tables.pdf");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testManyRotatedImages() {
|
||||
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
|
||||
String text = testOCR("manyRotatedImages");
|
||||
String text = testOCR("files/manyRotatedImages.pdf");
|
||||
assertThat(text).contains("Michela", "Gregori", "DVM", "PhD", "Pathologist");
|
||||
}
|
||||
|
||||
@ -92,7 +74,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@Test
|
||||
public void testMergeImages() {
|
||||
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
|
||||
String text = testOCR("merge_images");
|
||||
String text = testOCR("files/merge_images.pdf");
|
||||
assertThat(text).contains("Bodyweight change of dams with live young - group mean values",
|
||||
"Control",
|
||||
"mg/g day",
|
||||
@ -112,14 +94,14 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@Test
|
||||
public void testOCRWatermark() {
|
||||
|
||||
assertThat(testOCR("Watermark")).contains("syngenta");
|
||||
assertThat(testOCR("files/Watermark.pdf")).contains("syngenta");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testOCRInvisibleText() {
|
||||
|
||||
String text = testOCR("InvisibleText");
|
||||
String text = testOCR("files/InvisibleText.pdf");
|
||||
assertThat(text).contains("Michela", "Gregori", "DVM", "PhD", "Pathologist", "SIGNATURE PAGE");
|
||||
assertThat(text).doesNotContain("COMPLETION DATE:", "LABORATORY PROJECT ID:", "AUTHOR(S):", "Substance");
|
||||
}
|
||||
@ -128,17 +110,19 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
private String testOCR(String fileName) {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
||||
var originId = FileStorageService.getStorageId(TEST_DOSSIER_ID, "file", FileType.ORIGIN);
|
||||
try (var fileStream = pdfFileResource.getInputStream()) {
|
||||
storageService.storeObject(TenantContext.getTenantId(), originId, fileStream);
|
||||
}
|
||||
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||
|
||||
Path tmpFileName = Path.of(getTemporaryDirectory()).resolve(Path.of(fileName).getFileName());
|
||||
try (var out = new FileOutputStream(tmpFileName.toFile())) {
|
||||
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", out);
|
||||
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
|
||||
System.out.println("File:" + tmpFileName);
|
||||
}
|
||||
|
||||
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||
try (var fileStream = new FileInputStream(tmpFileName.toFile())) {
|
||||
return extractAllTextFromDocument(fileStream);
|
||||
}
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user