Compare commits

..

3 Commits

Author SHA1 Message Date
Kilian Schuettler
63a06625f6 RED-6019 InvisibleTex wip 2023-01-30 15:25:49 +01:00
deiflaender
f69681133c RED-6019 InvisibleText 1 2023-01-23 09:20:35 +01:00
deiflaender
579e6a5c67 RED-6019 InvisibleText 2023-01-19 12:04:12 +01:00
168 changed files with 2704 additions and 5141 deletions

47
.gitignore vendored
View File

@ -9,49 +9,6 @@
**/tmp/ **/tmp/
**/.apt_generated/ **/.apt_generated/
HELP.md
target/
!.mvn/wrapper/maven-wrapper.jar
!**/src/main/**/target/
!**/src/test/**/target/
### maven build ###
*.class
/out/
/build/
/target/
**/out/
**/build/
**/target/
### STS ###
.apt_generated
.classpath
.factorypath
.project
.settings
.springBeans
.sts4-cache
.gradle
### IntelliJ IDEA ###
.idea
*.iws
*.iml
*.ipr
### NetBeans ###
/nbproject/private/
/nbbuild/
/dist/
/nbdist/
/.nb-gradle/
build/
!**/src/main/**/build/
!**/src/test/**/build/
### VS Code ###
.vscode/
.factorypath .factorypath
.springBeans .springBeans
@ -69,7 +26,3 @@ build/
**/.DS_Store **/.DS_Store
**/classpath-data.json **/classpath-data.json
**/dependencies-and-licenses-overview.txt **/dependencies-and-licenses-overview.txt
gradle.properties
gradlew
gradlew.bat
gradle/

View File

@ -1,25 +0,0 @@
variables:
# SONAR_PROJECT_KEY: 'ocr-service:ocr-service-server'
GIT_SUBMODULE_STRATEGY: recursive
GIT_SUBMODULE_FORCE_HTTPS: 'true'
include:
- project: 'gitlab/gitlab'
ref: 'main'
file: 'ci-templates/gradle_java.yml'
deploy:
stage: deploy
tags:
- dind
script:
- echo "Building with gradle version ${BUILDVERSION}"
- gradle -Pversion=${BUILDVERSION} publish
- gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${BUILDVERSION}
- echo "BUILDVERSION=$BUILDVERSION" >> version.env
artifacts:
reports:
dotenv: version.env
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
- if: $CI_COMMIT_BRANCH =~ /^release/
- if: $CI_COMMIT_TAG

8
.gitmodules vendored
View File

@ -1,8 +0,0 @@
[submodule "ocr-service-v1/ocr-service-server/src/test/resources/files/syngenta"]
path = ocr-service-v1/ocr-service-server/src/test/resources/files/syngenta
url = https://gitlab.knecon.com/fforesight/documents/syngenta.git
update = merge
[submodule "ocr-service-v1/ocr-service-server/src/test/resources/files/basf"]
path = ocr-service-v1/ocr-service-server/src/test/resources/files/basf
url = https://gitlab.knecon.com/fforesight/documents/basf.git
update = merge

View File

@ -1,87 +0,0 @@
# OCR Service
## Overview
The OCR service is a tool designed for extracting text content from PDF files. It utilizes Tesseract, Leptonica, PDFTron, PDFBox, and Ghostscript to perform various tasks, including removing invisible elements and watermarks, extracting images, stitching striped images, binarizing images, running OCR on the processed images, and writing the recognized text back to the original PDF. This service is particularly useful for obtaining machine-readable text from PDF documents.
## Dependencies
[Tesseract](https://github.com/tesseract-ocr/tesseract)
[Leptonica](http://leptonica.org/)
[PDFTron](https://apryse.com/)
[PDFBox](https://pdfbox.apache.org/)
[Ghostscript](https://www.ghostscript.com/)
## Functionality
1. Invisible Element and Watermark Removal
The service uses PDFTron to attempt the removal of invisible elements and watermarks from the PDF.
2. Image Extraction
Extracts all images from the PDF using PDFBox
3. Striped Image Detection and Stitching
Detects if images are striped and stitches them together using Ghostscript.
4. Image Processing
- Convert to grayscale
- Upscale to target DPI
- Filter using Gauss kernel
- Binarizes the resulting images using Leptonica and the Otsu thresholding algorithm.
- Despeckle using various morphological operations
5. OCR Processing
Runs Tesseract on the images to extract text.
6. Font style detection
Detection of bold text using stroke width estimation
7. Text Integration
Draws the resulting text onto the original PDF using PDFBox.
Steps 2.-5. happen in parallel and communicate via a blocking queue to limit RAM usage.
Therefore, choosing your thread counts carefully leads to most optimal performance.
For example with 18 available cores, I achieved the highest performance with 2 Image extraction threads, 2 ghostscript processes and 16 OCR threads.
Setting all threads to basically unlimited (1000+) leads to comparable performance without laborious thread tuning, but at the cost of (potentially a lot) more RAM.
## Installation
To run the OCR service, ensure that the following dependencies are installed:
1. Ghostscript: Install using apt.
```bash
sudo apt install ghostscript
```
2. Tesseract and Leptonica: Install using [vcpkg](https://github.com/microsoft/vcpkg) with the command and set the environment variable `VCPKG_DYNAMIC_LIB` to your vcpkg lib folder (e.g. ~/vcpkg/installed/x64-linux-dynamic/lib).
```bash
vcpkg install tesseract --triplet x64-linux-dynamic
```
```bash
vcpkg install leptonica --triplet x64-linux-dynamic
```
3. Other dependencies are handled by Gradle build
```bash
gradle build
```
## Configuration
Configuration settings are available in the OcrServiceSettings class.
These settings can be overridden using environment variables. e.g.
`OCR_SERVICE_OCR_THREAD_COUNT=16`
Possible configurations and their defaults include:
```java
int ocrThreadCount = 4; // Number of OCR threads
int imageExtractThreadCount = 4; // Number of image extraction threads
int gsProcessCount = 4; // Number of Ghostscript processes
int dpi = 300; // Target DPI for binarized images
int psmOverride = -1; // Overrides the page segmentation mode if > 0
int minImageHeight = 20; // Minimum height for images to be processed
int minImageWidth = 20; // Minimum width for images to be processed
boolean debug = false; // If true, overlays OCR images with a grid and draws word bounding boxes
boolean removeWatermark; // If false, watermarks will not be removed
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
```
## Integration
The OCR-service communicates via RabbitMQ and uses the queues `ocr_request_queue`, `ocr_response_queue`,
`ocr_dead_letter_queue`, and `ocr_status_update_response_queue`.
### ocr_request_queue
This queue is used to start the OCR process, a DocumentRequest must be passed as a message. The service will then download the PDF from the provided cloud storage.
### ocr_response_queue
This queue is also used to signal the end of processing.
### ocr_dead_letter_queue
This queue is used to signal an error has occurred during processing.
### ocr_status_update_response_queue
This queue is used by the OCR service to give updates about the progress of the ongoing OCR on a image per image basis. The total amount may change, when less images are found than initially assumed.

37
bamboo-specs/pom.xml Normal file
View File

@ -0,0 +1,37 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs-parent</artifactId>
<version>8.1.3</version>
<relativePath/>
</parent>
<artifactId>bamboo-specs</artifactId>
<version>1.0.0-SNAPSHOT</version>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs-api</artifactId>
</dependency>
<dependency>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs</artifactId>
</dependency>
<!-- Test dependencies -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<!-- run 'mvn test' to perform offline validation of the plan -->
<!-- run 'mvn -Ppublish-specs' to upload the plan to your Bamboo server -->
</project>

View File

@ -0,0 +1,125 @@
package buildjob;
import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
import java.time.LocalTime;
import com.atlassian.bamboo.specs.api.BambooSpec;
import com.atlassian.bamboo.specs.api.builders.BambooKey;
import com.atlassian.bamboo.specs.api.builders.Variable;
import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
import com.atlassian.bamboo.specs.api.builders.permission.PermissionType;
import com.atlassian.bamboo.specs.api.builders.permission.Permissions;
import com.atlassian.bamboo.specs.api.builders.permission.PlanPermissions;
import com.atlassian.bamboo.specs.api.builders.plan.Job;
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
import com.atlassian.bamboo.specs.api.builders.plan.PlanIdentifier;
import com.atlassian.bamboo.specs.api.builders.plan.Stage;
import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup;
import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement;
import com.atlassian.bamboo.specs.api.builders.project.Project;
import com.atlassian.bamboo.specs.builders.task.CheckoutItem;
import com.atlassian.bamboo.specs.builders.task.InjectVariablesTask;
import com.atlassian.bamboo.specs.builders.task.ScriptTask;
import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask;
import com.atlassian.bamboo.specs.builders.task.VcsTagTask;
import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
import com.atlassian.bamboo.specs.builders.trigger.ScheduledTrigger;
import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
import com.atlassian.bamboo.specs.util.BambooServer;
/**
* Plan configuration for Bamboo.
* Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
*/
@BambooSpec
public class PlanSpec {
private static final String SERVICE_NAME = "ocr-service";
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-", "");
/**
* Run main to publish plan on Bamboo
*/
public static void main(final String[] args) throws Exception {
//By default credentials are read from the '.credentials' file.
BambooServer bambooServer = new BambooServer("http://localhost:8085");
Plan plan = new PlanSpec().createPlan();
bambooServer.publish(plan);
PlanPermissions planPermission = new PlanSpec().createPlanPermission(plan.getIdentifier());
bambooServer.publish(planPermission);
Plan secPlan = new PlanSpec().createSecBuild();
bambooServer.publish(secPlan);
PlanPermissions secPlanPermission = new PlanSpec().createPlanPermission(secPlan.getIdentifier());
bambooServer.publish(secPlanPermission);
}
private PlanPermissions createPlanPermission(PlanIdentifier planIdentifier) {
Permissions permission = new Permissions().userPermissions("atlbamboo",
PermissionType.EDIT,
PermissionType.VIEW,
PermissionType.ADMIN,
PermissionType.CLONE,
PermissionType.BUILD)
.groupPermissions("Development", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
.groupPermissions("devplant", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
.loggedInUserPermissions(PermissionType.VIEW)
.anonymousUserPermissionView();
return new PlanPermissions(planIdentifier.getProjectKey(), planIdentifier.getPlanKey()).permissions(permission);
}
private Project project() {
return new Project().name("RED").key(new BambooKey("RED"));
}
public Plan createPlan() {
return new Plan(project(), SERVICE_NAME, new BambooKey(SERVICE_KEY)).description("Plan created from (enter repository url of your plan)")
.variables(new Variable("maven_add_param", ""))
.stages(new Stage("Default Stage").jobs(new Job("Default Job", new BambooKey("JOB1")).tasks(new ScriptTask().description("Clean")
.inlineBody("#!/bin/bash\n" + "set -e\n" + "rm -rf ./*"),
new VcsCheckoutTask().description("Checkout Default Repository").cleanCheckout(true).checkoutItems(new CheckoutItem().defaultRepository()),
new ScriptTask().description("Build").location(Location.FILE).fileFromPath("bamboo-specs/src/main/resources/scripts/build-java.sh").argument(SERVICE_NAME),
createJUnitParserTask().description("Resultparser")
.resultDirectories("**/test-reports/*.xml, **/target/surefire-reports/*.xml, **/target/failsafe-reports/*.xml")
.enabled(true),
new InjectVariablesTask().description("Inject git Tag").path("git.tag").namespace("g").scope(InjectVariablesScope.LOCAL),
new VcsTagTask().description("${bamboo.g.gitTag}").tagName("${bamboo.g.gitTag}").defaultRepository())
.dockerConfiguration(new DockerConfiguration().image("nexus.iqser.com:5001/infra/maven:3.8.4-openjdk-17-slim")
.volume("/etc/maven/settings.xml", "/usr/share/maven/conf/settings.xml")
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
.linkedRepositories("RED / " + SERVICE_NAME)
.triggers(new BitbucketServerTrigger())
.planBranchManagement(new PlanBranchManagement().createForVcsBranch()
.delete(new BranchCleanup().whenInactiveInRepositoryAfterDays(14))
.notificationForCommitters());
}
public Plan createSecBuild() {
return new Plan(project(), SERVICE_NAME + "-Sec", new BambooKey(SERVICE_KEY + "SEC")).description("Security Analysis Plan")
.stages(new Stage("Default Stage").jobs(new Job("Default Job", new BambooKey("JOB1")).tasks(new ScriptTask().description("Clean")
.inlineBody("#!/bin/bash\n" + "set -e\n" + "rm -rf ./*"),
new VcsCheckoutTask().description("Checkout Default Repository").checkoutItems(new CheckoutItem().defaultRepository()),
new ScriptTask().description("Sonar").location(Location.FILE).fileFromPath("bamboo-specs/src/main/resources/scripts/sonar-java.sh").argument(SERVICE_NAME))
.dockerConfiguration(new DockerConfiguration().image("nexus.iqser.com:5001/infra/maven:3.6.2-jdk-13-3.0.0")
.dockerRunArguments("--net=host")
.volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
.linkedRepositories("RED / " + SERVICE_NAME)
.triggers(new ScheduledTrigger().scheduleOnceDaily(LocalTime.of(23, 00)))
.planBranchManagement(new PlanBranchManagement().createForVcsBranchMatching("release.*").notificationForCommitters());
}
}

View File

@ -0,0 +1,62 @@
#!/bin/bash
set -e
SERVICE_NAME=$1
if [[ "$bamboo_planRepository_branchName" == "master" ]]
then
echo "building on master branch"
branchVersion=$(cat pom.xml | grep -Eo "<version>.*" | sed -s 's|<version>\(.*\)\..*\(-*.*\)</version>|\1|')
echo "branch version is : $branchVersion"
latestVersion=$(semver $( git tag -l $branchVersion.* ) | tail -n1)
echo "latestVersion is : $latestVersion"
newVersion="$(semver $latestVersion -p -i minor)"
echo "newVersion is : $newVersion"
elif [[ "$bamboo_planRepository_branchName" == release* ]]
then
branchVersion=$(echo $bamboo_planRepository_branchName | sed -s 's|release\/\([0-9]\+\.[0-9]\+\)\.x|\1|')
latestVersion=$(semver $( git tag -l $branchVersion.* ) | tail -n1)
newVersion="$(semver $latestVersion -p -i patch)"
elif [[ "${bamboo_version_tag}" != "dev" ]]
then
newVersion="${bamboo_version_tag}"
else
mvn -f ${bamboo_build_working_directory}/$SERVICE_NAME-v1/pom.xml \
--no-transfer-progress \
${bamboo_maven_add_param} \
clean install \
-Djava.security.egd=file:/dev/./urandomelse
echo "gitTag=${bamboo_planRepository_1_branch}_${bamboo_buildNumber}" > git.tag
exit 0
fi
echo "gitTag=${newVersion}" > git.tag
mvn --no-transfer-progress \
-f ${bamboo_build_working_directory}/$SERVICE_NAME-v1/pom.xml \
versions:set \
-DnewVersion=${newVersion}
mvn --no-transfer-progress \
-f ${bamboo_build_working_directory}/$SERVICE_NAME-image-v1/pom.xml \
versions:set \
-DnewVersion=${newVersion}
mvn -f ${bamboo_build_working_directory}/$SERVICE_NAME-v1/pom.xml \
--no-transfer-progress \
clean deploy \
${bamboo_maven_add_param} \
-e \
-DdeployAtEnd=true \
-Dmaven.wagon.http.ssl.insecure=true \
-Dmaven.wagon.http.ssl.allowall=true \
-Dmaven.wagon.http.ssl.ignore.validity.dates=true \
-DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/red-platform-releases
mvn --no-transfer-progress \
-f ${bamboo_build_working_directory}/$SERVICE_NAME-image-v1/pom.xml \
package
mvn --no-transfer-progress \
-f ${bamboo_build_working_directory}/$SERVICE_NAME-image-v1/pom.xml \
docker:push

View File

@ -0,0 +1,44 @@
#!/bin/bash
set -e
SERVICE_NAME=$1
echo "build jar binaries"
mvn -f ${bamboo_build_working_directory}/$SERVICE_NAME-v1/pom.xml \
--no-transfer-progress \
clean install \
-Djava.security.egd=file:/dev/./urandomelse
echo "dependency-check:aggregate"
mvn --no-transfer-progress \
-f ${bamboo_build_working_directory}/$SERVICE_NAME-v1/pom.xml \
org.owasp:dependency-check-maven:aggregate
if [[ -z "${bamboo_repository_pr_key}" ]]
then
echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
mvn --no-transfer-progress \
-f ${bamboo_build_working_directory}/$SERVICE_NAME-v1/pom.xml \
sonar:sonar \
-Dsonar.projectKey=RED_$SERVICE_NAME \
-Dsonar.host.url=https://sonarqube.iqser.com \
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-Dsonar.branch.name=${bamboo_planRepository_1_branch} \
-Dsonar.dependencyCheck.jsonReportPath=target/dependency-check-report.json \
-Dsonar.dependencyCheck.xmlReportPath=target/dependency-check-report.xml \
-Dsonar.dependencyCheck.htmlReportPath=target/dependency-check-report.html
else
echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
mvn --no-transfer-progress \
-f ${bamboo_build_working_directory}/$SERVICE_NAME-v1/pom.xml \
sonar:sonar \
-Dsonar.projectKey=RED_$SERVICE_NAME \
-Dsonar.host.url=https://sonarqube.iqser.com \
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
-Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \
-Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \
-Dsonar.dependencyCheck.jsonReportPath=target/dependency-check-report.json \
-Dsonar.dependencyCheck.xmlReportPath=target/dependency-check-report.xml \
-Dsonar.dependencyCheck.htmlReportPath=target/dependency-check-report.html
fi

View File

@ -0,0 +1,22 @@
package buildjob;
import org.junit.Test;
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
import com.atlassian.bamboo.specs.api.exceptions.PropertiesValidationException;
import com.atlassian.bamboo.specs.api.util.EntityPropertiesBuilders;
public class PlanSpecTest {
@Test
public void checkYourPlanOffline() throws PropertiesValidationException {
Plan plan = new PlanSpec().createPlan();
EntityPropertiesBuilders.build(plan);
Plan secPlan = new PlanSpec().createSecBuild();
EntityPropertiesBuilders.build(secPlan);
}
}

View File

@ -1,15 +0,0 @@
/*
* This file was generated by the Gradle 'init' task.
*
* This project uses @Incubating APIs which are subject to change.
*/
plugins {
// Support convention plugins written in Kotlin. Convention plugins are build scripts in 'src/main' that automatically become available as plugins in the main build.
`kotlin-dsl`
}
repositories {
// Use the plugin portal to apply community plugins in convention plugins.
gradlePluginPortal()
}

View File

@ -1,70 +0,0 @@
plugins {
`java-library`
pmd
checkstyle
jacoco
}
group = "com.knecon.fforesight.service"
java.sourceCompatibility = JavaVersion.VERSION_17
java.targetCompatibility = JavaVersion.VERSION_17
pmd {
isConsoleOutput = true
}
tasks.pmdMain {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
}
tasks.pmdTest {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/test_pmd.xml")
}
tasks.named<Test>("test") {
useJUnitPlatform()
reports {
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
}
minHeapSize = "512m"
maxHeapSize = "8192m"
}
tasks.test {
finalizedBy(tasks.jacocoTestReport) // report is always generated after tests run
}
tasks.jacocoTestReport {
dependsOn(tasks.test) // tests are required to run before generating the report
reports {
xml.required.set(true)
csv.required.set(false)
html.outputLocation.set(layout.buildDirectory.dir("jacocoHtml"))
}
}
java {
withJavadocJar()
}
repositories {
mavenLocal()
maven {
url = uri("https://nexus.knecon.com/repository/gindev/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull()
password = providers.gradleProperty("mavenPassword").getOrNull()
}
}
maven {
url = uri("https://nexus.knecon.com/repository/PDFTron/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull()
password = providers.gradleProperty("mavenPassword").getOrNull()
}
}
mavenCentral()
}

View File

@ -1,39 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
"http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
<module name="Checker">
<property
name="severity"
value="error"/>
<module name="TreeWalker">
<module name="SuppressWarningsHolder"/>
<module name="MissingDeprecated"/>
<module name="MissingOverride"/>
<module name="AnnotationLocation"/>
<module name="JavadocStyle"/>
<module name="NonEmptyAtclauseDescription"/>
<module name="IllegalImport"/>
<module name="RedundantImport"/>
<module name="RedundantModifier"/>
<module name="EmptyBlock"/>
<module name="DefaultComesLast"/>
<module name="EmptyStatement"/>
<module name="EqualsHashCode"/>
<module name="ExplicitInitialization"/>
<module name="IllegalInstantiation"/>
<module name="ModifiedControlVariable"/>
<module name="MultipleVariableDeclarations"/>
<module name="PackageDeclaration"/>
<module name="ParameterAssignment"/>
<module name="SimplifyBooleanExpression"/>
<module name="SimplifyBooleanReturn"/>
<module name="StringLiteralEquality"/>
<module name="OneStatementPerLine"/>
<module name="FinalClass"/>
<module name="ArrayTypeStyle"/>
<module name="UpperEll"/>
<module name="OuterTypeFilename"/>
</module>
<module name="FileTabCharacter"/>
<module name="SuppressWarningsFilter"/>
</module>

View File

@ -1,20 +0,0 @@
<?xml version="1.0"?>
<ruleset name="Custom ruleset"
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
<description>
Knecon ruleset checks the code for bad stuff
</description>
<rule ref="category/java/errorprone.xml">
<exclude name="MissingSerialVersionUID"/>
<exclude name="AvoidLiteralsInIfCondition"/>
<exclude name="AvoidDuplicateLiterals"/>
<exclude name="NullAssignment"/>
<exclude name="AssignmentInOperand"/>
<exclude name="BeanMembersShouldSerialize"/>
</rule>
</ruleset>

View File

@ -1,22 +0,0 @@
<?xml version="1.0"?>
<ruleset name="Custom ruleset"
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
<description>
Knecon test ruleset checks the code for bad stuff
</description>
<rule ref="category/java/errorprone.xml">
<exclude name="MissingSerialVersionUID"/>
<exclude name="AvoidLiteralsInIfCondition"/>
<exclude name="AvoidDuplicateLiterals"/>
<exclude name="NullAssignment"/>
<exclude name="AssignmentInOperand"/>
<exclude name="TestClassWithoutTestCases"/>
<exclude name="BeanMembersShouldSerialize"/>
</rule>
</ruleset>

View File

@ -1 +0,0 @@
version = 4.0-SNAPSHOT

View File

@ -0,0 +1,118 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>com.iqser.red</groupId>
<artifactId>platform-docker-dependency</artifactId>
<version>1.2.0</version>
<relativePath/>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>ocr-service-image-v1</artifactId>
<groupId>com.iqser.red.service</groupId>
<version>1.0-SNAPSHOT</version>
<packaging>pom</packaging>
<properties>
<service.server>ocr-service-server-v1</service.server>
<platform.jar>${service.server}.jar</platform.jar>
<docker.skip.push>false</docker.skip.push>
<docker.image.name>${docker.image.prefix}/${service.server}</docker.image.name>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>io.fabric8</groupId>
<artifactId>docker-maven-plugin</artifactId>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>download-platform-jar</id>
<phase>prepare-package</phase>
<goals>
<goal>copy</goal>
</goals>
<configuration>
<artifactItems>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>${service.server}</artifactId>
<version>${version}</version>
<type>jar</type>
<overWrite>true</overWrite>
<destFileName>${platform.jar}</destFileName>
</dependency>
</artifactItems>
<outputDirectory>${docker.build.directory}</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>io.fabric8</groupId>
<artifactId>docker-maven-plugin</artifactId>
<configuration>
<images>
<image>
<name>${docker.image.name}</name>
<build>
<dockerFileDir>${docker.build.directory}</dockerFileDir>
<args>
<PLATFORM_JAR>${platform.jar}</PLATFORM_JAR>
</args>
<tags>
<tag>${docker.image.version}</tag>
<tag>latest</tag>
</tags>
</build>
</image>
</images>
</configuration>
</plugin>
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<executions>
<execution>
<id>copy-resources</id>
<phase>prepare-package</phase>
<goals>
<goal>copy-resources</goal>
</goals>
<configuration>
<outputDirectory>${basedir}/target/build/libs/</outputDirectory>
<resources>
<resource>
<directory>libs</directory>
<filtering>false</filtering>
</resource>
</resources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>

View File

@ -0,0 +1,18 @@
FROM red/base-image:2.0.0
COPY "libs/pdftron/OCRModuleLinux.tar.gz" .
RUN tar xvzf OCRModuleLinux.tar.gz
RUN mkdir /OCRModule
RUN mv Lib/* /OCRModule/
RUN apt-get -y update
# Ghostscript somehow improves ocr quality using pdfton, do not remove!
RUN apt-get -y install ghostscript
ARG PLATFORM_JAR
ENV PLATFORM_JAR ${PLATFORM_JAR}
ENV USES_ELASTICSEARCH false
COPY ["${PLATFORM_JAR}", "/"]

View File

@ -0,0 +1,68 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.iqser.red.service</groupId>
<artifactId>ocr-service-v1</artifactId>
<version>1.0-SNAPSHOT</version>
</parent>
<artifactId>ocr-service-api-v1</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<persistence-service.version>1.269.0</persistence-service.version>
<redaction-service.version>3.155.0</redaction-service.version>
<dsljson.version>1.9.9</dsljson.version>
</properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/com.dslplatform/dsl-json-java8 -->
<dependency>
<groupId>com.dslplatform</groupId>
<artifactId>dsl-json-java8</artifactId>
<version>${dsljson.version}</version>
</dependency>
<dependency>
<!-- This dependency contains annotations that are used in specifying REST endpoints. -->
<!-- It is optional since not all users of this API might use Feign. -->
<groupId>io.github.openfeign</groupId>
<artifactId>feign-core</artifactId>
<optional>true</optional>
</dependency>
<!-- spring -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-web</artifactId>
</dependency>
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>persistence-service-api-v1</artifactId>
<exclusions>
<exclusion>
<groupId>com.iqser.red.service</groupId>
<artifactId>redaction-service-api-v1</artifactId>
</exclusion>
</exclusions>
<version>${persistence-service.version}</version>
</dependency>
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>redaction-service-api-v1</artifactId>
<exclusions>
<exclusion>
<groupId>com.iqser.red.service</groupId>
<artifactId>persistence-service-api-v1</artifactId>
</exclusion>
</exclusions>
<version>${redaction-service.version}</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,16 @@
package com.iqser.red.service.ocr.v1.api.model;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.SuperBuilder;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class DocumentRequest {
protected String dossierId;
protected String fileId;
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.ocr.v1.api.model; package com.iqser.red.service.ocr.v1.api.model;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Builder; import lombok.Builder;
@ -6,15 +6,14 @@ import lombok.Data;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
@Data @Data
@Builder
@AllArgsConstructor
@NoArgsConstructor @NoArgsConstructor
@AllArgsConstructor
@Builder
public class OCRStatusUpdateResponse { public class OCRStatusUpdateResponse {
private String fileId; private String fileId;
private int numberOfPagesToOCR; private int numberOfPagesToOCR;
private int numberOfOCRedPages; private int numberOfOCRedPages;
private boolean ocrFinished; private boolean ocrFinished;
private boolean ocrStarted;
} }

View File

@ -1,22 +0,0 @@
plugins {
`maven-publish`
id("com.iqser.red.service.java-conventions")
id("io.freefair.lombok") version "8.4"
}
publishing {
publications {
create<MavenPublication>(name) {
from(components["java"])
}
}
repositories {
maven {
url = uri("https://nexus.knecon.com/repository/red-platform-releases/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull();
password = providers.gradleProperty("mavenPassword").getOrNull();
}
}
}
}

View File

@ -1,24 +0,0 @@
package com.knecon.fforesight.service.ocr.v1.api.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class DocumentRequest {
protected String dossierId;
protected String fileId;
protected boolean removeWatermark;
public DocumentRequest(String dossierId, String fileId) {
this.dossierId = dossierId;
this.fileId = fileId;
}
}

View File

@ -1,31 +0,0 @@
plugins {
id("com.iqser.red.service.java-conventions")
id("io.freefair.lombok") version "8.4"
}
configurations {
all {
exclude(group = "org.springframework.boot", module = "spring-boot-starter-logging")
}
}
dependencies {
api("com.iqser.red.service:persistence-service-internal-api-v1:2.224.0")
api("net.sourceforge.tess4j:tess4j:5.8.0")
api("com.iqser.red.commons:metric-commons:2.1.0")
api("com.iqser.red.commons:storage-commons:2.45.0")
api("com.knecon.fforesight:tenant-commons:0.21.0")
api("com.knecon.fforesight:lifecycle-commons:0.6.0")
api("com.pdftron:PDFNet:10.5.0")
api("org.apache.pdfbox:pdfbox:3.0.0")
api("org.apache.pdfbox:jbig2-imageio:3.0.4")
api("com.github.jai-imageio:jai-imageio-core:1.4.0")
api("com.github.jai-imageio:jai-imageio-jpeg2000:1.4.0")
api("org.apache.commons:commons-math3:3.6.1")
api("io.github.karols:hocr4j:0.2.0")
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
api("com.google.guava:guava:31.1-jre")
api("com.iqser.red.commons:pdftron-logic-commons:2.32.0")
api("com.knecon.fforesight:viewer-doc-processor:0.125.0")
testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
}

View File

@ -1,26 +0,0 @@
package com.knecon.fforesight.service.ocr.processor;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import io.micrometer.observation.ObservationRegistry;
@Configuration
@ComponentScan
@EnableConfigurationProperties(OcrServiceSettings.class)
public class OcrServiceProcessorConfiguration {
@Bean
@Autowired
public ViewerDocumentService viewerDocumentService(ObservationRegistry registry) {
return new ViewerDocumentService(registry);
}
}

View File

@ -1,49 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.initializer;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import com.pdftron.pdf.PDFNet;
import com.sun.jna.NativeLibrary;
import jakarta.annotation.PostConstruct;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Component
@RequiredArgsConstructor
public class NativeLibrariesInitializer {
@Value("${pdftron.license:}")
private String pdftronLicense;
@SneakyThrows
@PostConstruct
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
public void init() {
log.info("Initializing Native Libraries");
log.info("Setting pdftron license: {}", pdftronLicense);
PDFNet.setTempPath("/tmp/pdftron");
PDFNet.initialize(pdftronLicense);
log.info("Setting jna.library.path: {}", System.getenv("VCPKG_DYNAMIC_LIB"));
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
log.info("Asserting Native Libraries loaded");
try (NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica")) {
assert leptonicaLib != null;
log.info("Leptonica library loaded from {}", leptonicaLib.getFile().getAbsolutePath());
}
try (NativeLibrary tesseractLib = NativeLibrary.getInstance("tesseract")) {
assert tesseractLib != null;
log.info("Tesseract library loaded from {}", tesseractLib.getFile().getAbsolutePath());
}
}
}

View File

@ -1,36 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
public record ExtractedImage(
int pageNumber, QuadPoint position, int height, int width, BufferedImage image, Matrix ctm, int numberOnPage, PDColorSpace colorSpace) implements UnprocessedImage {
@SneakyThrows
public Pix asPix() {
BufferedImage image = ImageProcessingUtils.convertToDeviceColorSpace(this);
ImageProcessingUtils.setAlphaChannelToWhite(image);
return LeptUtils.convertImageToPix(image);
}
public QuadPoint getImageCoordinatesInInitialUserSpace() {
return QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, 1, 1)).getTransformed(ctm.createAffineTransform());
}
}

View File

@ -1,61 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.Graphics;
import java.awt.geom.AffineTransform;
import java.awt.image.BufferedImage;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.Setter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.tess4j.ITessAPI;
@Slf4j
@Getter
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ExtractedOcrImage implements OcrImage {
int pageNumber;
int numberOnPage;
int originalHeight;
int originalWidth;
Matrix ctm;
Pix pix;
int height;
int width;
int rotationDegrees;
@Override
public AffineTransform getImageCTM() {
AffineTransform affineTransform = ctm.createAffineTransform();
affineTransform.scale((double) 1 / getWidth(), (double) 1 / getHeight());
AffineTransform deRotationMatrix = switch (360 - rotationDegrees) {
case 90 -> new AffineTransform(0, 1, -1, 0, getHeight(), 0);
case 180 -> new AffineTransform(-1, 0, 0, -1, getWidth(), getHeight());
case 270 -> new AffineTransform(0, -1, 1, 0, getWidth() - getHeight(), getHeight()); // results from 90 + 180 rotations
default -> new AffineTransform();
};
affineTransform.concatenate(deRotationMatrix);
AffineTransform mirrorTransform = new AffineTransform(1, 0, 0, -1, 0, getHeight());
affineTransform.concatenate(mirrorTransform);
return affineTransform;
}
}

View File

@ -1,17 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.model;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Getter
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public final class FontMetrics {
float descent; // descent is the part of the text which is below the baseline, e.g. the lower curve of a 'g'. https://en.wikipedia.org/wiki/Body_height_(typography)
float fontSize;
float heightScaling;
}

View File

@ -1,5 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.model;
public record HeightAndDescent(float height, float descent) {
}

View File

@ -1,127 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.image.BufferedImage;
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
import com.knecon.fforesight.service.ocr.processor.utils.PdfDpiCalculator;
import lombok.SneakyThrows;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
import net.sourceforge.tess4j.ITessAPI;
public interface OcrImage {
/**
* Retrieves the page number where the OCR image is located. It uses 1-based-index.
*
* @return The page number where the OCR image is located.
*/
int getPageNumber();
/**
* Retrieves the number of this image on the page. For full page images this always returns 0.
*
* @return The number of this image on the page.
*/
int getNumberOnPage();
/**
* Retrieves the height of the original image (not necessarily in pdf coordinates).
*
* @return the height of the image
*/
int getHeight();
/**
* Retrieves the width of the original image (not necessarily in pdf coordinates).
*
* @return the width of the image
*/
int getWidth();
/**
* Gets the outer boundary of the image in image coordinates. (0,0) is upper left corner. And height and width is the image size
*
* @return the QuadPoint representing the size of the image
*/
default QuadPoint getImageBounds() {
// cannot be solved with a nice rotation matrix. After rotating the text coordinates in the image will always start at (0,0) and will therefore always start at (0,0) in the PDF.
// So in order to mimic this behavior we need to start with (0,0) coordinates always.
if (getRotationDegrees() == 90 || getRotationDegrees() == 270) {
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, getWidth()), new Point2D.Double(getHeight(), getWidth()), new Point2D.Double(getHeight(), 0));
} else {
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, getHeight()), new Point2D.Double(getWidth(), getHeight()), new Point2D.Double(getWidth(), 0));
}
}
/**
* Retrieves the image coordinates in the PDF by transforming the image bounds using the current transformation matrix (CTM).
*
* @return The image coordinates as a QuadPoint object.
*/
default QuadPoint getImageCoordinatesInInitialUserSpace() {
return getImageBounds().getTransformed(getImageCTM());
}
/**
* Retrieves the rotation degree of the OCR image.
*
* @return The rotation degree of the OCR image.
*/
int getRotationDegrees();
/**
* Retrieves the optimal page segmentation mode for the OCR image.
*
* @return The optimal page segmentation mode.
*/
default int getOptimalPageSegmentationMode() {
if (getWidth() < 200 || getHeight() < 200) {
return ITessAPI.TessPageSegMode.PSM_SINGLE_BLOCK;
}
return ITessAPI.TessPageSegMode.PSM_AUTO;
} // TODO: evaluate if PSM can be dynamically chosen to increase performance
/**
* Retrieves the buffered image associated with the OCR image.
*
* @return The BufferedImage object representing the image.
*/
Pix getPix();
default int getDpi() {
return PdfDpiCalculator.calculateDpi(getImageBounds(), getImageCTM(), getWidth());
}
/**
* Retrieves the current transformation matrix (CTM). The CTM may be used to transform the image coordinates to Initial User Space coordinates.
*
* @return The AffineTransform representing the current transformation matrix.
*/
AffineTransform getImageCTM();
default void destroyPix() {
LeptUtils.disposePix(getPix());
}
}

View File

@ -1,23 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.AffineTransform;
import java.util.List;
import com.knecon.fforesight.service.ocr.processor.service.HOcrPageParser;
import io.github.karols.hocr4j.Word;
public record OcrResult(OcrImage image, String tesseractOutputFilePath) {
public static OcrResult create(OcrImage image, String tesseractResult) {
return new OcrResult(image, tesseractResult);
}
public List<Word> getAllWords() {
return HOcrPageParser.extractHocrPage(tesseractOutputFilePath).getAllWords();
}
}

View File

@ -1,42 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.FontStyleDetectionModel;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
public record OcrResultToWrite(List<TextPositionInImage> textPositionInImage, QuadPoint imageBoundingBox) {
public static OcrResultToWrite fromFontStyleDetectionModel(FontStyleDetectionModel fontStyleDetectionModel) {
return new OcrResultToWrite(fontStyleDetectionModel.getTextPositionInImages(), fontStyleDetectionModel.getImageBounds());
}
public static Map<Integer, List<OcrResultToWrite>> buildOcrResultsToWrite(List<OcrResult> ocrResults, FontMetricsFactory fontMetricsFactory) {
return ocrResults.stream()
.collect(Collectors.groupingBy(ocrResult -> ocrResult.image().getPageNumber()))
.entrySet()
.stream()
.collect(Collectors.toMap(Map.Entry::getKey,
entry -> entry.getValue()
.stream()
.map(ocrResult -> new OcrResultToWrite(toTextPositionInImage(ocrResult, fontMetricsFactory), ocrResult.image().getImageCoordinatesInInitialUserSpace()))
.toList()));
}
private static List<TextPositionInImage> toTextPositionInImage(OcrResult ocrResult, FontMetricsFactory fontMetricsFactory) {
return ocrResult.getAllWords()
.stream()
.filter(word -> !word.isBlank())
.map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR))
.toList();
}
}

View File

@ -1,42 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.Rectangle2D;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
public record PageInformation(Rectangle2D mediabox, int number, int rotationDegrees) {
public static PageInformation fromPDPage(int pageNum, PDPage page) {
PDRectangle mediaBox = page.getMediaBox();
return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()),
pageNum,
page.getRotation());
}
public double height() {
return mediabox.getHeight();
}
public double width() {
return mediabox.getWidth();
}
public double minX() {
return mediabox.getX();
}
public double minY() {
return mediabox.getY();
}
}

View File

@ -1,117 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.Rectangle;
import java.awt.geom.AffineTransform;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.List;
import io.github.karols.hocr4j.Bounds;
public record QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) {
/*
B _____ C
| |
A|_____|D
*/
public static QuadPoint fromRectangle2D(Rectangle2D rectangle2D) {
return new QuadPoint(new Point2D.Double(rectangle2D.getX(), rectangle2D.getY()),
new Point2D.Double(rectangle2D.getX(), rectangle2D.getMaxY()),
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getMaxY()),
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getY()));
}
public static QuadPoint fromBounds(Bounds bounds) {
return new QuadPoint(new Point2D.Double(bounds.getLeft(), bounds.getBottom()),
new Point2D.Double(bounds.getLeft(), bounds.getTop()),
new Point2D.Double(bounds.getRight(), bounds.getTop()),
new Point2D.Double(bounds.getRight(), bounds.getBottom()));
}
public Rectangle2D getBounds2D() {
double minX = Math.min(Math.min(Math.min(a.getX(), b.getX()), c.getX()), d.getX());
double minY = Math.min(Math.min(Math.min(a.getY(), b.getY()), c.getY()), d.getY());
double maxX = Math.max(Math.max(Math.max(a.getX(), b.getX()), c.getX()), d.getX());
double maxY = Math.max(Math.max(Math.max(a.getY(), b.getY()), c.getY()), d.getY());
return new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY);
}
public QuadPoint getTransformed(AffineTransform at) {
return new QuadPoint(at.transform(a, null), at.transform(b, null), at.transform(c, null), at.transform(d, null));
}
/**
* Determines if the given QuadPoint aligns with this QuadPoint within a given threshold.
* It does os by trying every possible combination of aligning sides. It starts with the most likely combination of ab and cd.
*
* @param other The QuadPoint to compare with.
* @param threshold The maximum distance allowed for alignment.
* @return True if the QuadPoints align within the threshold, false otherwise.
*/
public boolean aligns(QuadPoint other, double threshold) {
Line2D ab = new Line2D.Double(a, b);
Line2D bc = new Line2D.Double(b, c);
Line2D cd = new Line2D.Double(c, d);
Line2D da = new Line2D.Double(d, a);
Line2D ab2 = new Line2D.Double(other.a, other.b);
Line2D bc2 = new Line2D.Double(other.b, other.c);
Line2D cd2 = new Line2D.Double(other.c, other.d);
Line2D da2 = new Line2D.Double(other.d, other.a);
List<Line2D> lines = List.of(ab, cd, bc, da);
List<Line2D> lines2 = List.of(cd2, ab2, bc2, da2);
return lines.stream().anyMatch(line -> lines2.stream().anyMatch(line2 -> aligns(line, line2, threshold)));
}
private static boolean aligns(Line2D a, Line2D b, double threshold) {
return aligns(a.getP1(), a.getP2(), b.getP1(), b.getP2(), threshold);
}
private static boolean aligns(Point2D a, Point2D b, Point2D a2, Point2D b2, double threshold) {
if (a.distance(a2) < threshold && b.distance(b2) < threshold) {
return true;
}
return a.distance(b2) < threshold && b.distance(a2) < threshold;
}
@Override
public String toString() {
return String.format("A:(%.2f, %.2f) | B:(%.2f, %.2f) | C:(%.2f, %.2f) | D:(%.2f, %.2f)",
a().getX(),
a().getY(),
b().getX(),
b().getY(),
c().getX(),
c().getY(),
d().getX(),
d().getY());
}
public double size() {
return a().distance(b()) * a().distance(d());
}
}

View File

@ -1,14 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.model;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
public record RenderedPageImageFile(int pageNumber, String absoluteFilePath) implements UnprocessedImage {
@Override
public Pix asPix() {
return Leptonica1.pixRead(absoluteFilePath);
}
}

View File

@ -1,82 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.AffineTransform;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Pix;
@Getter
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class RenderedPageOcrImage implements OcrImage {
int height;
int width;
PageInformation pageInformation;
Pix pix;
int rotationDegrees;
@Override
public AffineTransform getImageCTM() {
double scalingFactor = calculateScalingFactor();
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, 0, 0);
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
AffineTransform rotationMatrix = switch (calculateTotalRotation()) {
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height()); // results from 90 + 180 rotations
default -> new AffineTransform();
};
// matrix multiplication is performed from right to left, so the order is reversed.
// scaling -> mirror -> rotation
AffineTransform resultMatrix = new AffineTransform();
resultMatrix.concatenate(rotationMatrix);
resultMatrix.concatenate(mirrorMatrix);
resultMatrix.concatenate(imageToCropBoxScaling);
return resultMatrix;
}
private int calculateTotalRotation() {
return (pageInformation.rotationDegrees() + (360 - rotationDegrees)) % 360;
}
@Override
public int getPageNumber() {
return pageInformation.number();
}
@Override
public int getNumberOnPage() {
return 0;
}
private double calculateScalingFactor() {
// PDFBox always returns page height and width based on rotation
double pageWidth;
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
pageWidth = pageInformation.height();
} else {
pageWidth = pageInformation.width();
}
return pageWidth / width;
}
}

View File

@ -1,135 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
import io.github.karols.hocr4j.Word;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.Setter;
import lombok.experimental.FieldDefaults;
@Getter
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TextPositionInImage {
final QuadPoint position;
final String text;
final AffineTransform imageCTM;
@Setter
FontMetricsFactory fontMetricsFactory;
@Setter
FontStyle fontStyle;
public TextPositionInImage(Word word, AffineTransform imageCTM, FontMetricsFactory fontMetricsFactory, FontStyle fontStyle) {
this.position = QuadPoint.fromBounds(word.getBounds());
this.text = word.getText();
this.imageCTM = imageCTM;
this.fontMetricsFactory = fontMetricsFactory;
this.fontStyle = fontStyle;
}
public QuadPoint getTransformedTextBBox() {
return position.getTransformed(imageCTM);
}
public PDFont getFont() {
return fontMetricsFactory.getFont();
}
public Matrix getTextMatrix() {
FontMetrics metrics = fontMetricsFactory.calculateMetrics(text, getTransformedWidth(), getTransformedHeight());
// Matrix multiplication is from right to left:
// convert to image coords -> subtract descent -> scale height -> reverse imageCTM scaling -> translate to coordinates in image -> convert to pdf coords
// width must not be set, since it is scaled with the fontsize attribute
AffineTransform ctm = new AffineTransform();
ctm.concatenate(imageCTM);
ctm.translate(position.a().getX(), position.a().getY());
ctm.scale(getWidth() / getTransformedWidth(),
getHeight() / getTransformedHeight()); // scale with transformation coefficient, such that fontsize may be set with transformed width.
ctm.scale(1, metrics.getHeightScaling());
ctm.translate(0, metrics.getDescent());
ctm.concatenate(new AffineTransform(1, 0, 0, -1, 0, 0)); // start in image coordinates, with (0,0) being top left and negative height.
return new Matrix(ctm);
}
public double getFontSize() {
return fontMetricsFactory.calculateFontSize(text, getTransformedWidth());
}
public double getTransformedWidth() {
return transformedA().distance(transformedD());
}
public double getTransformedHeight() {
return transformedA().distance(transformedB());
}
public double getWidth() {
return position.a().distance(position.d());
}
public double getTextHeight() {
var metrics = fontMetricsFactory.calculateMetrics(text, getTransformedWidth(), getTransformedHeight());
return fontMetricsFactory.calculateFontSize(text, getTransformedWidth()) * metrics.getHeightScaling();
}
public double getHeight() {
return position.a().distance(position.b());
}
public Point2D transformedA() {
return imageCTM.transform(position.a(), null);
}
public Point2D transformedB() {
return imageCTM.transform(position.b(), null);
}
public Point2D transformedC() {
return imageCTM.transform(position.c(), null);
}
public Point2D transformedD() {
return imageCTM.transform(position.d(), null);
}
}

View File

@ -1,9 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.model;
import net.sourceforge.lept4j.Pix;
public interface UnprocessedImage {
Pix asPix();
}

View File

@ -1,58 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.model.scriptdetection;
import java.util.List;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
@Getter
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public final class FontStyleDetectionModel {
QuadPoint imageBounds;
Pix image;
List<TextPositionAndWordImage> textPositionsAndWordImages;
public static FontStyleDetectionModel fromOcrResult(OcrResult ocrResult, FontMetricsFactory fontMetricsFactory, OcrServiceSettings settings) {
var image = Leptonica1.pixRead(ocrResult.tesseractOutputFilePath() + ".tiff");
var wordPixes = ocrResult.getAllWords().stream().filter(word -> !word.isBlank()).map(word -> TextPositionAndWordImage.create(ocrResult.image().getImageCTM(), word, image, settings, fontMetricsFactory)).toList();
return new FontStyleDetectionModel(ocrResult.image().getImageCoordinatesInInitialUserSpace(), image, wordPixes);
}
public List<TextPositionInImage> getTextPositionInImages() {
return textPositionsAndWordImages.stream().map(TextPositionAndWordImage::getTextPositionInImage).toList();
}
public List<WordImage> getWordImages() {
return textPositionsAndWordImages.stream().map(TextPositionAndWordImage::getWordImage).toList();
}
public void dispose() {
LeptUtils.disposePix(image);
getWordImages().forEach(WordImage::dispose);
}
}

View File

@ -1,52 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.model.scriptdetection;
import java.awt.geom.AffineTransform;
import java.util.Objects;
import org.apache.commons.math3.ml.clustering.Clusterable;
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import io.github.karols.hocr4j.Word;
import lombok.Getter;
import net.sourceforge.lept4j.Pix;
@Getter
public final class TextPositionAndWordImage implements Clusterable {
private final TextPositionInImage textPositionInImage;
private final WordImage wordImage;
public TextPositionAndWordImage(TextPositionInImage textPositionInImage, WordImage wordImage) {
this.textPositionInImage = textPositionInImage;
this.wordImage = wordImage;
}
public static TextPositionAndWordImage create(AffineTransform imageCTM, Word word, Pix image, OcrServiceSettings settings, FontMetricsFactory fontMetricsFactory) {
TextPositionInImage textPositionInImage = new TextPositionInImage(word, imageCTM, fontMetricsFactory, FontStyle.REGULAR);
WordImage wordImage = new WordImage(textPositionInImage.getTextHeight(), word, image, settings);
return new TextPositionAndWordImage(textPositionInImage, wordImage);
}
@Override
public double[] getPoint() {
return wordImage.getPoint();
}
public double getTextHeight() {
return wordImage.getTextHeight();
}
}

View File

@ -1,71 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.model.scriptdetection;
import org.apache.commons.math3.ml.clustering.Clusterable;
import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.StrokeWidthCalculator;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import io.github.karols.hocr4j.Word;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Box;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
@Getter
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class WordImage implements Clusterable {
Pix image;
String text;
double textHeight;
OcrServiceSettings settings;
public WordImage(double textHeight, Word word, Pix originalImage, OcrServiceSettings settings) {
Box box = new Box(word.getBounds().getLeft(), word.getBounds().getTop(), word.getBounds().getWidth(), word.getBounds().getHeight(), 1);
this.image = Leptonica1.pixClipRectangle(originalImage, box, null);
box.clear();
this.text = word.getText();
this.textHeight = textHeight;
this.settings = settings;
}
public boolean hasLargerStrokeWidth(double strokeWidth) {
int roundedStrokeWidth = (int) Math.round(strokeWidth);
double roundingError = (roundedStrokeWidth - strokeWidth) / strokeWidth;
// add 1 to open a bit bigger than the estimated regular stroke width
Pix openedPix = Leptonica1.pixOpenBrick(null, image, roundedStrokeWidth + 1, roundedStrokeWidth + 1);
double openedPixelDensity = ImageProcessingUtils.calculatePixelDensity(openedPix);
double pixelDensity = ImageProcessingUtils.calculatePixelDensity(image);
LeptUtils.disposePix(openedPix);
return (openedPixelDensity * (1 + roundingError)) / pixelDensity > (settings.getBoldThreshold());
}
@Override
public double[] getPoint() {
return new double[]{textHeight};
}
public void dispose() {
LeptUtils.disposePix(image);
}
}

View File

@ -1,66 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.tenantcommons.TenantContext;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class FileStorageService {
private final StorageService storageService;
public static String getStorageId(String dossierId, String fileId, FileType fileType) {
return dossierId + "/" + fileId + "." + fileType.name() + fileType.getExtension();
}
public boolean untouchedFileExists(String dossierId, String fileId) {
return storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED));
}
@SneakyThrows
public void storeFiles(String dossierId, String fileId, File documentFile, File viewerDocumentFile) {
try (var in = new FileInputStream(documentFile)) {
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), in);
}
try (var in = new FileInputStream(viewerDocumentFile)) {
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT), in);
}
}
@SneakyThrows
public void downloadFiles(String dossierId, String fileId, File documentFile, File viewerDocumentFile) {
storageService.downloadTo(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), documentFile);
if (storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT))) {
storageService.downloadTo(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT), viewerDocumentFile);
} else {
Files.copy(documentFile.toPath(), viewerDocumentFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
}
if (!untouchedFileExists(dossierId, fileId)) {
try (var in = new FileInputStream(documentFile)) {
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED), in);
}
}
}
}

View File

@ -1,169 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.stream.Collectors;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import com.knecon.fforesight.service.ocr.processor.service.threads.BlockingQueueFiller;
import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@SuppressWarnings("PMD") // can't figure out how to safely close the stdOut and stdError streams in line 142/144
public class GhostScriptService {
static String FORMAT = ".tiff";
static String DEVICE = "tiffgray";
OcrServiceSettings settings;
@SneakyThrows
public void renderPagesAsImagesBatchedAndAddToQueue(List<Integer> stitchedPageNumbers,
String documentAbsolutePath,
Path tmpImageDir,
PDDocument document,
BlockingQueue<UnprocessedImage> imageProcessingQueue,
Statistics stats) {
BlockingQueue<RenderedPageImageFile> imageFileCollectorQueue = new LinkedBlockingDeque<>();
BlockingQueueFiller asyncTransferThread = new BlockingQueueFiller(imageFileCollectorQueue, imageProcessingQueue);
asyncTransferThread.start();
int numOfProcesses = Math.min(settings.getGsProcessCount(), stitchedPageNumbers.size());
List<List<ProcessInfo>> processInfoBatches = buildSubListForEachProcess(stitchedPageNumbers,
numOfProcesses,
256 * numOfProcesses); // GS has a limit on how many pageIndices per call are possible, so we limit it to 256 pages per process
for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) {
long timestamp = System.currentTimeMillis();
List<ProcessInfo> processInfos = processInfoBatches.get(batchIdx);
log.info("Batch {}: Running {} gs processes with ({}) pages each",
batchIdx,
processInfos.size(),
processInfos.stream().map(info -> info.stitchedPageNumbers().size()).map(String::valueOf).collect(Collectors.joining(", ")));
int finalBatchIdx = batchIdx;
List<Process> processes = processInfos.stream()
.parallel()
.map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.stitchedPageNumbers(), tmpImageDir, documentAbsolutePath))
.peek(s -> log.debug(String.join(" ", s.cmdArgs())))
.map(processInfo -> executeProcess(processInfo, imageFileCollectorQueue))
.toList();
List<Integer> processExitCodes = new LinkedList<>();
for (Process process : processes) {
processExitCodes.add(process.waitFor());
}
stats.increasePDF2ImgDuration(System.currentTimeMillis() - timestamp);
log.info("Batch {}: Ghostscript processes finished with exit codes " + processExitCodes, batchIdx);
}
asyncTransferThread.setAllImagesQueued(true);
}
private List<List<ProcessInfo>> buildSubListForEachProcess(List<Integer> stitchedPageNumbers, int processCount, int batchSize) {
// GhostScript command line can only handle so many page numbers at once, so we split it into batches
int batchCount = (int) Math.ceil((double) stitchedPageNumbers.size() / batchSize);
log.info("Splitting {} page renderings across {} process(es) in {} batch(es) with size {}", stitchedPageNumbers.size(), processCount, batchCount, batchSize);
List<List<ProcessInfo>> processInfoBatches = new ArrayList<>(batchCount);
List<List<List<Integer>>> batchedBalancedSublist = ListSplittingUtils.buildBatchedBalancedSublist(stitchedPageNumbers.stream().sorted().toList(), processCount, batchCount);
for (var batch : batchedBalancedSublist) {
List<ProcessInfo> processInfos = new ArrayList<>(processCount);
for (int threadIdx = 0; threadIdx < batch.size(); threadIdx++) {
List<Integer> balancedPageNumbersSubList = batch.get(threadIdx);
processInfos.add(new ProcessInfo(threadIdx, balancedPageNumbersSubList));
}
processInfoBatches.add(processInfos);
}
return processInfoBatches;
}
@SneakyThrows
private ProcessCmdsAndRenderedImageFiles buildCmdArgs(Integer processIdx,
Integer batchIdx,
List<Integer> stitchedImagePageIndices,
Path outputDir,
String documentAbsolutePath) {
String imagePathFormat = outputDir.resolve("output_" + processIdx + "_" + batchIdx + ".%04d" + FORMAT).toFile().toString();
Map<Integer, RenderedPageImageFile> fullPageImages = new HashMap<>();
for (int i = 0; i < stitchedImagePageIndices.size(); i++) {
Integer pageNumber = stitchedImagePageIndices.get(i);
fullPageImages.put(pageNumber, new RenderedPageImageFile(pageNumber, String.format(imagePathFormat, i + 1)));
}
String[] cmdArgs = buildCmdArgs(stitchedImagePageIndices, documentAbsolutePath, imagePathFormat);
return new ProcessCmdsAndRenderedImageFiles(cmdArgs, fullPageImages);
}
private String[] buildCmdArgs(List<Integer> stitchedImagePageIndices, String documentAbsolutePath, String imagePathFormat) {
StringBuilder sPageList = new StringBuilder();
int i = 1;
for (Integer integer : stitchedImagePageIndices) {
sPageList.append(integer);
if (i < stitchedImagePageIndices.size()) {
sPageList.append(",");
}
i++;
}
String[] cmdArgs = new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + settings.getDpi(), "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"};
return cmdArgs;
}
@SneakyThrows
private Process executeProcess(ProcessCmdsAndRenderedImageFiles processInfo, BlockingQueue<RenderedPageImageFile> imageFileCollectorQueue) {
Process p = Runtime.getRuntime().exec(processInfo.cmdArgs());
InputStream stdOut = p.getInputStream();
GhostScriptOutputHandler stdOutLogger = GhostScriptOutputHandler.stdOut(stdOut, processInfo.renderedPageImageFiles(), imageFileCollectorQueue);
InputStream stdError = p.getErrorStream();
GhostScriptOutputHandler stdErrorLogger = GhostScriptOutputHandler.errorHandler(stdError);
stdOutLogger.start();
stdErrorLogger.start();
return p;
}
private record ProcessCmdsAndRenderedImageFiles(String[] cmdArgs, Map<Integer, RenderedPageImageFile> renderedPageImageFiles) {
}
private record ProcessInfo(Integer processIdx, List<Integer> stitchedPageNumbers) {
}
}

View File

@ -1,24 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.io.FileInputStream;
import java.util.List;
import io.github.karols.hocr4j.Page;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class HOcrPageParser {
@SneakyThrows
public Page extractHocrPage(String tesseractOutputFileName) {
String hOcrString;
try (var hocrIn = new FileInputStream(tesseractOutputFileName + ".hocr")) {
hOcrString = new String(hocrIn.readAllBytes());
}
return Page.fromHocr(List.of(hOcrString)).get(0);
}
}

View File

@ -1,16 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service;
import org.springframework.stereotype.Service;
@Service
public interface IOcrMessageSender {
void sendUpdate(String fileId, int finishedImages, int totalImages);
void sendOCRStarted(String fileId);
void sendOcrFinished(String fileId, int totalImages);
void sendOcrResponse(String dossierId, String fileId);
}

View File

@ -1,96 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.DrawObject;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
import org.apache.pdfbox.contentstream.operator.state.Restore;
import org.apache.pdfbox.contentstream.operator.state.Save;
import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters;
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import lombok.Getter;
import lombok.SneakyThrows;
@Getter
public class ImageStreamEngine extends PDFStreamEngine {
private List<ExtractedImage> imagesOnCurrentPage;
private OcrServiceSettings settings;
private int pageNum;
public ImageStreamEngine(OcrServiceSettings settings) {
this.settings = settings;
// preparing PDFStreamEngine
addOperator(new Concatenate(this));
addOperator(new DrawObject(this));
addOperator(new SetGraphicsStateParameters(this));
addOperator(new Save(this));
addOperator(new Restore(this));
addOperator(new SetMatrix(this));
}
@Override
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException {
String operation = operator.getName();
if ("Do".equals(operation)) {
COSName objectName = (COSName) operands.get(0);
// get the PDF object
PDXObject xobject = getResources().getXObject(objectName);
// check if the object is an image object
if (xobject instanceof PDImageXObject imageXObject) {
if (imageXObject.getWidth() < settings.getMinImageWidth() || imageXObject.getHeight() < settings.getMinImageHeight()) {
return;
}
Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix();
this.imagesOnCurrentPage.add(new ExtractedImage(pageNum,
QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, imageXObject.getWidth(), imageXObject.getHeight())),
imageXObject.getHeight(),
imageXObject.getWidth(),
imageXObject.getImage(),
imageCTM,
imagesOnCurrentPage.size(),
imageXObject.getColorSpace()));
} else if (xobject instanceof PDFormXObject) {
PDFormXObject form = (PDFormXObject) xobject;
showForm(form);
}
} else {
super.processOperator(operator, operands);
}
}
@SneakyThrows
public void processPage(int pageNum, PDPage page) {
this.pageNum = pageNum;
this.imagesOnCurrentPage = new LinkedList<>();
super.processPage(page);
}
}

View File

@ -1,176 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.stream.IntStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service;
import org.springframework.util.FileSystemUtils;
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.FontStyleDetector;
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.pdftron.pdf.PDFDoc;
import io.micrometer.observation.ObservationRegistry;
import io.micrometer.observation.annotation.Observed;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OCRService {
FileStorageService fileStorageService;
OcrServiceSettings settings;
IOcrMessageSender ocrMessageSender;
WatermarkRemovalService watermarkRemovalService;
InvisibleElementRemovalService invisibleElementRemovalService;
OcrResultWriter ocrResultWriter;
GhostScriptService ghostScriptService;
FontStyleDetector boldDetector;
ObservationRegistry registry;
/**
* Starts the OCR-Process: Collecting images (via threads),
* looking for stitchedImages (if so converting the current page to an image with ghostscript and work on this instead),
* perform tesseract-ocr on these images (via threads) and write the generated ocr-text as invisible elements.
*
* @param dossierId Id of dossier
* @param fileId Id of file
* @param tmpDir working directory for all files
* @param documentFile the file to perform ocr on, results are written invisibly
* @param viewerDocumentFile debugging file, results are written visibly in an optional content group
*/
@Observed(name = "OCRService", contextualName = "run-ocr-on-document")
@SneakyThrows
public void runOcrOnDocument(String dossierId, String fileId, boolean removeWatermark, Path tmpDir, File documentFile, File viewerDocumentFile) {
if (removeWatermark) {
removeWatermarkIfEnabled(documentFile);
}
removeInvisibleElements(documentFile);
log.info("Starting OCR for file {}", fileId);
long ocrStart = System.currentTimeMillis();
Statistics stats = runOcr(tmpDir, documentFile, viewerDocumentFile, fileId, dossierId);
long ocrEnd = System.currentTimeMillis();
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, String.format("%.1f", (ocrEnd - ocrStart) / 1000.0));
log.info("Runtime breakdown: {}", stats);
}
@SneakyThrows
private void removeInvisibleElements(File originFile) {
Path tmpFile = Files.createTempFile("invisibleElements", ".pdf");
try (var in = new FileInputStream(originFile); var out = new FileOutputStream(tmpFile.toFile())) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false, false);
}
Files.copy(tmpFile, originFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
assert tmpFile.toFile().delete();
}
@SneakyThrows
private void removeWatermarkIfEnabled(File originFile) {
Path tmpFile = Files.createTempFile("removeWatermarks", ".pdf");
try (var in = new FileInputStream(originFile); var out = new FileOutputStream(tmpFile.toFile())) {
watermarkRemovalService.removeWatermarks(in, out);
}
Files.copy(tmpFile, originFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
assert tmpFile.toFile().delete();
}
@SneakyThrows
public Statistics runOcr(Path tmpDir, File documentFile, File viewerDocumentFile, String fileId, String dossierId) {
long timestamp;
Path tmpImageDir = tmpDir.resolve("images");
Path tesseractOutputDir = tmpDir.resolve("tesseract_output");
tesseractOutputDir.toFile().mkdirs();
tmpImageDir.toFile().mkdirs();
Statistics stats;
try (PDDocument document = Loader.loadPDF(documentFile)) {
OcrProgressLogger logger = new OcrProgressLogger(document.getNumberOfPages(), ocrMessageSender, fileId);
int numberOfExtractThreads = Math.min(settings.getImageExtractThreadCount(), document.getNumberOfPages());
int numberOfOcrThreads = Math.min(settings.getOcrThreadCount(), document.getNumberOfPages());
stats = new Statistics(numberOfExtractThreads, numberOfOcrThreads);
BlockingQueue<OcrImage> ocrImageQueue = new ArrayBlockingQueue<>((int) (1.5 * numberOfOcrThreads));
OcrImageFactory ocrImageFactory = new OcrImageFactory(document,
documentFile,
tmpImageDir,
numberOfExtractThreads,
ghostScriptService,
ocrImageQueue,
logger,
settings,
stats);
ocrImageFactory.start();
List<OcrResult> ocrResults = new LinkedList<>();
List<OCRThread> ocrThreads = IntStream.range(0, numberOfOcrThreads)
.boxed()
.map(id -> new OCRThread(id, ocrImageQueue, tesseractOutputDir, ocrResults, logger, stats, settings))
.peek(Thread::start)
.toList();
log.info("Started {} OCR consumer threads, listening for images on the queue", ocrThreads.size());
ocrImageFactory.join();
log.info("Processed all images, interrupting ocr threads");
ocrThreads.forEach(Thread::interrupt);
for (OCRThread ocrThread : ocrThreads) {
ocrThread.join();
}
log.info("Tesseract OCR has finished for file {} and dossier {}", fileId, dossierId);
timestamp = System.currentTimeMillis();
Map<Integer, List<OcrResultToWrite>> imageWithTextPositionsPerPage = boldDetector.detectBold(ocrResults, document);
stats.increaseFontStyleDetectionDuration(System.currentTimeMillis() - timestamp);
timestamp = System.currentTimeMillis();
ocrResultWriter.drawOcrResultsToPdf(documentFile, viewerDocumentFile, imageWithTextPositionsPerPage);
log.info("Saving document");
stats.increaseWritingTextDuration(System.currentTimeMillis() - timestamp);
logger.sendFinished();
return stats;
}
}
}

View File

@ -1,105 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.io.File;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.stream.Collectors;
import org.apache.pdfbox.pdmodel.PDDocument;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageExtractionThread;
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageProcessingThread;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
import lombok.AccessLevel;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OcrImageFactory {
PDDocument document;
File documentFile;
Path tmpImageDir;
GhostScriptService ghostScriptService;
BlockingQueue<UnprocessedImage> imageProcessingQueue;
ImageProcessingThread imageProcessingThread;
BlockingQueue<OcrImage> imageOutputQueue;
List<ImageExtractionThread> imageExtractionThreads;
List<Integer> stitchedPageNumbers;
Statistics stats;
public OcrImageFactory(PDDocument document,
File documentFile,
Path tmpImageDir,
int numberOfThreads,
GhostScriptService ghostScriptService,
BlockingQueue<OcrImage> imageOcrQueue,
OcrProgressLogger logger,
OcrServiceSettings settings,
Statistics stats) {
this.document = document;
this.documentFile = documentFile;
this.tmpImageDir = tmpImageDir;
this.ghostScriptService = ghostScriptService;
this.imageOutputQueue = imageOcrQueue;
this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOcrQueue.remainingCapacity());
this.stitchedPageNumbers = Collections.synchronizedList(new LinkedList<>());
this.stats = stats;
this.imageExtractionThreads = new ArrayList<>(numberOfThreads);
List<List<Integer>> balancedPageNumbers = ListSplittingUtils.buildBalancedContinuousSublist(document.getNumberOfPages(), numberOfThreads);
for (int i = 0; i < balancedPageNumbers.size(); i++) {
imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageProcessingQueue, stitchedPageNumbers));
}
this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOcrQueue, stats, settings, document);
log.info("Started {} image extraction threads, with ({}) pages each",
imageExtractionThreads.size(),
imageExtractionThreads.stream().map(ImageExtractionThread::getPageIndices).map(List::size).map(String::valueOf).collect(Collectors.joining(", ")));
}
public void start() {
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
imageExtractionThread.start();
}
imageProcessingThread.start();
}
@SneakyThrows
public void join() {
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
imageExtractionThread.join();
}
if (!stitchedPageNumbers.isEmpty()) {
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageProcessingQueue, stats);
}
imageProcessingThread.setAllImagesExtracted(true);
imageProcessingThread.interrupt();
imageProcessingThread.join();
}
}

View File

@ -1,91 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OcrProgressLogger {
Set<ImageNumberWithPageNumber> imagesToProcess;
Set<ImageNumberWithPageNumber> processedImages;
IOcrMessageSender ocrMessageSender;
String fileId;
public OcrProgressLogger(int totalPageCount, IOcrMessageSender ocrMessageSender, String fileId) {
this.ocrMessageSender = ocrMessageSender;
this.fileId = fileId;
this.imagesToProcess = Collections.synchronizedSet(new HashSet<>(totalPageCount));
for (int i = 0; i < totalPageCount; i++) {
imagesToProcess.add(new ImageNumberWithPageNumber(0, i + 1));
}
this.processedImages = Collections.synchronizedSet(new HashSet<>(totalPageCount));
}
public void logImageFinished(OcrImage image, int psm) {
this.processedImages.add(new ImageNumberWithPageNumber(image.getNumberOnPage(), image.getPageNumber()));
if (image instanceof ExtractedOcrImage) {
log.info("{}/{}: Finished image {} on page {} with rotation {}, used PSM {}, quad-point: {}",
processedImages.size(),
imagesToProcess.size(),
image.getNumberOnPage(),
image.getPageNumber(),
image.getRotationDegrees(),
psm,
image.getImageCoordinatesInInitialUserSpace());
} else {
log.info("{}/{}: Finished page {} as fully rendered page with rotation {}, used PSM {}",
processedImages.size(),
imagesToProcess.size(),
image.getPageNumber(),
image.getRotationDegrees(),
psm);
}
ocrMessageSender.sendUpdate(fileId, this.processedImages.size(), this.imagesToProcess.size());
}
public void logPageSkipped(Integer pageIndex) {
var pageDummy = new ImageNumberWithPageNumber(0, pageIndex);
this.imagesToProcess.remove(pageDummy);
log.debug("{}/{}: No images to ocr on page {}", processedImages.size(), imagesToProcess.size(), pageIndex);
ocrMessageSender.sendUpdate(fileId, this.processedImages.size(), imagesToProcess.size());
}
public void addImagesToProcess(int pageNumber, int imageNumber) {
this.imagesToProcess.add(new ImageNumberWithPageNumber(imageNumber, pageNumber));
}
public void sendFinished() {
log.info("{}/{}: Finished OCR on all images", processedImages.size(), imagesToProcess.size());
ocrMessageSender.sendOcrFinished(fileId, imagesToProcess.size());
}
private record ImageNumberWithPageNumber(int imageNumber, int pageNumber) {
}
}

View File

@ -1,251 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.awt.Color;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Stream;
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
import org.springframework.stereotype.Service;
import com.iqser.red.pdftronlogic.commons.Converter;
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.TextExtractor;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OcrResultWriter {
public static final Color REGULAR_TEXT_COLOR = Color.BLUE;
public static final Color BOLD_TEXT_COLOR = Color.CYAN;
public static final Color REGULAR_TEXT_IN_IGNORE_ZONE = Color.RED;
public static final Color BOLD_TEXT_IN_IGNORE_ZONE = Color.RED;
ViewerDocumentService viewerDocumentService;
@SneakyThrows
public void drawOcrResultsToPdf(File document, File viewerDocument, Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
Map<Integer, VisualizationsOnPage> ocrVisualizationsOnPages = new HashMap<>();
Map<Integer, VisualizationsOnPage> ocrTextDebugVisualizationsOnPages = new HashMap<>();
Map<Integer, VisualizationsOnPage> ocrBBoxDebugVisualizationsOnPages = new HashMap<>();
try (var in = new FileInputStream(document); PDFDoc doc = new PDFDoc(in)) {
for (Integer pageNumber : imagesWithResultsPerPage.keySet()) {
List<Rectangle2D> textBBoxes = getTextBBoxes(doc.getPage(pageNumber));
ocrVisualizationsOnPages.put(pageNumber - 1, createVisualizations(imagesWithResultsPerPage.get(pageNumber), textBBoxes));
ocrTextDebugVisualizationsOnPages.put(pageNumber - 1, createDebugTextVisualizations(imagesWithResultsPerPage.get(pageNumber), textBBoxes));
ocrBBoxDebugVisualizationsOnPages.put(pageNumber - 1, createDebugBBoxVisualizations(imagesWithResultsPerPage.get(pageNumber)));
}
}
Visualizations visualizations = new Visualizations(ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false);
List<Visualizations> debugVisualizations = List.of(visualizations,
new Visualizations(ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false),
new Visualizations(ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false));
viewerDocumentService.addVisualizationsOnPage(document, document, List.of(visualizations));
viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, debugVisualizations);
}
@SuppressWarnings("PMD")
private List<Rectangle2D> getTextBBoxes(Page page) {
List<Rectangle2D> textBBoxes = new ArrayList<>();
try (var textExtractor = new TextExtractor()) {
textExtractor.begin(page);
try {
for (TextExtractor.Line line = textExtractor.getFirstLine(); line.isValid(); line = getNextLine(line)) {
for (TextExtractor.Word word = line.getFirstWord(); word.isValid(); word = getNextWord(word)) {
textBBoxes.add(Converter.toRectangle2D(word.getBBox()));
}
}
} catch (Exception e) {
log.warn("Could not get word dimension, {}", e.getMessage());
}
return textBBoxes;
}
}
private static TextExtractor.Word getNextWord(TextExtractor.Word word) {
TextExtractor.Word nextWord = word.getNextWord();
word.close();
return nextWord;
}
private static TextExtractor.Line getNextLine(TextExtractor.Line line) {
TextExtractor.Line newLine = line.getNextLine();
line.close();
return newLine;
}
private VisualizationsOnPage createVisualizations(List<OcrResultToWrite> ocrResultsToWrite, List<Rectangle2D> ignoreZones) {
List<TextPositionInImage> words = ocrResultsToWrite.stream()
.map(OcrResultToWrite::textPositionInImage)
.flatMap(Collection::stream)
.filter(word -> ignoreZones.stream()
.noneMatch(ignoreZone -> word.getTransformedTextBBox().getBounds2D().intersects(ignoreZone)))
.toList();
List<PlacedText> placedTexts = words.stream()
.map(word -> new PlacedText(word.getText(),
null,
Color.BLACK,
(float) word.getFontSize(),
word.getFontMetricsFactory(),
Optional.of(word.getTextMatrix()),
Optional.of(RenderingMode.NEITHER)))
.toList();
return VisualizationsOnPage.builder().placedTexts(placedTexts).build();
}
private VisualizationsOnPage createDebugTextVisualizations(List<OcrResultToWrite> ocrResultsToWrite, List<Rectangle2D> textBBoxes) {
List<TextPositionInImage> wordsToDraw = new ArrayList<>();
List<TextPositionInImage> ignoredWords = new ArrayList<>();
for (OcrResultToWrite ocrResultToWrite : ocrResultsToWrite) {
for (TextPositionInImage textPositionInImage : ocrResultToWrite.textPositionInImage()) {
if (textBBoxes.stream()
.anyMatch(ignoreZone -> textPositionInImage.getTransformedTextBBox().getBounds2D().intersects(ignoreZone))) {
ignoredWords.add(textPositionInImage);
} else {
wordsToDraw.add(textPositionInImage);
}
}
}
Stream<PlacedText> placedTexts = wordsToDraw.stream()
.map(word -> new PlacedText(word.getText(),
null,
word.getFontStyle().equals(FontStyle.REGULAR) ? REGULAR_TEXT_COLOR : BOLD_TEXT_COLOR,
(float) word.getFontSize(),
word.getFontMetricsFactory(),
Optional.of(word.getTextMatrix()),
Optional.of(RenderingMode.FILL)));
Stream<PlacedText> placedTexts2 = ignoredWords.stream()
.map(word -> new PlacedText(word.getText(),
null,
word.getFontStyle().equals(FontStyle.REGULAR) ? REGULAR_TEXT_IN_IGNORE_ZONE : BOLD_TEXT_IN_IGNORE_ZONE,
(float) word.getFontSize(),
word.getFontMetricsFactory(),
Optional.of(word.getTextMatrix()),
Optional.of(RenderingMode.FILL)));
return VisualizationsOnPage.builder()
.placedTexts(Stream.of(placedTexts, placedTexts2)
.flatMap(Function.identity())
.toList())
.build();
}
private VisualizationsOnPage createDebugBBoxVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {
List<TextPositionInImage> words = ocrResultsToWrite.stream()
.map(OcrResultToWrite::textPositionInImage)
.flatMap(Collection::stream)
.toList();
List<ColoredLine> coloredLines = Stream.concat(//
words.stream()
.map(TextPositionInImage::getTransformedTextBBox)
.map(this::quadPointAsLines),//
ocrResultsToWrite.stream()
.map(OcrResultToWrite::imageBoundingBox)
.map(this::createGrid)//
)
.flatMap(Collection::stream)
.toList();
return VisualizationsOnPage.builder().coloredLines(coloredLines).build();
}
private List<ColoredLine> quadPointAsLines(QuadPoint rect) {
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.ORANGE, 1),
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1),
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1),
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.MAGENTA, 1));
}
@SneakyThrows
private List<ColoredLine> createGrid(QuadPoint rect) {
List<ColoredLine> lines = new LinkedList<>(quadPointAsLines(rect));
int nRows = 8;
int nCols = 8;
Point2D abStep = new Point2D.Double((rect.b().getX() - rect.a().getX()) / (nRows + 1), (rect.b().getY() - rect.a().getY()) / (nRows + 1));
Point2D start = add(rect.a(), abStep);
Point2D end = add(rect.d(), abStep);
for (int row = 0; row < nRows; ++row) {
lines.add(new ColoredLine(new Line2D.Double(start, end), Color.BLACK, 0.2f));
start = add(start, abStep);
end = add(end, abStep);
}
Point2D adStep = new Point2D.Double((rect.d().getX() - rect.a().getX()) / (nCols + 1), (rect.d().getY() - rect.a().getY()) / (nCols + 1));
start = add(rect.a(), adStep);
end = add(rect.b(), adStep);
for (int col = 0; col < nCols; ++col) {
lines.add(new ColoredLine(new Line2D.Double(start, end), Color.BLACK, 0.2f));
start = add(start, adStep);
end = add(end, adStep);
}
return lines;
}
private Point2D add(Point2D a, Point2D b) {
return new Point2D.Double(a.getX() + b.getX(), a.getY() + b.getY());
}
}

View File

@ -1,85 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;
import lombok.AccessLevel;
import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class Statistics {
List<Long> imageExtraction;
List<Long> tesseractDuration;
AtomicLong pdf2ImgDuration;
AtomicLong writingTextDuration;
AtomicLong imageProcessingDuration;
AtomicLong fontStyleDetectionDuration;
public Statistics(int numberOfExtractThreads, int numberOfOcrThreads) {
this.imageExtraction = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfExtractThreads, 0L)));
this.tesseractDuration = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfOcrThreads, 0L)));
this.fontStyleDetectionDuration = new AtomicLong(0);
this.pdf2ImgDuration = new AtomicLong(0);
this.writingTextDuration = new AtomicLong(0);
this.imageProcessingDuration = new AtomicLong(0);
}
public void increaseImageExtraction(int threadId, long duration) {
imageExtraction.set(threadId, imageExtraction.get(threadId) + duration);
}
public void increaseImageProcessing(long duration) {
imageProcessingDuration.addAndGet(duration);
}
public void increaseTesseractDuration(int threadId, long duration) {
tesseractDuration.set(threadId, tesseractDuration.get(threadId) + duration);
}
public void increasePDF2ImgDuration(long duration) {
pdf2ImgDuration.addAndGet(duration);
}
public void increaseWritingTextDuration(long duration) {
writingTextDuration.addAndGet(duration);
}
public void increaseFontStyleDetectionDuration(long duration) {
fontStyleDetectionDuration.addAndGet(duration);
}
@Override
public String toString() {
return String.format(
"imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s, FontstyleDetection=%.2f s",
((float) imageExtraction.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
((float) imageExtraction.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
((float) imageExtraction.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
((float) tesseractDuration.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
((float) tesseractDuration.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
((float) tesseractDuration.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
(float) imageProcessingDuration.get() / 1000,
(float) pdf2ImgDuration.get() / 1000,
(float) writingTextDuration.get() / 1000,
(float) fontStyleDetectionDuration.get() / 1000);
}
}

View File

@ -1,43 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service.fonts;
import org.apache.pdfbox.pdmodel.font.PDFont;
import com.knecon.fforesight.service.ocr.processor.model.FontMetrics;
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
import com.knecon.fforesight.service.viewerdoc.model.EmbeddableFont;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
public interface FontMetricsFactory extends EmbeddableFont {
default FontMetrics calculateMetrics(String text, double textWidth, double textHeight) {
HeightAndDescent heightAndDescent = calculateHeightAndDescent(text);
float fontSize = calculateFontSize(text, textWidth);
float heightScaling = (float) ((textHeight / (heightAndDescent.height() - heightAndDescent.descent())) * 1000) / fontSize;
return new FontMetrics((heightAndDescent.descent() / 1000) * fontSize, fontSize, heightScaling);
}
@SneakyThrows
default float calculateFontSize(String text, double textWidth) {
float width;
try {
width = getFont().getStringWidth(text);
} catch (IllegalArgumentException e) {
// this means, the font has no glyph for this character
width = getFont().getAverageFontWidth() * text.length();
}
return (float) (textWidth / width) * 1000;
}
PDFont getFont();
HeightAndDescent calculateHeightAndDescent(String text);
}

View File

@ -1,5 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service.fonts;
public enum FontStyle {
REGULAR, BOLD, ITALIC
}

View File

@ -1,140 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service.fonts;
import java.io.ByteArrayInputStream;
import java.util.Set;
import org.apache.fontbox.ttf.GlyphData;
import org.apache.fontbox.ttf.TTFParser;
import org.apache.fontbox.ttf.TrueTypeFont;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
import lombok.AllArgsConstructor;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor
@AllArgsConstructor
public class Type0FontMetricsFactory implements FontMetricsFactory {
private final String resourcePath;
private PDType0Font type0Font;
private TrueTypeFont trueTypeFont;
private PDDocument documentThisIsEmbeddedIn;
// for this specific font back-/forward-slashes have a lot of descent screwing up the font size and therefore bold detection. So if we find such a character we ignore its descent.
private static final Set<Integer> slashGlyphIds = Set.of(18, 63);
@SneakyThrows
public static Type0FontMetricsFactory regular(PDDocument document) {
String resourcePath = "fonts/cmu-regular.ttf";
return createFromResourcePath(resourcePath, document);
}
@SneakyThrows
public static Type0FontMetricsFactory bold(PDDocument document) {
String resourcePath = "fonts/cmu-bold.ttf";
return createFromResourcePath(resourcePath, document);
}
@SneakyThrows
@SuppressWarnings("PMD.CloseResource")
private static TrueTypeFont readFromResourcePath(String resourcePath) {
// The ttf is closed with the document, see PDType0Font line 134
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(resourcePath); var buffer = new RandomAccessReadBuffer(in)) {
return new TTFParser().parse(buffer);
}
}
@SneakyThrows
@SuppressWarnings("PMD.CloseResource")
private static Type0FontMetricsFactory createFromResourcePath(String resourcePath, PDDocument document) {
TrueTypeFont trueTypeFont = readFromResourcePath(resourcePath);
// since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
return new Type0FontMetricsFactory(resourcePath, PDType0Font.load(document, trueTypeFont, true), trueTypeFont, document); // use Type0Font for unicode support)
}
@SneakyThrows
public HeightAndDescent calculateHeightAndDescent(String text) {
byte[] bytes;
try {
bytes = type0Font.encode(text);
} catch (IllegalArgumentException e) {
log.warn("The string {} could not be parsed, using average height and descent", text);
return new HeightAndDescent(800, -50);
}
ByteArrayInputStream in = new ByteArrayInputStream(bytes);
float descent = 0;
float height = 0;
while (in.available() > 0) {
try {
int code = type0Font.readCode(in);
int glyphId = type0Font.codeToGID(code);
GlyphData glyph = trueTypeFont.getGlyph().getGlyph(glyphId);
if (glyph == null || glyph.getBoundingBox() == null) {
continue;
}
if (!slashGlyphIds.contains(glyphId)) {
descent = Math.min(descent, glyph.getYMinimum());
}
height = Math.max(height, glyph.getYMaximum());
} catch (Exception e) {
log.warn("descent and height of string {} could not be parsed, using average fallback value!", text);
}
}
// some characters like comma or minus return very small height values, while tesseract still returns a normal-sized bounding box and therefore exploding the height scaling factors,
// so we need a minimum value. Here, 500 seems optimal for the characters "-", ",", "_"
return new HeightAndDescent(Math.max(height, 500), descent);
}
@Override
public PDFont getFont() {
return type0Font;
}
@Override
@SneakyThrows
public PDFont embed(PDDocument document) {
if (documentThisIsEmbeddedIn.equals(document)) {
return getFont();
}
// no need to close, the font will be closed with the document it is embedded in
this.trueTypeFont = readFromResourcePath(resourcePath);
this.type0Font = PDType0Font.load(document, trueTypeFont, true);
this.documentThisIsEmbeddedIn = document;
return getFont();
}
@SneakyThrows
public void close() {
trueTypeFont.close();
}
}

View File

@ -1,158 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service.scriptdetection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Stream;
import org.apache.commons.math3.ml.clustering.Cluster;
import org.apache.commons.math3.ml.clustering.DBSCANClusterer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.FontStyleDetectionModel;
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.TextPositionAndWordImage;
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.WordImage;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
import com.knecon.fforesight.service.ocr.processor.service.fonts.Type0FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class FontStyleDetector {
OcrServiceSettings settings;
StrokeWidthCalculator strokeWidthCalculator;
/**
* Implementation of the MOBDoB algorithm, refer to the paper here:
* <a href="http://mile.ee.iisc.ac.in/publications/softCopy/DocumentAnalysis/Sai_NCVPRIPG2013.pdf">Script Independent Detection of Bold Words in Multi Font-size Documents</a>
* <p>
* As a high level overview: We cluster all text based on its font size. We determine the cluster with the most words. This is assumed to be regular text.
* We then estimate the average stroke width of that cluster by thinning all text to a single pixel and calculating the ratio of remaining pixels.
* (<a href="http://www.leptonica.org/papers/conn.pdf">Leptonica Documentation on thinning</a>)
* For each word we scale this average strokewidth based on its fontsize compared to the most common fontsize.
* Using the scaled strokewidth we do an opening operation.
* (<a href="https://en.wikipedia.org/wiki/Opening_(morphology)">Opening (Morphology)</a>).
* We then threshold the ratio of remaining pixels to determine whether a word is bold or not.
* <p>
* I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size estimation.
* But that is calculated based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height.
* The paper does not describe its clustering algorithm, so I've decided on DBSCAN due to its good runtime and readily available implementation by apache commons math.
* Moreover, the paper states that stroke width scales linearly with text height. I've come to the conclusion this is not the case.
* It seems it scales with the square root of the text height. Or at least this seemed to give the best results for me.
*/
public Map<Integer, List<OcrResultToWrite>> detectBold(List<OcrResult> ocrResults, PDDocument document) {
FontMetricsFactory fontMetricsFactory = Type0FontMetricsFactory.regular(document);
if (!settings.isBoldDetection()) {
return OcrResultToWrite.buildOcrResultsToWrite(ocrResults, fontMetricsFactory);
}
Map<Integer, List<OcrResultToWrite>> ocrResultToWritePerPage = new HashMap<>();
DBSCANClusterer<TextPositionAndWordImage> clusterer = new DBSCANClusterer<>(0.5, 1);
FontMetricsFactory boldFontMetricsFactory = Type0FontMetricsFactory.bold(document);
for (OcrResult result : ocrResults) {
FontStyleDetectionModel fontStyleDetectionModel = FontStyleDetectionModel.fromOcrResult(result, fontMetricsFactory, settings);
List<Cluster<TextPositionAndWordImage>> clusters = clusterer.cluster(fontStyleDetectionModel.getTextPositionsAndWordImages());
Optional<Cluster<TextPositionAndWordImage>> largestCluster = clusters.stream().max(Comparator.comparingInt(cluster -> cluster.getPoints().size()));
if (largestCluster.isEmpty()) {
insertResultIntoMap(result.image().getPageNumber(), ocrResultToWritePerPage, fontStyleDetectionModel);
continue;
}
List<TextPositionAndWordImage> wordsWithMostCommonTextHeight = largestCluster.get().getPoints();
double standardTextHeight = calculateStandardTextheight(wordsWithMostCommonTextHeight);
double regularStrokeWidth = calculateRegularStrokeWidth(wordsWithMostCommonTextHeight);
for (TextPositionAndWordImage textPositionsAndWordImage : fontStyleDetectionModel.getTextPositionsAndWordImages()) {
decideOnFontStyle(textPositionsAndWordImage, regularStrokeWidth, standardTextHeight, boldFontMetricsFactory);
}
insertResultIntoMap(result.image().getPageNumber(), ocrResultToWritePerPage, fontStyleDetectionModel);
fontStyleDetectionModel.dispose();
}
log.info("Finished bold detection");
return ocrResultToWritePerPage;
}
private static double calculateStandardTextheight(List<TextPositionAndWordImage> wordsWithMostCommonTextHeight) {
return wordsWithMostCommonTextHeight.stream()
.map(TextPositionAndWordImage::getWordImage)
.mapToDouble(WordImage::getTextHeight)
.filter(Double::isFinite)
.average()
.orElseThrow();
}
private double calculateRegularStrokeWidth(List<TextPositionAndWordImage> wordsWithMostCommonTextHeight) {
return wordsWithMostCommonTextHeight.stream()
.mapToDouble(textPositionAndWordImage -> strokeWidthCalculator.calculate(textPositionAndWordImage.getWordImage().getImage()))
.filter(Double::isFinite)
.average()
.orElseThrow();
}
private static void insertResultIntoMap(int pageNumber, Map<Integer, List<OcrResultToWrite>> ocrResultToWritePerPage, FontStyleDetectionModel fontStyleDetectionModel) {
OcrResultToWrite ocrResult = OcrResultToWrite.fromFontStyleDetectionModel(fontStyleDetectionModel);
ocrResultToWritePerPage.compute(pageNumber, (key, existingList) -> {
if (existingList == null) {
return List.of(ocrResult);
} else {
return Stream.concat(existingList.stream(), Stream.of(ocrResult)).toList();
}
});
}
private void decideOnFontStyle(TextPositionAndWordImage textPositionsAndWordImage,
double standardStrokeWidth,
double standardTextHeight,
FontMetricsFactory boldFontMetricsFactory) {
double scaledStrokeWidth = scaleStrokeWidthByFontSize(textPositionsAndWordImage, standardStrokeWidth, standardTextHeight);
if (textPositionsAndWordImage.getWordImage().hasLargerStrokeWidth(scaledStrokeWidth)) {
textPositionsAndWordImage.getTextPositionInImage().setFontMetricsFactory(boldFontMetricsFactory);
textPositionsAndWordImage.getTextPositionInImage().setFontStyle(FontStyle.BOLD);
} else {
textPositionsAndWordImage.getTextPositionInImage().setFontStyle(FontStyle.REGULAR);
}
}
private static double scaleStrokeWidthByFontSize(TextPositionAndWordImage textPositionsAndWordImage, double standardStrokeWidth, double standardFontSize) {
double influenceOfFontSize = 1.0; // the paper states that stroke width scales exactly linearly with font size. This did not seem to be true for me. Maybe some of the preprocessing steps are affecting this.
double fontsizeScalingFactor = Math.sqrt(textPositionsAndWordImage.getWordImage().getTextHeight() / standardFontSize);
return standardStrokeWidth + (influenceOfFontSize * (fontsizeScalingFactor - 1) * standardStrokeWidth);
}
}

View File

@ -1,57 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service.scriptdetection;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import lombok.AccessLevel;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.Sel;
import net.sourceforge.lept4j.util.LeptUtils;
/**
* This code is a good start for detecting italic text, although it has a few issues especially with glyphs which are naturally slanted. E.g. z, 2, 7, /
* If we want this maybe we should exclude these glyphs and then it might have less false positives. But in its current state i don't recommend using it.
*/
@NoArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ItalicDetector {
static String italicKernel = "ooxxooxxooxxoxxooXxooxxoxxooxxooxxoo";
Sel italicSel = Leptonica1.selCreateFromString(italicKernel, 9, 4, "italicKernel");
Sel brickSel = Leptonica1.selCreateBrick(3, 4, 1, 2, 1);
public boolean isItalic(Pix pix) {
Pix preprocessed = preprocess(pix);
Pix flipped = Leptonica1.pixFlipLR(null, pix);
Pix flippedPreprocessed = preprocess(flipped);
Leptonica1.pixFlipLR(flippedPreprocessed, flippedPreprocessed);
double pixelDensity = ImageProcessingUtils.calculatePixelDensity(preprocessed);
double flippedPixelDensity = ImageProcessingUtils.calculatePixelDensity(flippedPreprocessed);
LeptUtils.disposePix(preprocessed);
LeptUtils.disposePix(flipped);
LeptUtils.disposePix(flippedPreprocessed);
return flippedPixelDensity / pixelDensity < 0.85;
}
private Pix preprocess(Pix pix) {
Pix eroded = Leptonica1.pixErode(null, pix, italicSel.getPointer());
Pix dilated = Leptonica1.pixDilate(null, eroded, brickSel.getPointer());
LeptUtils.disposePix(eroded);
return dilated;
}
public void dispose() {
LeptUtils.dispose(italicSel);
LeptUtils.dispose(brickSel);
}
}

View File

@ -1,58 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service.scriptdetection;
import static net.sourceforge.lept4j.ILeptonica.L_THIN_FG;
import java.nio.IntBuffer;
import org.springframework.stereotype.Service;
import lombok.AccessLevel;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.Sela;
import net.sourceforge.lept4j.util.LeptUtils;
@Service
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class StrokeWidthCalculator {
Sela thinningSel;
/**
* Uses a series of sels to thin all connected lines to a single pixel. Then the pixel ratio is a good estimation of the stroke width in pixels.
* <a href="http://www.leptonica.org/papers/conn.pdf">Leptonica Documentation on thinning</a>
* Since the baseline is a strokewidth of exactly one, we need to add 1 to the result.
*
* @param input binarized pix with text on it
* @return estimated stroke width in pixels
*/
public double calculate(Pix input) {
init();
Pix thinned = Leptonica1.pixThinConnectedBySet(input, L_THIN_FG, thinningSel, 0);
IntBuffer thinnedPixelCount = IntBuffer.allocate(1);
Leptonica1.pixCountPixels(thinned, thinnedPixelCount, null);
IntBuffer pixelCount = IntBuffer.allocate(1);
Leptonica1.pixCountPixels(input, pixelCount, null);
LeptUtils.disposePix(thinned);
return (double) pixelCount.get() / thinnedPixelCount.get() + 1;
}
private void init() {
if (thinningSel == null) {
thinningSel = Leptonica1.selaMakeThinSets(1, 0);
}
}
}

View File

@ -1,65 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.concurrent.BlockingQueue;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.Setter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.tess4j.TessAPI1;
/*
This just moves the Elements from the GhostScriptOutputListener into the ImageProcessing queue asynchronously
*/
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class BlockingQueueFiller extends Thread {
final BlockingQueue<RenderedPageImageFile> imageInputQueue;
final BlockingQueue<UnprocessedImage> imageOutputQueue;
@Setter
boolean allImagesQueued;
@SneakyThrows
@Override
public void run() {
// Interrupting signals that the image extraction has finished
try {
while (!allImagesQueued) {
final UnprocessedImage image = imageInputQueue.take();
try {
imageOutputQueue.put(image);
} catch (InterruptedException e) {
imageOutputQueue.put(image);
}
}
} catch (InterruptedException e) {
log.info("All images extracted, emptying processing queue and stopping");
}
// empty the queue
try {
while (true) {
final UnprocessedImage image = imageInputQueue.remove();
imageOutputQueue.put(image);
}
} catch (NoSuchElementException e) {
log.debug("No images left in queue, stopping.");
}
}
}

View File

@ -1,121 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.BlockingQueue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class GhostScriptOutputHandler extends Thread {
static Pattern pageFinishedPattern = Pattern.compile("Page (\\d+)");
// If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock.
// Since both need to read simultaneously we need to implement the readers as separate threads.
final InputStream is;
final String processName;
final Type type;
final Map<Integer, RenderedPageImageFile> pagesToProcess;
final BlockingQueue<RenderedPageImageFile> renderedPageImageFileOutput;
int currentPageNumber;
public static GhostScriptOutputHandler errorHandler(InputStream is) {
return new GhostScriptOutputHandler(is, "GS", Type.ERROR, null, null);
}
public static GhostScriptOutputHandler stdOut(InputStream is,
Map<Integer, RenderedPageImageFile> pagesToProcess,
BlockingQueue<RenderedPageImageFile> renderedPageImageFileOutput) {
return new GhostScriptOutputHandler(is, "GS", Type.STD_OUT, pagesToProcess, renderedPageImageFileOutput);
}
@SneakyThrows
public void run() {
try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) {
String line;
while (true) {
line = br.readLine();
if (line == null) {
break;
}
if (type.equals(Type.ERROR)) {
log.error(processName + "_" + type.name() + ">" + line);
} else {
log.debug(processName + "_" + type.name() + ">" + line);
addProcessedImageToQueue(line);
}
}
}
is.close();
if (type.equals(Type.STD_OUT)) {
queueFinishedPage(currentPageNumber);
}
}
private void addProcessedImageToQueue(String line) {
/*
Ghostscript prints the pageNumber it is currently working on, so we remember the current page and queue it as soon as the next comes in.
*/
Matcher pageNumberMatcher = pageFinishedPattern.matcher(line);
if (pageNumberMatcher.find()) {
int pageNumber = Integer.parseInt(pageNumberMatcher.group(1));
if (currentPageNumber == 0) {
currentPageNumber = pageNumber;
return;
}
queueFinishedPage(currentPageNumber);
currentPageNumber = pageNumber;
}
}
private void queueFinishedPage(int pageNumber) {
var imageFile = this.pagesToProcess.get(pageNumber);
if (imageFile == null) {
throw new IllegalArgumentException(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet()));
}
renderedPageImageFileOutput.add(imageFile);
}
public enum Type {
ERROR,
STD_OUT
}
}

View File

@ -1,110 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import java.io.File;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine;
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ImageExtractionThread extends Thread {
static double FULL_PAGE_IMAGE_THRESHOLD = 0.99;
static double IMAGE_ALIGNMENT_THRESHOLD = 1;
int id;
@Getter
List<Integer> pageIndices;
File documentFile;
OcrProgressLogger logger;
Statistics stats;
OcrServiceSettings settings;
// output is written to these lists
BlockingQueue<UnprocessedImage> imageProcessingQueue;
List<Integer> stitchedPageNumbers;
@SneakyThrows
@Override
public void run() {
long timestamp;
for (Integer pageIndex : pageIndices) {
try (PDDocument document = Loader.loadPDF(documentFile)) { // load new PDDocument for thread safety, also keeps RAM usage low.
timestamp = System.currentTimeMillis();
List<ExtractedImage> extractedImages = getExtractedImages(pageIndex, document);
stats.increaseImageExtraction(id, System.currentTimeMillis() - timestamp);
if (extractedImages.isEmpty()) {
logger.logPageSkipped(pageIndex);
}
if (checkForFullPageOrStitchedImages(extractedImages, document.getPage(pageIndex - 1))) {
stitchedPageNumbers.add(pageIndex);
logger.addImagesToProcess(pageIndex, 0);
continue;
}
for (ExtractedImage image : extractedImages) {
imageProcessingQueue.put(image);
logger.addImagesToProcess(image.pageNumber(), image.numberOnPage());
}
}
}
}
private List<ExtractedImage> getExtractedImages(Integer pageIndex, PDDocument document) {
PDPage page = document.getPage(pageIndex - 1);
ImageStreamEngine imageStreamEngine = new ImageStreamEngine(settings);
imageStreamEngine.processPage(pageIndex, page);
return imageStreamEngine.getImagesOnCurrentPage();
}
@SneakyThrows
private boolean checkForFullPageOrStitchedImages(List<ExtractedImage> imagesOnCurrentPage, PDPage page) {
if (imagesOnCurrentPage.isEmpty()) {
return false;
}
for (ExtractedImage imageOnPage : imagesOnCurrentPage) {
if (imageOnPage.width() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getWidth() && imageOnPage.height() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox()
.getHeight()) {
return true;
}
}
//checking for intersections or direct alignment of images
for (int j = 0; j < imagesOnCurrentPage.size(); j++) {
for (int i = j + 1; i < imagesOnCurrentPage.size(); i++) {
if (imagesOnCurrentPage.get(j)
.getImageCoordinatesInInitialUserSpace()
.aligns(imagesOnCurrentPage.get(i).getImageCoordinatesInInitialUserSpace(), IMAGE_ALIGNMENT_THRESHOLD)) {
// TODO: see if we can stitch aligning images using BufferedImage and skip the gs conversion entirely
return true;
}
}
}
return false;
}
}

View File

@ -1,251 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import static net.sourceforge.tess4j.ITessAPI.TRUE;
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.util.NoSuchElementException;
import java.util.concurrent.BlockingQueue;
import org.apache.pdfbox.pdmodel.PDDocument;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import com.sun.jna.ptr.PointerByReference;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.Setter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.L_Kernel;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
import net.sourceforge.tess4j.ITessAPI;
import net.sourceforge.tess4j.TessAPI1;
/*
* This thread does all the image processing. There should only be one, since Leptonica is not thread safe.
*/
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ImageProcessingThread extends Thread {
final BlockingQueue<UnprocessedImage> imageInputQueue;
final BlockingQueue<OcrImage> imageOutputQueue;
final ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
final L_Kernel gaussianKernel = Leptonica1.makeGaussianKernel(2, 2, 1.0f, 1);
final Statistics stats;
final OcrServiceSettings settings;
final PDDocument document;
@Setter
boolean allImagesExtracted;
@SneakyThrows
@Override
public void run() {
try {
while (!allImagesExtracted) {
final UnprocessedImage image = imageInputQueue.take();
var ocrImage = this.process(image);
try {
imageOutputQueue.put(ocrImage);
} catch (InterruptedException e) {
imageOutputQueue.put(ocrImage);
}
}
} catch (InterruptedException e) {
log.info("All images extracted, emptying processing queue and stopping");
}
try {
while (true) {
final UnprocessedImage image = imageInputQueue.remove();
OcrImage ocrImage = this.process(image);
imageOutputQueue.put(ocrImage);
}
} catch (NoSuchElementException e) {
log.debug("No images left in processing queue, stopping.");
}
TessAPI1.TessBaseAPIEnd(this.detectionScriptHandle);
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
LeptUtils.dispose(gaussianKernel);
}
private OcrImage process(UnprocessedImage unprocessedImage) {
long timestamp = System.currentTimeMillis();
OcrImage ocrImage;
if (unprocessedImage instanceof ExtractedImage extractedImage) {
ocrImage = processExtractedImage(extractedImage);
} else if (unprocessedImage instanceof RenderedPageImageFile renderedPageImageFile) {
ocrImage = processRenderedPageImageFile(renderedPageImageFile);
} else {
throw new UnsupportedOperationException(String.format("Class %s is not supported!", unprocessedImage.getClass()));
}
stats.increaseImageProcessing(System.currentTimeMillis() - timestamp);
return ocrImage;
}
@SuppressWarnings("PMD.CompareObjectsWithEquals")
private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) {
Pix pix = processPix(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi());
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix);
OcrImage ocrImage = new RenderedPageOcrImage(pix.h,
pix.w,
PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1)),
rotatedPix,
orientDegree);
if (pix != rotatedPix) {
LeptUtils.disposePix(pix);
}
return ocrImage;
}
@SuppressWarnings("PMD.CompareObjectsWithEquals")
private OcrImage processExtractedImage(ExtractedImage extractedImage) {
float imageDPI = Math.abs(extractedImage.image().getWidth() / (extractedImage.ctm().getScalingFactorX() / 72));
Pix pix = processPix(extractedImage.asPix(), imageDPI, settings.getDpi());
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix);
OcrImage ocrImage = new ExtractedOcrImage(extractedImage.pageNumber(),
extractedImage.numberOnPage(),
extractedImage.height(),
extractedImage.width(),
extractedImage.ctm(),
rotatedPix,
pix.h,
pix.w,
orientDegree);
if (pix != rotatedPix) {
LeptUtils.disposePix(pix);
}
return ocrImage;
}
public int detectOrientation(Pix pix, int dpi, ITessAPI.TessBaseAPI detectionScriptHandle) {
TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, pix);
TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, dpi);
IntBuffer orientationDegreeResultBuffer;
FloatBuffer orientationDegreeConfidenceBuffer;
PointerByReference scriptureNameBuffer;
FloatBuffer scriptureConfidenceBuffer;
orientationDegreeResultBuffer = IntBuffer.allocate(1);
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
scriptureNameBuffer = new PointerByReference(); // Is this memory being freed?
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
int orientationDegree = 0;
int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle,
orientationDegreeResultBuffer,
orientationDegreeConfidenceBuffer,
scriptureNameBuffer,
scriptureConfidenceBuffer);
if (result == TRUE && orientationDegreeConfidenceBuffer.get() > settings.getMinRotationConfidence()) {
orientationDegree = orientationDegreeResultBuffer.get();
}
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
return orientationDegree;
}
@SneakyThrows
private Pix processPix(Pix pix, float imageDpi, int targetDpi) {
Pix grayScale;
Pix scaledUp;
Pix gaussian;
Pix binarized;
//convert to grayscale
if (pix.d == 8) {
grayScale = pix;
} else if (pix.d == 32) {
grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
LeptUtils.disposePix(pix);
} else if (pix.d == 1) {
grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255);
LeptUtils.disposePix(pix);
} else {
throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d));
}
// scale up
float targetFactor = targetDpi / imageDpi;
if (targetFactor > 2.1) {
scaledUp = Leptonica1.pixScaleGray4xLI(grayScale);
LeptUtils.disposePix(grayScale);
} else if (targetFactor > 1.1) {
scaledUp = Leptonica1.pixScaleGray2xLI(grayScale);
LeptUtils.disposePix(grayScale);
} else {
scaledUp = grayScale;
}
// remove noise and prep for Otsu
gaussian = Leptonica1.pixConvolve(scaledUp, gaussianKernel, 8, 1);
LeptUtils.disposePix(scaledUp);
// Threshold to binary
if (pix.w < 100 || pix.h < 100) {
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
} else {
binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.1f, null);
if (binarized == null) { // Sometimes Otsu just fails, then we binarize directly
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
}
}
LeptUtils.disposePix(gaussian);
return binarized;
}
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
String datapath = System.getenv("TESSDATA_PREFIX");
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
TessAPI1.TessBaseAPISetVariable(handle, "debug_file", "/dev/null");
return handle;
}
}

View File

@ -1,136 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import static net.sourceforge.tess4j.ITessAPI.TRUE;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPICreate;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIInit1;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetPageSegMode;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetVariable;
import java.io.File;
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.nio.file.Path;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.concurrent.BlockingQueue;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
import com.knecon.fforesight.service.ocr.processor.utils.Tesseract2;
import com.sun.jna.StringArray;
import com.sun.jna.ptr.PointerByReference;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
import net.sourceforge.tess4j.ITessAPI;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.TessAPI1;
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OCRThread extends Thread {
int id;
BlockingQueue<OcrImage> imageInputQueue;
Path tesseractOutputDir;
List<OcrResult> results;
OcrProgressLogger logger;
Statistics stats;
OcrServiceSettings settings;
Tesseract2 instance;
public OCRThread(int id,
BlockingQueue<OcrImage> imageInputQueue,
Path tesseractOutputDir,
List<OcrResult> results,
OcrProgressLogger logger,
Statistics stats,
OcrServiceSettings settings) {
this.id = id;
this.imageInputQueue = imageInputQueue;
this.tesseractOutputDir = tesseractOutputDir;
this.results = results;
this.logger = logger;
this.stats = stats;
this.settings = settings;
this.instance = createInstance(settings);
}
@SneakyThrows
@Override
public void run() {
// Interrupting signals that the image extraction has finished
while (!isInterrupted()) {
try {
final OcrImage image = imageInputQueue.take();
this.process(image);
} catch (InterruptedException e) {
// set isInterrupted to true (This exception may only happen during active waiting for queue, and then isInterrupted will not be set!)
interrupt();
}
}
// empty the queue
try {
while (true) {
final OcrImage image = imageInputQueue.remove();
this.process(image);
}
} catch (NoSuchElementException e) {
log.debug("Executed tesseract on all Images, finishing.");
}
}
private void process(OcrImage image) {
long timestamp = System.currentTimeMillis();
String tmpOutputFileName = String.format("output_%04d_%04d", image.getPageNumber(), image.getNumberOnPage());
String tesseractOutputFileName = tesseractOutputDir.resolve(tmpOutputFileName).toFile().toString();
int psm = settings.getPsmOverride() < 0 ? image.getOptimalPageSegmentationMode() : settings.getPsmOverride();
executeTesseract(psm, image.getDpi(), image.getPix(), tesseractOutputFileName);
image.destroyPix();
results.add(OcrResult.create(image, tesseractOutputFileName));
logger.logImageFinished(image, psm);
stats.increaseTesseractDuration(id, System.currentTimeMillis() - timestamp);
}
@SneakyThrows
public void executeTesseract(int psm, int dpi, Pix pix, String tesseractOutputFileName) {
Leptonica1.pixWrite(tesseractOutputFileName + ".tiff", pix, 5); // write the used image for later bold detection
instance.setVariable("user_defined_dpi", String.valueOf(dpi));
instance.setPageSegMode(psm);
instance.createDocumentsWithResults(pix, null, tesseractOutputFileName, List.of(ITesseract.RenderedFormat.HOCR), ITessAPI.TessPageIteratorLevel.RIL_BLOCK);
}
private static Tesseract2 createInstance(OcrServiceSettings settings) {
Tesseract2 instance = new Tesseract2();
instance.setVariable("debug_file", "/dev/null"); // remove warnings from std out
instance.setOcrEngineMode(1); // set to LSTM based Engine
instance.setLanguage(settings.getLanguages());
return instance;
}
}

View File

@ -1,28 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.settings;
import org.apache.pdfbox.cos.COSName;
import org.springframework.boot.context.properties.ConfigurationProperties;
import lombok.AccessLevel;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@ConfigurationProperties("ocr-service")
@FieldDefaults(level = AccessLevel.PRIVATE)
public class OcrServiceSettings {
int ocrThreadCount = 4; // Number of OCR threads
int imageExtractThreadCount = 2; // Number of image extraction threads
int gsProcessCount = 1; // Number of Ghostscript processes
int dpi = 300; // Target DPI for binarized images
int psmOverride = -1; // Overrides the page segmentation mode if > 0
int minImageHeight = 20; // Minimum height for images to be processed
int minImageWidth = 20; // Minimum width for images to be processed
float minRotationConfidence = 2; // Sets a lower bound for the confidence rating for rotated pages.
boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR");
boolean boldDetection = true; // if true, bold detection will be attempted
double boldThreshold = 0.5; // Words are opened with a brick of average stroke width, if the ratio of remaining pixels is higher the word is determined bold.
}

View File

@ -1,85 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import java.awt.AlphaComposite;
import java.awt.Color;
import java.awt.Graphics;
import java.awt.Graphics2D;
import java.awt.Transparency;
import java.awt.image.BufferedImage;
import java.nio.IntBuffer;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.sun.jna.ptr.PointerByReference;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
import net.sourceforge.lept4j.L_Kernel;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
@UtilityClass
public class ImageProcessingUtils {
public BufferedImage convertToDeviceColorSpace(ExtractedImage extractedImage) {
BufferedImage image;
if (extractedImage.colorSpace() instanceof PDDeviceRGB || extractedImage.colorSpace() instanceof PDDeviceGray) {
image = extractedImage.image();
} else {
BufferedImage pdfImage = extractedImage.image();
image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
Graphics g = image.getGraphics();
g.drawImage(pdfImage, 0, 0, null);
g.dispose();
}
return image;
}
public Pix deRotatePix(int orientDegree, Pix pix) {
return switch (360 - orientDegree) {
case 90 -> Leptonica1.pixRotateOrth(pix, 1);
case 180 -> Leptonica1.pixRotateOrth(pix, 2);
case 270 -> Leptonica1.pixRotateOrth(pix, 3);
default -> pix;
};
}
public static void setAlphaChannelToWhite(BufferedImage image) {
if (image.getTransparency() == Transparency.TRANSLUCENT) {
// NOTE: For BITMASK images, the color model is likely IndexColorModel,
// and this model will contain the "real" color of the transparent parts
// which is likely a better fit than unconditionally setting it to white.
// Fill background with white
Graphics2D graphics = image.createGraphics();
try {
graphics.setComposite(AlphaComposite.DstOver); // Set composite rules to paint "behind"
graphics.setPaint(Color.WHITE);
graphics.fillRect(0, 0, image.getWidth(), image.getHeight());
} finally {
graphics.dispose();
}
}
}
public static double calculatePixelDensity(Pix pix) {
IntBuffer pixelCount = IntBuffer.allocate(1);
int result = Leptonica1.pixCountPixels(pix, pixelCount, null);
if (result == 0) {
return (double) pixelCount.get() / (pix.h * pix.w);
} else {
return -1;
}
}
}

View File

@ -1,73 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import lombok.experimental.UtilityClass;
import net.sourceforge.lept4j.L_Kernel;
import net.sourceforge.lept4j.Leptonica1;
@UtilityClass
public class KernelUtils {
/*
-1, -1, -1
-1, 8, -1
-1, -1, -1
*/
public L_Kernel createFullLaplacianKernel() {
L_Kernel laplacianKernel = Leptonica1.kernelCreate(3, 3);
Leptonica1.kernelSetElement(laplacianKernel, 0, 0, -1);
Leptonica1.kernelSetElement(laplacianKernel, 0, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 0, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 0, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 0, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 1, 8);
return laplacianKernel;
}
/*
0, 0, -1, 0, 0
0, -1, -1, -1, 0
-1, -1, 12, -1, -1
0, -1, -1, -1, 0
0, 0, -1, 0, 0
*/
public L_Kernel createLaplacianKernel5x5() {
L_Kernel laplacianKernel = Leptonica1.kernelCreate(5, 5);
Leptonica1.kernelSetElement(laplacianKernel, 0, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 3, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 0, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 3, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 4, -1);
Leptonica1.kernelSetElement(laplacianKernel, 3, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 3, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 3, 3, -1);
Leptonica1.kernelSetElement(laplacianKernel, 4, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 2, 12);
return laplacianKernel;
}
/*
0, -1, 0
-1, 4, -1
0, -1, 0
*/
public L_Kernel createLaplacianKernel() {
L_Kernel laplacianKernel = Leptonica1.kernelCreate(3, 3);
Leptonica1.kernelSetElement(laplacianKernel, 0, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 0, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 1, 4);
return laplacianKernel;
}
}

View File

@ -1,64 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.IntStream;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ListSplittingUtils {
public List<List<Integer>> buildBalancedContinuousSublist(Integer totalNumberOfEntries, int threadCount) {
return buildBalancedSublist(IntStream.range(0, totalNumberOfEntries).map(i -> i + 1).boxed().toList(), threadCount);
}
public <T> List<List<T>> buildBalancedSublist(List<T> entries, int threadCount) {
List<Integer> balancedEntryCounts = buildBalancedEntryCounts(entries.size(), threadCount);
List<List<T>> balancedSublist = new ArrayList<>(threadCount);
int startIdx = 0;
for (Integer numberOfEntriesPerThread : balancedEntryCounts) {
balancedSublist.add(entries.subList(startIdx, startIdx + numberOfEntriesPerThread));
startIdx += numberOfEntriesPerThread;
}
return balancedSublist;
}
public <T> List<List<List<T>>> buildBatchedBalancedSublist(List<T> entries, int threadCount, int batchSize) {
// batches -> threads -> entries
List<List<List<T>>> batchedBalancedSubList = new LinkedList<>();
List<List<List<T>>> threadsWithBatches = buildBalancedSublist(entries, threadCount).stream().map(list -> buildBalancedSublist(list, batchSize)).toList();
// swap first two dimensions
for (int batchIdx = 0; batchIdx < batchSize; batchIdx++) {
List<List<T>> threadEntriesPerBatch = new ArrayList<>(threadCount);
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
threadEntriesPerBatch.add(threadsWithBatches.get(threadIdx).get(batchIdx));
}
batchedBalancedSubList.add(threadEntriesPerBatch);
}
return batchedBalancedSubList;
}
public List<Integer> buildBalancedEntryCounts(int totalNumberOfEntries, int threadCount) {
List<Integer> numberOfPagesPerThread = new ArrayList<>(threadCount);
for (int i = 0; i < threadCount; i++) {
numberOfPagesPerThread.add(0);
}
int threadIdx;
for (int i = 0; i < totalNumberOfEntries; i++) {
threadIdx = i % threadCount;
numberOfPagesPerThread.set(threadIdx, numberOfPagesPerThread.get(threadIdx) + 1);
}
return numberOfPagesPerThread;
}
}

View File

@ -1,21 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import java.awt.geom.AffineTransform;
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
import lombok.AccessLevel;
import lombok.experimental.FieldDefaults;
import lombok.experimental.UtilityClass;
@UtilityClass
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class PdfDpiCalculator {
public int calculateDpi(QuadPoint imageBounds, AffineTransform imageCTM, double width) {
QuadPoint transformedImageBounds = imageBounds.getTransformed(imageCTM);
double transformedWidth = transformedImageBounds.a().distance(transformedImageBounds.d());
double widthInInches = transformedWidth * 1 / 72;
return (int) Math.ceil(width / widthInInches);
}
}

View File

@ -1,73 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.Rect;
import com.pdftron.pdf.RectCollection;
import lombok.SneakyThrows;
public class PdfDraw {
@SneakyThrows
public static void drawGrid(ElementWriter writer, Page page) {
try (var eb = new ElementBuilder()) {
double dX = 15;
double dY = 15;
int nRows = (int) (page.getPageHeight() / dY) + 1;
int nCols = (int) (page.getPageWidth() / dX) + 1;
for (int row = 0; row < nRows; ++row) {
for (int col = 0; col < nCols; ++col) {
Element cell = eb.createRect(col * dX, row * dY, dX, dY);
cell.setPathStroke(true);
cell.getGState().setLineWidth(1);
cell.getGState().setStrokeOpacity(0.1);
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
if (row == 0 && col == 0) {
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
cell.setPathFill(true);
cell.getGState().setFillOpacity(0.8);
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
cell.getGState().setFillColor(new ColorPt(0, 0, 1));
} else {
cell.setPathFill(false);
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
}
writer.writePlacedElement(cell);
}
}
}
}
@SneakyThrows
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection) {
try (var colorPt = new ColorPt(1, 0, 0); var eb = new ElementBuilder()) {
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
try(var r = rectCollection.getRectAt(i)) {
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setLineWidth(5);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
rect.setPathFill(true);
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setFillColor(colorPt);
rect.getGState().setFillOpacity(0.5);
writer.writePlacedElement(rect);
}
}
}
}
}

View File

@ -1,141 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import java.awt.Rectangle;
import java.nio.IntBuffer;
import java.util.ArrayList;
import java.util.List;
import com.sun.jna.Pointer;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.tess4j.OCRResult;
import net.sourceforge.tess4j.TessAPI1;
import net.sourceforge.tess4j.Tesseract1;
import net.sourceforge.tess4j.TesseractException;
import net.sourceforge.tess4j.Word;
@Slf4j
/**
* Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted.
*/
public class Tesseract2 extends Tesseract1 {
private int createDocuments(Pix pix, String filename, TessResultRenderer renderer) {
String title = TessBaseAPIGetStringVariable(getHandle(), DOCUMENT_TITLE);
TessResultRendererBeginDocument(renderer, title);
int result = TessBaseAPIProcessPage(getHandle(), pix, 0, filename, null, 0, renderer);
TessResultRendererEndDocument(renderer);
// if (result == ITessAPI.FALSE) {
// throw new TesseractException("Error during processing page.");
// }
return TessBaseAPIMeanTextConf(getHandle());
}
public OCRResult createDocumentsWithResults(Pix bi, String filename, String outputbase, List<RenderedFormat> formats, int pageIteratorLevel) throws TesseractException {
List<OCRResult> results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel);
if (!results.isEmpty()) {
return results.get(0);
} else {
return null;
}
}
public List<OCRResult> createDocumentsWithResults(Pix[] pixs, String[] filenames, String[] outputbases, List<RenderedFormat> formats, int pageIteratorLevel) {
if (pixs.length != filenames.length || pixs.length != outputbases.length) {
throw new RuntimeException("The three arrays must match in length.");
}
init();
setVariables();
List<OCRResult> results = new ArrayList<OCRResult>();
try {
for (int i = 0; i < pixs.length; i++) {
try {
TessResultRenderer renderer = createRenderers(outputbases[i], formats);
int meanTextConfidence = createDocuments(pixs[i], filenames[i], renderer);
TessDeleteResultRenderer(renderer);
List<Word> words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel) : new ArrayList<Word>();
results.add(new OCRResult(meanTextConfidence, words));
} catch (Exception e) {
// skip the problematic image file
log.warn(e.getMessage(), e);
}
}
} finally {
dispose();
}
return results;
}
private List<Word> getRecognizedWords(int pageIteratorLevel) {
List<Word> words = new ArrayList<>();
try {
TessResultIterator ri = TessBaseAPIGetIterator(getHandle());
TessPageIterator pi = TessResultIteratorGetPageIterator(ri);
TessPageIteratorBegin(pi);
do {
Pointer ptr = TessResultIteratorGetUTF8Text(ri, pageIteratorLevel);
if (ptr == null) {
continue;
}
String text = ptr.getString(0);
TessAPI1.TessDeleteText(ptr);
float confidence = TessResultIteratorConfidence(ri, pageIteratorLevel);
IntBuffer leftB = IntBuffer.allocate(1);
IntBuffer topB = IntBuffer.allocate(1);
IntBuffer rightB = IntBuffer.allocate(1);
IntBuffer bottomB = IntBuffer.allocate(1);
TessPageIteratorBoundingBox(pi, pageIteratorLevel, leftB, topB, rightB, bottomB);
int left = leftB.get();
int top = topB.get();
int right = rightB.get();
int bottom = bottomB.get();
Word word = new Word(text, confidence, new Rectangle(left, top, right - left, bottom - top));
words.add(word);
} while (TessPageIteratorNext(pi, pageIteratorLevel) == TRUE);
// TessPageIteratorDelete(pi);
TessResultIteratorDelete(ri);
} catch (Exception e) {
log.warn(e.getMessage(), e);
}
return words;
}
private TessResultRenderer createRenderers(String outputbase, List<RenderedFormat> formats) {
TessResultRenderer renderer = null;
for (RenderedFormat format : formats) {
switch (format) {
case HOCR:
if (renderer == null) {
renderer = TessHOcrRendererCreate(outputbase);
} else {
TessResultRendererInsert(renderer, TessHOcrRendererCreate(outputbase));
}
break;
}
}
return renderer;
}
}

View File

@ -1,29 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.io.File;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.ocr.processor.model.FontMetrics;
import com.knecon.fforesight.service.ocr.processor.service.fonts.Type0FontMetricsFactory;
import lombok.SneakyThrows;
import lombok.experimental.SuperBuilder;
@SuppressWarnings("PMD")
class Type0FontMetricsFactoryTest {
@Test
@SneakyThrows
public void testStringWidth() {
try (PDDocument document = Loader.loadPDF(new File(Type0FontMetricsFactoryTest.class.getClassLoader().getResource("InvisibleText.pdf").getPath()))) {
Type0FontMetricsFactory metricsFactory = Type0FontMetricsFactory.regular(document);
FontMetrics fontMetrics = metricsFactory.calculateMetrics("deine mutter", 100, 50);
}
}
}

View File

@ -1,36 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import static net.sourceforge.lept4j.ILeptonica.IFF_PNG;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
@Disabled
class ImageProcessingUtilsTest {
@BeforeEach
public void loadLeptonica() {
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
}
@Test
public void testRotation() {
Pix pix = Leptonica1.pixRead("/home/kschuettler/Downloads/painHarold.webp");
Pix pix2 = ImageProcessingUtils.deRotatePix(0, pix);
Leptonica1.pixWrite("/tmp/0.png", pix2, IFF_PNG);
Pix pix3 = ImageProcessingUtils.deRotatePix(90, pix);
Leptonica1.pixWrite("/tmp/90.png", pix3, IFF_PNG);
Pix pix4 = ImageProcessingUtils.deRotatePix(180, pix);
Leptonica1.pixWrite("/tmp/180.png", pix4, IFF_PNG);
Pix pix5 = ImageProcessingUtils.deRotatePix(270, pix);
Leptonica1.pixWrite("/tmp/270.png", pix5, IFF_PNG);
}
}

View File

@ -1,21 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.util.Collection;
import org.junit.jupiter.api.Test;
class ListSplittingUtilsTest {
@Test
public void testBalancedListSplitting() {
int threadCount = 18;
int numberOfPages = 48;
var balancedList = ListSplittingUtils.buildBalancedContinuousSublist(numberOfPages, threadCount);
assertEquals(threadCount, balancedList.size());
assertEquals(numberOfPages, balancedList.stream().mapToLong(Collection::size).sum());
}
}

View File

@ -1,83 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import java.awt.image.BufferedImage;
import java.io.File;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.IntStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
import org.springframework.util.FileSystemUtils;
import com.knecon.fforesight.service.ocr.processor.service.OsUtils;
import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler;
import lombok.SneakyThrows;
// YOU NEED GHOSTSCRIPT INSTALLED TO RUN THIS TEST!!!!
@Disabled
public class Pdf2ImgTest {
private static final int DPI = 150;
@Test
@SneakyThrows
@Disabled
public void testPDFBox() {
String outputDir = OsUtils.getTemporaryDirectory("imageOutput", "");
new File(outputDir).mkdirs();
ClassPathResource resource = new ClassPathResource("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
try (PDDocument document = Loader.loadPDF(resource.getFile())) {
PDFRenderer renderer = new PDFRenderer(document);
for (int pageNumber = 0; pageNumber < document.getNumberOfPages(); pageNumber++) {
BufferedImage image = renderer.renderImageWithDPI(pageNumber, DPI);
boolean written = ImageIOUtil.writeImage(image, "tif", new File(outputDir + String.format("page%04d", pageNumber)).getAbsolutePath(), DPI);
System.out.printf("%d: %s%n", pageNumber, written);
}
}
FileSystemUtils.deleteRecursively(new File(outputDir));
}
@Test
@SneakyThrows
public void testGhostScriptParallel() {
int numOfProcesses = 5;
ClassPathResource resource = new ClassPathResource("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
String outputDir = "/tmp/ghostscript_out/";
List<Process> processes = IntStream.range(0, numOfProcesses).boxed().parallel().map(i -> buildCmdArgs(i, outputDir, resource)).map(Pdf2ImgTest::executeProcess).toList();
List<Integer> processExitCodes = new LinkedList<>();
for (Process process : processes) {
processExitCodes.add(process.waitFor());
}
System.out.println("Ghostscripts finished with exit codes " + processExitCodes);
FileSystemUtils.deleteRecursively(new File(outputDir));
}
@SneakyThrows
private static Process executeProcess(String[] cmdArgs) {
return Runtime.getRuntime().exec(cmdArgs);
}
@SneakyThrows
private static String[] buildCmdArgs(Integer i, String outputDir, ClassPathResource resource) {
String outDir = outputDir + "/" + i + "/";
new File(outDir).mkdirs();
return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=tiffgray", "-r" + DPI, "-sOutputFile=" + outDir + "page%04d", resource.getFile().toString(), "-c", "quit"};
}
}

View File

@ -0,0 +1,145 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.iqser.red.service</groupId>
<artifactId>ocr-service-v1</artifactId>
<version>1.0-SNAPSHOT</version>
</parent>
<artifactId>ocr-service-server-v1</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>ocr-service-api-v1</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>storage-commons</artifactId>
</dependency>
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>spring-commons</artifactId>
</dependency>
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>metric-commons</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-openfeign</artifactId>
</dependency>
<dependency>
<groupId>com.pdftron</groupId>
<artifactId>PDFNet</artifactId>
<version>9.4.0</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-amqp</artifactId>
<version>2.3.1.RELEASE</version>
</dependency>
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-kms</artifactId>
<version>1.12.158</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</dependency>
<!-- Test -->
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>test-commons</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.amqp</groupId>
<artifactId>spring-rabbit-test</artifactId>
<version>2.3.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<exclusions>
<exclusion>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-tomcat</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.12.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<annotationProcessors>
<annotationProcessor>lombok.launch.AnnotationProcessorHider$AnnotationProcessor</annotationProcessor>
<annotationProcessor>com.dslplatform.json.processor.CompiledJsonAnnotationProcessor</annotationProcessor>
</annotationProcessors>
</configuration>
</plugin>
<plugin>
<!-- generate git.properties for exposure in /info -->
<groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>revision</goal>
</goals>
<configuration>
<generateGitPropertiesFile>true</generateGitPropertiesFile>
<gitDescribe>
<tags>true</tags>
</gitDescribe>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<!-- repackages the generated jar into a runnable fat-jar and makes it
executable -->
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>repackage</goal>
</goals>
<configuration>
<executable>true</executable>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>pdftron</id>
<name>PDFNet Maven</name>
<url>https://pdftron.com/maven/release</url>
</repository>
</repositories>
</project>

View File

@ -0,0 +1,45 @@
package com.iqser.red.service.ocr.v1.server;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.actuate.autoconfigure.security.servlet.ManagementWebSecurityAutoConfiguration;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.cloud.openfeign.EnableFeignClients;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Import;
import org.springframework.scheduling.annotation.EnableAsync;
import com.iqser.red.commons.spring.DefaultWebMvcConfiguration;
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
import io.micrometer.core.aop.TimedAspect;
import io.micrometer.core.instrument.MeterRegistry;
@EnableAsync
@EnableConfigurationProperties(OcrServiceSettings.class)
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
@Import({DefaultWebMvcConfiguration.class, MessagingConfiguration.class})
@EnableFeignClients(basePackageClasses = FileStatusProcessingUpdateClient.class)
public class Application {
/**
* Entry point to the service application.
*
* @param args Any command line parameter given upon startup.
*/
public static void main(String[] args) {
SpringApplication.run(Application.class, args);
}
@Bean
public TimedAspect timedAspect(MeterRegistry registry) {
return new TimedAspect(registry);
}
}

View File

@ -0,0 +1,10 @@
package com.iqser.red.service.ocr.v1.server.client;
import org.springframework.cloud.openfeign.FeignClient;
import com.iqser.red.service.persistence.service.v1.api.resources.FileStatusProcessingUpdateResource;
@FeignClient(name = "FileStatusProcessingUpdateResource", url = "${persistence-service.url}")
public interface FileStatusProcessingUpdateClient extends FileStatusProcessingUpdateResource {
}

View File

@ -0,0 +1,42 @@
package com.iqser.red.service.ocr.v1.server.configuration;
import org.springframework.amqp.core.Queue;
import org.springframework.amqp.core.QueueBuilder;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import lombok.RequiredArgsConstructor;
@Configuration
@RequiredArgsConstructor
public class MessagingConfiguration {
public static final String OCR_QUEUE = "ocrQueue";
public static final String OCR_DLQ = "ocrDLQ";
public static final String X_DEAD_LETTER_EXCHANGE = "x-dead-letter-exchange";
public static final String X_DEAD_LETTER_ROUTING_KEY = "x-dead-letter-routing-key";
public static final String X_MAX_PRIORITY = "x-max-priority";
public static final String OCR_STATUS_UPDATE_RESPONSE_QUEUE = "ocr_status_update_response_queue";
@Bean
public Queue ocrQueue() {
return QueueBuilder.durable(OCR_QUEUE)
.withArgument(X_DEAD_LETTER_EXCHANGE, "")
.withArgument(X_DEAD_LETTER_ROUTING_KEY, OCR_DLQ)
.withArgument(X_MAX_PRIORITY, 2)
.maxPriority(2)
.build();
}
@Bean
public Queue ocrDeadLetterQueue() {
return QueueBuilder.durable(OCR_DLQ).build();
}
}

View File

@ -0,0 +1,35 @@
package com.iqser.red.service.ocr.v1.server.initializer;
import javax.annotation.PostConstruct;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import com.pdftron.pdf.PDFNet;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@Component
@RequiredArgsConstructor
public class PDFNetInitializer {
@Value("${pdftron.license:}")
private String pdftronLicense;
@Value("${pdftron.ocrmodule.path:/tmp}")
private String ocrModulePath;
@SneakyThrows
@PostConstruct
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
public void init() {
PDFNet.setTempPath("/tmp/pdftron");
PDFNet.addResourceSearchPath(ocrModulePath);
PDFNet.initialize(pdftronLicense);
}
}

View File

@ -0,0 +1,54 @@
package com.iqser.red.service.ocr.v1.server.model;
import com.pdftron.pdf.Rect;
import lombok.Data;
import lombok.SneakyThrows;
import java.awt.geom.Area;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.util.ArrayDeque;
import java.util.Deque;
@Data
public class ClippingPathStack {
private Deque<Area> stack = new ArrayDeque<>();
@SneakyThrows
public ClippingPathStack(Rect rectangle) {
stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D()));
}
@SneakyThrows
public void intersectClippingPath(GeneralPath path) {
getCurrentClippingPath().intersect(new Area(path));
}
public boolean almostIntersects(double x, double y, double width, double height) {
// Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0.
double tolerance = 1e-3;
double x_with_tolerance = x > 0 ? x - tolerance : x + tolerance;
double y_with_tolerance = y > 0 ? y - tolerance : y + tolerance;
double width_with_tolerance = width + 2 * tolerance;
double height_with_tolerance = height + 2 * tolerance;
return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
}
public Area getCurrentClippingPath() {
return stack.peek();
}
public void enterNewGState() {
Area current = stack.peek();
Area cloned = new Area();
cloned.add(current);
stack.push(cloned);
}
public void leaveGState() {
stack.pop();
}
}

View File

@ -0,0 +1,149 @@
package com.iqser.red.service.ocr.v1.server.model;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.Rect;
import lombok.*;
import lombok.experimental.SuperBuilder;
import java.awt.geom.Rectangle2D;
@Data
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public abstract class ElementFeatures {
private int elementType;
private Rectangle2D boundingBox;
public boolean almostMatches(Element element) throws PDFNetException {
if (element.getType() != elementType) return false;
if (element.getBBox() == null) return false;
return rectsAlmostMatch(element.getBBox());
}
protected boolean almostEqual(double a, double b) {
double tolerance = 1e-3;
return Math.abs(a - b) < tolerance;
}
@SneakyThrows
private boolean rectsAlmostMatch(Rect bBox) {
if (!almostEqual(bBox.getX1(), boundingBox.getX())) return false;
if (!almostEqual(bBox.getY1(), boundingBox.getY())) return false;
if (!almostEqual(bBox.getWidth(), boundingBox.getWidth())) return false;
return almostEqual(bBox.getHeight(), boundingBox.getHeight());
}
@EqualsAndHashCode(callSuper = true)
@Data
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public static class Text extends ElementFeatures {
private String text;
private int font;
private double fontsize;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
if (!super.almostMatches(element)) return false;
if (!text.equals(element.getTextString())) return false;
if (font != element.getGState().getFont().getType()) return false;
return almostEqual(fontsize, element.getGState().getFontSize());
}
}
@EqualsAndHashCode(callSuper = true)
@Data
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public static class Path extends ElementFeatures {
private boolean isClippingPath;
private boolean isClipWindingFill;
private boolean isStroked;
private boolean isFilled;
private boolean isWindingFill;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
if (!super.almostMatches(element)) return false;
if (isClippingPath != element.isClippingPath()) return false;
if (isClipWindingFill != element.isClipWindingFill()) return false;
if (isStroked != element.isStroked()) return false;
if (isFilled != element.isFilled()) return false;
if (isWindingFill != element.isWindingFill()) return false;
return true;
}
}
@EqualsAndHashCode(callSuper = true)
@Data
@SuperBuilder
@NoArgsConstructor
public static class Image extends ElementFeatures {
private int dataSize;
private int height;
private int width;
private int renderingIntent;
private int componentNum;
private int bitsPerComponent;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
if (!super.almostMatches(element)) return false;
if (dataSize != element.getImageDataSize()) return false;
if (height != element.getImageHeight()) return false;
if (width != element.getImageWidth()) return false;
if (renderingIntent != element.getImageRenderingIntent()) return false;
if (componentNum != element.getComponentNum()) return false;
if (bitsPerComponent != element.getBitsPerComponent()) return false;
return true;
}
}
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
switch (element.getType()) {
case Element.e_path:
return ElementFeatures.Path.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.isClippingPath(element.isClippingPath())
.isClipWindingFill(element.isClipWindingFill())
.isStroked(element.isStroked())
.isFilled(element.isFilled())
.isWindingFill(element.isWindingFill())
.build();
case Element.e_text:
return ElementFeatures.Text.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.text(element.getTextString())
.font(element.getGState().getFont().getType())
.fontsize(element.getGState().getFontSize())
.build();
case Element.e_image:
case Element.e_inline_image:
return Image.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.dataSize(element.getImageDataSize())
.height(element.getImageHeight())
.width(element.getImageWidth())
.renderingIntent(element.getImageRenderingIntent())
.componentNum(element.getComponentNum())
.bitsPerComponent(element.getBitsPerComponent())
.build();
default:
throw new UnsupportedOperationException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
}
}
private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException {
return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
}
}

View File

@ -0,0 +1,15 @@
package com.iqser.red.service.ocr.v1.server.model;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class ImagePosition {
private Rectangle rectangle;
private boolean hasTransparency;
}

View File

@ -0,0 +1,14 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import java.util.HashMap;
import java.util.Map;
import lombok.Data;
@Data
public class Classification {
private Map<String, Float> probabilities = new HashMap<>();
private String label;
}

View File

@ -0,0 +1,11 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import lombok.Data;
@Data
public class FilterGeometry {
private ImageSize imageSize;
private ImageFormat imageFormat;
}

View File

@ -0,0 +1,12 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import lombok.Data;
@Data
public class Filters {
private FilterGeometry geometry;
private Probability probability;
private boolean allPassed;
}

View File

@ -0,0 +1,11 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import lombok.Data;
@Data
public class Geometry {
private float width;
private float height;
}

View File

@ -0,0 +1,12 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import lombok.Data;
@Data
public class ImageFormat {
private float quotient;
private boolean tooTall;
private boolean tooWide;
}

View File

@ -0,0 +1,14 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import lombok.Data;
@Data
public class ImageMetadata {
private Classification classification;
private Position position;
private Geometry geometry;
private Filters filters;
private boolean alpha;
}

View File

@ -0,0 +1,31 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import java.util.ArrayList;
import java.util.List;
import com.dslplatform.json.CompiledJson;
import com.dslplatform.json.JsonAttribute;
import com.fasterxml.jackson.annotation.JsonAlias;
import com.fasterxml.jackson.annotation.JsonProperty;
import lombok.Data;
@Data
@CompiledJson
public class ImageServiceResponse {
private String dossierId;
private String fileId;
@JsonProperty(value = "imageMetadata")
@JsonAlias("data")
@JsonAttribute(alternativeNames = {"imageMetadata"})
private List<ImageMetadata> data = new ArrayList<>();
@JsonProperty(value = "imageMetadata")
@JsonAlias("data")
@JsonAttribute(alternativeNames = {"imageMetadata"})
public void setData(List<ImageMetadata> data) {this.data = data;}
}

View File

@ -0,0 +1,12 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import lombok.Data;
@Data
public class ImageSize {
private float quotient;
private boolean tooLarge;
private boolean tooSmall;
}

View File

@ -0,0 +1,14 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import lombok.Data;
@Data
public class Position {
private float x1;
private float x2;
private float y1;
private float y2;
private int pageNumber;
}

View File

@ -0,0 +1,10 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import lombok.Data;
@Data
public class Probability {
private boolean unconfident;
}

View File

@ -0,0 +1,69 @@
package com.iqser.red.service.ocr.v1.server.service;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import org.apache.commons.io.IOUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.storage.commons.service.StorageService;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class FileStorageService {
private final StorageService storageService;
public static String getStorageId(String dossierId, String fileId, FileType fileType) {
return dossierId + "/" + fileId + "." + fileType.name() + fileType.getExtension();
}
@SneakyThrows
public byte[] getOriginalFile(String dossierId, String fileId) {
return IOUtils.toByteArray(storageService.getObject(getStorageId(dossierId, fileId, FileType.ORIGIN)).getInputStream());
}
@SneakyThrows
public InputStream getOriginalFileAsStream(String dossierId, String fileId) {
return storageService.getObject(getStorageId(dossierId, fileId, FileType.ORIGIN)).getInputStream();
}
public void storeOriginalFile(String dossierId, String fileId, InputStream stream) {
storageService.storeObject(getStorageId(dossierId, fileId, FileType.ORIGIN), stream);
}
public boolean untouchedFileExists(String dossierId, String fileId) {
return storageService.objectExists(getStorageId(dossierId, fileId, FileType.UNTOUCHED));
}
public void storeUntouchedFile(String dossierId, String fileId, byte[] data) {
storageService.storeObject(getStorageId(dossierId, fileId, FileType.UNTOUCHED), new ByteArrayInputStream(data));
}
@SneakyThrows
public ImageServiceResponse getImageServiceResponse(String dossierId, String fileId) {
return storageService.readJSONObject(getStorageId(dossierId, fileId, FileType.IMAGE_INFO), ImageServiceResponse.class);
}
}

View File

@ -0,0 +1,463 @@
package com.iqser.red.service.ocr.v1.server.service;
import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack;
import com.iqser.red.service.ocr.v1.server.model.ElementFeatures;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import com.pdftron.sdf.Obj;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import org.springframework.stereotype.Service;
import java.awt.*;
import java.awt.geom.AffineTransform;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;
@Slf4j
@Service
public class InvisibleElementService {
/*
handled cases:
Text or Path outside of clipping path
Text which is transparent or is set to not render
Text or Path that have been painted over by visible and filled Paths
unhandled cases:
Text covered by widely stroked path
Text same color as background
Any Text set to clipping with its many interactions with other elements
*/
@SneakyThrows
public byte[] removeInvisibleTextOrPathElements(byte[] pdfFile, boolean delta) {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Set<Integer> visited = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
List<ElementFeatures> overlappedElements = removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, reader, writer, visited, delta);
visited.clear();
removeOverlappedElements(page, reader, writer, visited, overlappedElements, delta);
}
if (delta) {
debugSave(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
}
return pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null);
}
private List<ElementFeatures> removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited, boolean delta)
throws PDFNetException {
var overlappedElements = new ArrayList<ElementFeatures>();
var visibleElements = new ArrayList<ElementFeatures>();
ClippingPathStack clippingPathStack = new ClippingPathStack(page.getMediaBox());
visited.add((int) page.getSDFObj().getObjNum());
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(reader, writer, visited, clippingPathStack, delta, overlappedElements, visibleElements);
writer.end();
reader.end();
return overlappedElements;
}
private void processElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, ClippingPathStack clippingPathStack, Boolean delta,
List<ElementFeatures> coveredElements, List<ElementFeatures> visibleElements)
throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next())
switch (element.getType()) {
case Element.e_image:
case Element.e_inline_image:
processImages(element, writer, clippingPathStack, delta, visibleElements);
break;
case Element.e_text:
processText(element, writer, clippingPathStack, delta, visibleElements);
break;
case Element.e_path:
processPath(element, writer, clippingPathStack, delta, coveredElements, visibleElements);
break;
case Element.e_form:
processForm(reader, writer, element, visited, clippingPathStack, delta, coveredElements, visibleElements);
break;
case Element.e_group_begin:
clippingPathStack.enterNewGState();
writer.writeElement(element);
break;
case Element.e_group_end:
clippingPathStack.leaveGState();
writer.writeElement(element);
break;
default:
writer.writeElement(element);
}
}
private void processImages(Element imageElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta, List<ElementFeatures> visibleElements)
throws PDFNetException {
Rect rect = imageElement.getBBox();
if (rect == null) {
return;
}
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
if (!delta && inClippingPath) {
visibleElements.add(ElementFeatures.extractFeatures(imageElement));
writer.writeElement(imageElement);
}
if (delta && !inClippingPath) {
writer.writeElement(imageElement);
}
}
private void processText(Element textElement, ElementWriter writer, ClippingPathStack clippingPathStack,
Boolean delta, List<ElementFeatures> visibleElements)
throws PDFNetException {
Rect rect = textElement.getBBox();
if (rect == null) {
writer.writeElement(textElement);
return;
}
GState gState = textElement.getGState();
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
boolean isTextVisible = isTextRenderedVisibly(gState);
if (inClippingPath && isTextVisible) {
visibleElements.add(ElementFeatures.extractFeatures(textElement));
}
if (!delta) {
if (inClippingPath && isTextVisible) {
writer.writeElement(textElement);
} else if (textElement.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(textElement);
}
} else {
if (!inClippingPath) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
gState.setFillColor(new ColorPt(1, 0, 0));
writer.writeElement(textElement);
}
if (!isTextVisible) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
gState.setFillColor(new ColorPt(0, 0, 1));
gState.setTextRenderMode(GState.e_fill_text);
gState.setFillOpacity(1);
writer.writeElement(textElement);
}
}
}
private void processForm(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited, ClippingPathStack clippingPathStack, Boolean delta,
List<ElementFeatures> coveredElements, List<ElementFeatures> allElements)
throws PDFNetException {
writer.writeElement(element);
Obj formObj = element.getXObject();
if (!visited.contains((int) formObj.getObjNum())) {
visited.add((int) formObj.getObjNum());
ElementWriter new_writer = new ElementWriter();
reader.formBegin();
new_writer.begin(formObj);
reader.clearChangeList();
new_writer.setDefaultGState(reader);
processElements(reader, new_writer, visited, clippingPathStack, delta, coveredElements, allElements);
new_writer.end();
reader.end();
}
}
private void processPath(Element pathElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta,
List<ElementFeatures> overlappedElements, List<ElementFeatures> visibleElements)
throws PDFNetException {
GeneralPath linePath = convertToGeneralPath(pathElement.getPathData());
//transform path to initial user space
var ctm = pathElement.getCTM();
var affineTransform = getAffineTransform(ctm);
linePath.transform(affineTransform);
var rect = linePath.getBounds2D();
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
if (pathElement.isClippingPath()) {
if (pathElement.isClipWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
clippingPathStack.intersectClippingPath(linePath);
pathElement.setPathClip(!delta);
writer.writeElement(pathElement);
} else {
if (inClippingPath) {
if (isFilledAndNonTransparent(pathElement)) {
List<ElementFeatures> currentOverlappedElements = visibleElements.stream()
.filter(features -> almostContains(linePath, features.getBoundingBox()))
.collect(Collectors.toList());
overlappedElements.addAll(currentOverlappedElements);
visibleElements.removeAll(currentOverlappedElements);
}
visibleElements.add(ElementFeatures.extractFeatures(pathElement));
if (!delta) {
writer.writeElement(pathElement);
}
}
if (delta && !inClippingPath) {
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
writer.writeElement(pathElement);
}
}
}
private static AffineTransform getAffineTransform(Matrix2D ctm) throws PDFNetException {
return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
}
private void removeOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited, List<ElementFeatures> overlappedElements, boolean delta)
throws PDFNetException {
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
if (delta) {
overlappedElements.forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
overlappedElements.clear();
}
processOverlappedElements(reader, writer, visited, overlappedElements, delta);
writer.end();
reader.end();
if (overlappedElements.size() > 0) {
log.warn(overlappedElements.size() + " overlapped elements have not been found and removed");
}
}
private void processOverlappedElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, List<ElementFeatures> coveredElements, boolean delta)
throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_form:
processFormOverlappedElements(reader, writer, element, visited, coveredElements, delta);
break;
case Element.e_path:
case Element.e_image:
case Element.e_inline_image:
case Element.e_text:
boolean anyMatch = false;
for (ElementFeatures elementToRemove : coveredElements) {
if (elementToRemove.almostMatches(element)) {
coveredElements.remove(elementToRemove);
anyMatch = true;
break;
}
}
if (!anyMatch) {
writer.writeElement(element);
} else if (element.getType() == 3 && element.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(element);
}
break;
default:
writer.writeElement(element);
}
}
}
private void processFormOverlappedElements(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited, List<ElementFeatures> elementsToRemove, boolean delta)
throws PDFNetException {
writer.writeElement(element);
Obj formObj = element.getXObject();
if (!visited.contains((int) formObj.getObjNum())) {
visited.add((int) formObj.getObjNum());
ElementWriter new_writer = new ElementWriter();
reader.formBegin();
new_writer.begin(formObj);
reader.clearChangeList();
new_writer.setDefaultGState(reader);
processOverlappedElements(reader, new_writer, visited, elementsToRemove, delta);
new_writer.end();
reader.end();
}
}
private boolean isTextRenderedVisibly(GState gState) throws PDFNetException {
if (gState.getTextRenderMode() == GState.e_invisible_text) return false;
if (gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) return false;
if (gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) return false;
if (gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0) return false;
return true;
}
private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
GeneralPath linePath = new GeneralPath();
double[] dataPoints = pathData.getPoints();
byte[] opr = pathData.getOperators();
double x1;
double y1;
double x2;
double y2;
double x3;
double y3;
int data_index = 0;
for (int opr_index = 0; opr_index < opr.length; ++opr_index) {
switch (opr[opr_index]) {
case PathData.e_moveto:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
linePath.moveTo(x1, y1);
break;
case PathData.e_lineto:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
linePath.lineTo(x1, y1);
break;
case PathData.e_cubicto:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
x2 = dataPoints[data_index];
++data_index;
y2 = dataPoints[data_index];
++data_index;
x3 = dataPoints[data_index];
++data_index;
y3 = dataPoints[data_index];
++data_index;
linePath.curveTo(x1, y1, x2, y2, x3, y3);
break;
case PathData.e_rect:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
double w = dataPoints[data_index];
++data_index;
double h = dataPoints[data_index];
++data_index;
x2 = x1 + w;
y2 = y1;
x3 = x2;
y3 = y1 + h;
double x4 = x1;
double y4 = y3;
linePath.moveTo(x1, y1);
linePath.lineTo(x2, y2);
linePath.lineTo(x3, y3);
linePath.lineTo(x4, y4);
break;
case PathData.e_closepath:
linePath.closePath();
break;
default:
throw new PDFNetException("Invalid Element Type", 0, "", "", "");
}
}
return linePath;
}
private boolean almostContains(Shape outer, Rectangle2D inner) {
double tolerance = 1e-3;
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + tolerance : inner.getX() - tolerance;
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + tolerance : inner.getY() - tolerance;
double height_with_tolerance = inner.getHeight() - (2 * tolerance);
double width_with_tolerance = inner.getWidth() - (2 * tolerance);
Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
return outer.contains(innerRect);
}
private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
return element.isFilled() && element.getGState().getFillOpacity() == 1;
}
private void debugSave(byte[] pdfFile) {
String fileLocation = "/tmp/delta.pdf";
try (var f_out = FileUtils.openOutputStream(new File(fileLocation))) {
f_out.write(pdfFile);
} catch (IOException e) {
throw new RuntimeException("File location: " + fileLocation + "could not be openend, no file will be saved");
}
}
@SneakyThrows
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
ElementBuilder eb = new ElementBuilder();
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
writer.writePlacedElement(rect);
}
}

View File

@ -0,0 +1,177 @@
package com.iqser.red.service.ocr.v1.server.service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.model.ImagePosition;
import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
import com.iqser.red.service.persistence.service.v1.api.utils.SuppressFBWarnings;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.pdftron.pdf.*;
import com.pdftron.sdf.SDFDoc;
import io.micrometer.core.annotation.Timed;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.util.*;
@Slf4j
@Service
@RequiredArgsConstructor
public class OCRService {
public static final String ENGLISH = "eng";
private final FileStorageService fileStorageService;
private final OcrServiceSettings settings;
private final RabbitTemplate rabbitTemplate;
private final ObjectMapper objectMapper;
private final InvisibleElementService invisibleElementService;
@Timed("redactmanager_PDFTron-ocrDocument")
@SneakyThrows
public InputStream ocrDocument(String dossierId, String fileId) {
var fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
var imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId);
var fileBytes = IOUtils.toByteArray(fileStream);
var ocrBytes = ocr(fileBytes, fileId, imageServiceResponse);
return new ByteArrayInputStream(ocrBytes);
}
@SuppressFBWarnings("REC_CATCH_EXCEPTION")
private byte[] ocr(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) {
PDFDoc pdfDoc = null;
var fileWithoutInvisibleText = invisibleElementService.removeInvisibleTextOrPathElements(file, false);
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
pdfDoc = new PDFDoc(fileWithoutInvisibleText);
Map<Integer, List<ImagePosition>> pages = new HashMap<>();
// TODO take logic to ignore small and combine images from image-service.
// TODO Then replace logic so ocr-service is independent from image-service.
imageServiceResponse.getData()
.forEach(imageMetadata -> pages.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
.add(new ImagePosition(new Rectangle(new Point(imageMetadata.getPosition().getX1(), imageMetadata.getPosition().getY1()),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight(),
imageMetadata.getPosition().getPageNumber()), imageMetadata.isAlpha())));
Map<Integer, PDFDoc> pdfDocMap = Collections.synchronizedMap(new HashMap<>());
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pages.keySet().size()).build()));
ocrPages(pdfDoc, fileId, pages, pdfDocMap);
for (var entry : pdfDocMap.entrySet()) {
var ocrDoc = entry.getValue();
var page = entry.getKey();
Page ocrPage = ocrDoc.getPageIterator(1).next();
pdfDoc.pageInsert(pdfDoc.getPageIterator(page), ocrPage);
pdfDoc.pageRemove(pdfDoc.getPageIterator(page + 1));
ocrDoc.close();
}
Optimizer.optimize(pdfDoc);
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
pdfDoc.close();
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
.fileId(fileId)
.numberOfPagesToOCR(pages.keySet().size())
.numberOfOCRedPages(pages.keySet().size())
.ocrFinished(true)
.build()));
return out.toByteArray();
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
if (pdfDoc != null) {
try {
pdfDoc.close();
} catch (Exception e) {
log.debug("Failed to close document", e);
}
}
}
}
@SneakyThrows
private void ocrPages(PDFDoc pdfDoc, String fileId, Map<Integer, List<ImagePosition>> pages, Map<Integer, PDFDoc> pdfDocMap) {
int numberOfOCRedPages = 0;
for (var pageEntry : pages.entrySet()) {
try {
RectCollection rectCollection = new RectCollection();
var page = pageEntry.getKey();
Page pdfPage = pdfDoc.getPageIterator(page).next();
pdfPage.setMediaBox(pdfPage.getCropBox());
for (ImagePosition imagePosition : pageEntry.getValue()) {
Rectangle rectangle = imagePosition.getRectangle();
// Warning coordinate system is different in this call macOs/Linux
double y = -rectangle.getTopLeft().getY() + pdfPage.getCropBox().getY2() - rectangle.getHeight();
rectCollection.addRect(rectangle.getTopLeft().getX(), y, rectangle.getTopLeft().getX() + rectangle.getWidth(), y + rectangle.getHeight());
}
PDFDoc ocrDoc = new PDFDoc();
ocrDoc.pagePushBack(pdfPage);
pdfDocMap.put(pageEntry.getKey(), ocrDoc);
OCROptions options = new OCROptions();
options.addTextZonesForPage(rectCollection, 1);
options.addLang(ENGLISH);
options.addDPI(settings.getOcrDPI());
OCRModule.processPDF(ocrDoc, options);
rectCollection.clear();
} catch (Exception e) {
log.warn("Failed to process PDF page {} - {}", pageEntry.getKey(), e);
}
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
.fileId(fileId)
.numberOfPagesToOCR(pages.keySet().size())
.numberOfOCRedPages(++numberOfOCRedPages)
.build()));
log.warn("Done page {}", pageEntry);
}
}
}

View File

@ -0,0 +1,79 @@
package com.iqser.red.service.ocr.v1.server.service;
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
import org.springframework.amqp.rabbit.annotation.RabbitListener;
import org.springframework.http.HttpStatus;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.api.model.DocumentRequest;
import feign.FeignException;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class OcrMessageReceiver {
private final ObjectMapper objectMapper;
private final FileStorageService fileStorageService;
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
private final OCRService ocrService;
@RabbitHandler
@RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1")
public void receiveOcr(String in) throws JsonProcessingException {
DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class);
long start = System.currentTimeMillis();
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
setStatusOcrProcessing(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
if (!fileStorageService.untouchedFileExists(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId())) {
byte[] originalFile = fileStorageService.getOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile);
}
var ocrResult = ocrService.ocrDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), ocrResult);
long end = System.currentTimeMillis();
log.info("Successfully processed ocr for file with dossierId {} and fileId {}, took {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), end - start);
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
}
@RabbitHandler
@RabbitListener(queues = MessagingConfiguration.OCR_DLQ, concurrency = "1")
public void receiveOcrDQL(String in) throws JsonProcessingException {
DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class);
log.info("OCR DQL received: {}", ocrRequestMessage);
fileStatusProcessingUpdateClient.ocrFailed(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
}
private void setStatusOcrProcessing(String dossierId, String fileId) {
try {
fileStatusProcessingUpdateClient.ocrProcessing(dossierId, fileId);
} catch (FeignException e) {
if (e.status() == HttpStatus.CONFLICT.value()) {
throw new AmqpRejectAndDontRequeueException(e.getMessage());
}
}
}
}

Some files were not shown because too many files have changed in this diff Show More