Compare commits
126 Commits
renovate/c
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5fca39728b | ||
|
|
cd6390fde1 | ||
|
|
bc459ee966 | ||
|
|
47e7f8b297 | ||
|
|
22392e083d | ||
|
|
52a1fb4a05 | ||
|
|
378436cb2f | ||
|
|
f1204acc60 | ||
|
|
998755c3e3 | ||
|
|
c598f62633 | ||
|
|
2e25ee2155 | ||
|
|
7f04fb3c6f | ||
|
|
ff32f016eb | ||
|
|
821ef265fe | ||
|
|
7fcb6652ef | ||
|
|
61b1010e24 | ||
|
|
7b5a175440 | ||
|
|
18ba1daaef | ||
|
|
c61f71871e | ||
|
|
cc2937d0d2 | ||
|
|
71255d9fc9 | ||
|
|
1f9dac17e3 | ||
|
|
5712292698 | ||
|
|
1395318e18 | ||
|
|
842b794153 | ||
|
|
4b3ccc28e2 | ||
|
|
b469ea4174 | ||
|
|
253bb70519 | ||
|
|
d55f245c5e | ||
|
|
7ed1632c6f | ||
|
|
6be5dcf305 | ||
|
|
7f0fb149a9 | ||
|
|
6cab77c5c1 | ||
|
|
1e3dc3df24 | ||
|
|
f54f526f44 | ||
|
|
d8011bdba5 | ||
|
|
ea11013132 | ||
|
|
6d69b783f1 | ||
|
|
5b3261d229 | ||
|
|
b5a78a4396 | ||
|
|
3b320bfb00 | ||
|
|
021b18ada3 | ||
|
|
23bc84bd98 | ||
|
|
2e37b8eec9 | ||
|
|
c4c20d15ae | ||
|
|
d2f2def1c2 | ||
|
|
2bbc3775c5 | ||
|
|
2aaa53f441 | ||
|
|
724bb58969 | ||
|
|
1e08405082 | ||
|
|
150f2153c0 | ||
|
|
ca9e22b190 | ||
|
|
0c6ce2d77b | ||
|
|
b48db538fd | ||
|
|
14cc9941ff | ||
|
|
fdb3f3476b | ||
|
|
75bd2142ec | ||
|
|
9010ee8691 | ||
|
|
eaa6973a1f | ||
|
|
74d5f8d8e0 | ||
|
|
fb1fe35bc1 | ||
|
|
912f00aa84 | ||
|
|
bab16ad9b2 | ||
|
|
be4656189b | ||
|
|
8944b57344 | ||
|
|
67540950b8 | ||
|
|
6f29270e66 | ||
|
|
b961c9e324 | ||
|
|
4b6411161e | ||
|
|
14982eae7c | ||
|
|
99fc16130b | ||
|
|
80d38fb785 | ||
|
|
c06974ce69 | ||
|
|
591c7d7fab | ||
|
|
0300a087d4 | ||
|
|
98752ff1d1 | ||
|
|
ae09a59a7c | ||
|
|
65d818200f | ||
|
|
6fe95c6940 | ||
|
|
202132e14c | ||
|
|
0264e28cc2 | ||
|
|
a50f54676e | ||
|
|
1926707ae1 | ||
|
|
d3190844a3 | ||
|
|
c7ccbae6ff | ||
|
|
880bebcafc | ||
|
|
955ff6281d | ||
|
|
efd3a1d952 | ||
|
|
bb5b4a2fd8 | ||
|
|
6f99664906 | ||
|
|
574f7ac25e | ||
|
|
12217f2459 | ||
|
|
19747cbca5 | ||
|
|
2632d2023d | ||
|
|
4c225c2219 | ||
|
|
3d09f46844 | ||
|
|
77355b5367 | ||
|
|
57e194fcd0 | ||
|
|
c556687499 | ||
|
|
759bae6499 | ||
|
|
aa45fa84bb | ||
|
|
a82676c36b | ||
|
|
948c4bed79 | ||
|
|
6d3ec8a9db | ||
|
|
6533501ffc | ||
|
|
607b9be6f5 | ||
|
|
acba4cb103 | ||
|
|
bf3fe7f13d | ||
|
|
ede443a47a | ||
|
|
f5f1f70ffd | ||
|
|
1c62c5ddf4 | ||
|
|
506b888424 | ||
|
|
37ff2b982a | ||
|
|
06c49cc412 | ||
|
|
262204bcca | ||
|
|
04a0925a6c | ||
|
|
57ef7da5b3 | ||
|
|
7e20541d73 | ||
|
|
33412589c0 | ||
|
|
0ff07979ee | ||
|
|
0ad4682571 | ||
|
|
74f9f123f4 | ||
|
|
7525a54341 | ||
|
|
7209d47862 | ||
|
|
82db83936d | ||
|
|
856d52951c |
47
.gitignore
vendored
47
.gitignore
vendored
@ -9,6 +9,49 @@
|
||||
**/tmp/
|
||||
**/.apt_generated/
|
||||
|
||||
HELP.md
|
||||
target/
|
||||
!.mvn/wrapper/maven-wrapper.jar
|
||||
!**/src/main/**/target/
|
||||
!**/src/test/**/target/
|
||||
|
||||
### maven build ###
|
||||
*.class
|
||||
/out/
|
||||
/build/
|
||||
/target/
|
||||
**/out/
|
||||
**/build/
|
||||
**/target/
|
||||
|
||||
### STS ###
|
||||
.apt_generated
|
||||
.classpath
|
||||
.factorypath
|
||||
.project
|
||||
.settings
|
||||
.springBeans
|
||||
.sts4-cache
|
||||
.gradle
|
||||
|
||||
### IntelliJ IDEA ###
|
||||
.idea
|
||||
*.iws
|
||||
*.iml
|
||||
*.ipr
|
||||
|
||||
### NetBeans ###
|
||||
/nbproject/private/
|
||||
/nbbuild/
|
||||
/dist/
|
||||
/nbdist/
|
||||
/.nb-gradle/
|
||||
build/
|
||||
!**/src/main/**/build/
|
||||
!**/src/test/**/build/
|
||||
|
||||
### VS Code ###
|
||||
.vscode/
|
||||
|
||||
.factorypath
|
||||
.springBeans
|
||||
@ -26,3 +69,7 @@
|
||||
**/.DS_Store
|
||||
**/classpath-data.json
|
||||
**/dependencies-and-licenses-overview.txt
|
||||
gradle.properties
|
||||
gradlew
|
||||
gradlew.bat
|
||||
gradle/
|
||||
|
||||
@ -1,6 +1,25 @@
|
||||
variables:
|
||||
SONAR_PROJECT_KEY: 'LM_license-service'
|
||||
# SONAR_PROJECT_KEY: 'ocr-service:ocr-service-server'
|
||||
GIT_SUBMODULE_STRATEGY: recursive
|
||||
GIT_SUBMODULE_FORCE_HTTPS: 'true'
|
||||
include:
|
||||
- project: 'gitlab/gitlab'
|
||||
ref: 'main'
|
||||
file: 'ci-templates/maven_java.yml'
|
||||
file: 'ci-templates/gradle_java.yml'
|
||||
|
||||
deploy:
|
||||
stage: deploy
|
||||
tags:
|
||||
- dind
|
||||
script:
|
||||
- echo "Building with gradle version ${BUILDVERSION}"
|
||||
- gradle -Pversion=${BUILDVERSION} publish
|
||||
- gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${BUILDVERSION}
|
||||
- echo "BUILDVERSION=$BUILDVERSION" >> version.env
|
||||
artifacts:
|
||||
reports:
|
||||
dotenv: version.env
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
||||
- if: $CI_COMMIT_BRANCH =~ /^release/
|
||||
- if: $CI_COMMIT_TAG
|
||||
8
.gitmodules
vendored
Normal file
8
.gitmodules
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
[submodule "ocr-service-v1/ocr-service-server/src/test/resources/files/syngenta"]
|
||||
path = ocr-service-v1/ocr-service-server/src/test/resources/files/syngenta
|
||||
url = https://gitlab.knecon.com/fforesight/documents/syngenta.git
|
||||
update = merge
|
||||
[submodule "ocr-service-v1/ocr-service-server/src/test/resources/files/basf"]
|
||||
path = ocr-service-v1/ocr-service-server/src/test/resources/files/basf
|
||||
url = https://gitlab.knecon.com/fforesight/documents/basf.git
|
||||
update = merge
|
||||
87
README.md
Normal file
87
README.md
Normal file
@ -0,0 +1,87 @@
|
||||
# OCR Service
|
||||
## Overview
|
||||
The OCR service is a tool designed for extracting text content from PDF files. It utilizes Tesseract, Leptonica, PDFTron, PDFBox, and Ghostscript to perform various tasks, including removing invisible elements and watermarks, extracting images, stitching striped images, binarizing images, running OCR on the processed images, and writing the recognized text back to the original PDF. This service is particularly useful for obtaining machine-readable text from PDF documents.
|
||||
|
||||
## Dependencies
|
||||
[Tesseract](https://github.com/tesseract-ocr/tesseract)
|
||||
[Leptonica](http://leptonica.org/)
|
||||
[PDFTron](https://apryse.com/)
|
||||
[PDFBox](https://pdfbox.apache.org/)
|
||||
[Ghostscript](https://www.ghostscript.com/)
|
||||
## Functionality
|
||||
1. Invisible Element and Watermark Removal
|
||||
The service uses PDFTron to attempt the removal of invisible elements and watermarks from the PDF.
|
||||
2. Image Extraction
|
||||
Extracts all images from the PDF using PDFBox
|
||||
3. Striped Image Detection and Stitching
|
||||
Detects if images are striped and stitches them together using Ghostscript.
|
||||
4. Image Processing
|
||||
- Convert to grayscale
|
||||
- Upscale to target DPI
|
||||
- Filter using Gauss kernel
|
||||
- Binarizes the resulting images using Leptonica and the Otsu thresholding algorithm.
|
||||
- Despeckle using various morphological operations
|
||||
5. OCR Processing
|
||||
Runs Tesseract on the images to extract text.
|
||||
6. Font style detection
|
||||
Detection of bold text using stroke width estimation
|
||||
7. Text Integration
|
||||
Draws the resulting text onto the original PDF using PDFBox.
|
||||
|
||||
Steps 2.-5. happen in parallel and communicate via a blocking queue to limit RAM usage.
|
||||
Therefore, choosing your thread counts carefully leads to most optimal performance.
|
||||
For example with 18 available cores, I achieved the highest performance with 2 Image extraction threads, 2 ghostscript processes and 16 OCR threads.
|
||||
|
||||
Setting all threads to basically unlimited (1000+) leads to comparable performance without laborious thread tuning, but at the cost of (potentially a lot) more RAM.
|
||||
|
||||
## Installation
|
||||
To run the OCR service, ensure that the following dependencies are installed:
|
||||
|
||||
1. Ghostscript: Install using apt.
|
||||
```bash
|
||||
sudo apt install ghostscript
|
||||
```
|
||||
2. Tesseract and Leptonica: Install using [vcpkg](https://github.com/microsoft/vcpkg) with the command and set the environment variable `VCPKG_DYNAMIC_LIB` to your vcpkg lib folder (e.g. ~/vcpkg/installed/x64-linux-dynamic/lib).
|
||||
```bash
|
||||
vcpkg install tesseract --triplet x64-linux-dynamic
|
||||
```
|
||||
```bash
|
||||
vcpkg install leptonica --triplet x64-linux-dynamic
|
||||
```
|
||||
3. Other dependencies are handled by Gradle build
|
||||
```bash
|
||||
gradle build
|
||||
```
|
||||
|
||||
## Configuration
|
||||
Configuration settings are available in the OcrServiceSettings class.
|
||||
These settings can be overridden using environment variables. e.g.
|
||||
`OCR_SERVICE_OCR_THREAD_COUNT=16`
|
||||
|
||||
Possible configurations and their defaults include:
|
||||
|
||||
```java
|
||||
int ocrThreadCount = 4; // Number of OCR threads
|
||||
int imageExtractThreadCount = 4; // Number of image extraction threads
|
||||
int gsProcessCount = 4; // Number of Ghostscript processes
|
||||
int dpi = 300; // Target DPI for binarized images
|
||||
int psmOverride = -1; // Overrides the page segmentation mode if > 0
|
||||
int minImageHeight = 20; // Minimum height for images to be processed
|
||||
int minImageWidth = 20; // Minimum width for images to be processed
|
||||
boolean debug = false; // If true, overlays OCR images with a grid and draws word bounding boxes
|
||||
boolean removeWatermark; // If false, watermarks will not be removed
|
||||
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
|
||||
```
|
||||
## Integration
|
||||
|
||||
The OCR-service communicates via RabbitMQ and uses the queues `ocr_request_queue`, `ocr_response_queue`,
|
||||
`ocr_dead_letter_queue`, and `ocr_status_update_response_queue`.
|
||||
|
||||
### ocr_request_queue
|
||||
This queue is used to start the OCR process, a DocumentRequest must be passed as a message. The service will then download the PDF from the provided cloud storage.
|
||||
### ocr_response_queue
|
||||
This queue is also used to signal the end of processing.
|
||||
### ocr_dead_letter_queue
|
||||
This queue is used to signal an error has occurred during processing.
|
||||
### ocr_status_update_response_queue
|
||||
This queue is used by the OCR service to give updates about the progress of the ongoing OCR on a image per image basis. The total amount may change, when less images are found than initially assumed.
|
||||
15
buildSrc/build.gradle.kts
Normal file
15
buildSrc/build.gradle.kts
Normal file
@ -0,0 +1,15 @@
|
||||
/*
|
||||
* This file was generated by the Gradle 'init' task.
|
||||
*
|
||||
* This project uses @Incubating APIs which are subject to change.
|
||||
*/
|
||||
|
||||
plugins {
|
||||
// Support convention plugins written in Kotlin. Convention plugins are build scripts in 'src/main' that automatically become available as plugins in the main build.
|
||||
`kotlin-dsl`
|
||||
}
|
||||
|
||||
repositories {
|
||||
// Use the plugin portal to apply community plugins in convention plugins.
|
||||
gradlePluginPortal()
|
||||
}
|
||||
@ -0,0 +1,70 @@
|
||||
plugins {
|
||||
`java-library`
|
||||
pmd
|
||||
checkstyle
|
||||
jacoco
|
||||
}
|
||||
|
||||
group = "com.knecon.fforesight.service"
|
||||
|
||||
|
||||
|
||||
java.sourceCompatibility = JavaVersion.VERSION_17
|
||||
java.targetCompatibility = JavaVersion.VERSION_17
|
||||
|
||||
pmd {
|
||||
isConsoleOutput = true
|
||||
}
|
||||
|
||||
tasks.pmdMain {
|
||||
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
|
||||
}
|
||||
|
||||
tasks.pmdTest {
|
||||
pmd.ruleSetFiles = files("${rootDir}/config/pmd/test_pmd.xml")
|
||||
}
|
||||
|
||||
tasks.named<Test>("test") {
|
||||
useJUnitPlatform()
|
||||
reports {
|
||||
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
|
||||
}
|
||||
minHeapSize = "512m"
|
||||
maxHeapSize = "8192m"
|
||||
}
|
||||
|
||||
tasks.test {
|
||||
finalizedBy(tasks.jacocoTestReport) // report is always generated after tests run
|
||||
}
|
||||
|
||||
tasks.jacocoTestReport {
|
||||
dependsOn(tasks.test) // tests are required to run before generating the report
|
||||
reports {
|
||||
xml.required.set(true)
|
||||
csv.required.set(false)
|
||||
html.outputLocation.set(layout.buildDirectory.dir("jacocoHtml"))
|
||||
}
|
||||
}
|
||||
|
||||
java {
|
||||
withJavadocJar()
|
||||
}
|
||||
|
||||
repositories {
|
||||
mavenLocal()
|
||||
maven {
|
||||
url = uri("https://nexus.knecon.com/repository/gindev/")
|
||||
credentials {
|
||||
username = providers.gradleProperty("mavenUser").getOrNull()
|
||||
password = providers.gradleProperty("mavenPassword").getOrNull()
|
||||
}
|
||||
}
|
||||
maven {
|
||||
url = uri("https://nexus.knecon.com/repository/PDFTron/")
|
||||
credentials {
|
||||
username = providers.gradleProperty("mavenUser").getOrNull()
|
||||
password = providers.gradleProperty("mavenPassword").getOrNull()
|
||||
}
|
||||
}
|
||||
mavenCentral()
|
||||
}
|
||||
39
config/checkstyle/checkstyle.xml
Normal file
39
config/checkstyle/checkstyle.xml
Normal file
@ -0,0 +1,39 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
|
||||
"http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
|
||||
<module name="Checker">
|
||||
<property
|
||||
name="severity"
|
||||
value="error"/>
|
||||
<module name="TreeWalker">
|
||||
<module name="SuppressWarningsHolder"/>
|
||||
<module name="MissingDeprecated"/>
|
||||
<module name="MissingOverride"/>
|
||||
<module name="AnnotationLocation"/>
|
||||
<module name="JavadocStyle"/>
|
||||
<module name="NonEmptyAtclauseDescription"/>
|
||||
<module name="IllegalImport"/>
|
||||
<module name="RedundantImport"/>
|
||||
<module name="RedundantModifier"/>
|
||||
<module name="EmptyBlock"/>
|
||||
<module name="DefaultComesLast"/>
|
||||
<module name="EmptyStatement"/>
|
||||
<module name="EqualsHashCode"/>
|
||||
<module name="ExplicitInitialization"/>
|
||||
<module name="IllegalInstantiation"/>
|
||||
<module name="ModifiedControlVariable"/>
|
||||
<module name="MultipleVariableDeclarations"/>
|
||||
<module name="PackageDeclaration"/>
|
||||
<module name="ParameterAssignment"/>
|
||||
<module name="SimplifyBooleanExpression"/>
|
||||
<module name="SimplifyBooleanReturn"/>
|
||||
<module name="StringLiteralEquality"/>
|
||||
<module name="OneStatementPerLine"/>
|
||||
<module name="FinalClass"/>
|
||||
<module name="ArrayTypeStyle"/>
|
||||
<module name="UpperEll"/>
|
||||
<module name="OuterTypeFilename"/>
|
||||
</module>
|
||||
<module name="FileTabCharacter"/>
|
||||
<module name="SuppressWarningsFilter"/>
|
||||
</module>
|
||||
20
config/pmd/pmd.xml
Normal file
20
config/pmd/pmd.xml
Normal file
@ -0,0 +1,20 @@
|
||||
<?xml version="1.0"?>
|
||||
<ruleset name="Custom ruleset"
|
||||
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
|
||||
|
||||
<description>
|
||||
Knecon ruleset checks the code for bad stuff
|
||||
</description>
|
||||
|
||||
<rule ref="category/java/errorprone.xml">
|
||||
<exclude name="MissingSerialVersionUID"/>
|
||||
<exclude name="AvoidLiteralsInIfCondition"/>
|
||||
<exclude name="AvoidDuplicateLiterals"/>
|
||||
<exclude name="NullAssignment"/>
|
||||
<exclude name="AssignmentInOperand"/>
|
||||
<exclude name="BeanMembersShouldSerialize"/>
|
||||
</rule>
|
||||
|
||||
</ruleset>
|
||||
22
config/pmd/test_pmd.xml
Normal file
22
config/pmd/test_pmd.xml
Normal file
@ -0,0 +1,22 @@
|
||||
<?xml version="1.0"?>
|
||||
<ruleset name="Custom ruleset"
|
||||
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
|
||||
|
||||
<description>
|
||||
Knecon test ruleset checks the code for bad stuff
|
||||
</description>
|
||||
|
||||
|
||||
<rule ref="category/java/errorprone.xml">
|
||||
<exclude name="MissingSerialVersionUID"/>
|
||||
<exclude name="AvoidLiteralsInIfCondition"/>
|
||||
<exclude name="AvoidDuplicateLiterals"/>
|
||||
<exclude name="NullAssignment"/>
|
||||
<exclude name="AssignmentInOperand"/>
|
||||
<exclude name="TestClassWithoutTestCases"/>
|
||||
<exclude name="BeanMembersShouldSerialize"/>
|
||||
</rule>
|
||||
|
||||
</ruleset>
|
||||
1
gradle.properties.kts
Normal file
1
gradle.properties.kts
Normal file
@ -0,0 +1 @@
|
||||
version = 4.0-SNAPSHOT
|
||||
Binary file not shown.
@ -1,118 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<groupId>com.knecon.fforesight</groupId>
|
||||
<artifactId>platform-docker-dependency</artifactId>
|
||||
<version>0.1.0</version>
|
||||
<relativePath/>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>ocr-service-image-v1</artifactId>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<version>3.0-SNAPSHOT</version>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<properties>
|
||||
<service.server>ocr-service-server-v1</service.server>
|
||||
<platform.jar>${service.server}.jar</platform.jar>
|
||||
<docker.skip.push>false</docker.skip.push>
|
||||
<docker.image.name>${docker.image.prefix}/${service.server}</docker.image.name>
|
||||
</properties>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-resources-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>exec-maven-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>io.fabric8</groupId>
|
||||
<artifactId>docker-maven-plugin</artifactId>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
<pluginManagement>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>download-platform-jar</id>
|
||||
<phase>prepare-package</phase>
|
||||
<goals>
|
||||
<goal>copy</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<artifactItems>
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>${service.server}</artifactId>
|
||||
<version>${version}</version>
|
||||
<type>jar</type>
|
||||
<overWrite>true</overWrite>
|
||||
<destFileName>${platform.jar}</destFileName>
|
||||
</dependency>
|
||||
</artifactItems>
|
||||
<outputDirectory>${docker.build.directory}</outputDirectory>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>io.fabric8</groupId>
|
||||
<artifactId>docker-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<images>
|
||||
<image>
|
||||
<name>${docker.image.name}</name>
|
||||
<build>
|
||||
<dockerFileDir>${docker.build.directory}</dockerFileDir>
|
||||
<args>
|
||||
<PLATFORM_JAR>${platform.jar}</PLATFORM_JAR>
|
||||
</args>
|
||||
<tags>
|
||||
<tag>${docker.image.version}</tag>
|
||||
<tag>latest</tag>
|
||||
</tags>
|
||||
</build>
|
||||
</image>
|
||||
</images>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<artifactId>maven-resources-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>copy-resources</id>
|
||||
<phase>prepare-package</phase>
|
||||
<goals>
|
||||
<goal>copy-resources</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<outputDirectory>${basedir}/target/build/libs/</outputDirectory>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>libs</directory>
|
||||
<filtering>false</filtering>
|
||||
</resource>
|
||||
</resources>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
</build>
|
||||
</project>
|
||||
@ -1,18 +0,0 @@
|
||||
FROM red/base-image:2.0.2
|
||||
|
||||
COPY "libs/pdftron/OCRModuleLinux.tar.gz" .
|
||||
RUN tar xvzf OCRModuleLinux.tar.gz
|
||||
RUN mkdir /OCRModule
|
||||
RUN mv Lib/* /OCRModule/
|
||||
|
||||
RUN apt-get -y update
|
||||
# Ghostscript somehow improves ocr quality using pdfton, do not remove!
|
||||
RUN apt-get -y install ghostscript
|
||||
|
||||
ARG PLATFORM_JAR
|
||||
|
||||
ENV PLATFORM_JAR ${PLATFORM_JAR}
|
||||
|
||||
ENV USES_ELASTICSEARCH false
|
||||
|
||||
COPY ["${PLATFORM_JAR}", "/"]
|
||||
@ -1,33 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<artifactId>ocr-service-v1</artifactId>
|
||||
<version>3.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>ocr-service-api-v1</artifactId>
|
||||
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<!-- This dependency contains annotations that are used in specifying REST endpoints. -->
|
||||
<!-- It is optional since not all users of this API might use Feign. -->
|
||||
<groupId>io.github.openfeign</groupId>
|
||||
<artifactId>feign-core</artifactId>
|
||||
<optional>true</optional>
|
||||
</dependency>
|
||||
<!-- spring -->
|
||||
<dependency>
|
||||
<groupId>org.springframework</groupId>
|
||||
<artifactId>spring-web</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
</dependencies>
|
||||
</project>
|
||||
@ -1,16 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.api.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class DocumentRequest {
|
||||
|
||||
protected String dossierId;
|
||||
protected String fileId;
|
||||
|
||||
}
|
||||
@ -1,17 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.api.model;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class IdentityTest {
|
||||
|
||||
@Test
|
||||
public void mockTest() {
|
||||
|
||||
int i = 1;
|
||||
assertThat(i).isEqualTo(1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
22
ocr-service-v1/ocr-service-api/build.gradle.kts
Normal file
22
ocr-service-v1/ocr-service-api/build.gradle.kts
Normal file
@ -0,0 +1,22 @@
|
||||
plugins {
|
||||
`maven-publish`
|
||||
id("com.iqser.red.service.java-conventions")
|
||||
id("io.freefair.lombok") version "8.4"
|
||||
}
|
||||
|
||||
publishing {
|
||||
publications {
|
||||
create<MavenPublication>(name) {
|
||||
from(components["java"])
|
||||
}
|
||||
}
|
||||
repositories {
|
||||
maven {
|
||||
url = uri("https://nexus.knecon.com/repository/red-platform-releases/")
|
||||
credentials {
|
||||
username = providers.gradleProperty("mavenUser").getOrNull();
|
||||
password = providers.gradleProperty("mavenPassword").getOrNull();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,24 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.api.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public class DocumentRequest {
|
||||
|
||||
protected String dossierId;
|
||||
protected String fileId;
|
||||
protected boolean removeWatermark;
|
||||
|
||||
public DocumentRequest(String dossierId, String fileId) {
|
||||
|
||||
this.dossierId = dossierId;
|
||||
this.fileId = fileId;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.ocr.v1.api.model;
|
||||
package com.knecon.fforesight.service.ocr.v1.api.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -6,14 +6,15 @@ import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public class OCRStatusUpdateResponse {
|
||||
|
||||
private String fileId;
|
||||
private int numberOfPagesToOCR;
|
||||
private int numberOfOCRedPages;
|
||||
private boolean ocrFinished;
|
||||
private boolean ocrStarted;
|
||||
|
||||
}
|
||||
31
ocr-service-v1/ocr-service-processor/build.gradle.kts
Normal file
31
ocr-service-v1/ocr-service-processor/build.gradle.kts
Normal file
@ -0,0 +1,31 @@
|
||||
plugins {
|
||||
id("com.iqser.red.service.java-conventions")
|
||||
id("io.freefair.lombok") version "8.4"
|
||||
}
|
||||
|
||||
configurations {
|
||||
all {
|
||||
exclude(group = "org.springframework.boot", module = "spring-boot-starter-logging")
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
api("com.iqser.red.service:persistence-service-internal-api-v1:2.224.0")
|
||||
api("net.sourceforge.tess4j:tess4j:5.8.0")
|
||||
api("com.iqser.red.commons:metric-commons:2.1.0")
|
||||
api("com.iqser.red.commons:storage-commons:2.45.0")
|
||||
api("com.knecon.fforesight:tenant-commons:0.21.0")
|
||||
api("com.knecon.fforesight:lifecycle-commons:0.6.0")
|
||||
api("com.pdftron:PDFNet:10.5.0")
|
||||
api("org.apache.pdfbox:pdfbox:3.0.0")
|
||||
api("org.apache.pdfbox:jbig2-imageio:3.0.4")
|
||||
api("com.github.jai-imageio:jai-imageio-core:1.4.0")
|
||||
api("com.github.jai-imageio:jai-imageio-jpeg2000:1.4.0")
|
||||
api("org.apache.commons:commons-math3:3.6.1")
|
||||
api("io.github.karols:hocr4j:0.2.0")
|
||||
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
|
||||
api("com.google.guava:guava:31.1-jre")
|
||||
api("com.iqser.red.commons:pdftron-logic-commons:2.32.0")
|
||||
api("com.knecon.fforesight:viewer-doc-processor:0.125.0")
|
||||
testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
|
||||
}
|
||||
@ -0,0 +1,26 @@
|
||||
package com.knecon.fforesight.service.ocr.processor;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.ComponentScan;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
|
||||
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
|
||||
@Configuration
|
||||
@ComponentScan
|
||||
@EnableConfigurationProperties(OcrServiceSettings.class)
|
||||
public class OcrServiceProcessorConfiguration {
|
||||
|
||||
@Bean
|
||||
@Autowired
|
||||
public ViewerDocumentService viewerDocumentService(ObservationRegistry registry) {
|
||||
|
||||
return new ViewerDocumentService(registry);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,49 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.initializer;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
import com.sun.jna.NativeLibrary;
|
||||
|
||||
import jakarta.annotation.PostConstruct;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
public class NativeLibrariesInitializer {
|
||||
|
||||
@Value("${pdftron.license:}")
|
||||
private String pdftronLicense;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@PostConstruct
|
||||
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
|
||||
public void init() {
|
||||
|
||||
log.info("Initializing Native Libraries");
|
||||
log.info("Setting pdftron license: {}", pdftronLicense);
|
||||
PDFNet.setTempPath("/tmp/pdftron");
|
||||
PDFNet.initialize(pdftronLicense);
|
||||
|
||||
log.info("Setting jna.library.path: {}", System.getenv("VCPKG_DYNAMIC_LIB"));
|
||||
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
|
||||
|
||||
log.info("Asserting Native Libraries loaded");
|
||||
|
||||
try (NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica")) {
|
||||
assert leptonicaLib != null;
|
||||
log.info("Leptonica library loaded from {}", leptonicaLib.getFile().getAbsolutePath());
|
||||
}
|
||||
|
||||
try (NativeLibrary tesseractLib = NativeLibrary.getInstance("tesseract")) {
|
||||
assert tesseractLib != null;
|
||||
log.info("Tesseract library loaded from {}", tesseractLib.getFile().getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,36 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
public record ExtractedImage(
|
||||
int pageNumber, QuadPoint position, int height, int width, BufferedImage image, Matrix ctm, int numberOnPage, PDColorSpace colorSpace) implements UnprocessedImage {
|
||||
|
||||
@SneakyThrows
|
||||
public Pix asPix() {
|
||||
|
||||
BufferedImage image = ImageProcessingUtils.convertToDeviceColorSpace(this);
|
||||
ImageProcessingUtils.setAlphaChannelToWhite(image);
|
||||
return LeptUtils.convertImageToPix(image);
|
||||
}
|
||||
|
||||
|
||||
public QuadPoint getImageCoordinatesInInitialUserSpace() {
|
||||
|
||||
return QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, 1, 1)).getTransformed(ctm.createAffineTransform());
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,61 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.Graphics;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.image.BufferedImage;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.Setter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.tess4j.ITessAPI;
|
||||
|
||||
@Slf4j
|
||||
@Getter
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ExtractedOcrImage implements OcrImage {
|
||||
|
||||
int pageNumber;
|
||||
int numberOnPage;
|
||||
int originalHeight;
|
||||
int originalWidth;
|
||||
Matrix ctm;
|
||||
Pix pix;
|
||||
int height;
|
||||
int width;
|
||||
int rotationDegrees;
|
||||
|
||||
|
||||
@Override
|
||||
public AffineTransform getImageCTM() {
|
||||
|
||||
AffineTransform affineTransform = ctm.createAffineTransform();
|
||||
|
||||
affineTransform.scale((double) 1 / getWidth(), (double) 1 / getHeight());
|
||||
|
||||
AffineTransform deRotationMatrix = switch (360 - rotationDegrees) {
|
||||
case 90 -> new AffineTransform(0, 1, -1, 0, getHeight(), 0);
|
||||
case 180 -> new AffineTransform(-1, 0, 0, -1, getWidth(), getHeight());
|
||||
case 270 -> new AffineTransform(0, -1, 1, 0, getWidth() - getHeight(), getHeight()); // results from 90 + 180 rotations
|
||||
default -> new AffineTransform();
|
||||
};
|
||||
|
||||
affineTransform.concatenate(deRotationMatrix);
|
||||
AffineTransform mirrorTransform = new AffineTransform(1, 0, 0, -1, 0, getHeight());
|
||||
affineTransform.concatenate(mirrorTransform);
|
||||
return affineTransform;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public final class FontMetrics {
|
||||
|
||||
float descent; // descent is the part of the text which is below the baseline, e.g. the lower curve of a 'g'. https://en.wikipedia.org/wiki/Body_height_(typography)
|
||||
float fontSize;
|
||||
float heightScaling;
|
||||
|
||||
}
|
||||
@ -0,0 +1,5 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
public record HeightAndDescent(float height, float descent) {
|
||||
|
||||
}
|
||||
@ -0,0 +1,127 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.PdfDpiCalculator;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
import net.sourceforge.tess4j.ITessAPI;
|
||||
|
||||
public interface OcrImage {
|
||||
|
||||
/**
|
||||
* Retrieves the page number where the OCR image is located. It uses 1-based-index.
|
||||
*
|
||||
* @return The page number where the OCR image is located.
|
||||
*/
|
||||
int getPageNumber();
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the number of this image on the page. For full page images this always returns 0.
|
||||
*
|
||||
* @return The number of this image on the page.
|
||||
*/
|
||||
int getNumberOnPage();
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the height of the original image (not necessarily in pdf coordinates).
|
||||
*
|
||||
* @return the height of the image
|
||||
*/
|
||||
int getHeight();
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the width of the original image (not necessarily in pdf coordinates).
|
||||
*
|
||||
* @return the width of the image
|
||||
*/
|
||||
int getWidth();
|
||||
|
||||
|
||||
/**
|
||||
* Gets the outer boundary of the image in image coordinates. (0,0) is upper left corner. And height and width is the image size
|
||||
*
|
||||
* @return the QuadPoint representing the size of the image
|
||||
*/
|
||||
default QuadPoint getImageBounds() {
|
||||
|
||||
// cannot be solved with a nice rotation matrix. After rotating the text coordinates in the image will always start at (0,0) and will therefore always start at (0,0) in the PDF.
|
||||
// So in order to mimic this behavior we need to start with (0,0) coordinates always.
|
||||
if (getRotationDegrees() == 90 || getRotationDegrees() == 270) {
|
||||
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, getWidth()), new Point2D.Double(getHeight(), getWidth()), new Point2D.Double(getHeight(), 0));
|
||||
} else {
|
||||
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, getHeight()), new Point2D.Double(getWidth(), getHeight()), new Point2D.Double(getWidth(), 0));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the image coordinates in the PDF by transforming the image bounds using the current transformation matrix (CTM).
|
||||
*
|
||||
* @return The image coordinates as a QuadPoint object.
|
||||
*/
|
||||
default QuadPoint getImageCoordinatesInInitialUserSpace() {
|
||||
|
||||
return getImageBounds().getTransformed(getImageCTM());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the rotation degree of the OCR image.
|
||||
*
|
||||
* @return The rotation degree of the OCR image.
|
||||
*/
|
||||
int getRotationDegrees();
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the optimal page segmentation mode for the OCR image.
|
||||
*
|
||||
* @return The optimal page segmentation mode.
|
||||
*/
|
||||
default int getOptimalPageSegmentationMode() {
|
||||
|
||||
if (getWidth() < 200 || getHeight() < 200) {
|
||||
return ITessAPI.TessPageSegMode.PSM_SINGLE_BLOCK;
|
||||
}
|
||||
return ITessAPI.TessPageSegMode.PSM_AUTO;
|
||||
} // TODO: evaluate if PSM can be dynamically chosen to increase performance
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the buffered image associated with the OCR image.
|
||||
*
|
||||
* @return The BufferedImage object representing the image.
|
||||
*/
|
||||
Pix getPix();
|
||||
|
||||
|
||||
default int getDpi() {
|
||||
|
||||
return PdfDpiCalculator.calculateDpi(getImageBounds(), getImageCTM(), getWidth());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Retrieves the current transformation matrix (CTM). The CTM may be used to transform the image coordinates to Initial User Space coordinates.
|
||||
*
|
||||
* @return The AffineTransform representing the current transformation matrix.
|
||||
*/
|
||||
AffineTransform getImageCTM();
|
||||
|
||||
|
||||
default void destroyPix() {
|
||||
|
||||
LeptUtils.disposePix(getPix());
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,23 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.HOcrPageParser;
|
||||
|
||||
import io.github.karols.hocr4j.Word;
|
||||
|
||||
public record OcrResult(OcrImage image, String tesseractOutputFilePath) {
|
||||
|
||||
public static OcrResult create(OcrImage image, String tesseractResult) {
|
||||
|
||||
return new OcrResult(image, tesseractResult);
|
||||
}
|
||||
|
||||
|
||||
public List<Word> getAllWords() {
|
||||
|
||||
return HOcrPageParser.extractHocrPage(tesseractOutputFilePath).getAllWords();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,42 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.FontStyleDetectionModel;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
|
||||
|
||||
public record OcrResultToWrite(List<TextPositionInImage> textPositionInImage, QuadPoint imageBoundingBox) {
|
||||
|
||||
public static OcrResultToWrite fromFontStyleDetectionModel(FontStyleDetectionModel fontStyleDetectionModel) {
|
||||
|
||||
return new OcrResultToWrite(fontStyleDetectionModel.getTextPositionInImages(), fontStyleDetectionModel.getImageBounds());
|
||||
}
|
||||
|
||||
|
||||
public static Map<Integer, List<OcrResultToWrite>> buildOcrResultsToWrite(List<OcrResult> ocrResults, FontMetricsFactory fontMetricsFactory) {
|
||||
|
||||
return ocrResults.stream()
|
||||
.collect(Collectors.groupingBy(ocrResult -> ocrResult.image().getPageNumber()))
|
||||
.entrySet()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(Map.Entry::getKey,
|
||||
entry -> entry.getValue()
|
||||
.stream()
|
||||
.map(ocrResult -> new OcrResultToWrite(toTextPositionInImage(ocrResult, fontMetricsFactory), ocrResult.image().getImageCoordinatesInInitialUserSpace()))
|
||||
.toList()));
|
||||
}
|
||||
|
||||
|
||||
private static List<TextPositionInImage> toTextPositionInImage(OcrResult ocrResult, FontMetricsFactory fontMetricsFactory) {
|
||||
|
||||
return ocrResult.getAllWords()
|
||||
.stream()
|
||||
.filter(word -> !word.isBlank())
|
||||
.map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR))
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,42 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
|
||||
public record PageInformation(Rectangle2D mediabox, int number, int rotationDegrees) {
|
||||
|
||||
public static PageInformation fromPDPage(int pageNum, PDPage page) {
|
||||
|
||||
PDRectangle mediaBox = page.getMediaBox();
|
||||
return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()),
|
||||
pageNum,
|
||||
page.getRotation());
|
||||
}
|
||||
|
||||
|
||||
public double height() {
|
||||
|
||||
return mediabox.getHeight();
|
||||
}
|
||||
|
||||
|
||||
public double width() {
|
||||
|
||||
return mediabox.getWidth();
|
||||
}
|
||||
|
||||
|
||||
public double minX() {
|
||||
|
||||
return mediabox.getX();
|
||||
}
|
||||
|
||||
|
||||
public double minY() {
|
||||
|
||||
return mediabox.getY();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,117 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.Rectangle;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import io.github.karols.hocr4j.Bounds;
|
||||
|
||||
public record QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) {
|
||||
|
||||
/*
|
||||
B _____ C
|
||||
| |
|
||||
A|_____|D
|
||||
*/
|
||||
|
||||
|
||||
public static QuadPoint fromRectangle2D(Rectangle2D rectangle2D) {
|
||||
|
||||
return new QuadPoint(new Point2D.Double(rectangle2D.getX(), rectangle2D.getY()),
|
||||
new Point2D.Double(rectangle2D.getX(), rectangle2D.getMaxY()),
|
||||
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getMaxY()),
|
||||
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getY()));
|
||||
}
|
||||
|
||||
|
||||
public static QuadPoint fromBounds(Bounds bounds) {
|
||||
|
||||
return new QuadPoint(new Point2D.Double(bounds.getLeft(), bounds.getBottom()),
|
||||
new Point2D.Double(bounds.getLeft(), bounds.getTop()),
|
||||
new Point2D.Double(bounds.getRight(), bounds.getTop()),
|
||||
new Point2D.Double(bounds.getRight(), bounds.getBottom()));
|
||||
}
|
||||
|
||||
public Rectangle2D getBounds2D() {
|
||||
|
||||
double minX = Math.min(Math.min(Math.min(a.getX(), b.getX()), c.getX()), d.getX());
|
||||
double minY = Math.min(Math.min(Math.min(a.getY(), b.getY()), c.getY()), d.getY());
|
||||
double maxX = Math.max(Math.max(Math.max(a.getX(), b.getX()), c.getX()), d.getX());
|
||||
double maxY = Math.max(Math.max(Math.max(a.getY(), b.getY()), c.getY()), d.getY());
|
||||
|
||||
return new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY);
|
||||
}
|
||||
|
||||
|
||||
public QuadPoint getTransformed(AffineTransform at) {
|
||||
|
||||
return new QuadPoint(at.transform(a, null), at.transform(b, null), at.transform(c, null), at.transform(d, null));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determines if the given QuadPoint aligns with this QuadPoint within a given threshold.
|
||||
* It does os by trying every possible combination of aligning sides. It starts with the most likely combination of ab and cd.
|
||||
*
|
||||
* @param other The QuadPoint to compare with.
|
||||
* @param threshold The maximum distance allowed for alignment.
|
||||
* @return True if the QuadPoints align within the threshold, false otherwise.
|
||||
*/
|
||||
public boolean aligns(QuadPoint other, double threshold) {
|
||||
|
||||
Line2D ab = new Line2D.Double(a, b);
|
||||
Line2D bc = new Line2D.Double(b, c);
|
||||
Line2D cd = new Line2D.Double(c, d);
|
||||
Line2D da = new Line2D.Double(d, a);
|
||||
|
||||
Line2D ab2 = new Line2D.Double(other.a, other.b);
|
||||
Line2D bc2 = new Line2D.Double(other.b, other.c);
|
||||
Line2D cd2 = new Line2D.Double(other.c, other.d);
|
||||
Line2D da2 = new Line2D.Double(other.d, other.a);
|
||||
|
||||
List<Line2D> lines = List.of(ab, cd, bc, da);
|
||||
List<Line2D> lines2 = List.of(cd2, ab2, bc2, da2);
|
||||
return lines.stream().anyMatch(line -> lines2.stream().anyMatch(line2 -> aligns(line, line2, threshold)));
|
||||
}
|
||||
|
||||
|
||||
private static boolean aligns(Line2D a, Line2D b, double threshold) {
|
||||
|
||||
return aligns(a.getP1(), a.getP2(), b.getP1(), b.getP2(), threshold);
|
||||
}
|
||||
|
||||
|
||||
private static boolean aligns(Point2D a, Point2D b, Point2D a2, Point2D b2, double threshold) {
|
||||
|
||||
if (a.distance(a2) < threshold && b.distance(b2) < threshold) {
|
||||
return true;
|
||||
}
|
||||
return a.distance(b2) < threshold && b.distance(a2) < threshold;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.format("A:(%.2f, %.2f) | B:(%.2f, %.2f) | C:(%.2f, %.2f) | D:(%.2f, %.2f)",
|
||||
a().getX(),
|
||||
a().getY(),
|
||||
b().getX(),
|
||||
b().getY(),
|
||||
c().getX(),
|
||||
c().getY(),
|
||||
d().getX(),
|
||||
d().getY());
|
||||
}
|
||||
|
||||
|
||||
public double size() {
|
||||
|
||||
return a().distance(b()) * a().distance(d());
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
|
||||
public record RenderedPageImageFile(int pageNumber, String absoluteFilePath) implements UnprocessedImage {
|
||||
|
||||
@Override
|
||||
public Pix asPix() {
|
||||
|
||||
return Leptonica1.pixRead(absoluteFilePath);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,82 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
|
||||
@Getter
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class RenderedPageOcrImage implements OcrImage {
|
||||
|
||||
int height;
|
||||
int width;
|
||||
PageInformation pageInformation;
|
||||
Pix pix;
|
||||
int rotationDegrees;
|
||||
|
||||
|
||||
@Override
|
||||
public AffineTransform getImageCTM() {
|
||||
|
||||
double scalingFactor = calculateScalingFactor();
|
||||
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, 0, 0);
|
||||
|
||||
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
|
||||
|
||||
AffineTransform rotationMatrix = switch (calculateTotalRotation()) {
|
||||
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
|
||||
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
|
||||
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height()); // results from 90 + 180 rotations
|
||||
default -> new AffineTransform();
|
||||
};
|
||||
|
||||
// matrix multiplication is performed from right to left, so the order is reversed.
|
||||
// scaling -> mirror -> rotation
|
||||
AffineTransform resultMatrix = new AffineTransform();
|
||||
|
||||
resultMatrix.concatenate(rotationMatrix);
|
||||
resultMatrix.concatenate(mirrorMatrix);
|
||||
resultMatrix.concatenate(imageToCropBoxScaling);
|
||||
return resultMatrix;
|
||||
}
|
||||
|
||||
|
||||
private int calculateTotalRotation() {
|
||||
|
||||
return (pageInformation.rotationDegrees() + (360 - rotationDegrees)) % 360;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getPageNumber() {
|
||||
|
||||
return pageInformation.number();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getNumberOnPage() {
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
private double calculateScalingFactor() {
|
||||
|
||||
// PDFBox always returns page height and width based on rotation
|
||||
double pageWidth;
|
||||
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
|
||||
pageWidth = pageInformation.height();
|
||||
} else {
|
||||
pageWidth = pageInformation.width();
|
||||
}
|
||||
|
||||
return pageWidth / width;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,135 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
|
||||
|
||||
import io.github.karols.hocr4j.Word;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class TextPositionInImage {
|
||||
|
||||
final QuadPoint position;
|
||||
final String text;
|
||||
final AffineTransform imageCTM;
|
||||
|
||||
@Setter
|
||||
FontMetricsFactory fontMetricsFactory;
|
||||
@Setter
|
||||
FontStyle fontStyle;
|
||||
|
||||
|
||||
public TextPositionInImage(Word word, AffineTransform imageCTM, FontMetricsFactory fontMetricsFactory, FontStyle fontStyle) {
|
||||
|
||||
this.position = QuadPoint.fromBounds(word.getBounds());
|
||||
this.text = word.getText();
|
||||
this.imageCTM = imageCTM;
|
||||
this.fontMetricsFactory = fontMetricsFactory;
|
||||
this.fontStyle = fontStyle;
|
||||
}
|
||||
|
||||
|
||||
public QuadPoint getTransformedTextBBox() {
|
||||
|
||||
return position.getTransformed(imageCTM);
|
||||
}
|
||||
|
||||
|
||||
public PDFont getFont() {
|
||||
|
||||
return fontMetricsFactory.getFont();
|
||||
}
|
||||
|
||||
|
||||
public Matrix getTextMatrix() {
|
||||
|
||||
FontMetrics metrics = fontMetricsFactory.calculateMetrics(text, getTransformedWidth(), getTransformedHeight());
|
||||
|
||||
// Matrix multiplication is from right to left:
|
||||
// convert to image coords -> subtract descent -> scale height -> reverse imageCTM scaling -> translate to coordinates in image -> convert to pdf coords
|
||||
// width must not be set, since it is scaled with the fontsize attribute
|
||||
|
||||
AffineTransform ctm = new AffineTransform();
|
||||
ctm.concatenate(imageCTM);
|
||||
ctm.translate(position.a().getX(), position.a().getY());
|
||||
ctm.scale(getWidth() / getTransformedWidth(),
|
||||
getHeight() / getTransformedHeight()); // scale with transformation coefficient, such that fontsize may be set with transformed width.
|
||||
ctm.scale(1, metrics.getHeightScaling());
|
||||
ctm.translate(0, metrics.getDescent());
|
||||
ctm.concatenate(new AffineTransform(1, 0, 0, -1, 0, 0)); // start in image coordinates, with (0,0) being top left and negative height.
|
||||
|
||||
return new Matrix(ctm);
|
||||
}
|
||||
|
||||
|
||||
public double getFontSize() {
|
||||
|
||||
return fontMetricsFactory.calculateFontSize(text, getTransformedWidth());
|
||||
}
|
||||
|
||||
|
||||
public double getTransformedWidth() {
|
||||
|
||||
return transformedA().distance(transformedD());
|
||||
}
|
||||
|
||||
|
||||
public double getTransformedHeight() {
|
||||
|
||||
return transformedA().distance(transformedB());
|
||||
}
|
||||
|
||||
|
||||
public double getWidth() {
|
||||
|
||||
return position.a().distance(position.d());
|
||||
}
|
||||
|
||||
|
||||
public double getTextHeight() {
|
||||
|
||||
var metrics = fontMetricsFactory.calculateMetrics(text, getTransformedWidth(), getTransformedHeight());
|
||||
return fontMetricsFactory.calculateFontSize(text, getTransformedWidth()) * metrics.getHeightScaling();
|
||||
}
|
||||
|
||||
|
||||
public double getHeight() {
|
||||
|
||||
return position.a().distance(position.b());
|
||||
}
|
||||
|
||||
|
||||
public Point2D transformedA() {
|
||||
|
||||
return imageCTM.transform(position.a(), null);
|
||||
}
|
||||
|
||||
|
||||
public Point2D transformedB() {
|
||||
|
||||
return imageCTM.transform(position.b(), null);
|
||||
}
|
||||
|
||||
|
||||
public Point2D transformedC() {
|
||||
|
||||
return imageCTM.transform(position.c(), null);
|
||||
}
|
||||
|
||||
|
||||
public Point2D transformedD() {
|
||||
|
||||
return imageCTM.transform(position.d(), null);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,9 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
|
||||
public interface UnprocessedImage {
|
||||
|
||||
Pix asPix();
|
||||
|
||||
}
|
||||
@ -0,0 +1,58 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model.scriptdetection;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
@Getter
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public final class FontStyleDetectionModel {
|
||||
|
||||
QuadPoint imageBounds;
|
||||
Pix image;
|
||||
List<TextPositionAndWordImage> textPositionsAndWordImages;
|
||||
|
||||
|
||||
public static FontStyleDetectionModel fromOcrResult(OcrResult ocrResult, FontMetricsFactory fontMetricsFactory, OcrServiceSettings settings) {
|
||||
|
||||
var image = Leptonica1.pixRead(ocrResult.tesseractOutputFilePath() + ".tiff");
|
||||
var wordPixes = ocrResult.getAllWords().stream().filter(word -> !word.isBlank()).map(word -> TextPositionAndWordImage.create(ocrResult.image().getImageCTM(), word, image, settings, fontMetricsFactory)).toList();
|
||||
|
||||
return new FontStyleDetectionModel(ocrResult.image().getImageCoordinatesInInitialUserSpace(), image, wordPixes);
|
||||
}
|
||||
|
||||
|
||||
public List<TextPositionInImage> getTextPositionInImages() {
|
||||
|
||||
return textPositionsAndWordImages.stream().map(TextPositionAndWordImage::getTextPositionInImage).toList();
|
||||
}
|
||||
|
||||
|
||||
public List<WordImage> getWordImages() {
|
||||
|
||||
return textPositionsAndWordImages.stream().map(TextPositionAndWordImage::getWordImage).toList();
|
||||
}
|
||||
|
||||
|
||||
public void dispose() {
|
||||
|
||||
LeptUtils.disposePix(image);
|
||||
getWordImages().forEach(WordImage::dispose);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -0,0 +1,52 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model.scriptdetection;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.commons.math3.ml.clustering.Clusterable;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
import io.github.karols.hocr4j.Word;
|
||||
import lombok.Getter;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
|
||||
@Getter
|
||||
public final class TextPositionAndWordImage implements Clusterable {
|
||||
|
||||
private final TextPositionInImage textPositionInImage;
|
||||
private final WordImage wordImage;
|
||||
|
||||
|
||||
public TextPositionAndWordImage(TextPositionInImage textPositionInImage, WordImage wordImage) {
|
||||
|
||||
this.textPositionInImage = textPositionInImage;
|
||||
this.wordImage = wordImage;
|
||||
}
|
||||
|
||||
|
||||
public static TextPositionAndWordImage create(AffineTransform imageCTM, Word word, Pix image, OcrServiceSettings settings, FontMetricsFactory fontMetricsFactory) {
|
||||
|
||||
TextPositionInImage textPositionInImage = new TextPositionInImage(word, imageCTM, fontMetricsFactory, FontStyle.REGULAR);
|
||||
WordImage wordImage = new WordImage(textPositionInImage.getTextHeight(), word, image, settings);
|
||||
return new TextPositionAndWordImage(textPositionInImage, wordImage);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public double[] getPoint() {
|
||||
|
||||
return wordImage.getPoint();
|
||||
}
|
||||
|
||||
|
||||
public double getTextHeight() {
|
||||
|
||||
return wordImage.getTextHeight();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,71 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model.scriptdetection;
|
||||
|
||||
import org.apache.commons.math3.ml.clustering.Clusterable;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.StrokeWidthCalculator;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||
|
||||
import io.github.karols.hocr4j.Word;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import net.sourceforge.lept4j.Box;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
@Getter
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class WordImage implements Clusterable {
|
||||
|
||||
Pix image;
|
||||
String text;
|
||||
double textHeight;
|
||||
OcrServiceSettings settings;
|
||||
|
||||
|
||||
public WordImage(double textHeight, Word word, Pix originalImage, OcrServiceSettings settings) {
|
||||
|
||||
Box box = new Box(word.getBounds().getLeft(), word.getBounds().getTop(), word.getBounds().getWidth(), word.getBounds().getHeight(), 1);
|
||||
this.image = Leptonica1.pixClipRectangle(originalImage, box, null);
|
||||
box.clear();
|
||||
this.text = word.getText();
|
||||
this.textHeight = textHeight;
|
||||
this.settings = settings;
|
||||
}
|
||||
|
||||
|
||||
public boolean hasLargerStrokeWidth(double strokeWidth) {
|
||||
|
||||
int roundedStrokeWidth = (int) Math.round(strokeWidth);
|
||||
double roundingError = (roundedStrokeWidth - strokeWidth) / strokeWidth;
|
||||
|
||||
// add 1 to open a bit bigger than the estimated regular stroke width
|
||||
Pix openedPix = Leptonica1.pixOpenBrick(null, image, roundedStrokeWidth + 1, roundedStrokeWidth + 1);
|
||||
|
||||
double openedPixelDensity = ImageProcessingUtils.calculatePixelDensity(openedPix);
|
||||
|
||||
double pixelDensity = ImageProcessingUtils.calculatePixelDensity(image);
|
||||
|
||||
LeptUtils.disposePix(openedPix);
|
||||
|
||||
return (openedPixelDensity * (1 + roundingError)) / pixelDensity > (settings.getBoldThreshold());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public double[] getPoint() {
|
||||
|
||||
return new double[]{textHeight};
|
||||
}
|
||||
|
||||
|
||||
public void dispose() {
|
||||
|
||||
LeptUtils.disposePix(image);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,66 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class FileStorageService {
|
||||
|
||||
private final StorageService storageService;
|
||||
|
||||
|
||||
public static String getStorageId(String dossierId, String fileId, FileType fileType) {
|
||||
|
||||
return dossierId + "/" + fileId + "." + fileType.name() + fileType.getExtension();
|
||||
}
|
||||
|
||||
public boolean untouchedFileExists(String dossierId, String fileId) {
|
||||
|
||||
return storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED));
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void storeFiles(String dossierId, String fileId, File documentFile, File viewerDocumentFile) {
|
||||
|
||||
try (var in = new FileInputStream(documentFile)) {
|
||||
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), in);
|
||||
}
|
||||
try (var in = new FileInputStream(viewerDocumentFile)) {
|
||||
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT), in);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void downloadFiles(String dossierId, String fileId, File documentFile, File viewerDocumentFile) {
|
||||
|
||||
storageService.downloadTo(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), documentFile);
|
||||
if (storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT))) {
|
||||
storageService.downloadTo(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT), viewerDocumentFile);
|
||||
} else {
|
||||
Files.copy(documentFile.toPath(), viewerDocumentFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
|
||||
if (!untouchedFileExists(dossierId, fileId)) {
|
||||
try (var in = new FileInputStream(documentFile)) {
|
||||
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED), in);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,169 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.LinkedBlockingDeque;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.BlockingQueueFiller;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
@SuppressWarnings("PMD") // can't figure out how to safely close the stdOut and stdError streams in line 142/144
|
||||
public class GhostScriptService {
|
||||
|
||||
static String FORMAT = ".tiff";
|
||||
static String DEVICE = "tiffgray";
|
||||
OcrServiceSettings settings;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void renderPagesAsImagesBatchedAndAddToQueue(List<Integer> stitchedPageNumbers,
|
||||
String documentAbsolutePath,
|
||||
Path tmpImageDir,
|
||||
PDDocument document,
|
||||
BlockingQueue<UnprocessedImage> imageProcessingQueue,
|
||||
Statistics stats) {
|
||||
|
||||
BlockingQueue<RenderedPageImageFile> imageFileCollectorQueue = new LinkedBlockingDeque<>();
|
||||
BlockingQueueFiller asyncTransferThread = new BlockingQueueFiller(imageFileCollectorQueue, imageProcessingQueue);
|
||||
asyncTransferThread.start();
|
||||
int numOfProcesses = Math.min(settings.getGsProcessCount(), stitchedPageNumbers.size());
|
||||
|
||||
List<List<ProcessInfo>> processInfoBatches = buildSubListForEachProcess(stitchedPageNumbers,
|
||||
numOfProcesses,
|
||||
256 * numOfProcesses); // GS has a limit on how many pageIndices per call are possible, so we limit it to 256 pages per process
|
||||
for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) {
|
||||
long timestamp = System.currentTimeMillis();
|
||||
List<ProcessInfo> processInfos = processInfoBatches.get(batchIdx);
|
||||
|
||||
log.info("Batch {}: Running {} gs processes with ({}) pages each",
|
||||
batchIdx,
|
||||
processInfos.size(),
|
||||
processInfos.stream().map(info -> info.stitchedPageNumbers().size()).map(String::valueOf).collect(Collectors.joining(", ")));
|
||||
|
||||
int finalBatchIdx = batchIdx;
|
||||
List<Process> processes = processInfos.stream()
|
||||
.parallel()
|
||||
.map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.stitchedPageNumbers(), tmpImageDir, documentAbsolutePath))
|
||||
.peek(s -> log.debug(String.join(" ", s.cmdArgs())))
|
||||
.map(processInfo -> executeProcess(processInfo, imageFileCollectorQueue))
|
||||
.toList();
|
||||
|
||||
List<Integer> processExitCodes = new LinkedList<>();
|
||||
for (Process process : processes) {
|
||||
processExitCodes.add(process.waitFor());
|
||||
}
|
||||
stats.increasePDF2ImgDuration(System.currentTimeMillis() - timestamp);
|
||||
log.info("Batch {}: Ghostscript processes finished with exit codes " + processExitCodes, batchIdx);
|
||||
}
|
||||
asyncTransferThread.setAllImagesQueued(true);
|
||||
}
|
||||
|
||||
|
||||
private List<List<ProcessInfo>> buildSubListForEachProcess(List<Integer> stitchedPageNumbers, int processCount, int batchSize) {
|
||||
|
||||
// GhostScript command line can only handle so many page numbers at once, so we split it into batches
|
||||
int batchCount = (int) Math.ceil((double) stitchedPageNumbers.size() / batchSize);
|
||||
|
||||
log.info("Splitting {} page renderings across {} process(es) in {} batch(es) with size {}", stitchedPageNumbers.size(), processCount, batchCount, batchSize);
|
||||
|
||||
List<List<ProcessInfo>> processInfoBatches = new ArrayList<>(batchCount);
|
||||
List<List<List<Integer>>> batchedBalancedSublist = ListSplittingUtils.buildBatchedBalancedSublist(stitchedPageNumbers.stream().sorted().toList(), processCount, batchCount);
|
||||
|
||||
for (var batch : batchedBalancedSublist) {
|
||||
List<ProcessInfo> processInfos = new ArrayList<>(processCount);
|
||||
for (int threadIdx = 0; threadIdx < batch.size(); threadIdx++) {
|
||||
List<Integer> balancedPageNumbersSubList = batch.get(threadIdx);
|
||||
processInfos.add(new ProcessInfo(threadIdx, balancedPageNumbersSubList));
|
||||
}
|
||||
processInfoBatches.add(processInfos);
|
||||
}
|
||||
return processInfoBatches;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private ProcessCmdsAndRenderedImageFiles buildCmdArgs(Integer processIdx,
|
||||
Integer batchIdx,
|
||||
List<Integer> stitchedImagePageIndices,
|
||||
Path outputDir,
|
||||
String documentAbsolutePath) {
|
||||
|
||||
String imagePathFormat = outputDir.resolve("output_" + processIdx + "_" + batchIdx + ".%04d" + FORMAT).toFile().toString();
|
||||
|
||||
Map<Integer, RenderedPageImageFile> fullPageImages = new HashMap<>();
|
||||
for (int i = 0; i < stitchedImagePageIndices.size(); i++) {
|
||||
Integer pageNumber = stitchedImagePageIndices.get(i);
|
||||
fullPageImages.put(pageNumber, new RenderedPageImageFile(pageNumber, String.format(imagePathFormat, i + 1)));
|
||||
}
|
||||
|
||||
String[] cmdArgs = buildCmdArgs(stitchedImagePageIndices, documentAbsolutePath, imagePathFormat);
|
||||
|
||||
return new ProcessCmdsAndRenderedImageFiles(cmdArgs, fullPageImages);
|
||||
}
|
||||
|
||||
|
||||
private String[] buildCmdArgs(List<Integer> stitchedImagePageIndices, String documentAbsolutePath, String imagePathFormat) {
|
||||
|
||||
StringBuilder sPageList = new StringBuilder();
|
||||
int i = 1;
|
||||
for (Integer integer : stitchedImagePageIndices) {
|
||||
sPageList.append(integer);
|
||||
if (i < stitchedImagePageIndices.size()) {
|
||||
sPageList.append(",");
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
String[] cmdArgs = new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + settings.getDpi(), "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"};
|
||||
return cmdArgs;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Process executeProcess(ProcessCmdsAndRenderedImageFiles processInfo, BlockingQueue<RenderedPageImageFile> imageFileCollectorQueue) {
|
||||
|
||||
Process p = Runtime.getRuntime().exec(processInfo.cmdArgs());
|
||||
InputStream stdOut = p.getInputStream();
|
||||
GhostScriptOutputHandler stdOutLogger = GhostScriptOutputHandler.stdOut(stdOut, processInfo.renderedPageImageFiles(), imageFileCollectorQueue);
|
||||
InputStream stdError = p.getErrorStream();
|
||||
GhostScriptOutputHandler stdErrorLogger = GhostScriptOutputHandler.errorHandler(stdError);
|
||||
|
||||
stdOutLogger.start();
|
||||
stdErrorLogger.start();
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
private record ProcessCmdsAndRenderedImageFiles(String[] cmdArgs, Map<Integer, RenderedPageImageFile> renderedPageImageFiles) {
|
||||
|
||||
}
|
||||
|
||||
private record ProcessInfo(Integer processIdx, List<Integer> stitchedPageNumbers) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,24 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.util.List;
|
||||
|
||||
import io.github.karols.hocr4j.Page;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class HOcrPageParser {
|
||||
|
||||
@SneakyThrows
|
||||
public Page extractHocrPage(String tesseractOutputFileName) {
|
||||
|
||||
String hOcrString;
|
||||
try (var hocrIn = new FileInputStream(tesseractOutputFileName + ".hocr")) {
|
||||
hOcrString = new String(hocrIn.readAllBytes());
|
||||
}
|
||||
return Page.fromHocr(List.of(hOcrString)).get(0);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -0,0 +1,16 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@Service
|
||||
public interface IOcrMessageSender {
|
||||
|
||||
void sendUpdate(String fileId, int finishedImages, int totalImages);
|
||||
|
||||
void sendOCRStarted(String fileId);
|
||||
|
||||
void sendOcrFinished(String fileId, int totalImages);
|
||||
|
||||
void sendOcrResponse(String dossierId, String fileId);
|
||||
|
||||
}
|
||||
@ -0,0 +1,96 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.contentstream.PDFStreamEngine;
|
||||
import org.apache.pdfbox.contentstream.operator.DrawObject;
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
|
||||
import org.apache.pdfbox.contentstream.operator.state.Restore;
|
||||
import org.apache.pdfbox.contentstream.operator.state.Save;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters;
|
||||
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
||||
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Getter
|
||||
public class ImageStreamEngine extends PDFStreamEngine {
|
||||
|
||||
private List<ExtractedImage> imagesOnCurrentPage;
|
||||
private OcrServiceSettings settings;
|
||||
private int pageNum;
|
||||
|
||||
|
||||
public ImageStreamEngine(OcrServiceSettings settings) {
|
||||
|
||||
this.settings = settings;
|
||||
// preparing PDFStreamEngine
|
||||
addOperator(new Concatenate(this));
|
||||
addOperator(new DrawObject(this));
|
||||
addOperator(new SetGraphicsStateParameters(this));
|
||||
addOperator(new Save(this));
|
||||
addOperator(new Restore(this));
|
||||
addOperator(new SetMatrix(this));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException {
|
||||
|
||||
String operation = operator.getName();
|
||||
if ("Do".equals(operation)) {
|
||||
COSName objectName = (COSName) operands.get(0);
|
||||
// get the PDF object
|
||||
PDXObject xobject = getResources().getXObject(objectName);
|
||||
|
||||
// check if the object is an image object
|
||||
if (xobject instanceof PDImageXObject imageXObject) {
|
||||
|
||||
if (imageXObject.getWidth() < settings.getMinImageWidth() || imageXObject.getHeight() < settings.getMinImageHeight()) {
|
||||
return;
|
||||
}
|
||||
|
||||
Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix();
|
||||
this.imagesOnCurrentPage.add(new ExtractedImage(pageNum,
|
||||
QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, imageXObject.getWidth(), imageXObject.getHeight())),
|
||||
imageXObject.getHeight(),
|
||||
imageXObject.getWidth(),
|
||||
imageXObject.getImage(),
|
||||
imageCTM,
|
||||
imagesOnCurrentPage.size(),
|
||||
imageXObject.getColorSpace()));
|
||||
|
||||
} else if (xobject instanceof PDFormXObject) {
|
||||
PDFormXObject form = (PDFormXObject) xobject;
|
||||
showForm(form);
|
||||
}
|
||||
} else {
|
||||
super.processOperator(operator, operands);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void processPage(int pageNum, PDPage page) {
|
||||
|
||||
this.pageNum = pageNum;
|
||||
this.imagesOnCurrentPage = new LinkedList<>();
|
||||
super.processPage(page);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,176 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.util.FileSystemUtils;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.FontStyleDetector;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OCRService {
|
||||
|
||||
FileStorageService fileStorageService;
|
||||
OcrServiceSettings settings;
|
||||
IOcrMessageSender ocrMessageSender;
|
||||
WatermarkRemovalService watermarkRemovalService;
|
||||
InvisibleElementRemovalService invisibleElementRemovalService;
|
||||
OcrResultWriter ocrResultWriter;
|
||||
GhostScriptService ghostScriptService;
|
||||
FontStyleDetector boldDetector;
|
||||
ObservationRegistry registry;
|
||||
|
||||
|
||||
/**
|
||||
* Starts the OCR-Process: Collecting images (via threads),
|
||||
* looking for stitchedImages (if so converting the current page to an image with ghostscript and work on this instead),
|
||||
* perform tesseract-ocr on these images (via threads) and write the generated ocr-text as invisible elements.
|
||||
*
|
||||
* @param dossierId Id of dossier
|
||||
* @param fileId Id of file
|
||||
* @param tmpDir working directory for all files
|
||||
* @param documentFile the file to perform ocr on, results are written invisibly
|
||||
* @param viewerDocumentFile debugging file, results are written visibly in an optional content group
|
||||
*/
|
||||
@Observed(name = "OCRService", contextualName = "run-ocr-on-document")
|
||||
@SneakyThrows
|
||||
public void runOcrOnDocument(String dossierId, String fileId, boolean removeWatermark, Path tmpDir, File documentFile, File viewerDocumentFile) {
|
||||
|
||||
if (removeWatermark) {
|
||||
removeWatermarkIfEnabled(documentFile);
|
||||
}
|
||||
removeInvisibleElements(documentFile);
|
||||
|
||||
log.info("Starting OCR for file {}", fileId);
|
||||
long ocrStart = System.currentTimeMillis();
|
||||
Statistics stats = runOcr(tmpDir, documentFile, viewerDocumentFile, fileId, dossierId);
|
||||
long ocrEnd = System.currentTimeMillis();
|
||||
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, String.format("%.1f", (ocrEnd - ocrStart) / 1000.0));
|
||||
log.info("Runtime breakdown: {}", stats);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void removeInvisibleElements(File originFile) {
|
||||
|
||||
Path tmpFile = Files.createTempFile("invisibleElements", ".pdf");
|
||||
try (var in = new FileInputStream(originFile); var out = new FileOutputStream(tmpFile.toFile())) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, false, false);
|
||||
}
|
||||
Files.copy(tmpFile, originFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
|
||||
assert tmpFile.toFile().delete();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void removeWatermarkIfEnabled(File originFile) {
|
||||
|
||||
Path tmpFile = Files.createTempFile("removeWatermarks", ".pdf");
|
||||
try (var in = new FileInputStream(originFile); var out = new FileOutputStream(tmpFile.toFile())) {
|
||||
watermarkRemovalService.removeWatermarks(in, out);
|
||||
}
|
||||
Files.copy(tmpFile, originFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
|
||||
assert tmpFile.toFile().delete();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Statistics runOcr(Path tmpDir, File documentFile, File viewerDocumentFile, String fileId, String dossierId) {
|
||||
|
||||
long timestamp;
|
||||
|
||||
Path tmpImageDir = tmpDir.resolve("images");
|
||||
Path tesseractOutputDir = tmpDir.resolve("tesseract_output");
|
||||
|
||||
tesseractOutputDir.toFile().mkdirs();
|
||||
tmpImageDir.toFile().mkdirs();
|
||||
|
||||
Statistics stats;
|
||||
try (PDDocument document = Loader.loadPDF(documentFile)) {
|
||||
OcrProgressLogger logger = new OcrProgressLogger(document.getNumberOfPages(), ocrMessageSender, fileId);
|
||||
|
||||
int numberOfExtractThreads = Math.min(settings.getImageExtractThreadCount(), document.getNumberOfPages());
|
||||
int numberOfOcrThreads = Math.min(settings.getOcrThreadCount(), document.getNumberOfPages());
|
||||
stats = new Statistics(numberOfExtractThreads, numberOfOcrThreads);
|
||||
|
||||
BlockingQueue<OcrImage> ocrImageQueue = new ArrayBlockingQueue<>((int) (1.5 * numberOfOcrThreads));
|
||||
|
||||
OcrImageFactory ocrImageFactory = new OcrImageFactory(document,
|
||||
documentFile,
|
||||
tmpImageDir,
|
||||
numberOfExtractThreads,
|
||||
ghostScriptService,
|
||||
ocrImageQueue,
|
||||
logger,
|
||||
settings,
|
||||
stats);
|
||||
ocrImageFactory.start();
|
||||
|
||||
List<OcrResult> ocrResults = new LinkedList<>();
|
||||
List<OCRThread> ocrThreads = IntStream.range(0, numberOfOcrThreads)
|
||||
.boxed()
|
||||
.map(id -> new OCRThread(id, ocrImageQueue, tesseractOutputDir, ocrResults, logger, stats, settings))
|
||||
.peek(Thread::start)
|
||||
.toList();
|
||||
log.info("Started {} OCR consumer threads, listening for images on the queue", ocrThreads.size());
|
||||
ocrImageFactory.join();
|
||||
log.info("Processed all images, interrupting ocr threads");
|
||||
|
||||
ocrThreads.forEach(Thread::interrupt);
|
||||
for (OCRThread ocrThread : ocrThreads) {
|
||||
ocrThread.join();
|
||||
}
|
||||
|
||||
log.info("Tesseract OCR has finished for file {} and dossier {}", fileId, dossierId);
|
||||
|
||||
timestamp = System.currentTimeMillis();
|
||||
Map<Integer, List<OcrResultToWrite>> imageWithTextPositionsPerPage = boldDetector.detectBold(ocrResults, document);
|
||||
stats.increaseFontStyleDetectionDuration(System.currentTimeMillis() - timestamp);
|
||||
|
||||
timestamp = System.currentTimeMillis();
|
||||
ocrResultWriter.drawOcrResultsToPdf(documentFile, viewerDocumentFile, imageWithTextPositionsPerPage);
|
||||
|
||||
log.info("Saving document");
|
||||
stats.increaseWritingTextDuration(System.currentTimeMillis() - timestamp);
|
||||
|
||||
logger.sendFinished();
|
||||
return stats;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,105 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageExtractionThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageProcessingThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OcrImageFactory {
|
||||
|
||||
PDDocument document;
|
||||
File documentFile;
|
||||
Path tmpImageDir;
|
||||
GhostScriptService ghostScriptService;
|
||||
BlockingQueue<UnprocessedImage> imageProcessingQueue;
|
||||
ImageProcessingThread imageProcessingThread;
|
||||
BlockingQueue<OcrImage> imageOutputQueue;
|
||||
List<ImageExtractionThread> imageExtractionThreads;
|
||||
List<Integer> stitchedPageNumbers;
|
||||
Statistics stats;
|
||||
|
||||
|
||||
public OcrImageFactory(PDDocument document,
|
||||
File documentFile,
|
||||
Path tmpImageDir,
|
||||
int numberOfThreads,
|
||||
GhostScriptService ghostScriptService,
|
||||
BlockingQueue<OcrImage> imageOcrQueue,
|
||||
OcrProgressLogger logger,
|
||||
OcrServiceSettings settings,
|
||||
Statistics stats) {
|
||||
|
||||
this.document = document;
|
||||
this.documentFile = documentFile;
|
||||
this.tmpImageDir = tmpImageDir;
|
||||
this.ghostScriptService = ghostScriptService;
|
||||
this.imageOutputQueue = imageOcrQueue;
|
||||
this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOcrQueue.remainingCapacity());
|
||||
this.stitchedPageNumbers = Collections.synchronizedList(new LinkedList<>());
|
||||
this.stats = stats;
|
||||
|
||||
this.imageExtractionThreads = new ArrayList<>(numberOfThreads);
|
||||
|
||||
List<List<Integer>> balancedPageNumbers = ListSplittingUtils.buildBalancedContinuousSublist(document.getNumberOfPages(), numberOfThreads);
|
||||
for (int i = 0; i < balancedPageNumbers.size(); i++) {
|
||||
imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageProcessingQueue, stitchedPageNumbers));
|
||||
}
|
||||
this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOcrQueue, stats, settings, document);
|
||||
|
||||
log.info("Started {} image extraction threads, with ({}) pages each",
|
||||
imageExtractionThreads.size(),
|
||||
imageExtractionThreads.stream().map(ImageExtractionThread::getPageIndices).map(List::size).map(String::valueOf).collect(Collectors.joining(", ")));
|
||||
}
|
||||
|
||||
|
||||
public void start() {
|
||||
|
||||
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
|
||||
imageExtractionThread.start();
|
||||
}
|
||||
imageProcessingThread.start();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void join() {
|
||||
|
||||
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
|
||||
imageExtractionThread.join();
|
||||
}
|
||||
|
||||
if (!stitchedPageNumbers.isEmpty()) {
|
||||
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageProcessingQueue, stats);
|
||||
}
|
||||
|
||||
imageProcessingThread.setAllImagesExtracted(true);
|
||||
imageProcessingThread.interrupt();
|
||||
|
||||
imageProcessingThread.join();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,91 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OcrProgressLogger {
|
||||
|
||||
Set<ImageNumberWithPageNumber> imagesToProcess;
|
||||
Set<ImageNumberWithPageNumber> processedImages;
|
||||
IOcrMessageSender ocrMessageSender;
|
||||
|
||||
String fileId;
|
||||
|
||||
|
||||
public OcrProgressLogger(int totalPageCount, IOcrMessageSender ocrMessageSender, String fileId) {
|
||||
|
||||
this.ocrMessageSender = ocrMessageSender;
|
||||
this.fileId = fileId;
|
||||
this.imagesToProcess = Collections.synchronizedSet(new HashSet<>(totalPageCount));
|
||||
for (int i = 0; i < totalPageCount; i++) {
|
||||
imagesToProcess.add(new ImageNumberWithPageNumber(0, i + 1));
|
||||
}
|
||||
this.processedImages = Collections.synchronizedSet(new HashSet<>(totalPageCount));
|
||||
}
|
||||
|
||||
|
||||
public void logImageFinished(OcrImage image, int psm) {
|
||||
|
||||
this.processedImages.add(new ImageNumberWithPageNumber(image.getNumberOnPage(), image.getPageNumber()));
|
||||
|
||||
if (image instanceof ExtractedOcrImage) {
|
||||
log.info("{}/{}: Finished image {} on page {} with rotation {}, used PSM {}, quad-point: {}",
|
||||
processedImages.size(),
|
||||
imagesToProcess.size(),
|
||||
image.getNumberOnPage(),
|
||||
image.getPageNumber(),
|
||||
image.getRotationDegrees(),
|
||||
psm,
|
||||
image.getImageCoordinatesInInitialUserSpace());
|
||||
} else {
|
||||
log.info("{}/{}: Finished page {} as fully rendered page with rotation {}, used PSM {}",
|
||||
processedImages.size(),
|
||||
imagesToProcess.size(),
|
||||
image.getPageNumber(),
|
||||
image.getRotationDegrees(),
|
||||
psm);
|
||||
|
||||
}
|
||||
ocrMessageSender.sendUpdate(fileId, this.processedImages.size(), this.imagesToProcess.size());
|
||||
}
|
||||
|
||||
|
||||
public void logPageSkipped(Integer pageIndex) {
|
||||
|
||||
var pageDummy = new ImageNumberWithPageNumber(0, pageIndex);
|
||||
this.imagesToProcess.remove(pageDummy);
|
||||
log.debug("{}/{}: No images to ocr on page {}", processedImages.size(), imagesToProcess.size(), pageIndex);
|
||||
ocrMessageSender.sendUpdate(fileId, this.processedImages.size(), imagesToProcess.size());
|
||||
}
|
||||
|
||||
|
||||
public void addImagesToProcess(int pageNumber, int imageNumber) {
|
||||
|
||||
this.imagesToProcess.add(new ImageNumberWithPageNumber(imageNumber, pageNumber));
|
||||
}
|
||||
|
||||
|
||||
public void sendFinished() {
|
||||
|
||||
log.info("{}/{}: Finished OCR on all images", processedImages.size(), imagesToProcess.size());
|
||||
ocrMessageSender.sendOcrFinished(fileId, imagesToProcess.size());
|
||||
}
|
||||
|
||||
|
||||
private record ImageNumberWithPageNumber(int imageNumber, int pageNumber) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,251 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.Converter;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.TextExtractor;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OcrResultWriter {
|
||||
|
||||
public static final Color REGULAR_TEXT_COLOR = Color.BLUE;
|
||||
public static final Color BOLD_TEXT_COLOR = Color.CYAN;
|
||||
|
||||
public static final Color REGULAR_TEXT_IN_IGNORE_ZONE = Color.RED;
|
||||
public static final Color BOLD_TEXT_IN_IGNORE_ZONE = Color.RED;
|
||||
|
||||
ViewerDocumentService viewerDocumentService;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void drawOcrResultsToPdf(File document, File viewerDocument, Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||
|
||||
Map<Integer, VisualizationsOnPage> ocrVisualizationsOnPages = new HashMap<>();
|
||||
Map<Integer, VisualizationsOnPage> ocrTextDebugVisualizationsOnPages = new HashMap<>();
|
||||
Map<Integer, VisualizationsOnPage> ocrBBoxDebugVisualizationsOnPages = new HashMap<>();
|
||||
|
||||
try (var in = new FileInputStream(document); PDFDoc doc = new PDFDoc(in)) {
|
||||
|
||||
for (Integer pageNumber : imagesWithResultsPerPage.keySet()) {
|
||||
|
||||
List<Rectangle2D> textBBoxes = getTextBBoxes(doc.getPage(pageNumber));
|
||||
|
||||
ocrVisualizationsOnPages.put(pageNumber - 1, createVisualizations(imagesWithResultsPerPage.get(pageNumber), textBBoxes));
|
||||
ocrTextDebugVisualizationsOnPages.put(pageNumber - 1, createDebugTextVisualizations(imagesWithResultsPerPage.get(pageNumber), textBBoxes));
|
||||
ocrBBoxDebugVisualizationsOnPages.put(pageNumber - 1, createDebugBBoxVisualizations(imagesWithResultsPerPage.get(pageNumber)));
|
||||
}
|
||||
}
|
||||
|
||||
Visualizations visualizations = new Visualizations(ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false);
|
||||
|
||||
List<Visualizations> debugVisualizations = List.of(visualizations,
|
||||
new Visualizations(ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false),
|
||||
new Visualizations(ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false));
|
||||
|
||||
viewerDocumentService.addVisualizationsOnPage(document, document, List.of(visualizations));
|
||||
viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, debugVisualizations);
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("PMD")
|
||||
private List<Rectangle2D> getTextBBoxes(Page page) {
|
||||
|
||||
List<Rectangle2D> textBBoxes = new ArrayList<>();
|
||||
try (var textExtractor = new TextExtractor()) {
|
||||
textExtractor.begin(page);
|
||||
try {
|
||||
|
||||
for (TextExtractor.Line line = textExtractor.getFirstLine(); line.isValid(); line = getNextLine(line)) {
|
||||
for (TextExtractor.Word word = line.getFirstWord(); word.isValid(); word = getNextWord(word)) {
|
||||
textBBoxes.add(Converter.toRectangle2D(word.getBBox()));
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.warn("Could not get word dimension, {}", e.getMessage());
|
||||
}
|
||||
return textBBoxes;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static TextExtractor.Word getNextWord(TextExtractor.Word word) {
|
||||
|
||||
TextExtractor.Word nextWord = word.getNextWord();
|
||||
word.close();
|
||||
return nextWord;
|
||||
}
|
||||
|
||||
|
||||
private static TextExtractor.Line getNextLine(TextExtractor.Line line) {
|
||||
|
||||
TextExtractor.Line newLine = line.getNextLine();
|
||||
line.close();
|
||||
return newLine;
|
||||
}
|
||||
|
||||
|
||||
private VisualizationsOnPage createVisualizations(List<OcrResultToWrite> ocrResultsToWrite, List<Rectangle2D> ignoreZones) {
|
||||
|
||||
List<TextPositionInImage> words = ocrResultsToWrite.stream()
|
||||
.map(OcrResultToWrite::textPositionInImage)
|
||||
.flatMap(Collection::stream)
|
||||
.filter(word -> ignoreZones.stream()
|
||||
.noneMatch(ignoreZone -> word.getTransformedTextBBox().getBounds2D().intersects(ignoreZone)))
|
||||
.toList();
|
||||
|
||||
List<PlacedText> placedTexts = words.stream()
|
||||
.map(word -> new PlacedText(word.getText(),
|
||||
null,
|
||||
Color.BLACK,
|
||||
(float) word.getFontSize(),
|
||||
word.getFontMetricsFactory(),
|
||||
Optional.of(word.getTextMatrix()),
|
||||
Optional.of(RenderingMode.NEITHER)))
|
||||
.toList();
|
||||
return VisualizationsOnPage.builder().placedTexts(placedTexts).build();
|
||||
}
|
||||
|
||||
|
||||
private VisualizationsOnPage createDebugTextVisualizations(List<OcrResultToWrite> ocrResultsToWrite, List<Rectangle2D> textBBoxes) {
|
||||
|
||||
List<TextPositionInImage> wordsToDraw = new ArrayList<>();
|
||||
List<TextPositionInImage> ignoredWords = new ArrayList<>();
|
||||
|
||||
for (OcrResultToWrite ocrResultToWrite : ocrResultsToWrite) {
|
||||
for (TextPositionInImage textPositionInImage : ocrResultToWrite.textPositionInImage()) {
|
||||
if (textBBoxes.stream()
|
||||
.anyMatch(ignoreZone -> textPositionInImage.getTransformedTextBBox().getBounds2D().intersects(ignoreZone))) {
|
||||
ignoredWords.add(textPositionInImage);
|
||||
} else {
|
||||
wordsToDraw.add(textPositionInImage);
|
||||
}
|
||||
}
|
||||
}
|
||||
Stream<PlacedText> placedTexts = wordsToDraw.stream()
|
||||
.map(word -> new PlacedText(word.getText(),
|
||||
null,
|
||||
word.getFontStyle().equals(FontStyle.REGULAR) ? REGULAR_TEXT_COLOR : BOLD_TEXT_COLOR,
|
||||
(float) word.getFontSize(),
|
||||
word.getFontMetricsFactory(),
|
||||
Optional.of(word.getTextMatrix()),
|
||||
Optional.of(RenderingMode.FILL)));
|
||||
|
||||
Stream<PlacedText> placedTexts2 = ignoredWords.stream()
|
||||
.map(word -> new PlacedText(word.getText(),
|
||||
null,
|
||||
word.getFontStyle().equals(FontStyle.REGULAR) ? REGULAR_TEXT_IN_IGNORE_ZONE : BOLD_TEXT_IN_IGNORE_ZONE,
|
||||
(float) word.getFontSize(),
|
||||
word.getFontMetricsFactory(),
|
||||
Optional.of(word.getTextMatrix()),
|
||||
Optional.of(RenderingMode.FILL)));
|
||||
|
||||
return VisualizationsOnPage.builder()
|
||||
.placedTexts(Stream.of(placedTexts, placedTexts2)
|
||||
.flatMap(Function.identity())
|
||||
.toList())
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private VisualizationsOnPage createDebugBBoxVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {
|
||||
|
||||
List<TextPositionInImage> words = ocrResultsToWrite.stream()
|
||||
.map(OcrResultToWrite::textPositionInImage)
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
List<ColoredLine> coloredLines = Stream.concat(//
|
||||
words.stream()
|
||||
.map(TextPositionInImage::getTransformedTextBBox)
|
||||
.map(this::quadPointAsLines),//
|
||||
ocrResultsToWrite.stream()
|
||||
.map(OcrResultToWrite::imageBoundingBox)
|
||||
.map(this::createGrid)//
|
||||
)
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
return VisualizationsOnPage.builder().coloredLines(coloredLines).build();
|
||||
}
|
||||
|
||||
|
||||
private List<ColoredLine> quadPointAsLines(QuadPoint rect) {
|
||||
|
||||
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.ORANGE, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.MAGENTA, 1));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private List<ColoredLine> createGrid(QuadPoint rect) {
|
||||
|
||||
List<ColoredLine> lines = new LinkedList<>(quadPointAsLines(rect));
|
||||
|
||||
int nRows = 8;
|
||||
int nCols = 8;
|
||||
|
||||
Point2D abStep = new Point2D.Double((rect.b().getX() - rect.a().getX()) / (nRows + 1), (rect.b().getY() - rect.a().getY()) / (nRows + 1));
|
||||
Point2D start = add(rect.a(), abStep);
|
||||
Point2D end = add(rect.d(), abStep);
|
||||
for (int row = 0; row < nRows; ++row) {
|
||||
lines.add(new ColoredLine(new Line2D.Double(start, end), Color.BLACK, 0.2f));
|
||||
start = add(start, abStep);
|
||||
end = add(end, abStep);
|
||||
}
|
||||
Point2D adStep = new Point2D.Double((rect.d().getX() - rect.a().getX()) / (nCols + 1), (rect.d().getY() - rect.a().getY()) / (nCols + 1));
|
||||
start = add(rect.a(), adStep);
|
||||
end = add(rect.b(), adStep);
|
||||
for (int col = 0; col < nCols; ++col) {
|
||||
lines.add(new ColoredLine(new Line2D.Double(start, end), Color.BLACK, 0.2f));
|
||||
start = add(start, adStep);
|
||||
end = add(end, adStep);
|
||||
}
|
||||
|
||||
return lines;
|
||||
}
|
||||
|
||||
|
||||
private Point2D add(Point2D a, Point2D b) {
|
||||
|
||||
return new Point2D.Double(a.getX() + b.getX(), a.getY() + b.getY());
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,7 +1,10 @@
|
||||
package com.iqser.red.service.ocr.v1.server.utils;
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
@ -24,6 +27,14 @@ public final class OsUtils {
|
||||
return addBackSlashAtEnd(getTemporaryDirectory()) + addBackSlashAtEnd(SERVICE_NAME) + addBackSlashAtEnd(suffix) + addBackSlashAtEnd(fileId);
|
||||
}
|
||||
|
||||
public static File writeFileToTmpFolder(InputStream in, Path tmpDir) throws IOException {
|
||||
|
||||
File pdfFile = tmpDir.resolve("document.pdf").toFile();
|
||||
try (var fileOut = new FileOutputStream(pdfFile)) {
|
||||
fileOut.write(in.readAllBytes());
|
||||
}
|
||||
return pdfFile;
|
||||
}
|
||||
|
||||
private static boolean isWindows() {
|
||||
|
||||
@ -58,8 +69,10 @@ public final class OsUtils {
|
||||
return "/tmp";
|
||||
}
|
||||
|
||||
|
||||
public static String createTmpFileName(String filename, String suffix) {
|
||||
|
||||
return Path.of(OsUtils.getTemporaryDirectory()).resolve(Path.of(filename).getFileName()).toString().replace(".pdf", "_" + suffix + ".pdf");
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,85 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class Statistics {
|
||||
|
||||
List<Long> imageExtraction;
|
||||
List<Long> tesseractDuration;
|
||||
AtomicLong pdf2ImgDuration;
|
||||
AtomicLong writingTextDuration;
|
||||
AtomicLong imageProcessingDuration;
|
||||
AtomicLong fontStyleDetectionDuration;
|
||||
|
||||
|
||||
public Statistics(int numberOfExtractThreads, int numberOfOcrThreads) {
|
||||
|
||||
this.imageExtraction = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfExtractThreads, 0L)));
|
||||
this.tesseractDuration = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfOcrThreads, 0L)));
|
||||
this.fontStyleDetectionDuration = new AtomicLong(0);
|
||||
this.pdf2ImgDuration = new AtomicLong(0);
|
||||
this.writingTextDuration = new AtomicLong(0);
|
||||
this.imageProcessingDuration = new AtomicLong(0);
|
||||
}
|
||||
|
||||
|
||||
public void increaseImageExtraction(int threadId, long duration) {
|
||||
|
||||
imageExtraction.set(threadId, imageExtraction.get(threadId) + duration);
|
||||
}
|
||||
|
||||
|
||||
public void increaseImageProcessing(long duration) {
|
||||
|
||||
imageProcessingDuration.addAndGet(duration);
|
||||
}
|
||||
|
||||
|
||||
public void increaseTesseractDuration(int threadId, long duration) {
|
||||
|
||||
tesseractDuration.set(threadId, tesseractDuration.get(threadId) + duration);
|
||||
}
|
||||
|
||||
|
||||
public void increasePDF2ImgDuration(long duration) {
|
||||
|
||||
pdf2ImgDuration.addAndGet(duration);
|
||||
}
|
||||
|
||||
|
||||
public void increaseWritingTextDuration(long duration) {
|
||||
|
||||
writingTextDuration.addAndGet(duration);
|
||||
}
|
||||
|
||||
public void increaseFontStyleDetectionDuration(long duration) {
|
||||
|
||||
fontStyleDetectionDuration.addAndGet(duration);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.format(
|
||||
"imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s, FontstyleDetection=%.2f s",
|
||||
((float) imageExtraction.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
|
||||
((float) imageExtraction.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
|
||||
((float) imageExtraction.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
|
||||
((float) tesseractDuration.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
|
||||
((float) tesseractDuration.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
|
||||
((float) tesseractDuration.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
|
||||
(float) imageProcessingDuration.get() / 1000,
|
||||
(float) pdf2ImgDuration.get() / 1000,
|
||||
(float) writingTextDuration.get() / 1000,
|
||||
(float) fontStyleDetectionDuration.get() / 1000);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,43 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.fonts;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.FontMetrics;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.EmbeddableFont;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
public interface FontMetricsFactory extends EmbeddableFont {
|
||||
|
||||
default FontMetrics calculateMetrics(String text, double textWidth, double textHeight) {
|
||||
|
||||
HeightAndDescent heightAndDescent = calculateHeightAndDescent(text);
|
||||
float fontSize = calculateFontSize(text, textWidth);
|
||||
float heightScaling = (float) ((textHeight / (heightAndDescent.height() - heightAndDescent.descent())) * 1000) / fontSize;
|
||||
|
||||
return new FontMetrics((heightAndDescent.descent() / 1000) * fontSize, fontSize, heightScaling);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
default float calculateFontSize(String text, double textWidth) {
|
||||
|
||||
float width;
|
||||
try {
|
||||
width = getFont().getStringWidth(text);
|
||||
} catch (IllegalArgumentException e) {
|
||||
// this means, the font has no glyph for this character
|
||||
width = getFont().getAverageFontWidth() * text.length();
|
||||
}
|
||||
return (float) (textWidth / width) * 1000;
|
||||
}
|
||||
|
||||
|
||||
PDFont getFont();
|
||||
|
||||
|
||||
HeightAndDescent calculateHeightAndDescent(String text);
|
||||
|
||||
}
|
||||
@ -0,0 +1,5 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.fonts;
|
||||
|
||||
public enum FontStyle {
|
||||
REGULAR, BOLD, ITALIC
|
||||
}
|
||||
@ -0,0 +1,140 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.fonts;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.fontbox.ttf.GlyphData;
|
||||
import org.apache.fontbox.ttf.TTFParser;
|
||||
import org.apache.fontbox.ttf.TrueTypeFont;
|
||||
import org.apache.pdfbox.io.RandomAccessReadBuffer;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType0Font;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class Type0FontMetricsFactory implements FontMetricsFactory {
|
||||
|
||||
private final String resourcePath;
|
||||
private PDType0Font type0Font;
|
||||
private TrueTypeFont trueTypeFont;
|
||||
private PDDocument documentThisIsEmbeddedIn;
|
||||
|
||||
// for this specific font back-/forward-slashes have a lot of descent screwing up the font size and therefore bold detection. So if we find such a character we ignore its descent.
|
||||
private static final Set<Integer> slashGlyphIds = Set.of(18, 63);
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static Type0FontMetricsFactory regular(PDDocument document) {
|
||||
|
||||
String resourcePath = "fonts/cmu-regular.ttf";
|
||||
return createFromResourcePath(resourcePath, document);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static Type0FontMetricsFactory bold(PDDocument document) {
|
||||
|
||||
String resourcePath = "fonts/cmu-bold.ttf";
|
||||
return createFromResourcePath(resourcePath, document);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@SuppressWarnings("PMD.CloseResource")
|
||||
private static TrueTypeFont readFromResourcePath(String resourcePath) {
|
||||
|
||||
// The ttf is closed with the document, see PDType0Font line 134
|
||||
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(resourcePath); var buffer = new RandomAccessReadBuffer(in)) {
|
||||
return new TTFParser().parse(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@SuppressWarnings("PMD.CloseResource")
|
||||
private static Type0FontMetricsFactory createFromResourcePath(String resourcePath, PDDocument document) {
|
||||
|
||||
TrueTypeFont trueTypeFont = readFromResourcePath(resourcePath);
|
||||
// since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
|
||||
return new Type0FontMetricsFactory(resourcePath, PDType0Font.load(document, trueTypeFont, true), trueTypeFont, document); // use Type0Font for unicode support)
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public HeightAndDescent calculateHeightAndDescent(String text) {
|
||||
|
||||
byte[] bytes;
|
||||
try {
|
||||
bytes = type0Font.encode(text);
|
||||
} catch (IllegalArgumentException e) {
|
||||
log.warn("The string {} could not be parsed, using average height and descent", text);
|
||||
return new HeightAndDescent(800, -50);
|
||||
}
|
||||
|
||||
ByteArrayInputStream in = new ByteArrayInputStream(bytes);
|
||||
|
||||
float descent = 0;
|
||||
float height = 0;
|
||||
while (in.available() > 0) {
|
||||
try {
|
||||
int code = type0Font.readCode(in);
|
||||
int glyphId = type0Font.codeToGID(code);
|
||||
GlyphData glyph = trueTypeFont.getGlyph().getGlyph(glyphId);
|
||||
if (glyph == null || glyph.getBoundingBox() == null) {
|
||||
continue;
|
||||
}
|
||||
if (!slashGlyphIds.contains(glyphId)) {
|
||||
descent = Math.min(descent, glyph.getYMinimum());
|
||||
}
|
||||
height = Math.max(height, glyph.getYMaximum());
|
||||
} catch (Exception e) {
|
||||
log.warn("descent and height of string {} could not be parsed, using average fallback value!", text);
|
||||
}
|
||||
}
|
||||
// some characters like comma or minus return very small height values, while tesseract still returns a normal-sized bounding box and therefore exploding the height scaling factors,
|
||||
// so we need a minimum value. Here, 500 seems optimal for the characters "-", ",", "_"
|
||||
return new HeightAndDescent(Math.max(height, 500), descent);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public PDFont getFont() {
|
||||
|
||||
return type0Font;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public PDFont embed(PDDocument document) {
|
||||
|
||||
if (documentThisIsEmbeddedIn.equals(document)) {
|
||||
return getFont();
|
||||
}
|
||||
|
||||
// no need to close, the font will be closed with the document it is embedded in
|
||||
|
||||
this.trueTypeFont = readFromResourcePath(resourcePath);
|
||||
this.type0Font = PDType0Font.load(document, trueTypeFont, true);
|
||||
this.documentThisIsEmbeddedIn = document;
|
||||
return getFont();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void close() {
|
||||
|
||||
trueTypeFont.close();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,158 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.scriptdetection;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.math3.ml.clustering.Cluster;
|
||||
import org.apache.commons.math3.ml.clustering.DBSCANClusterer;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.FontStyleDetectionModel;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.TextPositionAndWordImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.WordImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.Type0FontMetricsFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class FontStyleDetector {
|
||||
|
||||
OcrServiceSettings settings;
|
||||
StrokeWidthCalculator strokeWidthCalculator;
|
||||
|
||||
|
||||
/**
|
||||
* Implementation of the MOBDoB algorithm, refer to the paper here:
|
||||
* <a href="http://mile.ee.iisc.ac.in/publications/softCopy/DocumentAnalysis/Sai_NCVPRIPG2013.pdf">Script Independent Detection of Bold Words in Multi Font-size Documents</a>
|
||||
* <p>
|
||||
* As a high level overview: We cluster all text based on its font size. We determine the cluster with the most words. This is assumed to be regular text.
|
||||
* We then estimate the average stroke width of that cluster by thinning all text to a single pixel and calculating the ratio of remaining pixels.
|
||||
* (<a href="http://www.leptonica.org/papers/conn.pdf">Leptonica Documentation on thinning</a>)
|
||||
* For each word we scale this average strokewidth based on its fontsize compared to the most common fontsize.
|
||||
* Using the scaled strokewidth we do an opening operation.
|
||||
* (<a href="https://en.wikipedia.org/wiki/Opening_(morphology)">Opening (Morphology)</a>).
|
||||
* We then threshold the ratio of remaining pixels to determine whether a word is bold or not.
|
||||
* <p>
|
||||
* I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size estimation.
|
||||
* But that is calculated based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height.
|
||||
* The paper does not describe its clustering algorithm, so I've decided on DBSCAN due to its good runtime and readily available implementation by apache commons math.
|
||||
* Moreover, the paper states that stroke width scales linearly with text height. I've come to the conclusion this is not the case.
|
||||
* It seems it scales with the square root of the text height. Or at least this seemed to give the best results for me.
|
||||
*/
|
||||
public Map<Integer, List<OcrResultToWrite>> detectBold(List<OcrResult> ocrResults, PDDocument document) {
|
||||
|
||||
FontMetricsFactory fontMetricsFactory = Type0FontMetricsFactory.regular(document);
|
||||
if (!settings.isBoldDetection()) {
|
||||
return OcrResultToWrite.buildOcrResultsToWrite(ocrResults, fontMetricsFactory);
|
||||
}
|
||||
|
||||
Map<Integer, List<OcrResultToWrite>> ocrResultToWritePerPage = new HashMap<>();
|
||||
|
||||
DBSCANClusterer<TextPositionAndWordImage> clusterer = new DBSCANClusterer<>(0.5, 1);
|
||||
|
||||
FontMetricsFactory boldFontMetricsFactory = Type0FontMetricsFactory.bold(document);
|
||||
|
||||
for (OcrResult result : ocrResults) {
|
||||
FontStyleDetectionModel fontStyleDetectionModel = FontStyleDetectionModel.fromOcrResult(result, fontMetricsFactory, settings);
|
||||
|
||||
List<Cluster<TextPositionAndWordImage>> clusters = clusterer.cluster(fontStyleDetectionModel.getTextPositionsAndWordImages());
|
||||
Optional<Cluster<TextPositionAndWordImage>> largestCluster = clusters.stream().max(Comparator.comparingInt(cluster -> cluster.getPoints().size()));
|
||||
|
||||
if (largestCluster.isEmpty()) {
|
||||
insertResultIntoMap(result.image().getPageNumber(), ocrResultToWritePerPage, fontStyleDetectionModel);
|
||||
continue;
|
||||
}
|
||||
|
||||
List<TextPositionAndWordImage> wordsWithMostCommonTextHeight = largestCluster.get().getPoints();
|
||||
|
||||
double standardTextHeight = calculateStandardTextheight(wordsWithMostCommonTextHeight);
|
||||
double regularStrokeWidth = calculateRegularStrokeWidth(wordsWithMostCommonTextHeight);
|
||||
|
||||
for (TextPositionAndWordImage textPositionsAndWordImage : fontStyleDetectionModel.getTextPositionsAndWordImages()) {
|
||||
decideOnFontStyle(textPositionsAndWordImage, regularStrokeWidth, standardTextHeight, boldFontMetricsFactory);
|
||||
}
|
||||
|
||||
insertResultIntoMap(result.image().getPageNumber(), ocrResultToWritePerPage, fontStyleDetectionModel);
|
||||
fontStyleDetectionModel.dispose();
|
||||
}
|
||||
|
||||
log.info("Finished bold detection");
|
||||
return ocrResultToWritePerPage;
|
||||
}
|
||||
|
||||
|
||||
private static double calculateStandardTextheight(List<TextPositionAndWordImage> wordsWithMostCommonTextHeight) {
|
||||
|
||||
return wordsWithMostCommonTextHeight.stream()
|
||||
.map(TextPositionAndWordImage::getWordImage)
|
||||
.mapToDouble(WordImage::getTextHeight)
|
||||
.filter(Double::isFinite)
|
||||
.average()
|
||||
.orElseThrow();
|
||||
}
|
||||
|
||||
|
||||
private double calculateRegularStrokeWidth(List<TextPositionAndWordImage> wordsWithMostCommonTextHeight) {
|
||||
|
||||
return wordsWithMostCommonTextHeight.stream()
|
||||
.mapToDouble(textPositionAndWordImage -> strokeWidthCalculator.calculate(textPositionAndWordImage.getWordImage().getImage()))
|
||||
.filter(Double::isFinite)
|
||||
.average()
|
||||
.orElseThrow();
|
||||
}
|
||||
|
||||
|
||||
private static void insertResultIntoMap(int pageNumber, Map<Integer, List<OcrResultToWrite>> ocrResultToWritePerPage, FontStyleDetectionModel fontStyleDetectionModel) {
|
||||
|
||||
OcrResultToWrite ocrResult = OcrResultToWrite.fromFontStyleDetectionModel(fontStyleDetectionModel);
|
||||
|
||||
ocrResultToWritePerPage.compute(pageNumber, (key, existingList) -> {
|
||||
if (existingList == null) {
|
||||
return List.of(ocrResult);
|
||||
} else {
|
||||
return Stream.concat(existingList.stream(), Stream.of(ocrResult)).toList();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
private void decideOnFontStyle(TextPositionAndWordImage textPositionsAndWordImage,
|
||||
double standardStrokeWidth,
|
||||
double standardTextHeight,
|
||||
FontMetricsFactory boldFontMetricsFactory) {
|
||||
|
||||
double scaledStrokeWidth = scaleStrokeWidthByFontSize(textPositionsAndWordImage, standardStrokeWidth, standardTextHeight);
|
||||
|
||||
if (textPositionsAndWordImage.getWordImage().hasLargerStrokeWidth(scaledStrokeWidth)) {
|
||||
textPositionsAndWordImage.getTextPositionInImage().setFontMetricsFactory(boldFontMetricsFactory);
|
||||
textPositionsAndWordImage.getTextPositionInImage().setFontStyle(FontStyle.BOLD);
|
||||
} else {
|
||||
textPositionsAndWordImage.getTextPositionInImage().setFontStyle(FontStyle.REGULAR);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static double scaleStrokeWidthByFontSize(TextPositionAndWordImage textPositionsAndWordImage, double standardStrokeWidth, double standardFontSize) {
|
||||
|
||||
double influenceOfFontSize = 1.0; // the paper states that stroke width scales exactly linearly with font size. This did not seem to be true for me. Maybe some of the preprocessing steps are affecting this.
|
||||
double fontsizeScalingFactor = Math.sqrt(textPositionsAndWordImage.getWordImage().getTextHeight() / standardFontSize);
|
||||
return standardStrokeWidth + (influenceOfFontSize * (fontsizeScalingFactor - 1) * standardStrokeWidth);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,57 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.scriptdetection;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.Sel;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
/**
|
||||
* This code is a good start for detecting italic text, although it has a few issues especially with glyphs which are naturally slanted. E.g. z, 2, 7, /
|
||||
* If we want this maybe we should exclude these glyphs and then it might have less false positives. But in its current state i don't recommend using it.
|
||||
*/
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ItalicDetector {
|
||||
|
||||
|
||||
static String italicKernel = "ooxxooxxooxxoxxooXxooxxoxxooxxooxxoo";
|
||||
Sel italicSel = Leptonica1.selCreateFromString(italicKernel, 9, 4, "italicKernel");
|
||||
Sel brickSel = Leptonica1.selCreateBrick(3, 4, 1, 2, 1);
|
||||
|
||||
|
||||
public boolean isItalic(Pix pix) {
|
||||
|
||||
Pix preprocessed = preprocess(pix);
|
||||
Pix flipped = Leptonica1.pixFlipLR(null, pix);
|
||||
Pix flippedPreprocessed = preprocess(flipped);
|
||||
Leptonica1.pixFlipLR(flippedPreprocessed, flippedPreprocessed);
|
||||
double pixelDensity = ImageProcessingUtils.calculatePixelDensity(preprocessed);
|
||||
double flippedPixelDensity = ImageProcessingUtils.calculatePixelDensity(flippedPreprocessed);
|
||||
LeptUtils.disposePix(preprocessed);
|
||||
LeptUtils.disposePix(flipped);
|
||||
LeptUtils.disposePix(flippedPreprocessed);
|
||||
return flippedPixelDensity / pixelDensity < 0.85;
|
||||
}
|
||||
|
||||
|
||||
private Pix preprocess(Pix pix) {
|
||||
|
||||
Pix eroded = Leptonica1.pixErode(null, pix, italicSel.getPointer());
|
||||
Pix dilated = Leptonica1.pixDilate(null, eroded, brickSel.getPointer());
|
||||
LeptUtils.disposePix(eroded);
|
||||
return dilated;
|
||||
}
|
||||
|
||||
|
||||
public void dispose() {
|
||||
|
||||
LeptUtils.dispose(italicSel);
|
||||
LeptUtils.dispose(brickSel);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,58 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.scriptdetection;
|
||||
|
||||
import static net.sourceforge.lept4j.ILeptonica.L_THIN_FG;
|
||||
|
||||
import java.nio.IntBuffer;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.Sela;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
@Service
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class StrokeWidthCalculator {
|
||||
|
||||
Sela thinningSel;
|
||||
|
||||
|
||||
/**
|
||||
* Uses a series of sels to thin all connected lines to a single pixel. Then the pixel ratio is a good estimation of the stroke width in pixels.
|
||||
* <a href="http://www.leptonica.org/papers/conn.pdf">Leptonica Documentation on thinning</a>
|
||||
* Since the baseline is a strokewidth of exactly one, we need to add 1 to the result.
|
||||
*
|
||||
* @param input binarized pix with text on it
|
||||
* @return estimated stroke width in pixels
|
||||
*/
|
||||
public double calculate(Pix input) {
|
||||
|
||||
init();
|
||||
|
||||
Pix thinned = Leptonica1.pixThinConnectedBySet(input, L_THIN_FG, thinningSel, 0);
|
||||
|
||||
IntBuffer thinnedPixelCount = IntBuffer.allocate(1);
|
||||
Leptonica1.pixCountPixels(thinned, thinnedPixelCount, null);
|
||||
|
||||
IntBuffer pixelCount = IntBuffer.allocate(1);
|
||||
Leptonica1.pixCountPixels(input, pixelCount, null);
|
||||
|
||||
LeptUtils.disposePix(thinned);
|
||||
|
||||
return (double) pixelCount.get() / thinnedPixelCount.get() + 1;
|
||||
}
|
||||
|
||||
|
||||
private void init() {
|
||||
|
||||
if (thinningSel == null) {
|
||||
thinningSel = Leptonica1.selaMakeThinSets(1, 0);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,65 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.Setter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.tess4j.TessAPI1;
|
||||
|
||||
/*
|
||||
This just moves the Elements from the GhostScriptOutputListener into the ImageProcessing queue asynchronously
|
||||
*/
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class BlockingQueueFiller extends Thread {
|
||||
|
||||
final BlockingQueue<RenderedPageImageFile> imageInputQueue;
|
||||
final BlockingQueue<UnprocessedImage> imageOutputQueue;
|
||||
|
||||
@Setter
|
||||
boolean allImagesQueued;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void run() {
|
||||
|
||||
// Interrupting signals that the image extraction has finished
|
||||
try {
|
||||
while (!allImagesQueued) {
|
||||
final UnprocessedImage image = imageInputQueue.take();
|
||||
try {
|
||||
imageOutputQueue.put(image);
|
||||
} catch (InterruptedException e) {
|
||||
imageOutputQueue.put(image);
|
||||
}
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
log.info("All images extracted, emptying processing queue and stopping");
|
||||
}
|
||||
|
||||
// empty the queue
|
||||
try {
|
||||
while (true) {
|
||||
final UnprocessedImage image = imageInputQueue.remove();
|
||||
imageOutputQueue.put(image);
|
||||
}
|
||||
} catch (NoSuchElementException e) {
|
||||
log.debug("No images left in queue, stopping.");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,121 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class GhostScriptOutputHandler extends Thread {
|
||||
|
||||
static Pattern pageFinishedPattern = Pattern.compile("Page (\\d+)");
|
||||
|
||||
// If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock.
|
||||
// Since both need to read simultaneously we need to implement the readers as separate threads.
|
||||
|
||||
final InputStream is;
|
||||
final String processName;
|
||||
final Type type;
|
||||
|
||||
final Map<Integer, RenderedPageImageFile> pagesToProcess;
|
||||
final BlockingQueue<RenderedPageImageFile> renderedPageImageFileOutput;
|
||||
|
||||
int currentPageNumber;
|
||||
|
||||
|
||||
public static GhostScriptOutputHandler errorHandler(InputStream is) {
|
||||
|
||||
return new GhostScriptOutputHandler(is, "GS", Type.ERROR, null, null);
|
||||
}
|
||||
|
||||
|
||||
public static GhostScriptOutputHandler stdOut(InputStream is,
|
||||
Map<Integer, RenderedPageImageFile> pagesToProcess,
|
||||
BlockingQueue<RenderedPageImageFile> renderedPageImageFileOutput) {
|
||||
|
||||
return new GhostScriptOutputHandler(is, "GS", Type.STD_OUT, pagesToProcess, renderedPageImageFileOutput);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void run() {
|
||||
|
||||
try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) {
|
||||
|
||||
String line;
|
||||
while (true) {
|
||||
line = br.readLine();
|
||||
|
||||
if (line == null) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (type.equals(Type.ERROR)) {
|
||||
log.error(processName + "_" + type.name() + ">" + line);
|
||||
} else {
|
||||
log.debug(processName + "_" + type.name() + ">" + line);
|
||||
addProcessedImageToQueue(line);
|
||||
}
|
||||
}
|
||||
}
|
||||
is.close();
|
||||
if (type.equals(Type.STD_OUT)) {
|
||||
queueFinishedPage(currentPageNumber);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void addProcessedImageToQueue(String line) {
|
||||
|
||||
/*
|
||||
Ghostscript prints the pageNumber it is currently working on, so we remember the current page and queue it as soon as the next comes in.
|
||||
*/
|
||||
Matcher pageNumberMatcher = pageFinishedPattern.matcher(line);
|
||||
if (pageNumberMatcher.find()) {
|
||||
int pageNumber = Integer.parseInt(pageNumberMatcher.group(1));
|
||||
|
||||
if (currentPageNumber == 0) {
|
||||
currentPageNumber = pageNumber;
|
||||
return;
|
||||
}
|
||||
|
||||
queueFinishedPage(currentPageNumber);
|
||||
currentPageNumber = pageNumber;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void queueFinishedPage(int pageNumber) {
|
||||
|
||||
var imageFile = this.pagesToProcess.get(pageNumber);
|
||||
if (imageFile == null) {
|
||||
throw new IllegalArgumentException(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet()));
|
||||
}
|
||||
renderedPageImageFileOutput.add(imageFile);
|
||||
}
|
||||
|
||||
|
||||
public enum Type {
|
||||
ERROR,
|
||||
STD_OUT
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,110 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ImageExtractionThread extends Thread {
|
||||
|
||||
static double FULL_PAGE_IMAGE_THRESHOLD = 0.99;
|
||||
static double IMAGE_ALIGNMENT_THRESHOLD = 1;
|
||||
|
||||
int id;
|
||||
@Getter
|
||||
List<Integer> pageIndices;
|
||||
File documentFile;
|
||||
OcrProgressLogger logger;
|
||||
Statistics stats;
|
||||
OcrServiceSettings settings;
|
||||
|
||||
// output is written to these lists
|
||||
BlockingQueue<UnprocessedImage> imageProcessingQueue;
|
||||
List<Integer> stitchedPageNumbers;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void run() {
|
||||
|
||||
long timestamp;
|
||||
for (Integer pageIndex : pageIndices) {
|
||||
try (PDDocument document = Loader.loadPDF(documentFile)) { // load new PDDocument for thread safety, also keeps RAM usage low.
|
||||
timestamp = System.currentTimeMillis();
|
||||
List<ExtractedImage> extractedImages = getExtractedImages(pageIndex, document);
|
||||
stats.increaseImageExtraction(id, System.currentTimeMillis() - timestamp);
|
||||
if (extractedImages.isEmpty()) {
|
||||
logger.logPageSkipped(pageIndex);
|
||||
}
|
||||
|
||||
if (checkForFullPageOrStitchedImages(extractedImages, document.getPage(pageIndex - 1))) {
|
||||
stitchedPageNumbers.add(pageIndex);
|
||||
logger.addImagesToProcess(pageIndex, 0);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (ExtractedImage image : extractedImages) {
|
||||
imageProcessingQueue.put(image);
|
||||
logger.addImagesToProcess(image.pageNumber(), image.numberOnPage());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private List<ExtractedImage> getExtractedImages(Integer pageIndex, PDDocument document) {
|
||||
|
||||
PDPage page = document.getPage(pageIndex - 1);
|
||||
ImageStreamEngine imageStreamEngine = new ImageStreamEngine(settings);
|
||||
imageStreamEngine.processPage(pageIndex, page);
|
||||
return imageStreamEngine.getImagesOnCurrentPage();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private boolean checkForFullPageOrStitchedImages(List<ExtractedImage> imagesOnCurrentPage, PDPage page) {
|
||||
|
||||
if (imagesOnCurrentPage.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (ExtractedImage imageOnPage : imagesOnCurrentPage) {
|
||||
if (imageOnPage.width() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getWidth() && imageOnPage.height() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox()
|
||||
.getHeight()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
//checking for intersections or direct alignment of images
|
||||
for (int j = 0; j < imagesOnCurrentPage.size(); j++) {
|
||||
for (int i = j + 1; i < imagesOnCurrentPage.size(); i++) {
|
||||
if (imagesOnCurrentPage.get(j)
|
||||
.getImageCoordinatesInInitialUserSpace()
|
||||
.aligns(imagesOnCurrentPage.get(i).getImageCoordinatesInInitialUserSpace(), IMAGE_ALIGNMENT_THRESHOLD)) {
|
||||
// TODO: see if we can stitch aligning images using BufferedImage and skip the gs conversion entirely
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,251 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import static net.sourceforge.tess4j.ITessAPI.TRUE;
|
||||
|
||||
import java.nio.FloatBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
|
||||
import com.sun.jna.ptr.PointerByReference;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.Setter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.L_Kernel;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
import net.sourceforge.tess4j.ITessAPI;
|
||||
import net.sourceforge.tess4j.TessAPI1;
|
||||
|
||||
/*
|
||||
* This thread does all the image processing. There should only be one, since Leptonica is not thread safe.
|
||||
*/
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class ImageProcessingThread extends Thread {
|
||||
|
||||
final BlockingQueue<UnprocessedImage> imageInputQueue;
|
||||
final BlockingQueue<OcrImage> imageOutputQueue;
|
||||
final ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
|
||||
final L_Kernel gaussianKernel = Leptonica1.makeGaussianKernel(2, 2, 1.0f, 1);
|
||||
final Statistics stats;
|
||||
final OcrServiceSettings settings;
|
||||
final PDDocument document;
|
||||
|
||||
@Setter
|
||||
boolean allImagesExtracted;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void run() {
|
||||
|
||||
try {
|
||||
while (!allImagesExtracted) {
|
||||
final UnprocessedImage image = imageInputQueue.take();
|
||||
var ocrImage = this.process(image);
|
||||
try {
|
||||
imageOutputQueue.put(ocrImage);
|
||||
} catch (InterruptedException e) {
|
||||
imageOutputQueue.put(ocrImage);
|
||||
}
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
log.info("All images extracted, emptying processing queue and stopping");
|
||||
}
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
final UnprocessedImage image = imageInputQueue.remove();
|
||||
OcrImage ocrImage = this.process(image);
|
||||
imageOutputQueue.put(ocrImage);
|
||||
}
|
||||
} catch (NoSuchElementException e) {
|
||||
log.debug("No images left in processing queue, stopping.");
|
||||
}
|
||||
|
||||
TessAPI1.TessBaseAPIEnd(this.detectionScriptHandle);
|
||||
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
|
||||
LeptUtils.dispose(gaussianKernel);
|
||||
}
|
||||
|
||||
|
||||
private OcrImage process(UnprocessedImage unprocessedImage) {
|
||||
|
||||
long timestamp = System.currentTimeMillis();
|
||||
|
||||
OcrImage ocrImage;
|
||||
if (unprocessedImage instanceof ExtractedImage extractedImage) {
|
||||
ocrImage = processExtractedImage(extractedImage);
|
||||
} else if (unprocessedImage instanceof RenderedPageImageFile renderedPageImageFile) {
|
||||
ocrImage = processRenderedPageImageFile(renderedPageImageFile);
|
||||
} else {
|
||||
throw new UnsupportedOperationException(String.format("Class %s is not supported!", unprocessedImage.getClass()));
|
||||
}
|
||||
|
||||
stats.increaseImageProcessing(System.currentTimeMillis() - timestamp);
|
||||
|
||||
return ocrImage;
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("PMD.CompareObjectsWithEquals")
|
||||
private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) {
|
||||
|
||||
Pix pix = processPix(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi());
|
||||
|
||||
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
|
||||
Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix);
|
||||
|
||||
OcrImage ocrImage = new RenderedPageOcrImage(pix.h,
|
||||
pix.w,
|
||||
PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1)),
|
||||
rotatedPix,
|
||||
orientDegree);
|
||||
|
||||
if (pix != rotatedPix) {
|
||||
LeptUtils.disposePix(pix);
|
||||
}
|
||||
|
||||
return ocrImage;
|
||||
}
|
||||
|
||||
@SuppressWarnings("PMD.CompareObjectsWithEquals")
|
||||
private OcrImage processExtractedImage(ExtractedImage extractedImage) {
|
||||
|
||||
float imageDPI = Math.abs(extractedImage.image().getWidth() / (extractedImage.ctm().getScalingFactorX() / 72));
|
||||
|
||||
Pix pix = processPix(extractedImage.asPix(), imageDPI, settings.getDpi());
|
||||
|
||||
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
|
||||
Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix);
|
||||
|
||||
OcrImage ocrImage = new ExtractedOcrImage(extractedImage.pageNumber(),
|
||||
extractedImage.numberOnPage(),
|
||||
extractedImage.height(),
|
||||
extractedImage.width(),
|
||||
extractedImage.ctm(),
|
||||
rotatedPix,
|
||||
pix.h,
|
||||
pix.w,
|
||||
orientDegree);
|
||||
|
||||
if (pix != rotatedPix) {
|
||||
LeptUtils.disposePix(pix);
|
||||
}
|
||||
return ocrImage;
|
||||
}
|
||||
|
||||
|
||||
public int detectOrientation(Pix pix, int dpi, ITessAPI.TessBaseAPI detectionScriptHandle) {
|
||||
|
||||
TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, pix);
|
||||
TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, dpi);
|
||||
|
||||
IntBuffer orientationDegreeResultBuffer;
|
||||
FloatBuffer orientationDegreeConfidenceBuffer;
|
||||
PointerByReference scriptureNameBuffer;
|
||||
FloatBuffer scriptureConfidenceBuffer;
|
||||
|
||||
orientationDegreeResultBuffer = IntBuffer.allocate(1);
|
||||
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
|
||||
scriptureNameBuffer = new PointerByReference(); // Is this memory being freed?
|
||||
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
|
||||
|
||||
int orientationDegree = 0;
|
||||
int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle,
|
||||
orientationDegreeResultBuffer,
|
||||
orientationDegreeConfidenceBuffer,
|
||||
scriptureNameBuffer,
|
||||
scriptureConfidenceBuffer);
|
||||
if (result == TRUE && orientationDegreeConfidenceBuffer.get() > settings.getMinRotationConfidence()) {
|
||||
orientationDegree = orientationDegreeResultBuffer.get();
|
||||
}
|
||||
|
||||
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
|
||||
|
||||
return orientationDegree;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Pix processPix(Pix pix, float imageDpi, int targetDpi) {
|
||||
|
||||
Pix grayScale;
|
||||
Pix scaledUp;
|
||||
Pix gaussian;
|
||||
Pix binarized;
|
||||
|
||||
//convert to grayscale
|
||||
if (pix.d == 8) {
|
||||
grayScale = pix;
|
||||
} else if (pix.d == 32) {
|
||||
grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
|
||||
LeptUtils.disposePix(pix);
|
||||
} else if (pix.d == 1) {
|
||||
grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255);
|
||||
LeptUtils.disposePix(pix);
|
||||
} else {
|
||||
throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d));
|
||||
}
|
||||
|
||||
// scale up
|
||||
float targetFactor = targetDpi / imageDpi;
|
||||
if (targetFactor > 2.1) {
|
||||
scaledUp = Leptonica1.pixScaleGray4xLI(grayScale);
|
||||
LeptUtils.disposePix(grayScale);
|
||||
} else if (targetFactor > 1.1) {
|
||||
scaledUp = Leptonica1.pixScaleGray2xLI(grayScale);
|
||||
LeptUtils.disposePix(grayScale);
|
||||
} else {
|
||||
scaledUp = grayScale;
|
||||
}
|
||||
|
||||
// remove noise and prep for Otsu
|
||||
gaussian = Leptonica1.pixConvolve(scaledUp, gaussianKernel, 8, 1);
|
||||
LeptUtils.disposePix(scaledUp);
|
||||
|
||||
// Threshold to binary
|
||||
if (pix.w < 100 || pix.h < 100) {
|
||||
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
|
||||
} else {
|
||||
binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.1f, null);
|
||||
if (binarized == null) { // Sometimes Otsu just fails, then we binarize directly
|
||||
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
|
||||
}
|
||||
}
|
||||
LeptUtils.disposePix(gaussian);
|
||||
|
||||
return binarized;
|
||||
}
|
||||
|
||||
|
||||
|
||||
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
|
||||
|
||||
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
|
||||
String datapath = System.getenv("TESSDATA_PREFIX");
|
||||
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
|
||||
TessAPI1.TessBaseAPISetVariable(handle, "debug_file", "/dev/null");
|
||||
return handle;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,136 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.threads;
|
||||
|
||||
import static net.sourceforge.tess4j.ITessAPI.TRUE;
|
||||
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPICreate;
|
||||
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIInit1;
|
||||
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetPageSegMode;
|
||||
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetVariable;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.FloatBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.Tesseract2;
|
||||
import com.sun.jna.StringArray;
|
||||
import com.sun.jna.ptr.PointerByReference;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
import net.sourceforge.tess4j.ITessAPI;
|
||||
import net.sourceforge.tess4j.ITesseract;
|
||||
import net.sourceforge.tess4j.TessAPI1;
|
||||
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OCRThread extends Thread {
|
||||
|
||||
int id;
|
||||
BlockingQueue<OcrImage> imageInputQueue;
|
||||
Path tesseractOutputDir;
|
||||
List<OcrResult> results;
|
||||
OcrProgressLogger logger;
|
||||
Statistics stats;
|
||||
OcrServiceSettings settings;
|
||||
Tesseract2 instance;
|
||||
|
||||
|
||||
public OCRThread(int id,
|
||||
BlockingQueue<OcrImage> imageInputQueue,
|
||||
Path tesseractOutputDir,
|
||||
List<OcrResult> results,
|
||||
OcrProgressLogger logger,
|
||||
Statistics stats,
|
||||
OcrServiceSettings settings) {
|
||||
|
||||
this.id = id;
|
||||
this.imageInputQueue = imageInputQueue;
|
||||
this.tesseractOutputDir = tesseractOutputDir;
|
||||
this.results = results;
|
||||
this.logger = logger;
|
||||
this.stats = stats;
|
||||
this.settings = settings;
|
||||
this.instance = createInstance(settings);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void run() {
|
||||
|
||||
// Interrupting signals that the image extraction has finished
|
||||
while (!isInterrupted()) {
|
||||
try {
|
||||
final OcrImage image = imageInputQueue.take();
|
||||
this.process(image);
|
||||
} catch (InterruptedException e) {
|
||||
// set isInterrupted to true (This exception may only happen during active waiting for queue, and then isInterrupted will not be set!)
|
||||
interrupt();
|
||||
}
|
||||
}
|
||||
// empty the queue
|
||||
try {
|
||||
while (true) {
|
||||
final OcrImage image = imageInputQueue.remove();
|
||||
this.process(image);
|
||||
}
|
||||
} catch (NoSuchElementException e) {
|
||||
log.debug("Executed tesseract on all Images, finishing.");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void process(OcrImage image) {
|
||||
|
||||
long timestamp = System.currentTimeMillis();
|
||||
String tmpOutputFileName = String.format("output_%04d_%04d", image.getPageNumber(), image.getNumberOnPage());
|
||||
String tesseractOutputFileName = tesseractOutputDir.resolve(tmpOutputFileName).toFile().toString();
|
||||
|
||||
int psm = settings.getPsmOverride() < 0 ? image.getOptimalPageSegmentationMode() : settings.getPsmOverride();
|
||||
|
||||
executeTesseract(psm, image.getDpi(), image.getPix(), tesseractOutputFileName);
|
||||
image.destroyPix();
|
||||
|
||||
results.add(OcrResult.create(image, tesseractOutputFileName));
|
||||
logger.logImageFinished(image, psm);
|
||||
stats.increaseTesseractDuration(id, System.currentTimeMillis() - timestamp);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void executeTesseract(int psm, int dpi, Pix pix, String tesseractOutputFileName) {
|
||||
|
||||
Leptonica1.pixWrite(tesseractOutputFileName + ".tiff", pix, 5); // write the used image for later bold detection
|
||||
instance.setVariable("user_defined_dpi", String.valueOf(dpi));
|
||||
instance.setPageSegMode(psm);
|
||||
instance.createDocumentsWithResults(pix, null, tesseractOutputFileName, List.of(ITesseract.RenderedFormat.HOCR), ITessAPI.TessPageIteratorLevel.RIL_BLOCK);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static Tesseract2 createInstance(OcrServiceSettings settings) {
|
||||
|
||||
Tesseract2 instance = new Tesseract2();
|
||||
instance.setVariable("debug_file", "/dev/null"); // remove warnings from std out
|
||||
instance.setOcrEngineMode(1); // set to LSTM based Engine
|
||||
instance.setLanguage(settings.getLanguages());
|
||||
return instance;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,28 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.settings;
|
||||
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@ConfigurationProperties("ocr-service")
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class OcrServiceSettings {
|
||||
|
||||
int ocrThreadCount = 4; // Number of OCR threads
|
||||
int imageExtractThreadCount = 2; // Number of image extraction threads
|
||||
int gsProcessCount = 1; // Number of Ghostscript processes
|
||||
int dpi = 300; // Target DPI for binarized images
|
||||
int psmOverride = -1; // Overrides the page segmentation mode if > 0
|
||||
int minImageHeight = 20; // Minimum height for images to be processed
|
||||
int minImageWidth = 20; // Minimum width for images to be processed
|
||||
float minRotationConfidence = 2; // Sets a lower bound for the confidence rating for rotated pages.
|
||||
boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes
|
||||
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
|
||||
COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR");
|
||||
boolean boldDetection = true; // if true, bold detection will be attempted
|
||||
double boldThreshold = 0.5; // Words are opened with a brick of average stroke width, if the ratio of remaining pixels is higher the word is determined bold.
|
||||
}
|
||||
@ -0,0 +1,85 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import java.awt.AlphaComposite;
|
||||
import java.awt.Color;
|
||||
import java.awt.Graphics;
|
||||
import java.awt.Graphics2D;
|
||||
import java.awt.Transparency;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.nio.IntBuffer;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
|
||||
import com.sun.jna.ptr.PointerByReference;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
import net.sourceforge.lept4j.L_Kernel;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
@UtilityClass
|
||||
public class ImageProcessingUtils {
|
||||
|
||||
public BufferedImage convertToDeviceColorSpace(ExtractedImage extractedImage) {
|
||||
|
||||
BufferedImage image;
|
||||
if (extractedImage.colorSpace() instanceof PDDeviceRGB || extractedImage.colorSpace() instanceof PDDeviceGray) {
|
||||
image = extractedImage.image();
|
||||
} else {
|
||||
BufferedImage pdfImage = extractedImage.image();
|
||||
image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
|
||||
Graphics g = image.getGraphics();
|
||||
g.drawImage(pdfImage, 0, 0, null);
|
||||
g.dispose();
|
||||
}
|
||||
return image;
|
||||
}
|
||||
|
||||
|
||||
public Pix deRotatePix(int orientDegree, Pix pix) {
|
||||
|
||||
return switch (360 - orientDegree) {
|
||||
case 90 -> Leptonica1.pixRotateOrth(pix, 1);
|
||||
case 180 -> Leptonica1.pixRotateOrth(pix, 2);
|
||||
case 270 -> Leptonica1.pixRotateOrth(pix, 3);
|
||||
default -> pix;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public static void setAlphaChannelToWhite(BufferedImage image) {
|
||||
|
||||
if (image.getTransparency() == Transparency.TRANSLUCENT) {
|
||||
// NOTE: For BITMASK images, the color model is likely IndexColorModel,
|
||||
// and this model will contain the "real" color of the transparent parts
|
||||
// which is likely a better fit than unconditionally setting it to white.
|
||||
|
||||
// Fill background with white
|
||||
Graphics2D graphics = image.createGraphics();
|
||||
try {
|
||||
graphics.setComposite(AlphaComposite.DstOver); // Set composite rules to paint "behind"
|
||||
graphics.setPaint(Color.WHITE);
|
||||
graphics.fillRect(0, 0, image.getWidth(), image.getHeight());
|
||||
} finally {
|
||||
graphics.dispose();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static double calculatePixelDensity(Pix pix) {
|
||||
|
||||
IntBuffer pixelCount = IntBuffer.allocate(1);
|
||||
int result = Leptonica1.pixCountPixels(pix, pixelCount, null);
|
||||
if (result == 0) {
|
||||
return (double) pixelCount.get() / (pix.h * pix.w);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,73 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import net.sourceforge.lept4j.L_Kernel;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
|
||||
@UtilityClass
|
||||
public class KernelUtils {
|
||||
|
||||
/*
|
||||
-1, -1, -1
|
||||
-1, 8, -1
|
||||
-1, -1, -1
|
||||
*/
|
||||
public L_Kernel createFullLaplacianKernel() {
|
||||
|
||||
L_Kernel laplacianKernel = Leptonica1.kernelCreate(3, 3);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 0, 0, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 0, 1, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 0, 2, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 1, 0, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 2, 0, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 2, 2, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 1, 1, 8);
|
||||
return laplacianKernel;
|
||||
}
|
||||
|
||||
/*
|
||||
0, 0, -1, 0, 0
|
||||
0, -1, -1, -1, 0
|
||||
-1, -1, 12, -1, -1
|
||||
0, -1, -1, -1, 0
|
||||
0, 0, -1, 0, 0
|
||||
*/
|
||||
public L_Kernel createLaplacianKernel5x5() {
|
||||
|
||||
L_Kernel laplacianKernel = Leptonica1.kernelCreate(5, 5);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 0, 2, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 1, 1, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 1, 3, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 2, 0, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 2, 3, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 2, 4, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 3, 1, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 3, 2, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 3, 3, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 4, 2, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 2, 2, 12);
|
||||
return laplacianKernel;
|
||||
}
|
||||
|
||||
/*
|
||||
0, -1, 0
|
||||
-1, 4, -1
|
||||
0, -1, 0
|
||||
*/
|
||||
public L_Kernel createLaplacianKernel() {
|
||||
|
||||
L_Kernel laplacianKernel = Leptonica1.kernelCreate(3, 3);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 0, 1, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 1, 0, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1);
|
||||
Leptonica1.kernelSetElement(laplacianKernel, 1, 1, 4);
|
||||
return laplacianKernel;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -0,0 +1,64 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ListSplittingUtils {
|
||||
|
||||
public List<List<Integer>> buildBalancedContinuousSublist(Integer totalNumberOfEntries, int threadCount) {
|
||||
|
||||
return buildBalancedSublist(IntStream.range(0, totalNumberOfEntries).map(i -> i + 1).boxed().toList(), threadCount);
|
||||
}
|
||||
|
||||
|
||||
public <T> List<List<T>> buildBalancedSublist(List<T> entries, int threadCount) {
|
||||
|
||||
List<Integer> balancedEntryCounts = buildBalancedEntryCounts(entries.size(), threadCount);
|
||||
List<List<T>> balancedSublist = new ArrayList<>(threadCount);
|
||||
int startIdx = 0;
|
||||
for (Integer numberOfEntriesPerThread : balancedEntryCounts) {
|
||||
balancedSublist.add(entries.subList(startIdx, startIdx + numberOfEntriesPerThread));
|
||||
startIdx += numberOfEntriesPerThread;
|
||||
}
|
||||
return balancedSublist;
|
||||
}
|
||||
|
||||
|
||||
public <T> List<List<List<T>>> buildBatchedBalancedSublist(List<T> entries, int threadCount, int batchSize) {
|
||||
|
||||
// batches -> threads -> entries
|
||||
List<List<List<T>>> batchedBalancedSubList = new LinkedList<>();
|
||||
List<List<List<T>>> threadsWithBatches = buildBalancedSublist(entries, threadCount).stream().map(list -> buildBalancedSublist(list, batchSize)).toList();
|
||||
// swap first two dimensions
|
||||
for (int batchIdx = 0; batchIdx < batchSize; batchIdx++) {
|
||||
List<List<T>> threadEntriesPerBatch = new ArrayList<>(threadCount);
|
||||
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
|
||||
threadEntriesPerBatch.add(threadsWithBatches.get(threadIdx).get(batchIdx));
|
||||
}
|
||||
batchedBalancedSubList.add(threadEntriesPerBatch);
|
||||
|
||||
}
|
||||
return batchedBalancedSubList;
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> buildBalancedEntryCounts(int totalNumberOfEntries, int threadCount) {
|
||||
|
||||
List<Integer> numberOfPagesPerThread = new ArrayList<>(threadCount);
|
||||
for (int i = 0; i < threadCount; i++) {
|
||||
numberOfPagesPerThread.add(0);
|
||||
}
|
||||
int threadIdx;
|
||||
for (int i = 0; i < totalNumberOfEntries; i++) {
|
||||
threadIdx = i % threadCount;
|
||||
numberOfPagesPerThread.set(threadIdx, numberOfPagesPerThread.get(threadIdx) + 1);
|
||||
}
|
||||
return numberOfPagesPerThread;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class PdfDpiCalculator {
|
||||
|
||||
public int calculateDpi(QuadPoint imageBounds, AffineTransform imageCTM, double width) {
|
||||
|
||||
QuadPoint transformedImageBounds = imageBounds.getTransformed(imageCTM);
|
||||
double transformedWidth = transformedImageBounds.a().distance(transformedImageBounds.d());
|
||||
double widthInInches = transformedWidth * 1 / 72;
|
||||
return (int) Math.ceil(width / widthInInches);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,73 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import com.pdftron.pdf.ColorPt;
|
||||
import com.pdftron.pdf.ColorSpace;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.Rect;
|
||||
import com.pdftron.pdf.RectCollection;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
|
||||
public class PdfDraw {
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawGrid(ElementWriter writer, Page page) {
|
||||
|
||||
try (var eb = new ElementBuilder()) {
|
||||
double dX = 15;
|
||||
double dY = 15;
|
||||
int nRows = (int) (page.getPageHeight() / dY) + 1;
|
||||
int nCols = (int) (page.getPageWidth() / dX) + 1;
|
||||
for (int row = 0; row < nRows; ++row) {
|
||||
for (int col = 0; col < nCols; ++col) {
|
||||
Element cell = eb.createRect(col * dX, row * dY, dX, dY);
|
||||
cell.setPathStroke(true);
|
||||
cell.getGState().setLineWidth(1);
|
||||
cell.getGState().setStrokeOpacity(0.1);
|
||||
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
if (row == 0 && col == 0) {
|
||||
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
|
||||
cell.setPathFill(true);
|
||||
cell.getGState().setFillOpacity(0.8);
|
||||
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
cell.getGState().setFillColor(new ColorPt(0, 0, 1));
|
||||
} else {
|
||||
cell.setPathFill(false);
|
||||
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
|
||||
}
|
||||
writer.writePlacedElement(cell);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection) {
|
||||
|
||||
try (var colorPt = new ColorPt(1, 0, 0); var eb = new ElementBuilder()) {
|
||||
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
|
||||
try(var r = rectCollection.getRectAt(i)) {
|
||||
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
|
||||
|
||||
rect.setPathStroke(true);
|
||||
rect.getGState().setLineWidth(5);
|
||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
rect.getGState().setStrokeColor(colorPt);
|
||||
|
||||
rect.setPathFill(true);
|
||||
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
rect.getGState().setFillColor(colorPt);
|
||||
rect.getGState().setFillOpacity(0.5);
|
||||
|
||||
writer.writePlacedElement(rect);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,141 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import java.awt.Rectangle;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.sun.jna.Pointer;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.tess4j.OCRResult;
|
||||
import net.sourceforge.tess4j.TessAPI1;
|
||||
import net.sourceforge.tess4j.Tesseract1;
|
||||
import net.sourceforge.tess4j.TesseractException;
|
||||
import net.sourceforge.tess4j.Word;
|
||||
|
||||
@Slf4j
|
||||
/**
|
||||
* Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted.
|
||||
*/
|
||||
public class Tesseract2 extends Tesseract1 {
|
||||
|
||||
|
||||
private int createDocuments(Pix pix, String filename, TessResultRenderer renderer) {
|
||||
|
||||
String title = TessBaseAPIGetStringVariable(getHandle(), DOCUMENT_TITLE);
|
||||
TessResultRendererBeginDocument(renderer, title);
|
||||
int result = TessBaseAPIProcessPage(getHandle(), pix, 0, filename, null, 0, renderer);
|
||||
TessResultRendererEndDocument(renderer);
|
||||
|
||||
// if (result == ITessAPI.FALSE) {
|
||||
// throw new TesseractException("Error during processing page.");
|
||||
// }
|
||||
|
||||
return TessBaseAPIMeanTextConf(getHandle());
|
||||
}
|
||||
|
||||
|
||||
public OCRResult createDocumentsWithResults(Pix bi, String filename, String outputbase, List<RenderedFormat> formats, int pageIteratorLevel) throws TesseractException {
|
||||
|
||||
List<OCRResult> results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel);
|
||||
if (!results.isEmpty()) {
|
||||
return results.get(0);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public List<OCRResult> createDocumentsWithResults(Pix[] pixs, String[] filenames, String[] outputbases, List<RenderedFormat> formats, int pageIteratorLevel) {
|
||||
|
||||
if (pixs.length != filenames.length || pixs.length != outputbases.length) {
|
||||
throw new RuntimeException("The three arrays must match in length.");
|
||||
}
|
||||
|
||||
init();
|
||||
setVariables();
|
||||
|
||||
List<OCRResult> results = new ArrayList<OCRResult>();
|
||||
|
||||
try {
|
||||
for (int i = 0; i < pixs.length; i++) {
|
||||
try {
|
||||
TessResultRenderer renderer = createRenderers(outputbases[i], formats);
|
||||
int meanTextConfidence = createDocuments(pixs[i], filenames[i], renderer);
|
||||
TessDeleteResultRenderer(renderer);
|
||||
List<Word> words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel) : new ArrayList<Word>();
|
||||
results.add(new OCRResult(meanTextConfidence, words));
|
||||
} catch (Exception e) {
|
||||
// skip the problematic image file
|
||||
log.warn(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
dispose();
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
|
||||
private List<Word> getRecognizedWords(int pageIteratorLevel) {
|
||||
|
||||
List<Word> words = new ArrayList<>();
|
||||
|
||||
try {
|
||||
TessResultIterator ri = TessBaseAPIGetIterator(getHandle());
|
||||
TessPageIterator pi = TessResultIteratorGetPageIterator(ri);
|
||||
TessPageIteratorBegin(pi);
|
||||
|
||||
do {
|
||||
Pointer ptr = TessResultIteratorGetUTF8Text(ri, pageIteratorLevel);
|
||||
if (ptr == null) {
|
||||
continue;
|
||||
}
|
||||
String text = ptr.getString(0);
|
||||
TessAPI1.TessDeleteText(ptr);
|
||||
float confidence = TessResultIteratorConfidence(ri, pageIteratorLevel);
|
||||
IntBuffer leftB = IntBuffer.allocate(1);
|
||||
IntBuffer topB = IntBuffer.allocate(1);
|
||||
IntBuffer rightB = IntBuffer.allocate(1);
|
||||
IntBuffer bottomB = IntBuffer.allocate(1);
|
||||
TessPageIteratorBoundingBox(pi, pageIteratorLevel, leftB, topB, rightB, bottomB);
|
||||
int left = leftB.get();
|
||||
int top = topB.get();
|
||||
int right = rightB.get();
|
||||
int bottom = bottomB.get();
|
||||
Word word = new Word(text, confidence, new Rectangle(left, top, right - left, bottom - top));
|
||||
words.add(word);
|
||||
} while (TessPageIteratorNext(pi, pageIteratorLevel) == TRUE);
|
||||
// TessPageIteratorDelete(pi);
|
||||
TessResultIteratorDelete(ri);
|
||||
} catch (Exception e) {
|
||||
log.warn(e.getMessage(), e);
|
||||
}
|
||||
|
||||
return words;
|
||||
}
|
||||
|
||||
|
||||
private TessResultRenderer createRenderers(String outputbase, List<RenderedFormat> formats) {
|
||||
|
||||
TessResultRenderer renderer = null;
|
||||
|
||||
for (RenderedFormat format : formats) {
|
||||
switch (format) {
|
||||
|
||||
case HOCR:
|
||||
if (renderer == null) {
|
||||
renderer = TessHOcrRendererCreate(outputbase);
|
||||
} else {
|
||||
TessResultRendererInsert(renderer, TessHOcrRendererCreate(outputbase));
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return renderer;
|
||||
}
|
||||
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,29 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.FontMetrics;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.Type0FontMetricsFactory;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@SuppressWarnings("PMD")
|
||||
class Type0FontMetricsFactoryTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testStringWidth() {
|
||||
|
||||
try (PDDocument document = Loader.loadPDF(new File(Type0FontMetricsFactoryTest.class.getClassLoader().getResource("InvisibleText.pdf").getPath()))) {
|
||||
Type0FontMetricsFactory metricsFactory = Type0FontMetricsFactory.regular(document);
|
||||
FontMetrics fontMetrics = metricsFactory.calculateMetrics("deine mutter", 100, 50);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,36 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import static net.sourceforge.lept4j.ILeptonica.IFF_PNG;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
|
||||
@Disabled
|
||||
class ImageProcessingUtilsTest {
|
||||
|
||||
@BeforeEach
|
||||
public void loadLeptonica() {
|
||||
|
||||
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testRotation() {
|
||||
|
||||
Pix pix = Leptonica1.pixRead("/home/kschuettler/Downloads/painHarold.webp");
|
||||
Pix pix2 = ImageProcessingUtils.deRotatePix(0, pix);
|
||||
Leptonica1.pixWrite("/tmp/0.png", pix2, IFF_PNG);
|
||||
Pix pix3 = ImageProcessingUtils.deRotatePix(90, pix);
|
||||
Leptonica1.pixWrite("/tmp/90.png", pix3, IFF_PNG);
|
||||
Pix pix4 = ImageProcessingUtils.deRotatePix(180, pix);
|
||||
Leptonica1.pixWrite("/tmp/180.png", pix4, IFF_PNG);
|
||||
Pix pix5 = ImageProcessingUtils.deRotatePix(270, pix);
|
||||
Leptonica1.pixWrite("/tmp/270.png", pix5, IFF_PNG);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class ListSplittingUtilsTest {
|
||||
|
||||
@Test
|
||||
public void testBalancedListSplitting() {
|
||||
|
||||
int threadCount = 18;
|
||||
int numberOfPages = 48;
|
||||
var balancedList = ListSplittingUtils.buildBalancedContinuousSublist(numberOfPages, threadCount);
|
||||
assertEquals(threadCount, balancedList.size());
|
||||
assertEquals(numberOfPages, balancedList.stream().mapToLong(Collection::size).sum());
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,83 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.File;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.util.FileSystemUtils;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OsUtils;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
// YOU NEED GHOSTSCRIPT INSTALLED TO RUN THIS TEST!!!!
|
||||
@Disabled
|
||||
public class Pdf2ImgTest {
|
||||
|
||||
private static final int DPI = 150;
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
@Disabled
|
||||
public void testPDFBox() {
|
||||
|
||||
String outputDir = OsUtils.getTemporaryDirectory("imageOutput", "");
|
||||
new File(outputDir).mkdirs();
|
||||
ClassPathResource resource = new ClassPathResource("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||
try (PDDocument document = Loader.loadPDF(resource.getFile())) {
|
||||
PDFRenderer renderer = new PDFRenderer(document);
|
||||
for (int pageNumber = 0; pageNumber < document.getNumberOfPages(); pageNumber++) {
|
||||
BufferedImage image = renderer.renderImageWithDPI(pageNumber, DPI);
|
||||
boolean written = ImageIOUtil.writeImage(image, "tif", new File(outputDir + String.format("page%04d", pageNumber)).getAbsolutePath(), DPI);
|
||||
System.out.printf("%d: %s%n", pageNumber, written);
|
||||
}
|
||||
}
|
||||
FileSystemUtils.deleteRecursively(new File(outputDir));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testGhostScriptParallel() {
|
||||
|
||||
int numOfProcesses = 5;
|
||||
ClassPathResource resource = new ClassPathResource("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||
String outputDir = "/tmp/ghostscript_out/";
|
||||
List<Process> processes = IntStream.range(0, numOfProcesses).boxed().parallel().map(i -> buildCmdArgs(i, outputDir, resource)).map(Pdf2ImgTest::executeProcess).toList();
|
||||
|
||||
List<Integer> processExitCodes = new LinkedList<>();
|
||||
for (Process process : processes) {
|
||||
processExitCodes.add(process.waitFor());
|
||||
}
|
||||
System.out.println("Ghostscripts finished with exit codes " + processExitCodes);
|
||||
FileSystemUtils.deleteRecursively(new File(outputDir));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static Process executeProcess(String[] cmdArgs) {
|
||||
|
||||
return Runtime.getRuntime().exec(cmdArgs);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static String[] buildCmdArgs(Integer i, String outputDir, ClassPathResource resource) {
|
||||
|
||||
String outDir = outputDir + "/" + i + "/";
|
||||
new File(outDir).mkdirs();
|
||||
return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=tiffgray", "-r" + DPI, "-sOutputFile=" + outDir + "page%04d", resource.getFile().toString(), "-c", "quit"};
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,160 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<artifactId>ocr-service-v1</artifactId>
|
||||
<version>3.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>ocr-service-server-v1</artifactId>
|
||||
|
||||
<properties>
|
||||
<tennat-commons.version>0.10.0</tennat-commons.version>
|
||||
<persistence-service.version>2.93.0</persistence-service.version>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.knecon.fforesight</groupId>
|
||||
<artifactId>tenant-commons</artifactId>
|
||||
<version>${tennat-commons.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<artifactId>persistence-service-internal-api-v1</artifactId>
|
||||
<version>${persistence-service.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<artifactId>ocr-service-api-v1</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.commons</groupId>
|
||||
<artifactId>storage-commons</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.commons</groupId>
|
||||
<artifactId>pdftron-logic-commons</artifactId>
|
||||
<version>2.5.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.commons</groupId>
|
||||
<artifactId>spring-commons</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.commons</groupId>
|
||||
<artifactId>metric-commons</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.cloud</groupId>
|
||||
<artifactId>spring-cloud-starter-openfeign</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.pdftron</groupId>
|
||||
<artifactId>PDFNet</artifactId>
|
||||
<version>10.1.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-amqp</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.amazonaws</groupId>
|
||||
<artifactId>aws-java-sdk-kms</artifactId>
|
||||
<version>1.12.440</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- Test -->
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.commons</groupId>
|
||||
<artifactId>test-commons</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.amqp</groupId>
|
||||
<artifactId>spring-rabbit-test</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-test</artifactId>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-tomcat</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<!-- generate git.properties for exposure in /info -->
|
||||
<groupId>pl.project13.maven</groupId>
|
||||
<artifactId>git-commit-id-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>revision</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<generateGitPropertiesFile>true</generateGitPropertiesFile>
|
||||
<gitDescribe>
|
||||
<tags>true</tags>
|
||||
</gitDescribe>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<annotationProcessors>
|
||||
<annotationProcessor>lombok.launch.AnnotationProcessorHider$AnnotationProcessor</annotationProcessor>
|
||||
</annotationProcessors>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<!-- repackages the generated jar into a runnable fat-jar and makes it
|
||||
executable -->
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-maven-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>repackage</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<executable>true</executable>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
<repositories>
|
||||
<repository>
|
||||
<id>pdftron</id>
|
||||
<name>PDFNet Maven</name>
|
||||
<url>https://pdftron.com/maven/release</url>
|
||||
</repository>
|
||||
</repositories>
|
||||
|
||||
|
||||
</project>
|
||||
@ -1,10 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.client;
|
||||
|
||||
import org.springframework.cloud.openfeign.FeignClient;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.internal.resources.FileStatusProcessingUpdateResource;
|
||||
|
||||
@FeignClient(name = "FileStatusProcessingUpdateResource", url = "${persistence-service.url}")
|
||||
public interface FileStatusProcessingUpdateClient extends FileStatusProcessingUpdateResource {
|
||||
|
||||
}
|
||||
@ -1,45 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.configuration;
|
||||
|
||||
import org.springframework.amqp.core.Queue;
|
||||
import org.springframework.amqp.core.QueueBuilder;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Configuration
|
||||
@RequiredArgsConstructor
|
||||
public class MessagingConfiguration {
|
||||
|
||||
public static final String OCR_QUEUE = "ocrQueue";
|
||||
public static final String OCR_DLQ = "ocrDLQ";
|
||||
|
||||
public static final String X_DEAD_LETTER_EXCHANGE = "x-dead-letter-exchange";
|
||||
public static final String X_DEAD_LETTER_ROUTING_KEY = "x-dead-letter-routing-key";
|
||||
public static final String X_MAX_PRIORITY = "x-max-priority";
|
||||
|
||||
public static final String OCR_STATUS_UPDATE_RESPONSE_QUEUE = "ocr_status_update_response_queue";
|
||||
|
||||
public static final String X_ERROR_INFO_HEADER = "x-error-message";
|
||||
public static final String X_ERROR_INFO_TIMESTAMP_HEADER = "x-error-message-timestamp";
|
||||
|
||||
|
||||
@Bean
|
||||
public Queue ocrQueue() {
|
||||
|
||||
return QueueBuilder.durable(OCR_QUEUE)
|
||||
.withArgument(X_DEAD_LETTER_EXCHANGE, "")
|
||||
.withArgument(X_DEAD_LETTER_ROUTING_KEY, OCR_DLQ)
|
||||
.withArgument(X_MAX_PRIORITY, 2)
|
||||
.maxPriority(2)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
public Queue ocrDeadLetterQueue() {
|
||||
|
||||
return QueueBuilder.durable(OCR_DLQ).build();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,31 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.initializer;
|
||||
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
|
||||
import jakarta.annotation.PostConstruct;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
public class PDFNetInitializer {
|
||||
|
||||
@Value("${pdftron.license:}")
|
||||
private String pdftronLicense;
|
||||
|
||||
@Value("${pdftron.ocrmodule.path:/tmp}")
|
||||
private String ocrModulePath;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@PostConstruct
|
||||
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
|
||||
public void init() {
|
||||
|
||||
PDFNet.setTempPath("/tmp/pdftron");
|
||||
PDFNet.addResourceSearchPath(ocrModulePath);
|
||||
PDFNet.initialize(pdftronLicense);
|
||||
}
|
||||
}
|
||||
@ -1,16 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model;
|
||||
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class ImagePosition {
|
||||
|
||||
private Rectangle rectangle;
|
||||
private boolean hasTransparency;
|
||||
|
||||
}
|
||||
@ -1,14 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Classification {
|
||||
|
||||
private Map<String, Float> probabilities = new HashMap<>();
|
||||
private String label;
|
||||
|
||||
}
|
||||
@ -1,11 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class FilterGeometry {
|
||||
|
||||
private ImageSize imageSize;
|
||||
private ImageFormat imageFormat;
|
||||
|
||||
}
|
||||
@ -1,12 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Filters {
|
||||
|
||||
private FilterGeometry geometry;
|
||||
private Probability probability;
|
||||
private boolean allPassed;
|
||||
|
||||
}
|
||||
@ -1,11 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Geometry {
|
||||
|
||||
private float width;
|
||||
private float height;
|
||||
|
||||
}
|
||||
@ -1,12 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class ImageFormat {
|
||||
|
||||
private float quotient;
|
||||
private boolean tooTall;
|
||||
private boolean tooWide;
|
||||
|
||||
}
|
||||
@ -1,14 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class ImageMetadata {
|
||||
|
||||
private Classification classification;
|
||||
private Position position;
|
||||
private Geometry geometry;
|
||||
private Filters filters;
|
||||
private boolean alpha;
|
||||
|
||||
}
|
||||
@ -1,26 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonAlias;
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class ImageServiceResponse {
|
||||
|
||||
private String dossierId;
|
||||
private String fileId;
|
||||
|
||||
@JsonProperty(value = "imageMetadata")
|
||||
@JsonAlias("data")
|
||||
private List<ImageMetadata> data = new ArrayList<>();
|
||||
|
||||
|
||||
@JsonProperty(value = "imageMetadata")
|
||||
@JsonAlias("data")
|
||||
public void setData(List<ImageMetadata> data) {this.data = data;}
|
||||
|
||||
}
|
||||
@ -1,12 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class ImageSize {
|
||||
|
||||
private float quotient;
|
||||
private boolean tooLarge;
|
||||
private boolean tooSmall;
|
||||
|
||||
}
|
||||
@ -1,14 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Position {
|
||||
|
||||
private float x1;
|
||||
private float x2;
|
||||
private float y1;
|
||||
private float y2;
|
||||
private int pageNumber;
|
||||
|
||||
}
|
||||
@ -1,10 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model.image;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class Probability {
|
||||
|
||||
private boolean unconfident;
|
||||
|
||||
}
|
||||
@ -1,70 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class FileStorageService {
|
||||
|
||||
private final StorageService storageService;
|
||||
|
||||
|
||||
public static String getStorageId(String dossierId, String fileId, FileType fileType) {
|
||||
|
||||
return dossierId + "/" + fileId + "." + fileType.name() + fileType.getExtension();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public byte[] getOriginalFile(String dossierId, String fileId) {
|
||||
|
||||
return IOUtils.toByteArray(storageService.getObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN)).getInputStream());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public InputStream getOriginalFileAsStream(String dossierId, String fileId) {
|
||||
|
||||
return storageService.getObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN)).getInputStream();
|
||||
}
|
||||
|
||||
|
||||
public void storeOriginalFile(String dossierId, String fileId, InputStream stream) {
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), stream);
|
||||
}
|
||||
|
||||
|
||||
public boolean untouchedFileExists(String dossierId, String fileId) {
|
||||
|
||||
return storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED));
|
||||
}
|
||||
|
||||
|
||||
public void storeUntouchedFile(String dossierId, String fileId, byte[] data) {
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED), new ByteArrayInputStream(data));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public ImageServiceResponse getImageServiceResponse(String dossierId, String fileId) {
|
||||
|
||||
return storageService.readJSONObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.IMAGE_INFO), ImageServiceResponse.class);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,196 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.pdftron.common.Matrix2D;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.Rect;
|
||||
import com.pdftron.pdf.RectCollection;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Service
|
||||
public class ImagePositionRetrievalService {
|
||||
|
||||
private static final double TOLERANCE = 1e-1;
|
||||
|
||||
// any image with smaller height and width than this gets thrown out, see everyPointInDashedLineIsImage.pdf
|
||||
private static final int PIXEL_THRESHOLD = 10;
|
||||
|
||||
|
||||
/**
|
||||
* Iterates over all elements in a PDF Document and retrieves the bounding box for each image, that is larger than the pixel threshold of 10 in either dimension.
|
||||
* Then it adjusts the bounding boxes for the page rotation.
|
||||
* If the mirrorY flag is set, the Y Coordinates are mirrored and moved up by the page height. This is required for PDFTrons OCRModule.
|
||||
*
|
||||
* @param pdfDoc a PDF File as PDFTron PDFDoc class
|
||||
* @param mirrorY if this flag is set, all coordinates are calculated with upper left corner as (0,0), else initial user space
|
||||
* @return a map with the page indices as keys and the image bounding boxes on that page as a RectCollection
|
||||
*/
|
||||
@SneakyThrows
|
||||
public Map<Integer, RectCollection> getImagePositionPerPage(PDFDoc pdfDoc, boolean mirrorY) {
|
||||
|
||||
Map<Integer, RectCollection> pageIdToImagePositions = new HashMap<>();
|
||||
ElementReader reader = new ElementReader();
|
||||
for (int pageId = 1; pageId <= pdfDoc.getPageCount(); ++pageId) {
|
||||
RectCollection imagePositions = new RectCollection();
|
||||
|
||||
reader.begin(pdfDoc.getPage(pageId));
|
||||
findImagePositionsOnPage(reader, imagePositions, pdfDoc.getPage(pageId), mirrorY);
|
||||
imagePositions = mergeOverlappingRects(imagePositions);
|
||||
reader.end();
|
||||
|
||||
if (imagePositions.getNumRects() > 0) {
|
||||
pageIdToImagePositions.put(pageId, imagePositions);
|
||||
}
|
||||
}
|
||||
reader.destroy();
|
||||
return pageIdToImagePositions;
|
||||
}
|
||||
|
||||
|
||||
private void findImagePositionsOnPage(ElementReader reader, RectCollection imagePositions, Page currentPage, boolean mirrorY) throws PDFNetException {
|
||||
|
||||
Element element;
|
||||
while ((element = reader.next()) != null) {
|
||||
switch (element.getType()) {
|
||||
case Element.e_image, Element.e_inline_image -> {
|
||||
// see everyPointInDashedLineIsImage.pdf TestFile
|
||||
if (element.getImageHeight() > PIXEL_THRESHOLD || element.getImageWidth() > PIXEL_THRESHOLD) {
|
||||
imagePositions.addRect(toRotationAdjustedRect(element.getBBox(), currentPage, mirrorY));
|
||||
}
|
||||
}
|
||||
case Element.e_form -> {
|
||||
reader.formBegin();
|
||||
findImagePositionsOnPage(reader, imagePositions, currentPage, mirrorY);
|
||||
reader.end();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public RectCollection mergeOverlappingRects(RectCollection imagePositions) {
|
||||
|
||||
if (imagePositions.getNumRects() < 2) {
|
||||
return imagePositions;
|
||||
}
|
||||
|
||||
List<Rectangle2D> rectangleList = toSortedRectangleList(imagePositions);
|
||||
|
||||
mergeRectangleList(rectangleList);
|
||||
|
||||
return toRectCollection(rectangleList);
|
||||
}
|
||||
|
||||
|
||||
// Sometimes images are split up into stripes, here we merge the positions of aligned and intersecting rectangles into one larger rectangle
|
||||
private void mergeRectangleList(List<Rectangle2D> rectangleList) {
|
||||
|
||||
for (int idx = 0; rectangleList.size() >= idx + 2; ) {
|
||||
|
||||
var rect1 = rectangleList.get(idx);
|
||||
var rect2 = rectangleList.get(idx + 1);
|
||||
|
||||
if (intersects(rect1, rect2) && isAlignedXOrY(rect1, rect2)) {
|
||||
rectangleList.remove(idx + 1);
|
||||
rectangleList.remove(idx);
|
||||
rectangleList.add(idx, rect1.createUnion(rect2));
|
||||
} else {
|
||||
++idx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean intersects(Rectangle2D rect1, Rectangle2D rect2) {
|
||||
|
||||
return rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE));
|
||||
}
|
||||
|
||||
|
||||
private boolean isAlignedXOrY(Rectangle2D rect1, Rectangle2D rect2) {
|
||||
|
||||
boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE;
|
||||
boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE;
|
||||
|
||||
return isAlignedX || isAlignedY;
|
||||
}
|
||||
|
||||
|
||||
private Rect toRotationAdjustedRect(Rect bbox, Page page, boolean mirrorY) throws PDFNetException {
|
||||
|
||||
int rotation = page.getRotation();
|
||||
double height = page.getPageHeight();
|
||||
double width = page.getPageWidth();
|
||||
|
||||
// Even though PDFTron almost always has the origin in the lower left corner, for some reason, the OCRModule's addTextZonesForPage() uses the upper left corner as origin...
|
||||
Matrix2D mirrorMatrix;
|
||||
if (mirrorY) {
|
||||
mirrorMatrix = new Matrix2D(1, 0, 0, -1, 0, height);
|
||||
} else {
|
||||
mirrorMatrix = new Matrix2D();
|
||||
}
|
||||
|
||||
// We need to rotate the rects to fit to the page rotation
|
||||
Matrix2D rotationMatrix = switch (rotation) {
|
||||
case 1 -> new Matrix2D(0, -1, 1, 0, 0, height);
|
||||
case 2 -> new Matrix2D(-1, 0, 0, -1, width, height);
|
||||
case 3 -> new Matrix2D(0, 1, -1, 0, width, 0);
|
||||
default -> new Matrix2D();
|
||||
};
|
||||
|
||||
Matrix2D finalMatrix = mirrorMatrix.multiply(rotationMatrix);
|
||||
|
||||
Point2D.Double p1 = finalMatrix.multPoint(bbox.getX1(), bbox.getY1());
|
||||
Point2D.Double p2 = finalMatrix.multPoint(bbox.getX2(), bbox.getY2());
|
||||
|
||||
// PDFTron Rect *needs* lower left and upper right coordinates to calculate width and height correctly, even though the documentation states otherwise
|
||||
Point2D.Double lowerLeft = new Point2D.Double(Math.min(p1.x, p2.x), Math.min(p1.y, p2.y));
|
||||
Point2D.Double upperRight = new Point2D.Double(Math.max(p1.x, p2.x), Math.max(p1.y, p2.y));
|
||||
|
||||
return new Rect(lowerLeft.x, lowerLeft.y, upperRight.x, upperRight.y);
|
||||
}
|
||||
|
||||
|
||||
private RectCollection toRectCollection(List<Rectangle2D> rectangleList) {
|
||||
|
||||
RectCollection rectCollection = new RectCollection();
|
||||
rectangleList.forEach(r -> {
|
||||
try {
|
||||
rectCollection.addRect(new Rect(r.getMinX(), r.getMinY(), r.getMaxX(), r.getMaxY()));
|
||||
} catch (PDFNetException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
return rectCollection;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private List<Rectangle2D> toSortedRectangleList(RectCollection rectCollection) {
|
||||
|
||||
List<Rectangle2D> list = new LinkedList<>();
|
||||
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
|
||||
Rect r = rectCollection.getRectAt(i);
|
||||
list.add(new Rectangle2D.Double(r.getX1(), r.getY1(), r.getWidth(), r.getHeight()));
|
||||
}
|
||||
list.sort(Comparator.comparingDouble(RectangularShape::getMinY).thenComparing(RectangularShape::getMinX));
|
||||
return list;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,196 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.OCRModule;
|
||||
import com.pdftron.pdf.OCROptions;
|
||||
import com.pdftron.pdf.Optimizer;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.RectCollection;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import io.micrometer.core.annotation.Timed;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class OCRService {
|
||||
|
||||
public static final String ENGLISH = "eng";
|
||||
|
||||
private final FileStorageService fileStorageService;
|
||||
private final OcrServiceSettings settings;
|
||||
|
||||
private final RabbitTemplate rabbitTemplate;
|
||||
|
||||
private final WatermarkRemovalService watermarkRemovalService;
|
||||
|
||||
private final InvisibleElementRemovalService invisibleElementRemovalService;
|
||||
|
||||
private final ImagePositionRetrievalService imagePositionRetrievalService;
|
||||
|
||||
|
||||
/**
|
||||
* First loads the PDF Document from storage.
|
||||
* Then removes all invisible Elements from the PDF, check InvisibleElementRemovalService for details.
|
||||
* Then gets Image Position Information, check ImagePositionRetrievalService for details.
|
||||
* Then runs OCR page by page, exclusively on pages which have images on them. It does so, by creating a new PDFDoc and inserting a single page at a time.
|
||||
* This is because PDFTron OCROptions overlays all regions where OCR should not be run with white images. It does not check for empty pages.
|
||||
* For Documents with many pages but few Images this results in major performance improvements.
|
||||
* It then re-adds the OCRed Pages to the original document and saves it.
|
||||
*
|
||||
* @param dossierId The dossier id
|
||||
* @param fileId The file id
|
||||
* @param out OutputStream to write the file to
|
||||
*/
|
||||
@Timed("redactmanager_runOcrOnDocument")
|
||||
public void runOcrOnDocument(String dossierId, String fileId, OutputStream out) throws IOException {
|
||||
|
||||
ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream();
|
||||
InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
|
||||
|
||||
try {
|
||||
if (settings.isRemoveWatermark()) {
|
||||
watermarkRemovalService.removeWatermarks(fileStream, transferOutputStream);
|
||||
fileStream.close();
|
||||
fileStream = new ByteArrayInputStream(transferOutputStream.toByteArray());
|
||||
transferOutputStream.close();
|
||||
transferOutputStream = new ByteArrayOutputStream();
|
||||
}
|
||||
|
||||
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
|
||||
|
||||
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
|
||||
long ocrStart = System.currentTimeMillis();
|
||||
runOcr(transferInputStream, out, fileId);
|
||||
long ocrEnd = System.currentTimeMillis();
|
||||
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (ocrEnd - ocrStart) / 1000.0));
|
||||
}
|
||||
|
||||
} finally {
|
||||
fileStream.close();
|
||||
transferOutputStream.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void runOcr(InputStream fileStream, OutputStream out, String fileId) {
|
||||
|
||||
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
||||
|
||||
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true);
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pageIdToRectCollection.size()).build());
|
||||
|
||||
// Optimization:
|
||||
// When a page does not have a TextZone, PDFTron whites out the page. But, PDFTron scans it anyway, resulting in a longer runtime.
|
||||
// So, we need to remove pages without images.
|
||||
// Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one.
|
||||
// Therefore, we create a new Document with a single page for every page that contains an image.
|
||||
// For some reason, if we insert the OCR singlePageDoc into the original PDFDoc inside the loop, for some documents, the RAM Usage increases exponentially with every page.
|
||||
// This is why, we replace the OCRed Pages outside the main loop.
|
||||
int numProcessedPages = 0;
|
||||
Map<Integer, PDFDoc> pageIdToSinglePagePdfDoc = new HashMap<>();
|
||||
for (Integer pageId : pageIdToRectCollection.keySet()) {
|
||||
try {
|
||||
PDFDoc singlePagePdfDoc = extractSinglePagePdfDoc(pdfDoc, pageId);
|
||||
processOcr(pageIdToRectCollection, pageId, singlePagePdfDoc);
|
||||
|
||||
log.info("{}/{} Page {} done, OCR regions {}",
|
||||
numProcessedPages,
|
||||
pageIdToRectCollection.size(),
|
||||
pageId,
|
||||
getAllOcrTextZonesAsString(pageIdToRectCollection, pageId));
|
||||
|
||||
pageIdToSinglePagePdfDoc.put(pageId, singlePagePdfDoc);
|
||||
++numProcessedPages;
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pageIdToRectCollection.size()).numberOfOCRedPages(numProcessedPages).build());
|
||||
|
||||
} catch (PDFNetException e) {
|
||||
log.error("Failed to process page {}", pageId);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
log.info("Copying {} OCRed Pages into original Document", pageIdToSinglePagePdfDoc.size());
|
||||
pageIdToSinglePagePdfDoc.forEach((pageId, singlePagePdfDoc) -> replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc));
|
||||
Optimizer.optimize(pdfDoc);
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pageIdToRectCollection.size()).numberOfOCRedPages(numProcessedPages).ocrFinished(true).build());
|
||||
|
||||
try {
|
||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
} catch (Exception e) {
|
||||
log.error("Processed File with fileId {} could not be saved", fileId);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
pdfDoc.close();
|
||||
}
|
||||
|
||||
|
||||
private void processOcr(Map<Integer, RectCollection> pageIdToRectCollection, Integer pageId, PDFDoc singlePagePdfDoc) throws PDFNetException {
|
||||
|
||||
OCROptions options = new OCROptions();
|
||||
options.addTextZonesForPage(pageIdToRectCollection.get(pageId), 1);
|
||||
options.addLang(ENGLISH);
|
||||
options.addDPI(settings.getOcrDPI());
|
||||
|
||||
OCRModule.processPDF(singlePagePdfDoc, options);
|
||||
}
|
||||
|
||||
|
||||
private static PDFDoc extractSinglePagePdfDoc(PDFDoc pdfDoc, Integer pageId) throws PDFNetException {
|
||||
|
||||
PDFDoc singlePagePdfDoc = new PDFDoc();
|
||||
Page page = pdfDoc.getPage(pageId);
|
||||
page.setMediaBox(page.getCropBox()); // this line ensures the ocr text is placed correctly by PDFTron, see TestFile MediaBoxBiggerThanCropBox.pdf
|
||||
singlePagePdfDoc.pagePushBack(page);
|
||||
return singlePagePdfDoc;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageId, PDFDoc singlePagePdfDoc) {
|
||||
|
||||
Page ocrPage = singlePagePdfDoc.getPage(1);
|
||||
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage);
|
||||
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1));
|
||||
singlePagePdfDoc.close();
|
||||
}
|
||||
|
||||
|
||||
private static StringBuilder getAllOcrTextZonesAsString(Map<Integer, RectCollection> pageIdToRectCollection, Integer pageId) throws PDFNetException {
|
||||
|
||||
StringBuilder zonesString = new StringBuilder();
|
||||
for (int j = 0; j < pageIdToRectCollection.get(pageId).getNumRects(); ++j) {
|
||||
var r = pageIdToRectCollection.get(pageId).getRectAt(j);
|
||||
zonesString.append(format("[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]", r.getX1(), r.getY1(), r.getX2(), r.getY2()));
|
||||
}
|
||||
return zonesString;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,102 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import static com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration.X_ERROR_INFO_HEADER;
|
||||
import static com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration.X_ERROR_INFO_TIMESTAMP_HEADER;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.time.temporal.ChronoUnit;
|
||||
|
||||
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
|
||||
import org.springframework.amqp.core.Message;
|
||||
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
||||
import org.springframework.amqp.rabbit.annotation.RabbitListener;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.ocr.v1.api.model.DocumentRequest;
|
||||
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
|
||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileErrorInfo;
|
||||
|
||||
import feign.FeignException;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class OcrMessageReceiver {
|
||||
|
||||
private final FileStorageService fileStorageService;
|
||||
private final ObjectMapper objectMapper;
|
||||
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
|
||||
private final OCRService ocrService;
|
||||
|
||||
|
||||
@RabbitHandler
|
||||
@RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1")
|
||||
public void receiveOcr(Message in) throws IOException {
|
||||
|
||||
DocumentRequest ocrRequestMessage = objectMapper.readValue(in.getBody(), DocumentRequest.class);
|
||||
|
||||
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
|
||||
try {
|
||||
setStatusOcrProcessing(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
|
||||
if (!fileStorageService.untouchedFileExists(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId())) {
|
||||
byte[] originalFile = fileStorageService.getOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile);
|
||||
}
|
||||
|
||||
try (var transferStream = new ByteArrayOutputStream()) {
|
||||
ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), transferStream);
|
||||
try (var inputStream = new ByteArrayInputStream(transferStream.toByteArray())) {
|
||||
fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), inputStream);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.error("Failed to store file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
} catch (Exception e) {
|
||||
log.warn("An exception occurred in ocr file stage: {}", e.getMessage());
|
||||
in.getMessageProperties().getHeaders().put(X_ERROR_INFO_HEADER, e.getMessage());
|
||||
in.getMessageProperties().getHeaders().put(X_ERROR_INFO_TIMESTAMP_HEADER, OffsetDateTime.now().truncatedTo(ChronoUnit.MILLIS));
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@RabbitHandler
|
||||
@RabbitListener(queues = MessagingConfiguration.OCR_DLQ, concurrency = "1")
|
||||
public void receiveOcrDQL(Message failedMessage) throws IOException {
|
||||
|
||||
DocumentRequest ocrRequestMessage = objectMapper.readValue(failedMessage.getBody(), DocumentRequest.class);
|
||||
log.info("OCR DQL received: {}", ocrRequestMessage);
|
||||
String errorMessage = failedMessage.getMessageProperties().getHeader(X_ERROR_INFO_HEADER);
|
||||
OffsetDateTime timestamp = failedMessage.getMessageProperties().getHeader(X_ERROR_INFO_TIMESTAMP_HEADER);
|
||||
timestamp = timestamp != null ? timestamp : OffsetDateTime.now().truncatedTo(ChronoUnit.MILLIS);
|
||||
fileStatusProcessingUpdateClient.ocrFailed(ocrRequestMessage.getDossierId(),
|
||||
ocrRequestMessage.getFileId(),
|
||||
new FileErrorInfo(errorMessage, MessagingConfiguration.OCR_DLQ, "ocr-service", timestamp));
|
||||
}
|
||||
|
||||
|
||||
private void setStatusOcrProcessing(String dossierId, String fileId) {
|
||||
|
||||
try {
|
||||
fileStatusProcessingUpdateClient.ocrProcessing(dossierId, fileId);
|
||||
} catch (FeignException e) {
|
||||
if (e.status() == HttpStatus.CONFLICT.value()) {
|
||||
throw new AmqpRejectAndDontRequeueException(e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,125 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.sdf.Obj;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class WatermarkRemovalService {
|
||||
|
||||
/**
|
||||
* !!!Warning!! This logic is definitive wrong and should NEVER run in production,
|
||||
* however it was used in second DocuMine (SCM) prototype and we currently need it to compare the results.
|
||||
*
|
||||
* @param pdfFile the file as Inputstream.
|
||||
* @param transferOutputStream the resulting file as Outputstream.
|
||||
*/
|
||||
@SneakyThrows
|
||||
public void removeWatermarks(InputStream pdfFile, OutputStream transferOutputStream) {
|
||||
|
||||
PDFDoc pdfDoc = new PDFDoc(pdfFile);
|
||||
this.execute(pdfDoc);
|
||||
|
||||
try {
|
||||
pdfDoc.save(transferOutputStream, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
} catch (Exception var10) {
|
||||
log.error("File could not be saved after watermark removal");
|
||||
throw new RuntimeException(var10);
|
||||
} finally {
|
||||
pdfDoc.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void execute(PDFDoc pdfDoc) {
|
||||
|
||||
ElementWriter writer = new ElementWriter();
|
||||
ElementReader reader = new ElementReader();
|
||||
Set<Integer> visited = new TreeSet<>();
|
||||
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||
Page page = iterator.next();
|
||||
removeOverlapText(page, reader, writer, visited);
|
||||
}
|
||||
|
||||
reader.destroy();
|
||||
writer.destroy();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void removeOverlapText(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited) {
|
||||
|
||||
visited.add((int) page.getSDFObj().getObjNum());
|
||||
reader.begin(page);
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
processElements(reader, writer, visited, false);
|
||||
writer.end();
|
||||
reader.end();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void processElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, boolean isInForm) {
|
||||
|
||||
for (Element element = reader.next(); element != null; element = reader.next())
|
||||
switch (element.getType()) {
|
||||
case Element.e_image, Element.e_inline_image -> processImage(element, writer, isInForm);
|
||||
case Element.e_form -> processForm(reader, writer, element, visited);
|
||||
default -> writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void processForm(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited) {
|
||||
|
||||
writer.writeElement(element);
|
||||
Obj formObj = element.getXObject();
|
||||
|
||||
if (!visited.contains((int) formObj.getObjNum())) {
|
||||
visited.add((int) formObj.getObjNum());
|
||||
ElementWriter formWriter = new ElementWriter();
|
||||
reader.formBegin();
|
||||
formWriter.begin(formObj);
|
||||
|
||||
reader.clearChangeList();
|
||||
formWriter.setDefaultGState(reader);
|
||||
|
||||
processElements(reader, formWriter, visited, true);
|
||||
formWriter.end();
|
||||
formWriter.destroy();
|
||||
reader.end();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void processImage(Element element, ElementWriter writer, boolean isInForm) {
|
||||
|
||||
// !!! Warning, this will also remove none watermark images form files.
|
||||
// Idea: Remove watermarks by comparing (hash values) images. Watermarks to remove should be uploaded in dossier/dossierTemplate.
|
||||
// Removing watermarks should be done in preprocessing, not at ocr.
|
||||
if (!isInForm) {
|
||||
writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,14 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.settings;
|
||||
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@ConfigurationProperties("ocr-service")
|
||||
public class OcrServiceSettings {
|
||||
|
||||
private int ocrDPI = 300;
|
||||
private boolean removeWatermark;
|
||||
|
||||
}
|
||||
@ -1,133 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
|
||||
import com.iqser.red.service.ocr.v1.server.service.OCRService;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
import com.pdftron.pdf.OCRModule;
|
||||
|
||||
import io.micrometer.prometheus.PrometheusMeterRegistry;
|
||||
import io.micrometer.prometheus.PrometheusTimer;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
|
||||
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
|
||||
@SpringBootTest(properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
|
||||
public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
|
||||
@Autowired
|
||||
protected ObjectMapper objectMapper;
|
||||
|
||||
@Autowired
|
||||
private OCRService ocrService;
|
||||
|
||||
@Autowired
|
||||
private PrometheusMeterRegistry registry;
|
||||
|
||||
@BeforeEach
|
||||
@SneakyThrows
|
||||
public void assertOCRModuleIsLoaded() {
|
||||
assert OCRModule.isModuleAvailable();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testOCRMetrics() {
|
||||
|
||||
testOCR("Watermark");
|
||||
testOCR("Watermark");
|
||||
testOCR("Watermark");
|
||||
|
||||
var ocrOnDocumentMeter = registry.getMeters().stream().filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny();
|
||||
assertThat(ocrOnDocumentMeter.isPresent()).isTrue();
|
||||
PrometheusTimer timer = (PrometheusTimer) ocrOnDocumentMeter.get();
|
||||
assertThat(timer.count()).isEqualTo(3);
|
||||
assertThat(timer.mean(TimeUnit.SECONDS)).isGreaterThan(0.1);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testOcr() {
|
||||
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
|
||||
String text = testOCR("StitchedImagesMultiPage");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testManyRotatedImages() {
|
||||
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
|
||||
String text = testOCR("manyRotatedImages");
|
||||
assertThat(text).contains("Michela", "Gregori", "DVM", "PhD", "Pathologist");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testMergeImages() {
|
||||
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
|
||||
String text = testOCR("merge_images");
|
||||
assertThat(text).contains("Bodyweight change of dams with live young - group mean values",
|
||||
"Control",
|
||||
"mg/g day",
|
||||
"10 mg/kg/day",
|
||||
"20 mg/kg/",
|
||||
"Days",
|
||||
"50",
|
||||
"-200",
|
||||
"—250",
|
||||
"150",
|
||||
"200",
|
||||
"250",
|
||||
"—150");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testOCRWatermark() {
|
||||
|
||||
assertThat(testOCR("Watermark")).contains("syngenta");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testOCRInvisibleText() {
|
||||
|
||||
String text = testOCR("InvisibleText");
|
||||
assertThat(text).contains("Michela", "Gregori", "DVM", "PhD", "Pathologist", "SIGNATURE PAGE");
|
||||
assertThat(text).doesNotContain("COMPLETION DATE:", "LABORATORY PROJECT ID:", "AUTHOR(S):", "Substance");
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private String testOCR(String fileName) {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
|
||||
var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN);
|
||||
try (var fileStream = pdfFileResource.getInputStream()) {
|
||||
storageService.storeObject(TenantContext.getTenantId(), originId, fileStream);
|
||||
}
|
||||
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||
ocrService.runOcrOnDocument("dossier", "file", out);
|
||||
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
|
||||
}
|
||||
|
||||
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||
return extractAllTextFromDocument(fileStream);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,165 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.PdfDraw.drawGrid;
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.PdfDraw.drawRectCollection;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.iqser.red.service.ocr.v1.server.AbstractTest;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.RectCollection;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
|
||||
class ImagePositionRetrievalServiceTest extends AbstractTest {
|
||||
|
||||
@Autowired
|
||||
private ImagePositionRetrievalService imagePositionRetrievalService;
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testImagePositionRetrievalForRotateTestFileWithImages() {
|
||||
|
||||
String fileName = "RotateTestFileWithImages";
|
||||
|
||||
List<int[]> allRectCoords = testImagePositionDetection(fileName);
|
||||
|
||||
assertThat(allRectCoords).contains(new int[]{48, 572, 295, 721},
|
||||
new int[]{54, 279, 301, 428},
|
||||
new int[]{360, 173, 509, 419},
|
||||
new int[]{362, 522, 511, 768},
|
||||
new int[]{459, 354, 608, 600},
|
||||
new int[]{145, 404, 392, 553},
|
||||
new int[]{151, 111, 398, 260},
|
||||
new int[]{457, 5, 606, 251},
|
||||
new int[]{395, 480, 545, 726},
|
||||
new int[]{393, 130, 542, 377},
|
||||
new int[]{88, 236, 334, 386},
|
||||
new int[]{82, 530, 328, 679},
|
||||
new int[]{465, 11, 614, 257},
|
||||
new int[]{159, 117, 406, 266},
|
||||
new int[]{467, 360, 617, 607},
|
||||
new int[]{153, 410, 400, 559});
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testImagePositionRetrievalForRotateTestFileWithImagesExtremeCropbox() {
|
||||
|
||||
String fileName = "RotateTestFileWithImagesExtremeCropbox";
|
||||
|
||||
List<int[]> allRectCoords = testImagePositionDetection(fileName);
|
||||
assertThat(allRectCoords).contains(new int[]{48, 572, 295, 721},
|
||||
new int[]{362, 522, 511, 768},
|
||||
new int[]{360, 173, 509, 419},
|
||||
new int[]{54, 279, 301, 428},
|
||||
new int[]{145, 192, 392, 341},
|
||||
new int[]{459, 142, 608, 388},
|
||||
new int[]{457, -207, 606, 39},
|
||||
new int[]{151, -101, 398, 48},
|
||||
new int[]{-30, 238, 216, 387},
|
||||
new int[]{283, 188, 433, 434},
|
||||
new int[]{281, -162, 430, 85},
|
||||
new int[]{-24, -56, 222, 94},
|
||||
new int[]{-39, 410, 208, 559},
|
||||
new int[]{275, 360, 425, 607},
|
||||
new int[]{273, 11, 422, 257},
|
||||
new int[]{-33, 117, 214, 266});
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testMergeImages() {
|
||||
|
||||
String fileName = "merge_images";
|
||||
List<int[]> allRectCoords = testImagePositionDetection(fileName);
|
||||
assertThat(allRectCoords).contains(new int[]{90, 284, 398, 770});
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testStitchedImagesMultiPage() {
|
||||
|
||||
String fileName = "StitchedImagesMultiPage";
|
||||
List<int[]> allRectCoords = testImagePositionDetection(fileName);
|
||||
assertThat(allRectCoords.size()).isEqualTo(48);
|
||||
}
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testEveryPointInDashedLineIsImage() {
|
||||
String fileName = "everyPointInDashedLineIsImage";
|
||||
List<int[]> allRectCoords = testImagePositionDetection(fileName);
|
||||
assertThat(allRectCoords.size()).isEqualTo(0);
|
||||
}
|
||||
|
||||
|
||||
private List<int[]> testImagePositionDetection(String fileName) throws IOException, PDFNetException {
|
||||
|
||||
try (InputStream fileStream = new FileInputStream(new ClassPathResource("files/" + fileName + ".pdf").getFile().getAbsolutePath())) {
|
||||
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
||||
|
||||
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, false);
|
||||
|
||||
ElementWriter writer = new ElementWriter();
|
||||
pageIdToRectCollection.forEach((pageId, rectCollection) -> {
|
||||
try {
|
||||
writer.begin(pdfDoc.getPage(pageId));
|
||||
drawRectCollection(writer, rectCollection);
|
||||
drawGrid(writer, pdfDoc.getPage(pageId));
|
||||
writer.end();
|
||||
StringBuilder zonesString = new StringBuilder();
|
||||
for (int j = 0; j < rectCollection.getNumRects(); ++j) {
|
||||
var r = rectCollection.getRectAt(j);
|
||||
zonesString.append(String.format("%d:[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]; ", j, r.getX1(), r.getY1(), r.getX2(), r.getY2()));
|
||||
}
|
||||
System.out.printf("Page %d: rotation: %d OCR regions %s \n", pageId, pdfDoc.getPage(pageId).getRotation(), zonesString);
|
||||
} catch (PDFNetException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
|
||||
// Check visually for red Rectangles to match images in the saved pdf file
|
||||
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf")) {
|
||||
out.write(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
|
||||
}
|
||||
pdfDoc.close();
|
||||
System.out.println("result file: " + getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf");
|
||||
// round all coords to nearest int to account for inconsistencies with the calculation of the bounding box
|
||||
return pageIdToRectCollection.values().stream().map(this::toRoundedCoordinateArrayList).flatMap(List::stream).collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private List<int[]> toRoundedCoordinateArrayList(RectCollection rectCollection) {
|
||||
|
||||
List<int[]> coords = new ArrayList<>(rectCollection.getNumRects());
|
||||
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
|
||||
var r = rectCollection.getRectAt(i);
|
||||
coords.add(new int[]{(int) Math.round(r.getX1()), (int) Math.round(r.getY1()), (int) Math.round(r.getX2()), (int) Math.round(r.getY2())});
|
||||
}
|
||||
return coords;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,51 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||
import com.iqser.red.service.ocr.v1.server.AbstractTest;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class InvisibleElementRemovalServiceTest extends AbstractTest {
|
||||
|
||||
@Autowired
|
||||
private InvisibleElementRemovalService invisibleElementRemovalService;
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testRemoveInvisibleText() {
|
||||
|
||||
String fileName = "InvisibleText";
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
|
||||
|
||||
try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out, false);
|
||||
}
|
||||
|
||||
try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_delta.pdf")) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out, true);
|
||||
}
|
||||
|
||||
System.out.println("Output File without invisible elements: files/" + fileName + ".pdf");
|
||||
System.out.println("Output Delta File without invisible elements: files/" + fileName + "_delta.pdf");
|
||||
|
||||
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||
String[] text = extractAllTextFromDocument(fileStream).split("\n");
|
||||
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user