Compare commits
375 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ef23ee0ade | ||
|
|
af31f52b47 | ||
|
|
b5152112ee | ||
|
|
85ea4ef455 | ||
|
|
01f8c01fff | ||
|
|
0b6a292c75 | ||
|
|
e24020589c | ||
|
|
c619b845e8 | ||
|
|
ed0371ca11 | ||
|
|
89b5be8d67 | ||
|
|
077ce60c9d | ||
|
|
ab171be6e2 | ||
|
|
664b47b4c3 | ||
|
|
8005c1f25f | ||
|
|
42185a95a0 | ||
|
|
51b42efaf6 | ||
|
|
6a50d45947 | ||
|
|
073ac12cf7 | ||
|
|
84b054a4cc | ||
|
|
905b65a5fa | ||
|
|
7617c1f308 | ||
|
|
2b3936c09b | ||
|
|
6e5b1f1978 | ||
|
|
cf846d18bc | ||
|
|
25c46f16ac | ||
|
|
96acefed78 | ||
|
|
366241e6c6 | ||
|
|
7f472ccc52 | ||
|
|
6f807c7d94 | ||
|
|
6e04c15f3d | ||
|
|
1384584e2f | ||
|
|
e58011e111 | ||
|
|
a821570065 | ||
|
|
7ee1f9e360 | ||
|
|
f9b25c8157 | ||
|
|
c90874da7a | ||
|
|
4683c696a5 | ||
|
|
95c02ce3cf | ||
|
|
b2d62e32fe | ||
|
|
65c1f03ea3 | ||
|
|
2219519a2b | ||
|
|
af05218e37 | ||
|
|
736f531df3 | ||
|
|
c64445d54b | ||
|
|
af29233b10 | ||
|
|
5f04b45554 | ||
|
|
6c41533f0b | ||
|
|
9d2596e5ef | ||
|
|
e7b01161ac | ||
|
|
7b073eb4f3 | ||
|
|
4b0c041d84 | ||
|
|
6c7442ac6d | ||
|
|
23e23328ee | ||
|
|
9d1ffdd779 | ||
|
|
3109a30ae1 | ||
|
|
fe2ed1807e | ||
|
|
31de229fa5 | ||
|
|
8a80abfff1 | ||
|
|
7c08905eda | ||
|
|
4f40c9dbc9 | ||
|
|
32381b4472 | ||
|
|
469da38952 | ||
|
|
0f8c4674b3 | ||
|
|
8e165a41d7 | ||
|
|
ed7a701ad9 | ||
|
|
393103e074 | ||
|
|
bd02066e2c | ||
|
|
fec19f4afb | ||
|
|
c726a643f0 | ||
|
|
519e95735c | ||
|
|
b52af2637f | ||
|
|
46ea7edc4c | ||
|
|
9650195afd | ||
|
|
ce628a99f7 | ||
|
|
b66afe135c | ||
|
|
dc892d0fec | ||
|
|
af45f2cd8c | ||
|
|
befb6b1df6 | ||
|
|
61efb4cae9 | ||
|
|
4a06059258 | ||
|
|
292e5b215e | ||
|
|
7c2db6c3c5 | ||
|
|
4395074b21 | ||
|
|
8e14b74da2 | ||
|
|
3b91639ea9 | ||
|
|
c5178ea5c2 | ||
|
|
cf39d4dfcc | ||
|
|
bb40345f79 | ||
|
|
e3e9d16145 | ||
|
|
f6ca5a3c17 | ||
|
|
15e3dced35 | ||
|
|
933054b332 | ||
|
|
ab86714cb3 | ||
|
|
8626b106d0 | ||
|
|
52e948e66c | ||
|
|
3b33405cbf | ||
|
|
b2fa14dde2 | ||
|
|
62e07686d7 | ||
|
|
3eb97d614f | ||
|
|
81469413b0 | ||
|
|
2993676a6f | ||
|
|
8e115dcd8a | ||
|
|
173911b840 | ||
|
|
b0ae00aa02 | ||
|
|
00bf9f279e | ||
|
|
d16377a24a | ||
|
|
81179ee744 | ||
|
|
1953b5924f | ||
|
|
6f6e8d5d4e | ||
|
|
69bcd4f68d | ||
|
|
b900cfaf31 | ||
|
|
cdc2081785 | ||
|
|
a9287ec406 | ||
|
|
5b6a706c28 | ||
|
|
28d8ad0a3f | ||
|
|
0c1583c1be | ||
|
|
7633566d9b | ||
|
|
cc4f09711e | ||
|
|
370165dc59 | ||
|
|
8c052c38d7 | ||
|
|
ea18d3d307 | ||
|
|
2726fc3fe1 | ||
|
|
033279e261 | ||
|
|
ec0dd032c9 | ||
|
|
598fa7f1c7 | ||
|
|
65b1f7d179 | ||
|
|
3173610be5 | ||
|
|
e920eb5a78 | ||
|
|
7e4baea7e5 | ||
|
|
66d3433e04 | ||
|
|
a2f559af51 | ||
|
|
39f527a57c | ||
|
|
5c2844fe31 | ||
|
|
b216f02e15 | ||
|
|
2e2f30ba35 | ||
|
|
9f7ed974ec | ||
|
|
570a348a77 | ||
|
|
859dba2ecf | ||
|
|
1c5d755111 | ||
|
|
133e06460f | ||
|
|
da91fcff97 | ||
|
|
79795e408a | ||
|
|
b719db86ab | ||
|
|
797602e373 | ||
|
|
3d2f66cf10 | ||
|
|
e304a9f2d7 | ||
|
|
c05f67cf44 | ||
|
|
9ecf9ca19f | ||
|
|
3a2ee903af | ||
|
|
072a8aa3da | ||
|
|
b5cfa7b63d | ||
|
|
5f5a6258c5 | ||
|
|
ac0e83725a | ||
|
|
5d33ad570e | ||
|
|
fd698a78fc | ||
|
|
c3edeb3c7d | ||
|
|
fc06dba2ce | ||
|
|
b6742c1e89 | ||
|
|
efb1a748af | ||
|
|
9be672c728 | ||
|
|
23985b14be | ||
|
|
48b7a22e2b | ||
|
|
546341ee75 | ||
|
|
0ed1481517 | ||
|
|
b2a47f66ae | ||
|
|
3835d03036 | ||
|
|
a5fcebce30 | ||
|
|
b867deb9f9 | ||
|
|
8648ed0952 | ||
|
|
53f786b539 | ||
|
|
40465e8778 | ||
|
|
a76b2ace3f | ||
|
|
aeaca2f278 | ||
|
|
f1dbcc24a2 | ||
|
|
fda25852d1 | ||
|
|
471fadbcca | ||
|
|
87001090d5 | ||
|
|
ea355429c2 | ||
|
|
6a65d7f9fc | ||
|
|
e935cc7b14 | ||
|
|
07733d0855 | ||
|
|
abb249e966 | ||
|
|
bcd1eb9afa | ||
|
|
60acbac53f | ||
|
|
a3decd292d | ||
|
|
b6f0a21886 | ||
|
|
d61cac8b4f | ||
|
|
ae46c5f1ca | ||
|
|
f0a70a5242 | ||
|
|
15ea385f4d | ||
|
|
08be18db2d | ||
|
|
64209255cb | ||
|
|
4761d2e1a2 | ||
|
|
1916e626df | ||
|
|
e4663ac8db | ||
|
|
6a691183dc | ||
|
|
3dd215288a | ||
|
|
6fb1a0bef3 | ||
|
|
4e7c3f584b | ||
|
|
84bdb4d1ed | ||
|
|
75ab4df592 | ||
|
|
8442e60055 | ||
|
|
0ef67fc07b | ||
|
|
ea02f31a84 | ||
|
|
58acbab85f | ||
|
|
d38d023485 | ||
|
|
c1afe9b11f | ||
|
|
bdcb9aeda4 | ||
|
|
6a86036a78 | ||
|
|
a358d7565e | ||
|
|
069a6c0b49 | ||
|
|
683f7f1fb8 | ||
|
|
7eab3a4088 | ||
|
|
970fc99ed1 | ||
|
|
48c54f63a0 | ||
|
|
20e4e5ddff | ||
|
|
b53930328a | ||
|
|
c947d552d2 | ||
|
|
6b1b5eab84 | ||
|
|
cc9816c8cb | ||
|
|
f256f9b30f | ||
|
|
6167e3fb57 | ||
|
|
a78fb0244a | ||
|
|
8099a00bb6 | ||
|
|
9bb0468b2b | ||
|
|
c4d9c5df02 | ||
|
|
976f408237 | ||
|
|
319268c53d | ||
|
|
014eba9fc3 | ||
|
|
9bd8419770 | ||
|
|
c13ff7fbf6 | ||
|
|
5d3826e9b9 | ||
|
|
0c3194276a | ||
|
|
e302d9784e | ||
|
|
f185b13f2b | ||
|
|
990c376ce6 | ||
|
|
bf6a0d770b | ||
|
|
f18bda1d4e | ||
|
|
0a11992361 | ||
|
|
456b8fe4a1 | ||
|
|
9778ece992 | ||
|
|
8bd0de6263 | ||
|
|
5c1708f97f | ||
|
|
a35d77be2e | ||
|
|
631160eb22 | ||
|
|
8e7e588d26 | ||
|
|
ac850c2626 | ||
|
|
1d765a6baa | ||
|
|
c55984aa67 | ||
|
|
27aa418029 | ||
|
|
c4edff4696 | ||
|
|
92fd1a72de | ||
|
|
0d3d25e7d7 | ||
|
|
956fbff872 | ||
|
|
2488009af1 | ||
|
|
16be2467fd | ||
|
|
f4cae8a7dc | ||
|
|
dfc23955d7 | ||
|
|
d6e3d6fe22 | ||
|
|
bef23e38b5 | ||
|
|
65ab7a1912 | ||
|
|
d80231e4a9 | ||
|
|
56c07a4491 | ||
|
|
0b4ad29dcb | ||
|
|
0ad0cd45d6 | ||
|
|
d659fe7234 | ||
|
|
cb9127b4f3 | ||
|
|
05523585c0 | ||
|
|
4ced572949 | ||
|
|
79239b751d | ||
|
|
f146beeb44 | ||
|
|
f8a4ccfff0 | ||
|
|
a6ba501fa8 | ||
|
|
7dfb3b2b52 | ||
|
|
c324d3815e | ||
|
|
74f55a5cbf | ||
|
|
e7bf607663 | ||
|
|
f4d789311c | ||
|
|
9817eae897 | ||
|
|
477f6af886 | ||
|
|
2c171b6a9e | ||
|
|
71477dabde | ||
|
|
a927cbd9dc | ||
|
|
a1521877d7 | ||
|
|
f4b6386e1c | ||
|
|
1d64028158 | ||
|
|
0979a267d4 | ||
|
|
cc77d19500 | ||
|
|
fa048b2fe0 | ||
|
|
bdf1161c91 | ||
|
|
b4a225144d | ||
|
|
903b1c1fd4 | ||
|
|
c3e7582ee3 | ||
|
|
cfc5db45cd | ||
|
|
fbd0196719 | ||
|
|
3c9049dc8a | ||
|
|
015984891f | ||
|
|
66fcb62833 | ||
|
|
48824f56a8 | ||
|
|
785628537f | ||
|
|
23eb0c40a3 | ||
|
|
1b4aaf4454 | ||
|
|
e4f3557b36 | ||
|
|
9be3c86297 | ||
|
|
88855de2da | ||
|
|
368a75e985 | ||
|
|
12344d57b2 | ||
|
|
9e854379e7 | ||
|
|
b779c72041 | ||
|
|
760a809900 | ||
|
|
ba1c7c07ab | ||
|
|
ca0cbbcb49 | ||
|
|
da2cdc288e | ||
|
|
68da328889 | ||
|
|
711548d1a7 | ||
|
|
2bddcdafee | ||
|
|
750ccf4ce2 | ||
|
|
57b5d3f48e | ||
|
|
d8c9659469 | ||
|
|
30f060e36c | ||
|
|
53a5824e6c | ||
|
|
e2bcf971c9 | ||
|
|
dacc2f7f43 | ||
|
|
144a9591a2 | ||
|
|
207d9dec97 | ||
|
|
09ee90222e | ||
|
|
1316a067fe | ||
|
|
e203210ade | ||
|
|
b25d46291a | ||
|
|
84148d3b6e | ||
|
|
a6ba66b1aa | ||
|
|
c3e69b2cdf | ||
|
|
f69331e7d8 | ||
|
|
01493dc033 | ||
|
|
459e0c8be7 | ||
|
|
1b1f777706 | ||
|
|
0e0a811f9d | ||
|
|
efa3d75479 | ||
|
|
9abdc6d44d | ||
|
|
3bab61c446 | ||
|
|
d17517d3c3 | ||
|
|
567cbc178b | ||
|
|
3c53772765 | ||
|
|
8647cf5a18 | ||
|
|
310c07b200 | ||
|
|
daba0bf8a6 | ||
|
|
3839de215c | ||
|
|
b4d68594f1 | ||
|
|
99ed331a1e | ||
|
|
f2c0991987 | ||
|
|
b8ef55e6e2 | ||
|
|
5792ff4a93 | ||
|
|
621c3f269d | ||
|
|
8dba392904 | ||
|
|
306a53ea79 | ||
|
|
754fd8f933 | ||
|
|
28ec4c9ccb | ||
|
|
aed4a55787 | ||
|
|
f87e2d75b5 | ||
|
|
de6760abc1 | ||
|
|
261ef4c367 | ||
|
|
11ba9c6bb9 | ||
|
|
b7c3d02978 | ||
|
|
bcf0bcbaf4 | ||
|
|
84cde2a3db | ||
|
|
6f2dd4f823 | ||
|
|
a909724217 | ||
|
|
67a981e7a8 | ||
|
|
0e93fdd515 | ||
|
|
d464239f9b | ||
|
|
88a20924b9 | ||
|
|
f89243472c | ||
|
|
ad3612acd4 | ||
|
|
630eee6bd7 | ||
|
|
a951911ec8 |
1
.gitattributes
vendored
Normal file
1
.gitattributes
vendored
Normal file
@ -0,0 +1 @@
|
||||
*.pdf filter=lfs diff=lfs merge=lfs -text
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@ -42,3 +42,5 @@ gradlew.bat
|
||||
gradlew
|
||||
gradle.properties
|
||||
gradle/
|
||||
.DS_Store
|
||||
.DS_Store/
|
||||
|
||||
@ -1,3 +1,7 @@
|
||||
variables:
|
||||
# SONAR_PROJECT_KEY: 'fforesight_layout-parser_AYd5quv2mRkBOCG22hvF'
|
||||
GIT_SUBMODULE_STRATEGY: recursive
|
||||
GIT_SUBMODULE_FORCE_HTTPS: 'true'
|
||||
include:
|
||||
- project: 'gitlab/gitlab'
|
||||
ref: 'main'
|
||||
@ -10,12 +14,13 @@ deploy:
|
||||
script:
|
||||
- echo "Building with gradle version ${BUILDVERSION}"
|
||||
- gradle -Pversion=${BUILDVERSION} publish
|
||||
- gradle bootBuildImage --cleanCache --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${BUILDVERSION}
|
||||
- gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${BUILDVERSION}
|
||||
- echo "BUILDVERSION=$BUILDVERSION" >> version.env
|
||||
artifacts:
|
||||
reports:
|
||||
dotenv: version.env
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
||||
- if: $CI_COMMIT_BRANCH =~ /^feature/ && $CI_COMMIT_TAG == ""
|
||||
- if: $CI_COMMIT_BRANCH =~ /^release/
|
||||
- if: $CI_COMMIT_TAG
|
||||
- if: $CI_COMMIT_TAG
|
||||
|
||||
8
.gitmodules
vendored
Normal file
8
.gitmodules
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/basf"]
|
||||
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/basf
|
||||
url = ssh://git@git.knecon.com:22222/fforesight/documents/basf.git
|
||||
update = merge
|
||||
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta"]
|
||||
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta
|
||||
url = ssh://git@git.knecon.com:22222/fforesight/documents/syngenta.git
|
||||
update = merge
|
||||
88
README.md
88
README.md
@ -1 +1,89 @@
|
||||
# PDF Layout Parser Micro-Service: layout-parser
|
||||
|
||||
## Introduction
|
||||
The layout-parser micro-service is a powerful tool designed to efficiently extract structured information from PDF documents. Written in Java and utilizing Spring Boot 3, Apache PDFBox, and RabbitMQ, this micro-service excels at parsing PDFs and organizing their content into a meaningful and coherent layout structure. Notably, the layout-parser micro-service distinguishes itself by relying solely on advanced algorithms, rather than machine learning techniques.
|
||||
|
||||
### Key Steps in the PDF Layout Parsing Process:
|
||||
|
||||
* **Text Position Extraction:**
|
||||
The micro-service leverages Apache PDFBox to extract precise text positions for each individual character within the PDF document.
|
||||
|
||||
* **Word Segmentation and Text Block Formation:**
|
||||
Employing an array of diverse algorithms, the micro-service initially identifies and segments words, creating distinct text blocks.
|
||||
|
||||
* **Text Block Classification:**
|
||||
The segmented text blocks are then subjected to classification algorithms. These algorithms categorize the text blocks based on their content and visual properties, distinguishing between sections, subsections, headlines, paragraphs, images, tables, table cells, headers, and footers.
|
||||
|
||||
* **Layout Coherence Establishment:**
|
||||
The classified text blocks are subsequently orchestrated into a cohesive layout structure. This process involves arranging sections, subsections, paragraphs, images, and other elements in a logical and structured manner.
|
||||
|
||||
* **Output Generation in Various Formats:**
|
||||
Once the layout structure is established, the micro-service generates output in multiple formats. These formats are designed for seamless integration with downstream micro-services. The supported formats include JSON, XML, and others, ensuring flexibility in downstream data consumption.
|
||||
|
||||
### Optional Enhancements:
|
||||
|
||||
* **ML-Based Table Extraction:**
|
||||
For enhanced results, users have the option to incorporate machine learning-based table extraction. This feature can be activated by providing ML-generated results as a JSON file, which are then integrated seamlessly into the layout structure.
|
||||
|
||||
* **Image Classification using ML:**
|
||||
Additionally, for more accurate image classification, users can optionally feed ML-generated image classification results into the micro-service. Similar to the table extraction option, the micro-service processes the pre-parsed results in JSON format, thus optimizing the accuracy of image content identification.
|
||||
|
||||
In conclusion, the layout-parser micro-service is a versatile PDF layout parsing solution crafted entirely around advanced algorithms, without reliance on machine learning. It proficiently extracts text positions, segments content into meaningful blocks, classifies these blocks, arranges them coherently, and outputs structured data for downstream micro-services. Optional integration with ML-generated table extractions and image classifications further enhances its capabilities.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## Installation
|
||||
|
||||
### Prerequisites
|
||||
|
||||
Before building and using the layout-parser micro-service, please ensure you have the following software and tools installed:
|
||||
|
||||
Java Development Kit (JDK) 17 or later
|
||||
Gradle build tool (preinstalled)
|
||||
Build and Test
|
||||
To build and test the micro-service, follow these steps:
|
||||
|
||||
### Clone the Repository:
|
||||
|
||||
bash
|
||||
```
|
||||
git clone ssh://git@git.knecon.com:22222/fforesight/layout-parser.git
|
||||
cd layout-parser
|
||||
```
|
||||
### Build the Project:
|
||||
Use the following command to build the project using Gradle:
|
||||
|
||||
```
|
||||
gradle clean build
|
||||
```
|
||||
### Run Tests:
|
||||
Run the test suite using the following command:
|
||||
```
|
||||
gradle test
|
||||
```
|
||||
## Building a Custom Docker Image
|
||||
To create a custom Docker image for the layout-parser micro-service, execute the provided script:
|
||||
|
||||
### Ensure Docker is Installed:
|
||||
Ensure that Docker is installed and running on your system.
|
||||
|
||||
### Run the Image Building Script:
|
||||
Execute the publish-custom-image script in the project directory:
|
||||
|
||||
```
|
||||
./publish-custom-image
|
||||
```
|
||||
## Publishing to Internal Maven Repository
|
||||
To publish the layout-parser micro-service to your internal Maven repository, execute the following command:
|
||||
|
||||
```
|
||||
gradle -Pversion=buildVersion publish
|
||||
```
|
||||
Replace buildVersion with the desired version number.
|
||||
|
||||
## Additional Notes
|
||||
Make sure to configure any necessary application properties before deploying the micro-service.
|
||||
For advanced usage and configurations, refer to Kilian or Dom or preferably the source code.
|
||||
|
||||
@ -8,6 +8,8 @@ plugins {
|
||||
|
||||
group = "com.knecon.fforesight"
|
||||
|
||||
val documentVersion by rootProject.extra { "4.433.0" }
|
||||
|
||||
java.sourceCompatibility = JavaVersion.VERSION_17
|
||||
java.targetCompatibility = JavaVersion.VERSION_17
|
||||
|
||||
@ -24,6 +26,8 @@ tasks.named<Test>("test") {
|
||||
reports {
|
||||
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
|
||||
}
|
||||
minHeapSize = "512m"
|
||||
maxHeapSize = "2048m"
|
||||
}
|
||||
|
||||
tasks.test {
|
||||
@ -40,6 +44,19 @@ tasks.jacocoTestReport {
|
||||
}
|
||||
|
||||
allprojects {
|
||||
|
||||
tasks.withType<Javadoc> {
|
||||
options {
|
||||
this as StandardJavadocDocletOptions
|
||||
addBooleanOption("Xdoclint:none", true)
|
||||
addStringOption("Xmaxwarns", "1")
|
||||
}
|
||||
}
|
||||
|
||||
pmd {
|
||||
setConsoleOutput(true)
|
||||
}
|
||||
|
||||
publishing {
|
||||
publications {
|
||||
create<MavenPublication>(name) {
|
||||
@ -62,6 +79,7 @@ java {
|
||||
withJavadocJar()
|
||||
}
|
||||
|
||||
|
||||
repositories {
|
||||
mavenLocal()
|
||||
mavenCentral()
|
||||
|
||||
@ -9,12 +9,13 @@
|
||||
</description>
|
||||
|
||||
<rule ref="category/java/errorprone.xml">
|
||||
<exclude name="DataflowAnomalyAnalysis"/>
|
||||
<exclude name="MissingSerialVersionUID"/>
|
||||
<exclude name="NullAssignment"/>
|
||||
<exclude name="AvoidLiteralsInIfCondition"/>
|
||||
<exclude name="AvoidDuplicateLiterals"/>
|
||||
<exclude name="AvoidFieldNameMatchingMethodName"/>
|
||||
<exclude name="NullAssignment"/>
|
||||
<exclude name="AssignmentInOperand"/>
|
||||
<exclude name="BeanMembersShouldSerialize"/>
|
||||
</rule>
|
||||
|
||||
</ruleset>
|
||||
</ruleset>
|
||||
|
||||
|
||||
@ -10,14 +10,14 @@
|
||||
|
||||
|
||||
<rule ref="category/java/errorprone.xml">
|
||||
<exclude name="DataflowAnomalyAnalysis"/>
|
||||
<exclude name="MissingSerialVersionUID"/>
|
||||
<exclude name="NullAssignment"/>
|
||||
<exclude name="AvoidLiteralsInIfCondition"/>
|
||||
<exclude name="AvoidDuplicateLiterals"/>
|
||||
<exclude name="AvoidFieldNameMatchingMethodName"/>
|
||||
<exclude name="AvoidFieldNameMatchingTypeName"/>
|
||||
<exclude name="NullAssignment"/>
|
||||
<exclude name="AssignmentInOperand"/>
|
||||
<exclude name="TestClassWithoutTestCases"/>
|
||||
<exclude name="BeanMembersShouldSerialize"/>
|
||||
</rule>
|
||||
|
||||
</ruleset>
|
||||
</ruleset>
|
||||
|
||||
@ -1,6 +1,10 @@
|
||||
plugins {
|
||||
id("com.knecon.fforesight.java-conventions")
|
||||
id("io.freefair.lombok") version "8.2.2"
|
||||
id("io.freefair.lombok") version "8.4"
|
||||
}
|
||||
|
||||
description = "layoutparser-service-internal-api"
|
||||
|
||||
dependencies {
|
||||
implementation("io.swagger.core.v3:swagger-annotations:2.2.15")
|
||||
}
|
||||
|
||||
@ -1,20 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class DocumentData {
|
||||
|
||||
DocumentPage[] documentPages;
|
||||
DocumentTextData[] documentTextData;
|
||||
DocumentPositionData[] documentPositions;
|
||||
DocumentStructure documentStructure;
|
||||
|
||||
}
|
||||
@ -1,22 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class DocumentPage {
|
||||
|
||||
int number;
|
||||
int height;
|
||||
int width;
|
||||
int rotation;
|
||||
|
||||
}
|
||||
@ -1,21 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class DocumentPositionData {
|
||||
|
||||
Long id;
|
||||
int[] stringIdxToPositionIdx;
|
||||
float[][] positions;
|
||||
|
||||
}
|
||||
@ -1,127 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class DocumentStructure {
|
||||
|
||||
EntryData root;
|
||||
|
||||
|
||||
public static class TableProperties {
|
||||
|
||||
public static final String NUMBER_OF_ROWS = "numberOfRows";
|
||||
public static final String NUMBER_OF_COLS = "numberOfCols";
|
||||
|
||||
}
|
||||
|
||||
public static class ImageProperties {
|
||||
|
||||
public static final String TRANSPARENT = "transparent";
|
||||
public static final String IMAGE_TYPE = "imageType";
|
||||
public static final String POSITION = "position";
|
||||
public static final String ID = "id";
|
||||
|
||||
}
|
||||
|
||||
public static class TableCellProperties {
|
||||
|
||||
public static final String B_BOX = "bBox";
|
||||
public static final String ROW = "row";
|
||||
public static final String COL = "col";
|
||||
public static final String HEADER = "header";
|
||||
|
||||
}
|
||||
|
||||
public static final String RECTANGLE_DELIMITER = ";";
|
||||
|
||||
public static Rectangle2D parseRectangle2D(String bBox) {
|
||||
|
||||
List<Float> floats = Arrays.stream(bBox.split(RECTANGLE_DELIMITER)).map(Float::parseFloat).toList();
|
||||
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
||||
}
|
||||
|
||||
|
||||
public EntryData get(List<Integer> tocId) {
|
||||
|
||||
if (tocId.isEmpty()) {
|
||||
return root;
|
||||
}
|
||||
EntryData entry = root.children.get(tocId.get(0));
|
||||
for (int id : tocId.subList(1, tocId.size())) {
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
||||
|
||||
public Stream<EntryData> streamAllEntries() {
|
||||
|
||||
return Stream.concat(Stream.of(root), root.children.stream()).flatMap(DocumentStructure::flatten);
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return String.join("\n", streamAllEntries().map(EntryData::toString).toList());
|
||||
}
|
||||
|
||||
|
||||
private static Stream<EntryData> flatten(EntryData entry) {
|
||||
|
||||
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentStructure::flatten));
|
||||
}
|
||||
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public static class EntryData {
|
||||
|
||||
NodeType type;
|
||||
int[] treeId;
|
||||
Long[] atomicBlockIds;
|
||||
Long[] pageNumbers;
|
||||
Map<String, String> properties;
|
||||
List<EntryData> children;
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("[");
|
||||
for (int i : treeId) {
|
||||
sb.append(i);
|
||||
sb.append(",");
|
||||
}
|
||||
sb.delete(sb.length() - 1, sb.length());
|
||||
sb.append("]: ");
|
||||
|
||||
sb.append(type);
|
||||
sb.append(" atbs = ");
|
||||
sb.append(atomicBlockIds.length);
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,27 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class DocumentTextData {
|
||||
|
||||
Long id;
|
||||
Long page;
|
||||
String searchText;
|
||||
int numberOnPage;
|
||||
int start;
|
||||
int end;
|
||||
int[] lineBreaks;
|
||||
|
||||
}
|
||||
@ -1,21 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
public enum NodeType {
|
||||
DOCUMENT,
|
||||
SECTION,
|
||||
HEADLINE,
|
||||
PARAGRAPH,
|
||||
TABLE,
|
||||
TABLE_CELL,
|
||||
IMAGE,
|
||||
HEADER,
|
||||
FOOTER;
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT);
|
||||
}
|
||||
}
|
||||
@ -1,5 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
@ -9,9 +10,12 @@ import lombok.NoArgsConstructor;
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Schema(description = "Object containing a simplified version, which contains almost exclusively text, of the document structure Section class.")
|
||||
public class SimplifiedSectionText {
|
||||
|
||||
private int sectionNumber;
|
||||
@Schema(description = "The number of this Section. This is used to map the simplified section text back to the original Section.")
|
||||
private String sectionNumber;
|
||||
@Schema(description = "The text in this Section.")
|
||||
private String text;
|
||||
|
||||
}
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
@ -12,9 +13,22 @@ import lombok.NoArgsConstructor;
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Schema(description = "Object containing a simplified version, which contains almost exclusively text, of the document structure.")
|
||||
public class SimplifiedText {
|
||||
|
||||
@Schema(description = "Number of pages in the entire document.")
|
||||
private int numberOfPages;
|
||||
@Schema(description = "A List of simplified Sections, which contains almost exclusively text.")
|
||||
@Builder.Default
|
||||
private List<SimplifiedSectionText> sectionTexts = new ArrayList<>();
|
||||
@Schema(description = "A list of the main section numbers ")
|
||||
@Builder.Default
|
||||
private List<String> mainSectionNumbers = new ArrayList<>();
|
||||
@Schema(description = "A list of the header section numbers ")
|
||||
@Builder.Default
|
||||
private List<String> headerSectionNumbers = new ArrayList<>();
|
||||
@Schema(description = "A list of the footer section numbers ")
|
||||
@Builder.Default
|
||||
private List<String> footerSectionNumbers = new ArrayList<>();
|
||||
|
||||
}
|
||||
|
||||
@ -2,20 +2,29 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@Schema(description = "Object containing information about a Paragraph/Headline/Header/Footer.")
|
||||
public class ParagraphData {
|
||||
|
||||
@Schema(description = "The text of this Semantic Node, without any linebreaks.", example = "This is some text.")
|
||||
private String text;
|
||||
@Schema(description = "A list of text ranges in string offsets. Every character in any of the ranges is bold.", example = "[0, 15]")
|
||||
List<Range> boldTextBoundaries;
|
||||
@Schema(description = "A list of text ranges in string offsets. Every character in any of the ranges is italic.", example = "[0, 15]")
|
||||
List<Range> italicTextBoundaries;
|
||||
@Schema(description = "The line breaks in the text of this semantic node in string offsets. They are exclusive end. At the end of each semantic node there is an implicit linebreak.", example = "[5, 10]")
|
||||
List<Integer> linebreaks;
|
||||
@Schema(description = "The classification of this Paragraph.", allowableValues = "{paragraph, headline, header, footer}")
|
||||
private String classification;
|
||||
|
||||
@Schema(description = "Describes the text orientation of this semantic node. Any semantic node only has a single text orientation.", allowableValues = "{ZERO, QUARTER_CIRCLE, HALF_CIRCLE, THREE_QUARTER_CIRCLE}")
|
||||
private String orientation;
|
||||
@Schema(description = "Describes the text direction in degrees of this semantic node. Any semantic node only has a single text direction.", minimum = "0", maximum = "359")
|
||||
private int textDirection;
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
|
||||
@Schema(description = "Object specifying the start and end offsets of a text range in string offsets.")
|
||||
public record Range(int start, int end) {
|
||||
|
||||
}
|
||||
|
||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
@ -9,8 +10,12 @@ import lombok.Data;
|
||||
@Builder
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@Schema(description = "Object containing a simplified version of the document structure. This simplified form only knows Paragraphs and Tables. The Paragraph Objects might be a Paragraph, Headline, Header or Footer.")
|
||||
public class ResearchDocumentData {
|
||||
|
||||
@Schema(description = "File name of the original uploaded file.")
|
||||
String originalFile;
|
||||
@Schema(description = "A List of all paragraphs/headline or table objects, that have been parsed in this document.")
|
||||
List<StructureObject> structureObjects;
|
||||
|
||||
}
|
||||
|
||||
@ -2,14 +2,19 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@Schema(description = "Object containing information about a Table Row.")
|
||||
public class RowData {
|
||||
|
||||
@Schema(description = "Boolean indicating whether this table row is classified as a header row.")
|
||||
boolean header;
|
||||
@Schema(description = "A list of Objects containing information about the text in each cell of this row.")
|
||||
List<ParagraphData> cellText;
|
||||
@Schema(description = "The bounding box of this StructureObject. Is always exactly 4 values representing x, y, w, h, where x, y specify the lower left corner.")
|
||||
float[] bBox;
|
||||
}
|
||||
|
||||
@ -1,5 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
@ -7,13 +10,22 @@ import lombok.Data;
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@Schema(description = "Object containing information about either a Paragraph/Headline/Header/Footer or a Table.")
|
||||
public class StructureObject {
|
||||
|
||||
@Schema(description = "The ID of this StructureObject.")
|
||||
Integer structureObjectNumber;
|
||||
@Schema(description = "The Tree ID of this StructureObject.")
|
||||
List<Integer> treeId;
|
||||
@Schema(description = "This value indicates the start of the string offsets in this Object, with respect to the reading order.")
|
||||
int page;
|
||||
@Schema(description = "This stringOffset indicates the start of the string offsets in this Object, with respect to the reading order of the entire document. It is equal to the previous' StructureObject stringOffset + its length.")
|
||||
int stringOffset;
|
||||
@Schema(description = "The bounding box of this StructureObject. Is always exactly 4 values representing x, y, w, h, where x, y specify the lower left corner.", example = "[100, 100, 50, 50]")
|
||||
float[] boundingBox;
|
||||
@Schema(description = "Object containing information about a Paragraph/Headline/Header/Footer. Either this or table is null.")
|
||||
ParagraphData paragraph;
|
||||
@Schema(description = "Object containing information about a Table. Either this or paragraph is null.")
|
||||
TableData table;
|
||||
|
||||
}
|
||||
|
||||
@ -2,15 +2,20 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@Schema(description = "Object containing information about a Table.")
|
||||
public class TableData {
|
||||
|
||||
@Schema(description = "A list of Objects containing information about all rows in this table.")
|
||||
List<RowData> rowData;
|
||||
@Schema(description = "Number of columns in this table.")
|
||||
Integer numberOfCols;
|
||||
@Schema(description = "Number of rows in this table.")
|
||||
Integer numberOfRows;
|
||||
|
||||
}
|
||||
|
||||
@ -2,9 +2,26 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.Builder;
|
||||
|
||||
@Builder
|
||||
public record LayoutParsingFinishedEvent(Map<String, String> identifier, long duration, int numberOfPages, String message) {
|
||||
@Schema(description = "Object containing information about the layout parsing.")
|
||||
public record LayoutParsingFinishedEvent(
|
||||
@Schema(description = "General purpose identifier. It is returned exactly the same way it is inserted with the LayoutParsingRequest.") //
|
||||
Map<String, String> identifier,
|
||||
|
||||
@Schema(description = "The duration of a single layout parsing in ms.") //
|
||||
long duration,
|
||||
|
||||
@Schema(description = "The number of pages of the parsed document.") //
|
||||
int numberOfPages,
|
||||
|
||||
@Schema(description = "A general message. It contains some information useful for a developer, like the paths where the files are stored. Not meant to be machine readable.") //
|
||||
String message,
|
||||
|
||||
@Schema(description = "The app version of the layout parser.") //
|
||||
String layoutParserVersion
|
||||
) {
|
||||
|
||||
}
|
||||
|
||||
@ -2,7 +2,9 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
|
||||
|
||||
public class LayoutParsingQueueNames {
|
||||
|
||||
public static final String LAYOUT_PARSING_REQUEST_QUEUE = "layout_parsing_request_queue";
|
||||
public static final String LAYOUT_PARSING_DLQ = "layout_parsing_dead_letter_queue";
|
||||
public static final String LAYOUT_PARSING_FINISHED_EVENT_QUEUE = "layout_parsing_response_queue";
|
||||
public static final String LAYOUT_PARSING_REQUEST_QUEUE_PREFIX = "layout_parsing_request";
|
||||
public static final String LAYOUT_PARSING_REQUEST_EXCHANGE = "layout_parsing_request_exchange";
|
||||
public static final String LAYOUT_PARSING_RESPONSE_QUEUE_PREFIX = "layout_parsing_response";
|
||||
public static final String LAYOUT_PARSING_RESPONSE_EXCHANGE = "layout_parsing_response_exchange";
|
||||
public static final String LAYOUT_PARSING_DLQ = "layout_parsing_error";
|
||||
}
|
||||
|
||||
@ -3,23 +3,45 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.Builder;
|
||||
import lombok.NonNull;
|
||||
|
||||
@Builder
|
||||
@Schema(description = "Object containing all storage paths the service needs to know.")
|
||||
public record LayoutParsingRequest(
|
||||
@Schema(description = "Enum specifying the type of layout parsing to be performed.", allowableValues = "{RedactManager, DocuMine, TAAS}")//
|
||||
@NonNull LayoutParsingType layoutParsingType,
|
||||
|
||||
@Schema(description = "General purpose identifiers. They are not changed by the service at all and are returned as is in the response queue.")//
|
||||
Map<String, String> identifier,
|
||||
@NonNull String originFileStorageId,
|
||||
Optional<String> tablesFileStorageId,
|
||||
Optional<String> imagesFileStorageId,
|
||||
@NonNull String structureFileStorageId,
|
||||
String researchDocumentStorageId,
|
||||
@NonNull String textBlockFileStorageId,
|
||||
@NonNull String positionBlockFileStorageId,
|
||||
@NonNull String pageFileStorageId,
|
||||
@NonNull String simplifiedTextStorageId,
|
||||
@NonNull String viewerDocumentStorageId,
|
||||
@NonNull String sectionGridStorageId) {
|
||||
|
||||
@Schema(description = "Path to the original PDF file.")//
|
||||
@NonNull String originFileStorageId,//
|
||||
|
||||
@Schema(description = "Optional Path to the table extraction file.")//
|
||||
Optional<String> tablesFileStorageId,//
|
||||
@Schema(description = "Optional Path to the image classification file.")//
|
||||
Optional<String> imagesFileStorageId,//
|
||||
|
||||
@Schema(description = "Optional Path to the the visual layout parsing service file") Optional<String> visualLayoutParsingFileId,//
|
||||
|
||||
@Schema(description = "Path where the Document Structure File will be stored.")//
|
||||
@NonNull String structureFileStorageId,//
|
||||
@Schema(description = "Path where the Research Data File will be stored.")//
|
||||
String researchDocumentStorageId,//
|
||||
@Schema(description = "Path where the Document Text File will be stored.")//
|
||||
@NonNull String textBlockFileStorageId,//
|
||||
@Schema(description = "Path where the Document Positions File will be stored.")//
|
||||
@NonNull String positionBlockFileStorageId,//
|
||||
@Schema(description = "Path where the Document Pages File will be stored.")//
|
||||
@NonNull String pageFileStorageId,//
|
||||
@Schema(description = "Path where the Document Markdown File will be stored.")//
|
||||
Optional<String> documentMarkdownFileStorageId,//
|
||||
@Schema(description = "Path where the Simplified Text File will be stored.")//
|
||||
@NonNull String simplifiedTextStorageId,//
|
||||
@Schema(description = "Path where the Viewer Document PDF will be stored.")//
|
||||
@NonNull String viewerDocumentStorageId
|
||||
) {
|
||||
|
||||
}
|
||||
|
||||
@ -2,6 +2,11 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
|
||||
|
||||
public enum LayoutParsingType {
|
||||
REDACT_MANAGER,
|
||||
TAAS,
|
||||
DOCUMINE
|
||||
REDACT_MANAGER_OLD,
|
||||
REDACT_MANAGER_PARAGRAPH_DEBUG,
|
||||
REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
DOCUMINE,
|
||||
DOCUMINE_OLD,
|
||||
CLARIFYND,
|
||||
CLARIFYND_PARAGRAPH_DEBUG
|
||||
}
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
plugins {
|
||||
id("com.knecon.fforesight.java-conventions")
|
||||
id("io.freefair.lombok") version "8.2.2"
|
||||
id("io.freefair.lombok") version "8.4"
|
||||
}
|
||||
|
||||
description = "layoutparser-service-processor"
|
||||
@ -8,19 +8,33 @@ description = "layoutparser-service-processor"
|
||||
val jacksonVersion = "2.15.2"
|
||||
val pdfBoxVersion = "3.0.0"
|
||||
|
||||
|
||||
dependencies {
|
||||
implementation(project(":layoutparser-service-internal-api"))
|
||||
implementation(project(":viewer-doc-processor"))
|
||||
|
||||
implementation("com.iqser.red.service:persistence-service-shared-api-v1:2.140.0") {
|
||||
implementation("com.knecon.fforesight:document:${rootProject.extra.get("documentVersion")}")
|
||||
implementation("com.iqser.red.service:persistence-service-shared-api-v1:2.564.0-RED9010.0") {
|
||||
exclude("org.springframework.boot", "spring-boot-starter-security")
|
||||
exclude("org.springframework.boot", "spring-boot-starter-validation")
|
||||
}
|
||||
implementation("com.knecon.fforesight:tenant-commons:0.10.0")
|
||||
implementation("com.iqser.red.commons:storage-commons:2.38.0")
|
||||
implementation("com.knecon.fforesight:tenant-commons:0.30.0") {
|
||||
exclude("com.iqser.red.commons", "storage-commons")
|
||||
}
|
||||
implementation("com.iqser.red.commons:storage-commons:2.50.0")
|
||||
|
||||
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
|
||||
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
|
||||
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")
|
||||
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
|
||||
implementation("org.springframework.boot:spring-boot-starter-web:3.1.2")
|
||||
implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
|
||||
implementation("org.jgrapht:jgrapht-core:1.5.2")
|
||||
implementation("org.apache.pdfbox:jbig2-imageio:3.0.4")
|
||||
implementation("com.github.jai-imageio:jai-imageio-core:1.4.0")
|
||||
implementation("com.github.jai-imageio:jai-imageio-jpeg2000:1.4.0")
|
||||
implementation("org.tinspin:tinspin-indexes:2.1.3")
|
||||
implementation("org.commonmark:commonmark:0.22.0")
|
||||
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
|
||||
implementation("com.pdftron:PDFNet:10.11.0")
|
||||
implementation("org.apache.commons:commons-text:1.12.0")
|
||||
}
|
||||
|
||||
@ -0,0 +1,20 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Configuration
|
||||
@ConfigurationProperties("layoutparser")
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class LayoutParserSettings {
|
||||
|
||||
boolean debug;
|
||||
LayoutParsingType layoutParsingTypeOverride;
|
||||
}
|
||||
@ -2,146 +2,276 @@ package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.mapper.DocumentDataMapper;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEnhancementService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.VisualLayoutParsingAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionGridCreatorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||
|
||||
import io.micrometer.observation.Observation;
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@SuppressWarnings("PMD.CloseResource")
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class LayoutParsingPipeline {
|
||||
|
||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
private final CvTableParsingAdapter cvTableParsingAdapter;
|
||||
private final LayoutParsingStorageService layoutParsingStorageService;
|
||||
private final SectionsBuilderService sectionsBuilderService;
|
||||
private final SectionGridCreatorService sectionGridCreatorService;
|
||||
private final TaasClassificationService taasClassificationService;
|
||||
private final RedactManagerClassificationService redactManagerClassificationService;
|
||||
private final DocuMineClassificationService docuMineClassificationService;
|
||||
private final SimplifiedSectionTextService simplifiedSectionTextService;
|
||||
private final BodyTextFrameService bodyTextFrameService;
|
||||
private final RulingCleaningService rulingCleaningService;
|
||||
private final TableExtractionService tableExtractionService;
|
||||
private final TaasBlockificationService taasBlockificationService;
|
||||
private final DocuMineBlockificationService docuMineBlockificationService;
|
||||
private final RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
private final ViewerDocumentService viewerDocumentService;
|
||||
final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
final CvTableParsingAdapter cvTableParsingAdapter;
|
||||
final LayoutParsingStorageService layoutParsingStorageService;
|
||||
final SectionsBuilderService sectionsBuilderService;
|
||||
final SimplifiedSectionTextService simplifiedSectionTextService;
|
||||
final RulingCleaningService rulingCleaningService;
|
||||
final TableExtractionService tableExtractionService;
|
||||
final DocuMineBlockificationService docuMineBlockificationService;
|
||||
final RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
final BlockificationPostprocessingService blockificationPostprocessingService;
|
||||
final DocstrumBlockificationService docstrumBlockificationService;
|
||||
final LayoutGridService layoutGridService;
|
||||
final ObservationRegistry observationRegistry;
|
||||
final VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
||||
final GraphicExtractorService graphicExtractorService;
|
||||
final OutlineExtractorService outlineExtractorService;
|
||||
final SectionTreeBuilderService sectionTreeBuilderService;
|
||||
final SectionTreeEnhancementService sectionTreeEnhancementService;
|
||||
final LayoutParserSettings settings;
|
||||
final ClassificationService classificationService;
|
||||
|
||||
@Value("${LAYOUT_PARSER_VERSION:}")
|
||||
private String layoutParserVersion;
|
||||
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||
|
||||
try (PDDocument originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId())) {
|
||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
||||
}
|
||||
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
|
||||
.orElse(originFile);
|
||||
|
||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
||||
}
|
||||
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
|
||||
.map(layoutParsingStorageService::getVisualLayoutParsingFile)
|
||||
.orElse(new VisualLayoutParsingResponse());
|
||||
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
|
||||
.map(layoutParsingStorageService::getImagesFile)
|
||||
.orElse(new ImageServiceResponse());
|
||||
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
|
||||
.map(layoutParsingStorageService::getTablesFile)
|
||||
.orElse(new TableServiceResponse());
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null //
|
||||
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride();
|
||||
|
||||
int numberOfPages = originDocument.getNumberOfPages();
|
||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingType,
|
||||
originFile,
|
||||
imageServiceResponse,
|
||||
tableServiceResponse,
|
||||
visualLayoutParsingResponse,
|
||||
layoutParsingRequest.identifier());
|
||||
|
||||
layoutParsingStorageService.storeSectionGrid(layoutParsingRequest, sectionGridCreatorService.createSectionGrid(documentGraph));
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
||||
log.info("Building document graph for {}", layoutParsingRequest.identifier());
|
||||
|
||||
try (var out = new ByteArrayOutputStream()) {
|
||||
viewerDocumentService.createViewerDocument(originDocument, documentGraph, out, false);
|
||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out);
|
||||
}
|
||||
DocumentWithVisualization documentWithVisualization = observeBuildDocumentGraph(layoutParsingType, classificationDocument);
|
||||
|
||||
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) {
|
||||
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
|
||||
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
|
||||
}
|
||||
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
|
||||
|
||||
return LayoutParsingFinishedEvent.builder()
|
||||
.identifier(layoutParsingRequest.identifier())
|
||||
.numberOfPages(numberOfPages)
|
||||
.duration(System.currentTimeMillis() - start)
|
||||
.message(format("Layout parsing is finished and files have been saved with Ids:\n Structure: %s\nText: %s\nPositions: %s\nPageData: %s",
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId()))
|
||||
.build();
|
||||
layoutGridService.addLayoutGrid(viewerDocumentFile, documentWithVisualization, viewerDocumentFile, layoutParsingType, layoutParserVersion, false);
|
||||
|
||||
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
|
||||
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentWithVisualization.document()));
|
||||
if (layoutParsingRequest.documentMarkdownFileStorageId()
|
||||
.isPresent()) {
|
||||
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
|
||||
.get(), new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
|
||||
}
|
||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentWithVisualization.document()));
|
||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
|
||||
|
||||
if (layoutParsingRequest.researchDocumentStorageId() != null) {
|
||||
log.info("Building research document data for {}", layoutParsingRequest.identifier());
|
||||
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentWithVisualization.document());
|
||||
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
|
||||
}
|
||||
|
||||
if (!viewerDocumentFile.equals(originFile)) {
|
||||
assert !viewerDocumentFile.exists() || viewerDocumentFile.delete();
|
||||
}
|
||||
assert !originFile.exists() || originFile.delete();
|
||||
|
||||
return LayoutParsingFinishedEvent.builder()
|
||||
.identifier(layoutParsingRequest.identifier())
|
||||
.numberOfPages(documentWithVisualization.document().getNumberOfPages())
|
||||
.duration(System.currentTimeMillis() - start)
|
||||
.message(format("""
|
||||
Layout parsing has finished in %.02f s.
|
||||
identifiers: %s
|
||||
%s
|
||||
Files have been saved with Ids:
|
||||
Structure: %s
|
||||
Text: %s
|
||||
Positions: %s
|
||||
PageData: %s
|
||||
Simplified Text: %s
|
||||
Viewer Doc: %s""",
|
||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||
layoutParsingRequest.identifier(),
|
||||
buildSemanticNodeCountMessage(documentWithVisualization.document().getNumberOfPages(), documentWithVisualization.buildSemanticNodeCounts()),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
layoutParsingRequest.simplifiedTextStorageId(),
|
||||
layoutParsingRequest.viewerDocumentStorageId()))
|
||||
.layoutParserVersion(layoutParserVersion)
|
||||
.build();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private DocumentWithVisualization observeBuildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument) {
|
||||
|
||||
AtomicReference<DocumentWithVisualization> documentReference = new AtomicReference<>();
|
||||
|
||||
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry)
|
||||
.contextualName("build-document-graph")
|
||||
.observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(layoutParsingType, classificationDocument)));
|
||||
|
||||
return documentReference.get();
|
||||
}
|
||||
|
||||
|
||||
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
||||
|
||||
return format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||
numberOfPages,
|
||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Observed(name = "LayoutParsingPipeline", contextualName = "parse-layout")
|
||||
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
|
||||
PDDocument originDocument,
|
||||
File originFile,
|
||||
ImageServiceResponse imageServiceResponse,
|
||||
TableServiceResponse tableServiceResponse) {
|
||||
TableServiceResponse tableServiceResponse,
|
||||
VisualLayoutParsingResponse visualLayoutParsingResponse,
|
||||
Map<String, String> identifier) {
|
||||
|
||||
PDDocument originDocument = openDocument(originFile);
|
||||
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
|
||||
|
||||
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
||||
|
||||
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||
|
||||
if (settings.isDebug() || identifier.containsKey("debug")) {
|
||||
classificationDocument.getLayoutDebugLayer().setActive(true);
|
||||
}
|
||||
|
||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||
|
||||
originDocument.setAllSecurityToBeRemoved(true);
|
||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||
|
||||
long pageCount = originDocument.getNumberOfPages();
|
||||
|
||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||
|
||||
if (pageNumber % 100 == 0) {
|
||||
// re-open document every once in a while to save on RAM. This has no significant performance impact.
|
||||
// This is due to PDFBox caching all images and some other stuff with Soft References. This dereferences them and forces the freeing of memory.
|
||||
originDocument.close();
|
||||
originDocument = openDocument(originFile);
|
||||
}
|
||||
|
||||
if (pageNumber % 100 == 0 || pageNumber == pageCount || pageNumber == 1) {
|
||||
log.info("Extracting text on Page {} for {}", pageNumber, identifier);
|
||||
}
|
||||
|
||||
classificationDocument.setPages(classificationPages);
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = originDocument.getPage(pageNumber - 1);
|
||||
@ -149,80 +279,173 @@ public class LayoutParsingPipeline {
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE)) {
|
||||
stripper.setSortByPosition(true);
|
||||
}
|
||||
stripper.getText(originDocument);
|
||||
List<Word> words = stripper.getWords();
|
||||
|
||||
// rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now
|
||||
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
||||
var lines = TextPositionOperations.groupByLine(new HashSet<>(words));
|
||||
classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber);
|
||||
words = TextPositionOperations.sortWords(lines);
|
||||
}
|
||||
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
|
||||
|
||||
PDRectangle pdr = pdPage.getMediaBox();
|
||||
|
||||
int rotation = pdPage.getRotation();
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||
List<Ruling> rulings = stripper.getRulings();
|
||||
classificationDocument.getLayoutDebugLayer().addRulingVisualization(rulings, pageNumber);
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), rulings);
|
||||
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
|
||||
stripper.getRulings(),
|
||||
stripper.getMinCharWidth(),
|
||||
stripper.getMaxCharHeight());
|
||||
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
|
||||
classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber);
|
||||
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
||||
|
||||
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getWords(), false);
|
||||
|
||||
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
||||
.addAll(graphics.stream()
|
||||
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()),
|
||||
ImageType.GRAPHIC,
|
||||
false,
|
||||
stripper.getPageNumber(),
|
||||
""))
|
||||
.toList());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER ->
|
||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case TAAS ->
|
||||
taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCUMINE ->
|
||||
docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer());
|
||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
|
||||
};
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
classificationPage.setRotation(rotation);
|
||||
classificationPage.setLandscape(isLandscape);
|
||||
classificationPage.setPageNumber(pageNumber);
|
||||
classificationPage.setPageWidth(cropbox.getWidth());
|
||||
classificationPage.setPageHeight(cropbox.getHeight());
|
||||
|
||||
updateClassificationPage(pdPage, pdr, classificationPage, cleanRulings, pageNumber, pageInformation);
|
||||
|
||||
blockificationPostprocessingService.findHeadlinesFromOutline(classificationDocument, pageNumber, classificationPage, pageInformation);
|
||||
|
||||
classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
|
||||
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
||||
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
|
||||
|
||||
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
|
||||
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
|
||||
if (pdfImages.containsKey(pageNumber)) {
|
||||
classificationPage.setImages(pdfImages.get(pageNumber));
|
||||
imageServiceResponseAdapter.findOcr(classificationPage);
|
||||
}
|
||||
|
||||
tableExtractionService.extractTables(cleanRulings, classificationPage, layoutParsingType);
|
||||
if (signatures.containsKey(pageNumber)) {
|
||||
if (classificationPage.getImages() == null || classificationPage.getImages().isEmpty()) {
|
||||
classificationPage.setImages(signatures.get(pageNumber));
|
||||
} else {
|
||||
classificationPage.getImages().addAll(signatures.get(pageNumber));
|
||||
}
|
||||
}
|
||||
|
||||
tableExtractionService.extractTables(emptyTableCells, classificationPage);
|
||||
|
||||
buildPageStatistics(classificationPage);
|
||||
increaseDocumentStatistics(classificationPage, classificationDocument);
|
||||
|
||||
classificationPages.add(classificationPage);
|
||||
}
|
||||
|
||||
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
|
||||
originDocument.close();
|
||||
|
||||
classificationService.classify(classificationDocument, layoutParsingType, identifier);
|
||||
|
||||
SectionTree sectionTree = sectionTreeBuilderService.createSectionTree(classificationDocument);
|
||||
classificationDocument.setSectionTree(sectionTree);
|
||||
|
||||
log.info("Building Sections for {}", identifier);
|
||||
|
||||
switch (layoutParsingType) {
|
||||
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
|
||||
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
|
||||
default -> sectionTreeEnhancementService.assignSectionBlocksAndImages(classificationDocument);
|
||||
}
|
||||
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
sectionsBuilderService.addImagesToSections(classificationDocument);
|
||||
|
||||
return classificationDocument;
|
||||
}
|
||||
|
||||
|
||||
private Map<String, Rectangle2D> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
|
||||
Map<String, Rectangle2D> markedContentBboxes = new HashMap<>();
|
||||
private static void updateClassificationPage(PDPage pdPage,
|
||||
PDRectangle pdr,
|
||||
ClassificationPage classificationPage,
|
||||
CleanRulings cleanRulings,
|
||||
int pageNumber,
|
||||
PageInformation pageInformation) {
|
||||
|
||||
int rotation = pdPage.getRotation();
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
classificationPage.setRotation(rotation);
|
||||
classificationPage.setLandscape(isLandscape);
|
||||
classificationPage.setPageNumber(pageNumber);
|
||||
classificationPage.setPageWidth((float) pageInformation.width());
|
||||
classificationPage.setPageHeight((float) pageInformation.height());
|
||||
}
|
||||
|
||||
|
||||
private static void rotateDirAdjExactly(List<Word> words, PDPage pdPage) {
|
||||
|
||||
for (TextDirection dir : TextDirection.values()) {
|
||||
double averageRotation = words.stream()
|
||||
.map(Word::getCharacters)
|
||||
.flatMap(Collection::stream)
|
||||
.map(Character::getTextPosition)
|
||||
.filter(pos -> pos.getDir().equals(dir))
|
||||
.mapToDouble(RedTextPosition::getExactDir).average()
|
||||
.orElse(0);
|
||||
|
||||
if (averageRotation == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
AffineTransform rotateInstance = AffineTransform.getRotateInstance(averageRotation, pdPage.getMediaBox().getWidth() / 2, pdPage.getMediaBox().getHeight() / 2);
|
||||
|
||||
for (Word word : words) {
|
||||
if (!dir.equals(word.getDir())) {
|
||||
continue;
|
||||
}
|
||||
word.transform(rotateInstance);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addNumberOfPagesToTrace(int numberOfPages, long size) {
|
||||
|
||||
if (observationRegistry.getCurrentObservation() != null) {
|
||||
observationRegistry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages));
|
||||
observationRegistry.getCurrentObservation().highCardinalityKeyValue("fileSize", String.valueOf(size));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private PDDocument openDocument(File originFile) {
|
||||
|
||||
PDDocument document = Loader.loadPDF(originFile);
|
||||
document.setAllSecurityToBeRemoved(true);
|
||||
return document;
|
||||
}
|
||||
|
||||
|
||||
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
|
||||
|
||||
Map<String, List<Rectangle2D>> markedContentBboxes = new HashMap<>();
|
||||
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER));
|
||||
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER));
|
||||
return markedContentBboxes;
|
||||
}
|
||||
|
||||
|
||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||
|
||||
if (!classificationPage.isLandscape()) {
|
||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||
}
|
||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
||||
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
||||
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
||||
@ -234,10 +457,10 @@ public class LayoutParsingPipeline {
|
||||
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
|
||||
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
|
||||
if (textBlock instanceof TextPageBlock) {
|
||||
if (((TextPageBlock) textBlock).getSequences() == null) {
|
||||
if (((TextPageBlock) textBlock).getWords() == null) {
|
||||
continue;
|
||||
}
|
||||
for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) {
|
||||
for (Word word : ((TextPageBlock) textBlock).getWords()) {
|
||||
classificationPage.getTextHeightCounter().add(word.getTextHeight());
|
||||
classificationPage.getFontCounter().add(word.getFont());
|
||||
classificationPage.getFontSizeCounter().add(word.getFontSize());
|
||||
|
||||
@ -1,10 +1,23 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.ComponentScan;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
|
||||
@Configuration
|
||||
@ComponentScan
|
||||
public class LayoutParsingServiceProcessorConfiguration {
|
||||
|
||||
@Bean
|
||||
@Autowired
|
||||
public PDFTronViewerDocumentService viewerDocumentService(ObservationRegistry registry) {
|
||||
|
||||
return new PDFTronViewerDocumentService(registry);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,32 +1,34 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.core.task.TaskExecutor;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
|
||||
import com.iqser.red.service.redaction.v1.server.data.DocumentData;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocVersioningUtility;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -39,21 +41,38 @@ public class LayoutParsingStorageService {
|
||||
private final StorageService storageService;
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
private final TaskExecutor taskExecutor;
|
||||
|
||||
public PDDocument getOriginFile(String storageId) throws IOException {
|
||||
|
||||
try (var originDocumentInputStream = getObject(storageId)) {
|
||||
File tempFile = createTempFile("document", ".pdf");
|
||||
try (var tempFileOutputStream = new FileOutputStream(tempFile)) {
|
||||
IOUtils.copy(originDocumentInputStream, tempFileOutputStream);
|
||||
originDocumentInputStream.close();
|
||||
}
|
||||
return Loader.loadPDF(tempFile);
|
||||
}
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file")
|
||||
public File getOriginFile(String storageId) throws IOException {
|
||||
|
||||
File tempFile = createTempFile("document", ".pdf");
|
||||
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
|
||||
return tempFile;
|
||||
}
|
||||
|
||||
|
||||
public ImageServiceResponse getImagesFile(String storageId) throws IOException {
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "get-viewer-doc-file")
|
||||
public Optional<File> getViewerDocFile(String storageId) throws IOException {
|
||||
|
||||
if (!storageService.objectExists(TenantContext.getTenantId(), storageId)) {
|
||||
return Optional.empty();
|
||||
}
|
||||
File tempFile = createTempFile("viewerDocument", ".pdf");
|
||||
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
|
||||
|
||||
if (!ViewerDocVersioningUtility.isCurrentVersion(tempFile)) {
|
||||
assert tempFile.delete();
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return Optional.of(tempFile);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public ImageServiceResponse getImagesFile(String storageId) {
|
||||
|
||||
try (InputStream inputStream = getObject(storageId)) {
|
||||
|
||||
@ -64,7 +83,8 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
|
||||
|
||||
public TableServiceResponse getTablesFile(String storageId) throws IOException {
|
||||
@SneakyThrows
|
||||
public TableServiceResponse getTablesFile(String storageId) {
|
||||
|
||||
try (var tableClassificationStream = getObject(storageId)) {
|
||||
|
||||
@ -75,18 +95,44 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
|
||||
|
||||
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
|
||||
@SneakyThrows
|
||||
public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentStructure());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getDocumentTextData());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getDocumentPositions());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getDocumentPages());
|
||||
try (InputStream inputStream = getObject(storageId)) {
|
||||
return objectMapper.readValue(inputStream, VisualLayoutParsingResponse.class);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void storeSectionGrid(LayoutParsingRequest layoutParsingRequest, SectionGrid sectionGrid) {
|
||||
@SneakyThrows
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "store-document-data")
|
||||
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.sectionGridStorageId(), sectionGrid);
|
||||
Runnable storeDocumentStructureRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
documentData.getDocumentStructure());
|
||||
|
||||
CompletableFuture<Void> storeDocumentStructureFuture = CompletableFuture.runAsync(storeDocumentStructureRunnable, taskExecutor);
|
||||
|
||||
Runnable storeDocumentTextDataRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
documentData.getDocumentTextData());
|
||||
|
||||
CompletableFuture<Void> storeDocumentTextDataFuture = CompletableFuture.runAsync(storeDocumentTextDataRunnable, taskExecutor);
|
||||
|
||||
Runnable storeDocumentPositionsRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
documentData.getDocumentPositionData());
|
||||
|
||||
CompletableFuture<Void> storeDocumentPositionsFuture = CompletableFuture.runAsync(storeDocumentPositionsRunnable, taskExecutor);
|
||||
|
||||
Runnable storeDocumentPagesRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
documentData.getDocumentPages());
|
||||
|
||||
CompletableFuture<Void> storeDocumentPagesFuture = CompletableFuture.runAsync(storeDocumentPagesRunnable, taskExecutor);
|
||||
|
||||
CompletableFuture.allOf(storeDocumentStructureFuture, storeDocumentTextDataFuture, storeDocumentPositionsFuture, storeDocumentPagesFuture).join();
|
||||
}
|
||||
|
||||
|
||||
@ -121,6 +167,7 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
|
||||
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "store-simplified-text")
|
||||
public void storeSimplifiedText(LayoutParsingRequest layoutParsingRequest, SimplifiedText simplifiedText) {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.simplifiedTextStorageId(), simplifiedText);
|
||||
@ -138,12 +185,24 @@ public class LayoutParsingStorageService {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void storeViewerDocument(LayoutParsingRequest layoutParsingRequest, ByteArrayOutputStream out) {
|
||||
|
||||
try (var in = new ByteArrayInputStream(out.toByteArray())) {
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "store-viewer-document")
|
||||
public void storeViewerDocument(LayoutParsingRequest layoutParsingRequest, File out) {
|
||||
|
||||
try (var in = new FileInputStream(out)) {
|
||||
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.viewerDocumentStorageId(), in);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "store-markdown-file")
|
||||
public void storeMarkdownFile(String markdownFileStorageId, String markdownContent) {
|
||||
|
||||
try (InputStream inputStream = new ByteArrayInputStream(markdownContent.getBytes(StandardCharsets.UTF_8))) {
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(), markdownFileStorageId, inputStream);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,98 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.EnumMap;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.LineBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.NearestNeighbourService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class DocstrumSegmentationService {
|
||||
|
||||
public static final double SAME_DIRECTION_THRESHOLD = 0.9;
|
||||
private final NearestNeighbourService nearestNeighbourService;
|
||||
private final SpacingService spacingService;
|
||||
private final LineBuilderService lineBuilderService;
|
||||
private final ZoneBuilderService zoneBuilderService;
|
||||
private final ReadingOrderService readingOrderService;
|
||||
|
||||
|
||||
public List<Zone> segmentPage(List<Word> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) {
|
||||
|
||||
EnumMap<TextDirection, Integer> directionCounts = new EnumMap<>(TextDirection.class);
|
||||
|
||||
List<Zone> newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO);
|
||||
directionCounts.put(TextDirection.ZERO, newZones.size());
|
||||
List<Zone> zones = new ArrayList<>(newZones);
|
||||
|
||||
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE);
|
||||
directionCounts.put(TextDirection.QUARTER_CIRCLE, newZones.size());
|
||||
zones.addAll(newZones);
|
||||
|
||||
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE);
|
||||
directionCounts.put(TextDirection.HALF_CIRCLE, newZones.size());
|
||||
zones.addAll(newZones);
|
||||
|
||||
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE);
|
||||
directionCounts.put(TextDirection.THREE_QUARTER_CIRCLE, newZones.size());
|
||||
zones.addAll(newZones);
|
||||
|
||||
return readingOrderService.resolve(zones, xyOrder, mostSameDirection(directionCounts));
|
||||
}
|
||||
|
||||
|
||||
private boolean mostSameDirection(EnumMap<TextDirection, Integer> directionCounts) {
|
||||
|
||||
int total = directionCounts.values()
|
||||
.stream()
|
||||
.mapToInt(i -> i).sum();
|
||||
|
||||
if ((double) directionCounts.get(TextDirection.ZERO) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
} else if ((double) directionCounts.get(TextDirection.QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
} else if ((double) directionCounts.get(TextDirection.HALF_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
} else if ((double) directionCounts.get(TextDirection.THREE_QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> computeZones(List<Word> textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) {
|
||||
|
||||
List<Character> characters = textPositions.stream()
|
||||
.filter(t -> t.getDir() == direction)
|
||||
.map(Word::getCharacters)
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
|
||||
nearestNeighbourService.findNearestNeighbors(characters);
|
||||
|
||||
double characterSpacing = spacingService.computeCharacterSpacing(characters);
|
||||
double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
|
||||
|
||||
List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings);
|
||||
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,31 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
public class AngleFilter {
|
||||
|
||||
protected double lowerAngle;
|
||||
protected double upperAngle;
|
||||
|
||||
|
||||
public AngleFilter(double lowerAngle, double upperAngle) {
|
||||
|
||||
this.lowerAngle = lowerAngle < -Math.PI / 2 ? lowerAngle + Math.PI : lowerAngle;
|
||||
this.upperAngle = upperAngle >= Math.PI / 2 ? upperAngle - Math.PI : upperAngle;
|
||||
}
|
||||
|
||||
|
||||
public boolean matches(Neighbor neighbor) {
|
||||
|
||||
return matches(neighbor.getAngle());
|
||||
}
|
||||
|
||||
|
||||
public boolean matches(double angle) {
|
||||
|
||||
if (lowerAngle <= upperAngle) {
|
||||
return lowerAngle <= angle && angle < upperAngle;
|
||||
} else {
|
||||
return lowerAngle <= angle || angle < upperAngle;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,279 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Data
|
||||
@SuperBuilder
|
||||
@NoArgsConstructor
|
||||
public abstract class BoundingBox {
|
||||
|
||||
// Java coordinate system: (0, 0) is always upper left, x is increasing left to right and y is increasing from top to bottom.
|
||||
// should be used when determining reading order or other tasks which require coordinates in a harmonized system.
|
||||
protected Rectangle2D bBox; // I would not trust this coordinate when comparing rulings and text, due to the text positions being slightly off.
|
||||
|
||||
// PDF coordinate system: depends on page rotation, (0, 0) is lower left corner, x is increasing left to right and y from bottom to top.
|
||||
// This rotates completely in 90 degree steps with page rotation.
|
||||
// Needs to be used when writing to a PDF.
|
||||
// Also, these are definitely correct and should be used whenever possible.
|
||||
protected Rectangle2D bBoxPdf;
|
||||
|
||||
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||
|
||||
|
||||
public double getX() {
|
||||
|
||||
return bBox.getX();
|
||||
}
|
||||
|
||||
|
||||
public double getY() {
|
||||
|
||||
return bBox.getY();
|
||||
}
|
||||
|
||||
|
||||
public double getMinX() {
|
||||
|
||||
return bBox.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public double getMinY() {
|
||||
|
||||
return bBox.getMinY();
|
||||
}
|
||||
|
||||
|
||||
public double getPdfMinX() {
|
||||
|
||||
return bBoxPdf.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public double getPdfMaxX() {
|
||||
|
||||
return bBoxPdf.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public double getPdfMinY() {
|
||||
|
||||
return bBoxPdf.getMinY();
|
||||
}
|
||||
|
||||
|
||||
public double getPdfMaxY() {
|
||||
|
||||
return bBoxPdf.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public double getWidth() {
|
||||
|
||||
return bBox.getWidth();
|
||||
}
|
||||
|
||||
|
||||
public double getHeight() {
|
||||
|
||||
return bBox.getHeight();
|
||||
}
|
||||
|
||||
|
||||
public double getMaxX() {
|
||||
|
||||
return bBox.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public double getMaxY() {
|
||||
|
||||
return bBox.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public double getArea() {
|
||||
|
||||
return (bBox.getHeight() * bBox.getWidth());
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(BoundingBox contained) {
|
||||
|
||||
return contains(contained, 0);
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(BoundingBox contained, double tolerance) {
|
||||
|
||||
return getPdfMinX() <= contained.getPdfMinX() + tolerance
|
||||
&& getPdfMinY() <= contained.getPdfMinY() + tolerance
|
||||
&& getPdfMaxX() >= contained.getPdfMaxX() - tolerance
|
||||
&& getPdfMaxY() >= contained.getPdfMaxY() - tolerance;
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(BoundingBox other) {
|
||||
|
||||
return this.intersectsX(other) && this.intersectsY(other);
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(BoundingBox other, float yThreshold, float xThreshold) {
|
||||
|
||||
return this.intersectsX(other, xThreshold) && this.intersectsY(other, yThreshold);
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsX(BoundingBox other, float threshold) {
|
||||
|
||||
return this.getX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getX();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsPdf(BoundingBox other) {
|
||||
|
||||
return this.intersectsXPdf(other) && this.intersectsYPdf(other);
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsPdf(BoundingBox other, float yThreshold, float xThreshold) {
|
||||
|
||||
return this.intersectsXPdf(other, xThreshold) && this.intersectsYPdf(other, yThreshold);
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsYPdf(BoundingBox other) {
|
||||
|
||||
return this.getPdfMinY() <= other.getPdfMaxY() && this.getPdfMaxY() >= other.getPdfMinY();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(BoundingBox other) {
|
||||
|
||||
return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(BoundingBox other, float threshold) {
|
||||
|
||||
return this.getY() - threshold <= other.getMaxY() && this.getMaxY() + threshold >= other.getY();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsYPdf(BoundingBox other, float threshold) {
|
||||
|
||||
return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsXPdf(BoundingBox other) {
|
||||
|
||||
return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsX(BoundingBox other) {
|
||||
|
||||
return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsXPdf(BoundingBox other, float threshold) {
|
||||
|
||||
return this.getPdfMinX() - threshold <= other.getPdfMaxX() && this.getMaxX() + threshold >= other.getPdfMinX();
|
||||
}
|
||||
|
||||
|
||||
public void setToBBoxOfComponents(List<? extends BoundingBox> components) {
|
||||
|
||||
this.bBox = components.stream()
|
||||
.map(BoundingBox::getBBox)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
this.bBoxPdf = components.stream()
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
}
|
||||
|
||||
|
||||
public double verticalOverlap(BoundingBox other) {
|
||||
|
||||
return Math.max(0, Math.min(this.getPdfMaxY(), other.getPdfMaxY()) - Math.max(this.getPdfMinY(), other.getPdfMinY()));
|
||||
}
|
||||
|
||||
|
||||
public static final Comparator<BoundingBox> ILL_DEFINED_ORDER = (o1, o2) -> {
|
||||
|
||||
if (o1.equals(o2)) {
|
||||
return 0;
|
||||
}
|
||||
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD * ((o1.getHeight() + o2.getHeight()) / 2)) {
|
||||
return Double.compare(o1.getPdfMinX(), o2.getPdfMinX());
|
||||
} else {
|
||||
return Double.compare(o1.getPdfMaxY(), o2.getPdfMaxY());
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public double horizontalDistance(BoundingBox other) {
|
||||
|
||||
double rect1Right = getMaxX();
|
||||
double rect1Left = getMinX();
|
||||
double rect2Right = other.getMaxX();
|
||||
double rect2Left = other.getMinX();
|
||||
|
||||
if (rect1Left > rect2Right || rect2Left > rect1Right) {
|
||||
return Math.max(rect2Left - rect1Right, rect1Left - rect2Right);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public double verticalDistance(BoundingBox other) {
|
||||
|
||||
double rect1Top = getMaxY();
|
||||
double rect1Bottom = getMinY();
|
||||
double rect2Top = other.getMaxY();
|
||||
double rect2Bottom = other.getMinY();
|
||||
|
||||
if (rect1Bottom > rect2Top || rect2Bottom > rect1Top) {
|
||||
return Math.max(rect2Bottom - rect1Top, rect1Bottom - rect2Top);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean rightOf(BoundingBox other) {
|
||||
|
||||
return this.intersectsY(other) && other.getMaxX() <= this.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public boolean leftOf(BoundingBox other) {
|
||||
|
||||
return this.intersectsY(other) && other.getMinX() >= this.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public boolean isAbove(BoundingBox other) {
|
||||
|
||||
return this.intersectsX(other) && other.getMinY() >= this.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public boolean isBelow(BoundingBox other) {
|
||||
|
||||
return this.intersectsX(other) && this.getMinY() >= other.getMaxY();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,86 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Setter;
|
||||
|
||||
@Data
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
public class Character {
|
||||
|
||||
@EqualsAndHashCode.Include
|
||||
private final double x;
|
||||
@EqualsAndHashCode.Include
|
||||
private final double y;
|
||||
private final RedTextPosition textPosition;
|
||||
|
||||
@Setter
|
||||
private List<Neighbor> neighbors = new ArrayList<>();
|
||||
|
||||
|
||||
public Character(RedTextPosition chunk) {
|
||||
|
||||
this.x = chunk.getBBoxDirAdj().getCenterX();
|
||||
this.y = chunk.getBBoxDirAdj().getCenterY();
|
||||
this.textPosition = chunk;
|
||||
}
|
||||
|
||||
|
||||
public double getHeight() {
|
||||
|
||||
return textPosition.getHeightDirAdj();
|
||||
}
|
||||
|
||||
|
||||
public double distance(Character character) {
|
||||
|
||||
double dx = getX() - character.getX();
|
||||
double dy = getY() - character.getY();
|
||||
return Math.sqrt(dx * dx + dy * dy);
|
||||
}
|
||||
|
||||
|
||||
public double horizontalDistance(Character character) {
|
||||
|
||||
return Math.abs(getX() - character.getX());
|
||||
}
|
||||
|
||||
|
||||
public double verticalDistance(Character character) {
|
||||
|
||||
return Math.abs(getY() - character.getY());
|
||||
}
|
||||
|
||||
|
||||
public double overlappingDistance(Character other) {
|
||||
|
||||
double[] xs = new double[4];
|
||||
double s = Math.sin(-0);
|
||||
double c = Math.cos(-0);
|
||||
xs[0] = c * x - s * y;
|
||||
xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDirAdj());
|
||||
xs[2] = c * other.x - s * other.y;
|
||||
xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDirAdj());
|
||||
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
|
||||
Arrays.sort(xs);
|
||||
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
|
||||
}
|
||||
|
||||
|
||||
public double angle(Character character) {
|
||||
|
||||
if (getX() > character.getX()) {
|
||||
return FastAtan2.fastAtan2(getY() - character.getY(), getX() - character.getX());
|
||||
} else {
|
||||
return FastAtan2.fastAtan2(character.getY() - getY(), character.getX() - getX());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,324 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
/*
|
||||
WIP, mostly working, needs to be tested a bit more
|
||||
*/
|
||||
public class ColumnDetector {
|
||||
|
||||
public static final double MAX_VALUE_THRESHOLD = 0.5;
|
||||
final static int bins_num = 512;
|
||||
final static int globalStartIdx = 0; // ignore outer parts completely, we don't expect columns there
|
||||
final static int globalEndIdx = bins_num; // i chose 7, since thirds seems a likely split for columns, therefore divided by 6 would eliminate those.
|
||||
public static final double DERIVATIVE_ZERO_THRESHOLD = 1e-10;
|
||||
public static final double MINIMUM_THRESHOLD_FOR_COLUMNS = 0.05;
|
||||
public static final double NEAR_GLOBAL_THRESHOLD = 0.5;
|
||||
double minY;
|
||||
double maxY;
|
||||
double midY;
|
||||
double[] histogram;
|
||||
double min;
|
||||
double max;
|
||||
double resolution;
|
||||
double sum;
|
||||
int N;
|
||||
|
||||
|
||||
public ColumnDetector(double min, double max, double minY, double maxY) {
|
||||
|
||||
this.min = min;
|
||||
this.max = max;
|
||||
this.minY = minY;
|
||||
this.maxY = maxY;
|
||||
this.midY = maxY - minY;
|
||||
this.resolution = (max - min) / bins_num;
|
||||
this.histogram = new double[bins_num];
|
||||
}
|
||||
|
||||
|
||||
public void add(BoundingBox zone) {
|
||||
|
||||
N++;
|
||||
double weight = computeWeight(zone);
|
||||
int start = (int) ((zone.getMinX() - min) / resolution);
|
||||
int end = (int) ((zone.getMaxX() - min) / resolution);
|
||||
for (int i = start; i < end; i++) {
|
||||
histogram[i] += weight;
|
||||
sum += histogram[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private double computeWeight(BoundingBox zone) {
|
||||
|
||||
double areaWeight = zone.getBBox().getHeight();
|
||||
|
||||
double relativeDistance = relativeDistanceToMiddle(zone.getBBox().getCenterY());
|
||||
|
||||
double distanceWeight;
|
||||
if (relativeDistance < 0.6) {
|
||||
distanceWeight = 1;
|
||||
} else if (relativeDistance < 0.8) {
|
||||
distanceWeight = 0.8;
|
||||
} else {
|
||||
distanceWeight = 0.1;
|
||||
}
|
||||
|
||||
return areaWeight * distanceWeight;
|
||||
}
|
||||
|
||||
|
||||
private double relativeDistanceToMiddle(double y) {
|
||||
|
||||
double range = (maxY - minY) / 2;
|
||||
double mid = minY + range;
|
||||
|
||||
return Math.abs(y - mid) / range;
|
||||
}
|
||||
|
||||
|
||||
public double[] computeDerivative() {
|
||||
|
||||
int length = histogram.length;
|
||||
double[] derivative = new double[length];
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (i == 0) {
|
||||
derivative[i] = (histogram[i + 1] - histogram[i]) / resolution;
|
||||
} else if (i == length - 1) {
|
||||
derivative[i] = (histogram[i] - histogram[i - 1]) / resolution;
|
||||
} else {
|
||||
derivative[i] = (histogram[i + 1] - histogram[i - 1]) / (2 * resolution);
|
||||
}
|
||||
}
|
||||
|
||||
return derivative;
|
||||
}
|
||||
|
||||
|
||||
public double calcMean(double[] arr, int start, int end) {
|
||||
|
||||
if (start == end) {
|
||||
return 0;
|
||||
}
|
||||
double sum = 0;
|
||||
for (int i = start; i < end; i++) {
|
||||
sum += arr[i];
|
||||
}
|
||||
return sum / (end - start);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Find columns, by finding all local maxima/minima of the derivative. Filtering them for the ones with the biggest values.
|
||||
For each found minima, we will step to the right until we hit a 0 in the derivative, this indicates a minimum in the main histogram. If this minimum is below a threshold, it is deemed a column divider.
|
||||
Same goes for maxima, but stepping to the left now, since minima in the function will always be to the left of a maximum in its derivative.
|
||||
*/
|
||||
public List<Double> determineColumnsWithDerivative(double[] derivative) {
|
||||
|
||||
assert derivative.length == histogram.length;
|
||||
|
||||
Set<Integer> columnIndices = new HashSet<>();
|
||||
double mean = calcMean(histogram, 0, histogram.length);
|
||||
double maxDvValue = calcMax(derivative);
|
||||
double minDvValue = calcMin(derivative);
|
||||
|
||||
if (maxDvValue - minDvValue < mean * MAX_VALUE_THRESHOLD) {
|
||||
Collections.emptyList();
|
||||
}
|
||||
|
||||
Extrema derivativeExtrema = calculateNearGlobalExtrema(derivative, maxDvValue, minDvValue);
|
||||
|
||||
List<Integer> columnsRightOfMinima = findZerosToTheRightOfMinima(derivative, derivativeExtrema.minima(), mean);
|
||||
columnIndices.addAll(columnsRightOfMinima);
|
||||
|
||||
List<Integer> columnsLeftOfMaxima = findZerosToTheLeftOfMaxima(derivative, derivativeExtrema.maxima(), mean);
|
||||
columnIndices.addAll(columnsLeftOfMaxima);
|
||||
|
||||
return columnIndices.stream()
|
||||
.sorted(Comparator.naturalOrder())
|
||||
.map(this::calculateXCoordinateFromIdx)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private List<Integer> findZerosToTheLeftOfMaxima(double[] derivative, List<Integer> derivativeMaxima, double mean) {
|
||||
|
||||
List<Integer> columnsLeftOfMaxima = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < derivativeMaxima.size(); i++) {
|
||||
List<Integer> consecutiveZeroes = new LinkedList<>();
|
||||
boolean maximumFound = false;
|
||||
int maximaIdx = derivativeMaxima.get(i) - 1; // the highest derivative will always be at least one step away from the lowest value.
|
||||
int endIdx = (int) Math.max(globalStartIdx,
|
||||
Math.min(maximaIdx - 1,
|
||||
maximaIdx - 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the left edge;
|
||||
|
||||
for (int j = maximaIdx; j >= endIdx; j--) {
|
||||
if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) {
|
||||
maximumFound = true;
|
||||
consecutiveZeroes.add(j);
|
||||
} else if (maximumFound) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (maximumFound) {
|
||||
int midIdx = consecutiveZeroes.size() / 2;
|
||||
int middleMinimumIdx = consecutiveZeroes.get(midIdx);
|
||||
if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) {
|
||||
columnsLeftOfMaxima.add(middleMinimumIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
return columnsLeftOfMaxima;
|
||||
}
|
||||
|
||||
|
||||
private List<Integer> findZerosToTheRightOfMinima(double[] derivative, List<Integer> derivativeMinima, double mean) {
|
||||
|
||||
List<Integer> columnIndixes = new LinkedList<>();
|
||||
for (int i = 0; i < derivativeMinima.size(); i++) {
|
||||
List<Integer> consecutiveZeroes = new LinkedList<>();
|
||||
boolean minimumFound = false;
|
||||
int minimaIdx = derivativeMinima.get(i) + 1; // the highest derivative will always be at least one step earlier than the lowest value.
|
||||
int endIdx = (int) Math.min(globalEndIdx,
|
||||
Math.max(minimaIdx + 1,
|
||||
minimaIdx + 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the right edge;
|
||||
|
||||
for (int j = minimaIdx; j < endIdx; j++) {
|
||||
if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) {
|
||||
minimumFound = true;
|
||||
consecutiveZeroes.add(j);
|
||||
} else if (minimumFound) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (minimumFound) {
|
||||
int midIdx = consecutiveZeroes.size() / 2;
|
||||
int middleMinimumIdx = consecutiveZeroes.get(midIdx);
|
||||
if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) {
|
||||
columnIndixes.add(middleMinimumIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
return columnIndixes;
|
||||
}
|
||||
|
||||
|
||||
private double calcMax(double[] array) {
|
||||
|
||||
double max = Double.NEGATIVE_INFINITY;
|
||||
for (int i = 0; i < array.length; i++) {
|
||||
if (array[i] > max) {
|
||||
max = array[i];
|
||||
}
|
||||
}
|
||||
return max;
|
||||
}
|
||||
|
||||
|
||||
private double calcMin(double[] array) {
|
||||
|
||||
double min = Double.POSITIVE_INFINITY;
|
||||
for (int i = 0; i < array.length; i++) {
|
||||
if (array[i] < min) {
|
||||
min = array[i];
|
||||
}
|
||||
}
|
||||
return min;
|
||||
}
|
||||
|
||||
|
||||
private Extrema calculateNearGlobalExtrema(double[] derivative, double maxDvValue, double minDvValue) {
|
||||
|
||||
List<Integer> nearGlobalDvMaximaIdx = new LinkedList<>();
|
||||
List<Integer> nearGlobalDvMinimaIdx = new LinkedList<>();
|
||||
for (int i = globalStartIdx; i < globalEndIdx; i++) {
|
||||
if (derivative[i] <= minDvValue * NEAR_GLOBAL_THRESHOLD) {
|
||||
nearGlobalDvMinimaIdx.add(i);
|
||||
}
|
||||
if (derivative[i] >= maxDvValue * NEAR_GLOBAL_THRESHOLD) {
|
||||
nearGlobalDvMaximaIdx.add(i);
|
||||
}
|
||||
}
|
||||
|
||||
nearGlobalDvMinimaIdx = removeConsecutive(nearGlobalDvMinimaIdx);
|
||||
nearGlobalDvMaximaIdx = removeConsecutive(nearGlobalDvMaximaIdx);
|
||||
|
||||
return new Extrema(nearGlobalDvMaximaIdx, nearGlobalDvMinimaIdx);
|
||||
}
|
||||
|
||||
|
||||
private record Extrema(List<Integer> maxima, List<Integer> minima) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
private Double calculateXCoordinateFromIdx(int globalMinIdx) {
|
||||
|
||||
return min + ((globalMinIdx + 1) * resolution);
|
||||
}
|
||||
|
||||
|
||||
public static List<Integer> removeConsecutive(List<Integer> numbers) {
|
||||
|
||||
List<Integer> result = new ArrayList<>();
|
||||
if (numbers == null || numbers.isEmpty()) {
|
||||
return result;
|
||||
}
|
||||
|
||||
result.add(numbers.get(0)); // Add the first number
|
||||
|
||||
for (int i = 1; i < numbers.size(); i++) {
|
||||
if (numbers.get(i) != numbers.get(i - 1) + 1) {
|
||||
result.add(numbers.get(i)); // Add non-consecutive numbers
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public void kernelSmooth(double[] kernel) {
|
||||
|
||||
double[] newFrequencies = new double[histogram.length];
|
||||
int shift = (kernel.length - 1) / 2;
|
||||
for (int i = 0; i < kernel.length; i++) {
|
||||
int jStart = Math.max(0, i - shift);
|
||||
int jEnd = Math.min(histogram.length, histogram.length + i - shift);
|
||||
for (int j = jStart; j < jEnd; j++) {
|
||||
newFrequencies[j - i + shift] += kernel[i] * histogram[j];
|
||||
}
|
||||
}
|
||||
histogram = newFrequencies;
|
||||
}
|
||||
|
||||
|
||||
public double[] createGaussianKernel(int length, double stdDeviation) {
|
||||
|
||||
int r = length / 2;
|
||||
|
||||
int size = 2 * r + 1;
|
||||
double[] kernel = new double[size];
|
||||
double sum = 0;
|
||||
double b = 2 * (stdDeviation) * (stdDeviation);
|
||||
double a = 1 / Math.sqrt(Math.PI * b);
|
||||
for (int i = 0; i < size; i++) {
|
||||
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
|
||||
sum += kernel[i];
|
||||
}
|
||||
for (int i = 0; i < size; i++) {
|
||||
kernel[i] /= sum;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,90 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
public class Histogram {
|
||||
|
||||
private static final double EPSILON = 1.0e-6;
|
||||
private final double min;
|
||||
private final double resolution;
|
||||
private double[] frequencies;
|
||||
|
||||
|
||||
public Histogram(double minValue, double maxValue, double resolution) {
|
||||
|
||||
this.min = minValue - EPSILON;
|
||||
double delta = maxValue - minValue + 2 * EPSILON;
|
||||
int size = Math.max(1, (int) Math.round((maxValue - minValue) / resolution));
|
||||
this.resolution = delta / size;
|
||||
this.frequencies = new double[size];
|
||||
}
|
||||
|
||||
|
||||
public void kernelSmooth(double[] kernel) {
|
||||
|
||||
double[] newFrequencies = new double[frequencies.length];
|
||||
int shift = (kernel.length - 1) / 2;
|
||||
for (int i = 0; i < kernel.length; i++) {
|
||||
int jStart = Math.max(0, i - shift);
|
||||
int jEnd = Math.min(frequencies.length, frequencies.length + i - shift);
|
||||
for (int j = jStart; j < jEnd; j++) {
|
||||
newFrequencies[j - i + shift] += kernel[i] * frequencies[j];
|
||||
}
|
||||
}
|
||||
frequencies = newFrequencies;
|
||||
}
|
||||
|
||||
|
||||
public double[] createGaussianKernel(double length, double stdDeviation) {
|
||||
|
||||
int r = (int) Math.round(length / resolution) / 2;
|
||||
|
||||
int size = 2 * r + 1;
|
||||
double[] kernel = new double[size];
|
||||
double sum = 0;
|
||||
double b = 2 * (stdDeviation / resolution) * (stdDeviation / resolution);
|
||||
double a = 1 / Math.sqrt(Math.PI * b);
|
||||
for (int i = 0; i < size; i++) {
|
||||
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
|
||||
sum += kernel[i];
|
||||
}
|
||||
for (int i = 0; i < size; i++) {
|
||||
kernel[i] /= sum;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
|
||||
public void gaussianSmooth(double windowLength, double stdDeviation) {
|
||||
|
||||
kernelSmooth(createGaussianKernel(windowLength, stdDeviation));
|
||||
}
|
||||
|
||||
|
||||
public void add(double value) {
|
||||
|
||||
frequencies[(int) ((value - min) / resolution)] += 1.0;
|
||||
}
|
||||
|
||||
|
||||
public int getSize() {
|
||||
|
||||
return frequencies.length;
|
||||
}
|
||||
|
||||
|
||||
public double getPeakValue() {
|
||||
|
||||
int peakIndex = 0;
|
||||
for (int i = 1; i < frequencies.length; i++) {
|
||||
if (frequencies[i] > frequencies[peakIndex]) {
|
||||
peakIndex = i;
|
||||
}
|
||||
}
|
||||
int peakEndIndex = peakIndex + 1;
|
||||
final double EPS = 0.0001;
|
||||
while (peakEndIndex < frequencies.length && Math.abs(frequencies[peakEndIndex] - frequencies[peakIndex]) < EPS) {
|
||||
peakEndIndex++;
|
||||
}
|
||||
return ((double) peakIndex + peakEndIndex) / 2 * resolution + min;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,194 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.BOLD;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.BOLD_ITALIC;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.ITALIC;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.STANDARD;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.EnumMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.FontStyle;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
|
||||
@Data
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
|
||||
public class Line extends TextBoundingBox {
|
||||
|
||||
private static final double WORD_DISTANCE_MULTIPLIER = 0.17;
|
||||
|
||||
@EqualsAndHashCode.Include
|
||||
private final double x0;
|
||||
@EqualsAndHashCode.Include
|
||||
private final double y0;
|
||||
|
||||
@EqualsAndHashCode.Include
|
||||
private final double x1;
|
||||
@EqualsAndHashCode.Include
|
||||
private final double y1;
|
||||
|
||||
private FontStyle fontStyle;
|
||||
|
||||
private final List<Word> words;
|
||||
|
||||
|
||||
public Line(List<Character> characters, double wordSpacing) {
|
||||
|
||||
if (characters.size() >= 2) {
|
||||
// linear regression
|
||||
double sx = 0.0;
|
||||
double sxx = 0.0;
|
||||
double sxy = 0.0;
|
||||
double sy = 0.0;
|
||||
for (Character character : characters) {
|
||||
sx += character.getX();
|
||||
sxx += character.getX() * character.getX();
|
||||
sxy += character.getX() * character.getY();
|
||||
sy += character.getY();
|
||||
}
|
||||
double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx);
|
||||
double a = (sy - b * sx) / characters.size();
|
||||
|
||||
this.x0 = characters.get(0).getX();
|
||||
this.y0 = a + b * this.x0;
|
||||
this.x1 = characters.get(characters.size() - 1).getX();
|
||||
this.y1 = a + b * this.x1;
|
||||
} else {
|
||||
Character character = characters.get(0);
|
||||
double dx = character.getTextPosition().getWidthDirAdj() / 3;
|
||||
double dy = dx * Math.tan(0);
|
||||
this.x0 = character.getX() - dx;
|
||||
this.x1 = character.getX() + dx;
|
||||
this.y0 = character.getY() - dy;
|
||||
this.y1 = character.getY() + dy;
|
||||
}
|
||||
this.words = new ArrayList<>();
|
||||
computeWords(characters, wordSpacing * WORD_DISTANCE_MULTIPLIER);
|
||||
buildBBox();
|
||||
computeFontStyle();
|
||||
}
|
||||
|
||||
|
||||
public Line(List<Word> words) {
|
||||
|
||||
this.words = words;
|
||||
buildBBox();
|
||||
x0 = getMinX();
|
||||
y0 = getMinY();
|
||||
x1 = getMaxX();
|
||||
y1 = getMaxY();
|
||||
computeFontStyle();
|
||||
}
|
||||
|
||||
|
||||
private void computeFontStyle() {
|
||||
|
||||
EnumMap<FontStyle, AtomicInteger> fontStyleCounter = new EnumMap<>(FontStyle.class);
|
||||
for (FontStyle fontStyle : FontStyle.values()) {
|
||||
fontStyleCounter.put(fontStyle, new AtomicInteger(0));
|
||||
}
|
||||
for (Word word : words) {
|
||||
switch (word.getFontStyle()) {
|
||||
case STANDARD -> fontStyleCounter.get(FontStyle.REGULAR).getAndIncrement();
|
||||
case BOLD -> fontStyleCounter.get(FontStyle.BOLD).getAndIncrement();
|
||||
case ITALIC -> fontStyleCounter.get(FontStyle.ITALIC).getAndIncrement();
|
||||
case BOLD_ITALIC -> fontStyleCounter.get(FontStyle.BOLD_ITALIC).getAndIncrement();
|
||||
}
|
||||
}
|
||||
fontStyle = fontStyleCounter.entrySet()
|
||||
.stream()
|
||||
.max(Comparator.comparing(entry -> entry.getValue().get()))
|
||||
.map(Map.Entry::getKey).orElse(FontStyle.REGULAR);
|
||||
}
|
||||
|
||||
|
||||
public double getAngle() {
|
||||
|
||||
return FastAtan2.fastAtan2(y1 - y0, x1 - x0);
|
||||
}
|
||||
|
||||
|
||||
public double getLength() {
|
||||
|
||||
return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
|
||||
}
|
||||
|
||||
|
||||
public double angularDifference(Line j) {
|
||||
|
||||
double diff = Math.abs(getAngle() - j.getAngle());
|
||||
if (diff <= Math.PI / 2) {
|
||||
return diff;
|
||||
} else {
|
||||
return Math.PI - diff;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public double horizontalDistance(Line other) {
|
||||
|
||||
double[] xs = new double[4];
|
||||
xs[0] = x0;
|
||||
xs[1] = x1;
|
||||
xs[2] = other.x0;
|
||||
xs[3] = other.x1;
|
||||
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
|
||||
Arrays.sort(xs);
|
||||
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
|
||||
}
|
||||
|
||||
|
||||
public double verticalDistance(Line other) {
|
||||
|
||||
double ym = (y0 + y1) / 2;
|
||||
double yn = (other.y0 + other.y1) / 2;
|
||||
return Math.abs(ym - yn);
|
||||
}
|
||||
|
||||
|
||||
private void computeWords(List<Character> characters, double wordSpacing) {
|
||||
|
||||
// Imo, the width of space should be scaled with the font size, but it only depends on the median distance between horizontal neighbours.
|
||||
// If there are large differences in fontsize on a page, this might lead to missing spaces for the smaller fonts and too many for larger fonts.
|
||||
// I've just now changed the scaling factor. If you come across this comment with missing whitespaces again, try scaling the fontsize instead of simply changing the factor again.
|
||||
Word word = new Word();
|
||||
Character previous = null;
|
||||
for (Character current : characters) {
|
||||
if (previous != null) {
|
||||
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
|
||||
if (dist > wordSpacing) {
|
||||
words.add(word);
|
||||
word = new Word();
|
||||
}
|
||||
}
|
||||
word.add(current);
|
||||
previous = current;
|
||||
}
|
||||
words.add(word);
|
||||
}
|
||||
|
||||
|
||||
private void buildBBox() {
|
||||
|
||||
this.setToBBoxOfComponents(words);
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
words.forEach(word -> sb.append(word.toString()).append(" "));
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,43 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class Neighbor {
|
||||
|
||||
@Getter
|
||||
private final double distance;
|
||||
private Double angle;
|
||||
private final Character originCharacter;
|
||||
@Getter
|
||||
private final Character character;
|
||||
|
||||
|
||||
public Neighbor(Character neighbor, Character origin) {
|
||||
|
||||
this.distance = neighbor.distance(origin);
|
||||
this.character = neighbor;
|
||||
this.originCharacter = origin;
|
||||
}
|
||||
|
||||
|
||||
public double getHorizontalDistance() {
|
||||
|
||||
return character.horizontalDistance(originCharacter);
|
||||
}
|
||||
|
||||
|
||||
public double getVerticalDistance() {
|
||||
|
||||
return character.verticalDistance(originCharacter);
|
||||
}
|
||||
|
||||
|
||||
public double getAngle() {
|
||||
|
||||
if (angle != null) {
|
||||
return angle;
|
||||
}
|
||||
return this.character.angle(this.originCharacter);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,180 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Getter
|
||||
@Setter
|
||||
@SuperBuilder
|
||||
@NoArgsConstructor
|
||||
@EqualsAndHashCode(callSuper = false)
|
||||
public abstract class TextBoundingBox extends BoundingBox {
|
||||
|
||||
protected Rectangle2D bBoxDirAdj;
|
||||
|
||||
protected TextDirection dir;
|
||||
|
||||
|
||||
@Override
|
||||
public void setToBBoxOfComponents(List<? extends BoundingBox> components) {
|
||||
|
||||
super.setToBBoxOfComponents(components);
|
||||
this.bBoxDirAdj = components.stream()
|
||||
.filter(c -> c instanceof TextBoundingBox)
|
||||
.map(c -> (TextBoundingBox) c)
|
||||
.map(TextBoundingBox::getBBoxDirAdj)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
Set<TextDirection> textDirections = components.stream()
|
||||
.filter(c -> c instanceof TextBoundingBox)
|
||||
.map(c -> (TextBoundingBox) c)
|
||||
.map(TextBoundingBox::getDir)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
if (textDirections.isEmpty()) {
|
||||
dir = TextDirection.ZERO;
|
||||
} else if (textDirections.size() > 1) {
|
||||
throw new IllegalArgumentException("More than one text direction found");
|
||||
} else {
|
||||
dir = textDirections.iterator().next();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public double getXDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.getX();
|
||||
}
|
||||
|
||||
|
||||
public double getYDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.getY();
|
||||
}
|
||||
|
||||
|
||||
public double getWidthDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.getWidth();
|
||||
}
|
||||
|
||||
|
||||
public double getHeightDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.getHeight();
|
||||
}
|
||||
|
||||
|
||||
public double getMaxXDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public double getMaxYDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public double getCenterYDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.getCenterY();
|
||||
}
|
||||
|
||||
|
||||
public double getCenterXDirAdj() {
|
||||
|
||||
return this.bBoxDirAdj.getCenterX();
|
||||
}
|
||||
|
||||
|
||||
public double horizontalDistanceDirAdj(TextBoundingBox other) {
|
||||
|
||||
double rect1Right = getMaxXDirAdj();
|
||||
double rect1Left = getXDirAdj();
|
||||
double rect2Right = other.getMaxXDirAdj();
|
||||
double rect2Left = other.getXDirAdj();
|
||||
|
||||
if (rect1Left > rect2Right || rect2Left > rect1Right) {
|
||||
return Math.max(rect2Left - rect1Right, rect1Left - rect2Right);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public double verticalDistanceDirAdj(TextBoundingBox other) {
|
||||
|
||||
double rect1Top = getMaxYDirAdj();
|
||||
double rect1Bottom = getYDirAdj();
|
||||
double rect2Top = other.getMaxYDirAdj();
|
||||
double rect2Bottom = other.getYDirAdj();
|
||||
|
||||
if (rect1Bottom > rect2Top || rect2Bottom > rect1Top) {
|
||||
return Math.max(rect2Bottom - rect1Top, rect1Bottom - rect2Top);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsDirAdj(TextBoundingBox other) {
|
||||
|
||||
return this.intersectsXDirAdj(other) && this.intersectsYDirAdj(other);
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsDirAdj(TextBoundingBox other, float yThreshold, float xThreshold) {
|
||||
|
||||
return this.intersectsXDirAdj(other, xThreshold) && this.intersectsYDirAdj(other, yThreshold);
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsXDirAdj(TextBoundingBox other, float threshold) {
|
||||
|
||||
return this.getXDirAdj() - threshold <= other.getMaxXDirAdj() && this.getMaxXDirAdj() + threshold >= other.getXDirAdj();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsXDirAdj(TextBoundingBox other) {
|
||||
|
||||
return this.getXDirAdj() <= other.getMaxXDirAdj() && this.getMaxXDirAdj() >= other.getXDirAdj();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsYDirAdj(TextBoundingBox other) {
|
||||
|
||||
return this.getYDirAdj() <= other.getMaxYDirAdj() && this.getMaxYDirAdj() >= other.getYDirAdj();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsYDirAdj(TextBoundingBox other, float threshold) {
|
||||
|
||||
return this.getYDirAdj() - threshold <= other.getMaxYDirAdj() && this.getMaxYDirAdj() + threshold >= other.getYDirAdj();
|
||||
}
|
||||
|
||||
|
||||
public boolean isAboveDirAdj(TextBoundingBox other) {
|
||||
|
||||
return other.isBelow(this);
|
||||
}
|
||||
|
||||
|
||||
public boolean isBelowDirAdj(TextBoundingBox other) {
|
||||
|
||||
return this.intersectsXDirAdj(other) && this.getYDirAdj() >= other.getMaxYDirAdj();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,37 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
public class UnionFind<T> extends org.jgrapht.alg.util.UnionFind<T> {
|
||||
|
||||
public UnionFind(Set<T> elements) {
|
||||
|
||||
super(elements);
|
||||
}
|
||||
|
||||
|
||||
public Collection<Set<T>> getGroups() {
|
||||
|
||||
Map<T, Set<T>> setRep = new LinkedHashMap<>();
|
||||
for (T t : getParentMap().keySet()) {
|
||||
T representative = find(t);
|
||||
if (!setRep.containsKey(representative)) {
|
||||
setRep.put(representative, new LinkedHashSet<>());
|
||||
}
|
||||
setRep.get(representative).add(t);
|
||||
}
|
||||
|
||||
return setRep.values();
|
||||
}
|
||||
|
||||
|
||||
public Collection<T> getElements() {
|
||||
|
||||
return getParentMap().keySet();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,33 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
|
||||
@Data
|
||||
@EqualsAndHashCode(callSuper = false)
|
||||
public class Zone extends TextBoundingBox {
|
||||
|
||||
private List<Line> lines;
|
||||
|
||||
|
||||
@SuppressWarnings("PMD.ConstructorCallsOverridableMethod")
|
||||
public Zone(List<Line> lines) {
|
||||
|
||||
this.lines = lines;
|
||||
setToBBoxOfComponents(lines);
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
lines.forEach(line -> sb.append(line.toString()).append("\n"));
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,58 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
|
||||
@Service
|
||||
public class LineBuilderService {
|
||||
|
||||
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
|
||||
private static final double LINE_SPACING_THRESHOLD_MULTIPLIER = 0.67;
|
||||
private static final double ANGLE_TOLERANCE = Math.toRadians(5);
|
||||
|
||||
|
||||
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing, CleanRulings rulings) {
|
||||
|
||||
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalDistance = lineSpacing * LINE_SPACING_THRESHOLD_MULTIPLIER;
|
||||
|
||||
UnionFind<Character> unionFind = new UnionFind<>(new HashSet<>(characters));
|
||||
|
||||
AngleFilter angleFilter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
||||
|
||||
characters.forEach(character -> {
|
||||
character.getNeighbors()
|
||||
.forEach(neighbor -> {
|
||||
double normalizedHorizontalDistance = neighbor.getHorizontalDistance() / maxHorizontalDistance;
|
||||
double normalizedVerticalDistance = neighbor.getVerticalDistance() / maxVerticalDistance;
|
||||
|
||||
if (character.getTextPosition().getDir() != neighbor.getCharacter().getTextPosition().getDir() //
|
||||
|| !angleFilter.matches(neighbor) //
|
||||
|| Math.pow(normalizedHorizontalDistance, 2) + Math.pow(normalizedVerticalDistance, 2) > 1 //
|
||||
|| rulings.lineBetween(character.getTextPosition(), neighbor.getCharacter().getTextPosition())) {
|
||||
return;
|
||||
}
|
||||
|
||||
unionFind.union(character, neighbor.getCharacter());
|
||||
});
|
||||
});
|
||||
|
||||
return unionFind.getGroups()
|
||||
.stream()
|
||||
.map(lineCharacters -> lineCharacters.stream()
|
||||
.sorted(Comparator.comparingDouble(Character::getX))
|
||||
.toList())
|
||||
.map(lineCharacters -> new Line(lineCharacters, characterSpacing))
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,36 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.tinspin.index.Index;
|
||||
import org.tinspin.index.kdtree.KDTree;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Neighbor;
|
||||
|
||||
@Service
|
||||
public class NearestNeighbourService {
|
||||
|
||||
private static final int NUMBER_OF_NEIGHBOURS = 8;
|
||||
|
||||
|
||||
public void findNearestNeighbors(List<Character> characters) {
|
||||
KDTree<Character> kdTree = KDTree.create(2);
|
||||
characters.forEach(c -> kdTree.insert(new double[]{c.getX(), c.getY()}, c));
|
||||
|
||||
for(Character c : characters) {
|
||||
Index.PointIteratorKnn<Character> iterator = kdTree.queryKnn(new double[]{c.getX(), c.getY()}, NUMBER_OF_NEIGHBOURS + 1);
|
||||
// skip the first as this is identity
|
||||
if(iterator.hasNext()) {
|
||||
iterator.next();
|
||||
}
|
||||
while(iterator.hasNext()) {
|
||||
c.getNeighbors().add(new Neighbor(iterator.next().value(), c));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,192 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||
|
||||
@Service
|
||||
public class ReadingOrderService {
|
||||
|
||||
private static final double THRESHOLD = 5;
|
||||
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
|
||||
|
||||
private static final Comparator<TextBoundingBox> COMPARATOR = //
|
||||
Comparator.comparing(TextBoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(TextBoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
|
||||
|
||||
private static final Comparator<TextBoundingBox> COMPARATOR_DIR_ADJ = //
|
||||
Comparator.comparing(TextBoundingBox::getYDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
|
||||
|
||||
|
||||
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder, boolean useDirAdjCoords) {
|
||||
|
||||
if (zones.isEmpty() || zones.size() == 1) {
|
||||
return zones;
|
||||
}
|
||||
|
||||
if (xyReadingOrder) {
|
||||
return resolveSingleColumnReadingOrder(zones, useDirAdjCoords);
|
||||
}
|
||||
|
||||
Map<Long, Integer> histogram = new HashMap<>();
|
||||
for (Zone zone : zones) {
|
||||
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
|
||||
long minY = Math.round(bbox.getMinY());
|
||||
long maxY = Math.round(bbox.getMaxY());
|
||||
for (long i = minY; i <= maxY; i++) {
|
||||
histogram.put(i, histogram.getOrDefault(i, 0) + 1);
|
||||
}
|
||||
}
|
||||
|
||||
if (histogram.values()
|
||||
.stream()
|
||||
.mapToInt(Integer::intValue).average()
|
||||
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
||||
return resolveSingleColumnReadingOrder(zones, useDirAdjCoords);
|
||||
} else {
|
||||
|
||||
return resolveMultiColumnReadingOder(zones, useDirAdjCoords);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones, boolean useDirAdjCoords) {
|
||||
|
||||
if (useDirAdjCoords) {
|
||||
return zones.stream()
|
||||
.collect(Collectors.groupingBy(TextBoundingBox::getDir)).values()
|
||||
.stream()
|
||||
.flatMap(words -> words.stream()
|
||||
.sorted(COMPARATOR_DIR_ADJ))
|
||||
.toList();
|
||||
}
|
||||
|
||||
zones.sort(COMPARATOR);
|
||||
return zones;
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones, boolean useDirAdjCoords) {
|
||||
|
||||
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
|
||||
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
|
||||
|
||||
double minX = Double.POSITIVE_INFINITY;
|
||||
double maxX = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Zone zone : zones) {
|
||||
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
|
||||
if (bbox.getX() < minX) {
|
||||
minX = zone.getXDirAdj();
|
||||
}
|
||||
if (bbox.getMaxX() > maxX) {
|
||||
maxX = zone.getMaxXDirAdj();
|
||||
}
|
||||
}
|
||||
|
||||
double midLineXCoordinate = (minX + maxX) / 2;
|
||||
|
||||
List<Zone> leftOf = new ArrayList<>();
|
||||
List<Zone> rightOf = new ArrayList<>();
|
||||
List<Zone> middle = new ArrayList<>();
|
||||
|
||||
for (Zone zone : zones) {
|
||||
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
|
||||
if (bbox.getX() < midLineXCoordinate && bbox.getX() + bbox.getWidth() < midLineXCoordinate) {
|
||||
leftOf.add(zone);
|
||||
} else if (bbox.getX() > midLineXCoordinate && bbox.getX() + bbox.getWidth() > midLineXCoordinate) {
|
||||
rightOf.add(zone);
|
||||
} else {
|
||||
middle.add(zone);
|
||||
}
|
||||
}
|
||||
|
||||
if (useDirAdjCoords) {
|
||||
leftOf.sort(COMPARATOR_DIR_ADJ);
|
||||
rightOf.sort(COMPARATOR_DIR_ADJ);
|
||||
middle.sort(COMPARATOR_DIR_ADJ);
|
||||
} else {
|
||||
leftOf.sort(COMPARATOR);
|
||||
rightOf.sort(COMPARATOR);
|
||||
middle.sort(COMPARATOR);
|
||||
}
|
||||
/*
|
||||
List<Zone> leftNotIntersecting = new ArrayList<>();
|
||||
for (Zone leftZone : leftOf) {
|
||||
boolean intersects = false;
|
||||
for (Zone rightZone : rightOf) {
|
||||
if (leftZone.intersectsY(rightZone)) {
|
||||
intersects = true;
|
||||
break;
|
||||
}
|
||||
// early stopping
|
||||
if (rightZone.getBBox().getMinY() > leftZone.getBBox().getMaxY()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!intersects) {
|
||||
leftNotIntersecting.add(leftZone);
|
||||
}
|
||||
}
|
||||
|
||||
List<Zone> rightNotIntersecting = new ArrayList<>();
|
||||
for (Zone rightZone : rightOf) {
|
||||
boolean intersects = false;
|
||||
for (Zone leftZone : leftOf) {
|
||||
if (rightZone.intersectsY(leftZone)) {
|
||||
intersects = true;
|
||||
break;
|
||||
}
|
||||
// early stopping
|
||||
if (leftZone.getBBox().getMinY() > rightZone.getBBox().getMaxY()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!intersects) {
|
||||
rightNotIntersecting.add(rightZone);
|
||||
}
|
||||
}
|
||||
|
||||
leftOf.removeAll(leftNotIntersecting);
|
||||
rightOf.removeAll(rightNotIntersecting);
|
||||
|
||||
middle.addAll(leftNotIntersecting);
|
||||
middle.addAll(rightNotIntersecting);
|
||||
*/
|
||||
List<Zone> sortedZones = new ArrayList<>();
|
||||
sortedZones.addAll(leftOf);
|
||||
sortedZones.addAll(rightOf);
|
||||
|
||||
ListIterator<Zone> itty = middle.listIterator();
|
||||
|
||||
while (itty.hasNext()) {
|
||||
Zone current = itty.next();
|
||||
Rectangle2D bbox = useDirAdjCoords ? current.getBBoxDirAdj() : current.getBBox();
|
||||
for (int i = 0; i < sortedZones.size(); i++) {
|
||||
if (bbox.getY() < sortedZones.get(i).getY()) {
|
||||
sortedZones.add(i, current);
|
||||
itty.remove();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sortedZones.addAll(middle);
|
||||
|
||||
return sortedZones;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,56 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Histogram;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Neighbor;
|
||||
|
||||
@Service
|
||||
public class SpacingService {
|
||||
|
||||
private static final double SPACING_HISTOGRAM_RESOLUTION = 0.5;
|
||||
private static final double SPACING_HISTOGRAM_SMOOTHING_LENGTH = 2.5;
|
||||
private static final double SPACING_HIST_SMOOTHING_STANDARD_DEVIATION = 0.5;
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
|
||||
public double computeCharacterSpacing(List<Character> characters) {
|
||||
|
||||
return computeSpacing(characters, 0);
|
||||
}
|
||||
|
||||
|
||||
public double computeLineSpacing(List<Character> characters) {
|
||||
|
||||
return computeSpacing(characters, Math.PI / 2);
|
||||
}
|
||||
|
||||
|
||||
private double computeSpacing(List<Character> characters, double angle) {
|
||||
|
||||
double maxDistance = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Character character : characters) {
|
||||
for (Neighbor neighbor : character.getNeighbors()) {
|
||||
maxDistance = Math.max(maxDistance, neighbor.getDistance());
|
||||
}
|
||||
}
|
||||
Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION);
|
||||
AngleFilter angleFilter = new AngleFilter(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
|
||||
for (Character character : characters) {
|
||||
for (Neighbor neighbor : character.getNeighbors()) {
|
||||
if (angleFilter.matches(neighbor)) {
|
||||
histogram.add(neighbor.getDistance());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
histogram.gaussianSmooth(SPACING_HISTOGRAM_SMOOTHING_LENGTH, SPACING_HIST_SMOOTHING_STANDARD_DEVIATION);
|
||||
return histogram.getPeakValue();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,126 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||
|
||||
@Service
|
||||
public class ZoneBuilderService {
|
||||
|
||||
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
|
||||
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
|
||||
|
||||
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -7;
|
||||
|
||||
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
|
||||
|
||||
private static final double MIN_LINE_SIZE_SCALE = 0.9;
|
||||
|
||||
private static final double MAX_LINE_SIZE_SCALE = 2.5;
|
||||
|
||||
private static final double ANGLE_TOLERANCE = Math.toRadians(5);
|
||||
|
||||
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
|
||||
|
||||
|
||||
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing, CleanRulings rulings) {
|
||||
|
||||
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
|
||||
double minHorizontalMergeDistance = lineSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
|
||||
|
||||
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
|
||||
|
||||
double meanHeight = calculateMeanHeight(lines);
|
||||
|
||||
lines.forEach(outerLine -> {
|
||||
lines.forEach(innerLine -> {
|
||||
|
||||
if (innerLine == outerLine //
|
||||
|| unionFind.inSameSet(outerLine, innerLine)//
|
||||
|| outerLine.angularDifference(innerLine) > ANGLE_TOLERANCE) {
|
||||
return;
|
||||
}
|
||||
|
||||
// if (!innerLine.getFontStyle().equals(outerLine.getFontStyle()) //
|
||||
// && !outerLine.intersectsY(innerLine, -2f)) {
|
||||
// return;
|
||||
// }
|
||||
|
||||
double horizontalScale = Math.min(outerLine.getHeightDirAdj(), innerLine.getHeightDirAdj()) / meanHeight;
|
||||
horizontalScale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(horizontalScale, MAX_LINE_SIZE_SCALE));
|
||||
double verticalScale = horizontalScale;
|
||||
|
||||
// if (innerLine.toString().endsWith(":")
|
||||
// || outerLine.toString().endsWith(":")
|
||||
// || numericalIdentifierPattern.matcher(innerLine.toString()).matches()
|
||||
// || numericalIdentifierPattern.matcher(outerLine.toString()).matches()) {
|
||||
//
|
||||
// horizontalScale *= 5;
|
||||
// verticalScale /= 10;
|
||||
// }
|
||||
|
||||
double horizontalDistance = outerLine.horizontalDistance(innerLine) / horizontalScale;
|
||||
double verticalDistance = outerLine.verticalDistance(innerLine) / verticalScale;
|
||||
|
||||
if ((!(minHorizontalDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalDistance)) //
|
||||
&& (!(minHorizontalMergeDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalMergeDistance))) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (rulings.lineBetween(outerLine, innerLine)) {
|
||||
return;
|
||||
}
|
||||
|
||||
unionFind.union(outerLine, innerLine);
|
||||
|
||||
});
|
||||
});
|
||||
|
||||
return unionFind.getGroups()
|
||||
.stream()
|
||||
.map(group -> mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private double calculateMeanHeight(List<Line> lines) {
|
||||
|
||||
double meanHeight = 0.0;
|
||||
double weights = 0.0;
|
||||
for (Line line : lines) {
|
||||
double weight = line.getLength();
|
||||
meanHeight += line.getHeightDirAdj() * weight;
|
||||
weights += weight;
|
||||
}
|
||||
meanHeight /= weights;
|
||||
return meanHeight;
|
||||
}
|
||||
|
||||
|
||||
private Zone mergeLinesInZone(List<Line> lines, double characterSpacing, double lineSpacing) {
|
||||
|
||||
Set<Word> words = lines.stream()
|
||||
.map(Line::getWords)
|
||||
.flatMap(Collection::stream)
|
||||
.collect(Collectors.toSet());
|
||||
Collection<Set<Word>> groupedLines = TextPositionOperations.groupByLine(words);
|
||||
|
||||
List<Line> sortedLines = TextPositionOperations.sortLines(groupedLines);
|
||||
return new Zone(sortedLines);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,18 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.utils;
|
||||
|
||||
public class DoubleUtils {
|
||||
|
||||
public static int compareDouble(double d1, double d2, double precision) {
|
||||
|
||||
if (Double.isNaN(d1) || Double.isNaN(d2)) {
|
||||
return Double.compare(d1, d2);
|
||||
}
|
||||
|
||||
if (Math.abs(d1 - d2) < precision) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return Double.compare(d1, d2);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,76 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.utils;
|
||||
|
||||
public class FastAtan2 {
|
||||
|
||||
static final private int Size_Ac = 1000;
|
||||
static final private int Size_Ar = Size_Ac + 1;
|
||||
static final private double Pi = (float) Math.PI;
|
||||
static final private double Pi_H = Pi / 2;
|
||||
|
||||
static final private double[] Atan2 = new double[Size_Ar];
|
||||
static final private double[] Atan2_PM = new double[Size_Ar];
|
||||
static final private double[] Atan2_MP = new double[Size_Ar];
|
||||
static final private double[] Atan2_MM = new double[Size_Ar];
|
||||
|
||||
static final private double[] Atan2_R = new double[Size_Ar];
|
||||
static final private double[] Atan2_RPM = new double[Size_Ar];
|
||||
static final private double[] Atan2_RMP = new double[Size_Ar];
|
||||
static final private double[] Atan2_RMM = new double[Size_Ar];
|
||||
|
||||
static {
|
||||
for (int i = 0; i <= Size_Ac; i++) {
|
||||
double d = (double) i / Size_Ac;
|
||||
double x = 1;
|
||||
double y = x * d;
|
||||
double v = Math.atan2(y, x);
|
||||
Atan2[i] = v;
|
||||
Atan2_PM[i] = Pi - v;
|
||||
Atan2_MP[i] = -v;
|
||||
Atan2_MM[i] = -Pi + v;
|
||||
|
||||
Atan2_R[i] = Pi_H - v;
|
||||
Atan2_RPM[i] = Pi_H + v;
|
||||
Atan2_RMP[i] = -Pi_H + v;
|
||||
Atan2_RMM[i] = -Pi_H - v;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("ParameterAssignment")
|
||||
static public double fastAtan2(double y, double x) {
|
||||
|
||||
if (y < 0) {
|
||||
if (x < 0) {
|
||||
//(y < x) because == (-y > -x)
|
||||
if (y < x) {
|
||||
return Atan2_RMM[(int) (x / y * Size_Ac)];
|
||||
} else {
|
||||
return Atan2_MM[(int) (y / x * Size_Ac)];
|
||||
}
|
||||
} else {
|
||||
y = -y;
|
||||
if (y > x) {
|
||||
return Atan2_RMP[(int) (x / y * Size_Ac)];
|
||||
} else {
|
||||
return Atan2_MP[(int) (y / x * Size_Ac)];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (x < 0) {
|
||||
x = -x;
|
||||
if (y > x) {
|
||||
return Atan2_RPM[(int) (x / y * Size_Ac)];
|
||||
} else {
|
||||
return Atan2_PM[(int) (y / x * Size_Ac)];
|
||||
}
|
||||
} else {
|
||||
if (y > x) {
|
||||
return Atan2_R[(int) (x / y * Size_Ac)];
|
||||
} else {
|
||||
return Atan2[(int) (y / x * Size_Ac)];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,28 +1,29 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public abstract class AbstractPageBlock {
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
public abstract class AbstractPageBlock extends BoundingBox {
|
||||
|
||||
@JsonIgnore
|
||||
protected float minX;
|
||||
@JsonIgnore
|
||||
protected float maxX;
|
||||
@JsonIgnore
|
||||
protected float minY;
|
||||
@JsonIgnore
|
||||
protected float maxY;
|
||||
@JsonIgnore
|
||||
protected PageBlockType classification;
|
||||
|
||||
Set<LayoutEngine> engines = new HashSet<>();
|
||||
|
||||
@JsonIgnore
|
||||
protected int page;
|
||||
|
||||
@ -39,42 +40,6 @@ public abstract class AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
public boolean containsBlock(TextPageBlock other) {
|
||||
|
||||
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(AbstractPageBlock other) {
|
||||
|
||||
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Rectangle other) {
|
||||
|
||||
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft()
|
||||
.getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getHeight() {
|
||||
|
||||
return maxY - minY;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getWidth() {
|
||||
|
||||
return maxX - minX;
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(AbstractPageBlock atc) {
|
||||
|
||||
return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY();
|
||||
}
|
||||
public abstract boolean isEmpty();
|
||||
|
||||
}
|
||||
|
||||
@ -3,9 +3,11 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
@ -23,9 +25,12 @@ public class ClassificationDocument {
|
||||
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
|
||||
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
|
||||
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
|
||||
private LayoutDebugLayer layoutDebugLayer = new LayoutDebugLayer();
|
||||
private boolean headlines;
|
||||
|
||||
private SectionGrid sectionGrid = new SectionGrid();
|
||||
private long rulesVersion;
|
||||
|
||||
private OutlineObjectTree outlineObjectTree;
|
||||
private SectionTree sectionTree;
|
||||
|
||||
}
|
||||
|
||||
@ -8,21 +8,26 @@ import java.util.Map;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
|
||||
public class ClassificationPage {
|
||||
|
||||
@NonNull
|
||||
private List<AbstractPageBlock> textBlocks;
|
||||
|
||||
private List<OutlineObject> outlineObjects = new ArrayList<>();
|
||||
|
||||
private List<AbstractPageBlock> headlines = new ArrayList<>();
|
||||
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
|
||||
private Rectangle bodyTextFrame;
|
||||
@ -40,8 +45,8 @@ public class ClassificationPage {
|
||||
private float pageWidth;
|
||||
private float pageHeight;
|
||||
|
||||
CleanRulings cleanRulings;
|
||||
private CleanRulings cleanRulings;
|
||||
|
||||
private Map<String, Rectangle2D> markedContentBboxPerType = new HashMap<>();
|
||||
private Map<String, List<Rectangle2D>> markedContentBboxPerType = new HashMap<>();
|
||||
|
||||
}
|
||||
|
||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
@ -11,6 +12,7 @@ import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@Deprecated
|
||||
public class ClassificationSection {
|
||||
|
||||
private List<AbstractPageBlock> pageBlocks = new ArrayList<>();
|
||||
@ -29,4 +31,10 @@ public class ClassificationSection {
|
||||
return tables;
|
||||
}
|
||||
|
||||
|
||||
public List<AbstractPageBlock> getNonEmptyPageBlocks() {
|
||||
|
||||
return pageBlocks.stream().filter(pageBlock -> !pageBlock.isEmpty()).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,19 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
public record DocumentWithVisualization(Document document, LayoutDebugLayer layoutDebugLayer) {
|
||||
|
||||
public Map<NodeType, Long> buildSemanticNodeCounts() {
|
||||
|
||||
return document.streamAllSubNodes()
|
||||
.collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting()));
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,6 +1,5 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
@ -12,10 +11,14 @@ import lombok.Getter;
|
||||
@Getter
|
||||
public class FloatFrequencyCounter {
|
||||
|
||||
Map<Float, Integer> countPerValue = new HashMap<>();
|
||||
Map<Double, Integer> countPerValue = new HashMap<>();
|
||||
boolean changed;
|
||||
Double mostPopularCache;
|
||||
|
||||
|
||||
public void add(float value) {
|
||||
public void add(double value) {
|
||||
|
||||
changed = true;
|
||||
|
||||
if (!countPerValue.containsKey(value)) {
|
||||
countPerValue.put(value, 1);
|
||||
@ -25,9 +28,11 @@ public class FloatFrequencyCounter {
|
||||
}
|
||||
|
||||
|
||||
public void addAll(Map<Float, Integer> otherCounter) {
|
||||
public void addAll(Map<Double, Integer> otherCounter) {
|
||||
|
||||
for (Map.Entry<Float, Integer> entry : otherCounter.entrySet()) {
|
||||
changed = true;
|
||||
|
||||
for (Map.Entry<Double, Integer> entry : otherCounter.entrySet()) {
|
||||
if (countPerValue.containsKey(entry.getKey())) {
|
||||
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
|
||||
} else {
|
||||
@ -37,36 +42,36 @@ public class FloatFrequencyCounter {
|
||||
}
|
||||
|
||||
|
||||
public Float getMostPopular() {
|
||||
public Double getMostPopular() {
|
||||
|
||||
Map.Entry<Float, Integer> mostPopular = null;
|
||||
for (Map.Entry<Float, Integer> entry : countPerValue.entrySet()) {
|
||||
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
|
||||
mostPopular = entry;
|
||||
if (changed || mostPopularCache == null) {
|
||||
Map.Entry<Double, Integer> mostPopular = null;
|
||||
for (Map.Entry<Double, Integer> entry : countPerValue.entrySet()) {
|
||||
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
|
||||
mostPopular = entry;
|
||||
}
|
||||
}
|
||||
mostPopularCache = mostPopular != null ? mostPopular.getKey() : 0;
|
||||
changed = false;
|
||||
}
|
||||
return mostPopular != null ? mostPopular.getKey() : null;
|
||||
|
||||
return mostPopularCache;
|
||||
}
|
||||
|
||||
|
||||
public List<Float> getHighterThanMostPopular() {
|
||||
public List<Double> getValuesInReverseOrder() {
|
||||
|
||||
Float mostPopular = getMostPopular();
|
||||
List<Float> higher = new ArrayList<>();
|
||||
for (Float value : countPerValue.keySet()) {
|
||||
if (value > mostPopular) {
|
||||
higher.add(value);
|
||||
}
|
||||
}
|
||||
|
||||
return higher.stream().sorted(Collections.reverseOrder()).collect(Collectors.toList());
|
||||
return countPerValue.keySet()
|
||||
.stream()
|
||||
.sorted(Collections.reverseOrder())
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
public Float getHighest() {
|
||||
public Double getHighest() {
|
||||
|
||||
Float highest = null;
|
||||
for (Float value : countPerValue.keySet()) {
|
||||
Double highest = null;
|
||||
for (Double value : countPerValue.keySet()) {
|
||||
if (highest == null || value > highest) {
|
||||
highest = value;
|
||||
}
|
||||
|
||||
@ -3,7 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -16,8 +16,8 @@ import lombok.experimental.FieldDefaults;
|
||||
public class LineInformation {
|
||||
|
||||
List<Rectangle2D> lineBBox;
|
||||
List<List<TextPositionSequence>> sequencesByLines;
|
||||
List<List<Word>> sequencesByLines;
|
||||
List<List<Rectangle2D>> bBoxWithGapsByLines;
|
||||
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines;
|
||||
List<List<List<Word>>> sequencesWithGapsByLines;
|
||||
|
||||
}
|
||||
|
||||
@ -9,12 +9,14 @@ public enum PageBlockType {
|
||||
H6,
|
||||
HEADER,
|
||||
FOOTER,
|
||||
TITLE,
|
||||
PARAGRAPH,
|
||||
PARAGRAPH_BOLD,
|
||||
PARAGRAPH_ITALIC,
|
||||
PARAGRAPH_UNKNOWN,
|
||||
OTHER,
|
||||
TABLE_OF_CONTENTS_HEADLINE,
|
||||
TABLE_OF_CONTENTS_ITEM,
|
||||
LIST_ITEM,
|
||||
TABLE;
|
||||
|
||||
|
||||
@ -31,8 +33,21 @@ public enum PageBlockType {
|
||||
}
|
||||
|
||||
|
||||
public static int getHeadlineNumber(PageBlockType pageBlockType) {
|
||||
|
||||
return switch (pageBlockType) {
|
||||
case H1, TABLE_OF_CONTENTS_HEADLINE -> 1;
|
||||
case H2 -> 2;
|
||||
case H3 -> 3;
|
||||
case H4 -> 4;
|
||||
case H5 -> 5;
|
||||
default -> 6;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public boolean isHeadline() {
|
||||
|
||||
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
|
||||
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6) || this.equals(TABLE_OF_CONTENTS_HEADLINE);
|
||||
}
|
||||
}
|
||||
|
||||
@ -4,7 +4,7 @@ import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -15,7 +15,7 @@ import lombok.Getter;
|
||||
@AllArgsConstructor
|
||||
public class PageContents {
|
||||
|
||||
List<TextPositionSequence> sortedTextPositionSequences;
|
||||
List<Word> sortedWords;
|
||||
Rectangle2D cropBox;
|
||||
Rectangle2D mediaBox;
|
||||
List<Ruling> rulings;
|
||||
|
||||
@ -3,26 +3,32 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class SectionIdentifier {
|
||||
|
||||
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
|
||||
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?");
|
||||
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?");
|
||||
|
||||
private enum Format {
|
||||
public enum Format {
|
||||
EMPTY,
|
||||
NUMERICAL,
|
||||
ALPHANUMERIC,
|
||||
DOCUMENT
|
||||
}
|
||||
|
||||
@Getter
|
||||
Format format;
|
||||
@Getter
|
||||
String identifierString;
|
||||
List<Integer> identifiers;
|
||||
boolean asChild;
|
||||
@ -38,6 +44,10 @@ public class SectionIdentifier {
|
||||
if (numericalIdentifierMatcher.find()) {
|
||||
return buildNumericalSectionIdentifier(headline, numericalIdentifierMatcher);
|
||||
}
|
||||
Matcher alphanumericIdentifierMatcher = alphanumericIdentifierPattern.matcher(headline);
|
||||
if (alphanumericIdentifierMatcher.find()) {
|
||||
return buildAlphanumericSectionIdentifier(headline, alphanumericIdentifierMatcher);
|
||||
}
|
||||
// more formats here
|
||||
return SectionIdentifier.empty();
|
||||
}
|
||||
@ -72,7 +82,36 @@ public class SectionIdentifier {
|
||||
}
|
||||
identifiers.add(Integer.parseInt(numericalIdentifier.trim()));
|
||||
}
|
||||
return new SectionIdentifier(Format.NUMERICAL, identifierString, identifiers.stream().toList(), false);
|
||||
return new SectionIdentifier(Format.NUMERICAL,
|
||||
identifierString,
|
||||
identifiers.stream()
|
||||
.toList(),
|
||||
false);
|
||||
}
|
||||
|
||||
|
||||
private static SectionIdentifier buildAlphanumericSectionIdentifier(String headline, Matcher alphanumericIdentifierMatcher) {
|
||||
|
||||
String identifierString = headline.substring(alphanumericIdentifierMatcher.start(), alphanumericIdentifierMatcher.end());
|
||||
|
||||
String alphanumericIdentifier = alphanumericIdentifierMatcher.group(0).substring(0, 1).toUpperCase(Locale.ENGLISH);
|
||||
int mappedCharacterValue = alphanumericIdentifier.charAt(0) - 'A' + 1;
|
||||
List<Integer> identifiers = new LinkedList<>();
|
||||
identifiers.add(mappedCharacterValue);
|
||||
|
||||
for (int i = 1; i <= 3; i++) {
|
||||
String numericalIdentifier = alphanumericIdentifierMatcher.group(i);
|
||||
if (numericalIdentifier == null || numericalIdentifier.equals("0") || numericalIdentifier.isEmpty() || numericalIdentifier.isBlank()) {
|
||||
break;
|
||||
}
|
||||
identifiers.add(Integer.parseInt(numericalIdentifier.trim()));
|
||||
}
|
||||
|
||||
return new SectionIdentifier(Format.ALPHANUMERIC,
|
||||
identifierString,
|
||||
identifiers.stream()
|
||||
.toList(),
|
||||
false);
|
||||
}
|
||||
|
||||
|
||||
@ -120,4 +159,22 @@ public class SectionIdentifier {
|
||||
return identifierString;
|
||||
}
|
||||
|
||||
|
||||
public boolean isEmpty() {
|
||||
|
||||
return this.format.equals(Format.EMPTY);
|
||||
}
|
||||
|
||||
|
||||
public int level() {
|
||||
|
||||
return identifiers.size();
|
||||
}
|
||||
|
||||
|
||||
protected List<Integer> getIdentifiers() {
|
||||
|
||||
return identifiers;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,144 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Setter;
|
||||
|
||||
@Setter
|
||||
@EqualsAndHashCode
|
||||
public class Boundary implements Comparable<Boundary> {
|
||||
|
||||
private int start;
|
||||
private int end;
|
||||
|
||||
|
||||
public Boundary(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
||||
}
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
|
||||
public int length() {
|
||||
|
||||
return end - start;
|
||||
}
|
||||
|
||||
|
||||
public int start() {
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
|
||||
public int end() {
|
||||
|
||||
return end;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Boundary boundary) {
|
||||
|
||||
return start <= boundary.start() && boundary.end() <= end;
|
||||
}
|
||||
|
||||
|
||||
public boolean containedBy(Boundary boundary) {
|
||||
|
||||
return boundary.contains(this);
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
||||
}
|
||||
return this.start <= start && end <= this.end;
|
||||
}
|
||||
|
||||
|
||||
public boolean containedBy(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
||||
}
|
||||
return start <= this.start && this.end <= end;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(int index) {
|
||||
|
||||
return start <= index && index < end;
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(Boundary boundary) {
|
||||
|
||||
return boundary.start() < this.end && this.start < boundary.end();
|
||||
}
|
||||
|
||||
|
||||
public List<Boundary> split(List<Integer> splitIndices) {
|
||||
|
||||
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
|
||||
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
|
||||
}
|
||||
List<Boundary> splitBoundaries = new LinkedList<>();
|
||||
int previousIndex = start;
|
||||
for (int splitIndex : splitIndices) {
|
||||
|
||||
// skip split if it would produce a boundary of length 0
|
||||
if (splitIndex == previousIndex) {
|
||||
continue;
|
||||
}
|
||||
splitBoundaries.add(new Boundary(previousIndex, splitIndex));
|
||||
previousIndex = splitIndex;
|
||||
}
|
||||
splitBoundaries.add(new Boundary(previousIndex, end));
|
||||
return splitBoundaries;
|
||||
}
|
||||
|
||||
public IntStream intStream() {
|
||||
|
||||
return IntStream.range(start, end);
|
||||
}
|
||||
|
||||
public static Boundary merge(Collection<Boundary> boundaries) {
|
||||
|
||||
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
|
||||
int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new);
|
||||
return new Boundary(minStart, maxEnd);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return format("Boundary [%d|%d)", start, end);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(Boundary boundary) {
|
||||
|
||||
if (end < boundary.end() && start < boundary.start()) {
|
||||
return -1;
|
||||
}
|
||||
if (start > boundary.start() && end > boundary.end()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,217 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@EqualsAndHashCode
|
||||
public class DocumentTree {
|
||||
|
||||
private final Entry root;
|
||||
|
||||
|
||||
public DocumentTree(Document document) {
|
||||
|
||||
root = Entry.builder().treeId(Collections.emptyList()).children(new LinkedList<>()).node(document).build();
|
||||
}
|
||||
|
||||
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return allEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewMainEntryAndReturnId(GenericSemanticNode node) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(Collections.emptyList(), node);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, GenericSemanticNode node) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, Table node) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewTableChildEntryAndReturnId(Table parentTable, TableCell tableCell) {
|
||||
|
||||
return createNewChildEntryAndReturnIdImpl(parentTable.getTreeId(), tableCell);
|
||||
}
|
||||
|
||||
|
||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
||||
private List<Integer> createNewChildEntryAndReturnIdImpl(List<Integer> parentId, SemanticNode node) {
|
||||
|
||||
if (!entryExists(parentId)) {
|
||||
throw new IllegalArgumentException(format("parentId %s does not exist!", parentId));
|
||||
}
|
||||
|
||||
Entry parent = getEntryById(parentId);
|
||||
List<Integer> newId = new LinkedList<>(parentId);
|
||||
newId.add(parent.children.size());
|
||||
parent.children.add(Entry.builder().treeId(newId).node(node).build());
|
||||
|
||||
return newId;
|
||||
}
|
||||
|
||||
|
||||
private boolean entryExists(List<Integer> treeId) {
|
||||
|
||||
if (treeId.isEmpty()) {
|
||||
return root != null;
|
||||
}
|
||||
Entry entry = root.children.get(treeId.get(0));
|
||||
for (int id : treeId.subList(1, treeId.size())) {
|
||||
if (id >= entry.children.size() || 0 > id) {
|
||||
return false;
|
||||
}
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public Entry getParentEntryById(List<Integer> treeId) {
|
||||
|
||||
return getEntryById(getParentId(treeId));
|
||||
}
|
||||
|
||||
|
||||
public boolean hasParentById(List<Integer> treeId) {
|
||||
|
||||
return !treeId.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
public Stream<SemanticNode> childNodes(List<Integer> treeId) {
|
||||
|
||||
return getEntryById(treeId).children.stream().map(Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
public Stream<SemanticNode> childNodesOfType(List<Integer> treeId, NodeType nodeType) {
|
||||
|
||||
return getEntryById(treeId).children.stream().filter(entry -> entry.node.getType().equals(nodeType)).map(Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
private static List<Integer> getParentId(List<Integer> treeId) {
|
||||
|
||||
if (treeId.isEmpty()) {
|
||||
throw new UnsupportedOperationException("Root has no parent!");
|
||||
}
|
||||
if (treeId.size() < 2) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
return treeId.subList(0, treeId.size() - 1);
|
||||
}
|
||||
|
||||
|
||||
public Entry getEntryById(List<Integer> treeId) {
|
||||
|
||||
if (treeId.isEmpty()) {
|
||||
return root;
|
||||
}
|
||||
Entry entry = root.children.get(treeId.get(0));
|
||||
for (int id : treeId.subList(1, treeId.size())) {
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> mainEntries() {
|
||||
|
||||
return root.children.stream();
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> allEntriesInOrder() {
|
||||
|
||||
return Stream.of(root).flatMap(DocumentTree::flatten);
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> allSubEntriesInOrder(List<Integer> parentId) {
|
||||
|
||||
return getEntryById(parentId).children.stream().flatMap(DocumentTree::flatten);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.join("\n", allEntriesInOrder().map(Entry::toString).toList());
|
||||
}
|
||||
|
||||
|
||||
private static Stream<Entry> flatten(Entry entry) {
|
||||
|
||||
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTree::flatten));
|
||||
}
|
||||
|
||||
|
||||
public SemanticNode getHighestParentById(List<Integer> treeId) {
|
||||
|
||||
if (treeId.isEmpty()) {
|
||||
return root.node;
|
||||
}
|
||||
return root.children.get(treeId.get(0)).node;
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
public static class Entry {
|
||||
|
||||
List<Integer> treeId;
|
||||
SemanticNode node;
|
||||
@Builder.Default
|
||||
List<Entry> children = new LinkedList<>();
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return node.toString();
|
||||
}
|
||||
|
||||
|
||||
public NodeType getType() {
|
||||
|
||||
return node.getType();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,8 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.entity;
|
||||
|
||||
public enum EntityType {
|
||||
ENTITY,
|
||||
RECOMMENDATION,
|
||||
FALSE_POSITIVE,
|
||||
FALSE_RECOMMENDATION
|
||||
}
|
||||
@ -1,228 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.entity;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.Deque;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
public class RedactionEntity {
|
||||
|
||||
// initial values
|
||||
@EqualsAndHashCode.Include
|
||||
final Boundary boundary;
|
||||
@EqualsAndHashCode.Include
|
||||
final String type;
|
||||
@EqualsAndHashCode.Include
|
||||
final EntityType entityType;
|
||||
|
||||
// empty defaults
|
||||
boolean redaction;
|
||||
boolean removed;
|
||||
boolean ignored;
|
||||
boolean resized;
|
||||
boolean skipRemoveEntitiesContainedInLarger;
|
||||
boolean dictionaryEntry;
|
||||
boolean dossierDictionaryEntry;
|
||||
Set<Engine> engines;
|
||||
Set<RedactionEntity> references;
|
||||
@Builder.Default
|
||||
Deque<Integer> matchedRules = new LinkedList<>();
|
||||
String redactionReason;
|
||||
String legalBasis;
|
||||
|
||||
// inferred on graph insertion
|
||||
@EqualsAndHashCode.Include
|
||||
String value;
|
||||
String textBefore;
|
||||
String textAfter;
|
||||
@Builder.Default
|
||||
Set<Page> pages = new HashSet<>();
|
||||
List<RedactionPosition> redactionPositionsPerPage;
|
||||
@Builder.Default
|
||||
List<SemanticNode> intersectingNodes = new LinkedList<>();
|
||||
SemanticNode deepestFullyContainingNode;
|
||||
|
||||
|
||||
public static RedactionEntity initialEntityNode(Boundary boundary, String type, EntityType entityType) {
|
||||
|
||||
return RedactionEntity.builder().type(type).entityType(entityType).boundary(boundary).engines(new HashSet<>()).references(new HashSet<>()).build();
|
||||
}
|
||||
|
||||
|
||||
public boolean occursInNodeOfType(Class<? extends SemanticNode> clazz) {
|
||||
|
||||
return intersectingNodes.stream().anyMatch(clazz::isInstance);
|
||||
}
|
||||
|
||||
|
||||
public boolean occursInNode(SemanticNode semanticNode) {
|
||||
|
||||
return intersectingNodes.stream().anyMatch(node -> node.equals(semanticNode));
|
||||
}
|
||||
|
||||
|
||||
public boolean isType(String type) {
|
||||
|
||||
return this.type.equals(type);
|
||||
}
|
||||
|
||||
|
||||
public boolean isAnyType(List<String> types) {
|
||||
|
||||
return types.contains(type);
|
||||
}
|
||||
|
||||
|
||||
public void addIntersectingNode(SemanticNode containingNode) {
|
||||
|
||||
intersectingNodes.add(containingNode);
|
||||
}
|
||||
|
||||
|
||||
public void removeFromGraph() {
|
||||
|
||||
intersectingNodes.forEach(node -> node.getEntities().remove(this));
|
||||
pages.forEach(page -> page.getEntities().remove(this));
|
||||
intersectingNodes = new LinkedList<>();
|
||||
deepestFullyContainingNode = null;
|
||||
pages = new HashSet<>();
|
||||
removed = true;
|
||||
ignored = true;
|
||||
}
|
||||
|
||||
|
||||
public void addMatchedRule(int ruleNumber) {
|
||||
|
||||
matchedRules.add(ruleNumber);
|
||||
}
|
||||
|
||||
|
||||
public int getMatchedRule() {
|
||||
|
||||
if (matchedRules.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
return matchedRules.getLast();
|
||||
}
|
||||
|
||||
|
||||
public List<RedactionPosition> getRedactionPositionsPerPage() {
|
||||
|
||||
if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) {
|
||||
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(boundary);
|
||||
|
||||
Page firstPage = rectanglesPerLinePerPage.keySet()
|
||||
.stream()
|
||||
.min(Comparator.comparingInt(Page::getNumber))
|
||||
.orElseThrow(() -> new RuntimeException("No Positions found on any page!"));
|
||||
String id = IdBuilder.buildId(pages, rectanglesPerLinePerPage.values().stream().flatMap(Collection::stream).toList());
|
||||
redactionPositionsPerPage = rectanglesPerLinePerPage.entrySet().stream().map(entry -> buildRedactionPosition(firstPage, id, entry)).toList();
|
||||
}
|
||||
return redactionPositionsPerPage;
|
||||
}
|
||||
|
||||
|
||||
private static RedactionPosition buildRedactionPosition(Page firstPage, String id, Map.Entry<Page, List<Rectangle2D>> entry) {
|
||||
|
||||
if (entry.getKey().equals(firstPage)) {
|
||||
return new RedactionPosition(id, entry.getKey(), entry.getValue());
|
||||
} else {
|
||||
return new RedactionPosition(id + "-" + entry.getKey().getNumber(), entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean containedBy(RedactionEntity redactionEntity) {
|
||||
|
||||
return this.boundary.containedBy(redactionEntity.getBoundary());
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(RedactionEntity redactionEntity) {
|
||||
|
||||
return this.boundary.contains(redactionEntity.getBoundary());
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(RedactionEntity redactionEntity) {
|
||||
|
||||
return this.boundary.intersects(redactionEntity.getBoundary());
|
||||
}
|
||||
|
||||
|
||||
public void addEngine(Engine engine) {
|
||||
|
||||
engines.add(engine);
|
||||
}
|
||||
|
||||
|
||||
public void addEngines(Set<Engine> engines) {
|
||||
|
||||
this.engines.addAll(engines);
|
||||
}
|
||||
|
||||
|
||||
public void addReference(RedactionEntity reference) {
|
||||
|
||||
references.add(reference);
|
||||
}
|
||||
|
||||
|
||||
public void addReferences(List<RedactionEntity> references) {
|
||||
|
||||
this.references.addAll(references);
|
||||
}
|
||||
|
||||
|
||||
public boolean matchesAnnotationId(String manualRedactionId) {
|
||||
|
||||
return getRedactionPositionsPerPage().stream().anyMatch(entityPosition -> entityPosition.getId().equals(manualRedactionId));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("Entity[\"");
|
||||
sb.append(value);
|
||||
sb.append("\", ");
|
||||
sb.append(boundary);
|
||||
sb.append(", pages[");
|
||||
pages.forEach(page -> {
|
||||
sb.append(page.getNumber());
|
||||
sb.append(", ");
|
||||
});
|
||||
sb.delete(sb.length() - 2, sb.length());
|
||||
sb.append("], type = \"");
|
||||
sb.append(type);
|
||||
sb.append("\", EntityType.");
|
||||
sb.append(entityType);
|
||||
sb.append("]");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,24 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.entity;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class RedactionPosition {
|
||||
|
||||
final String id;
|
||||
Page page;
|
||||
// Each entry in this list corresponds to an entry in the redaction log, this means:
|
||||
// An entity might be represented by multiple redaction log entries
|
||||
List<Rectangle2D> rectanglePerLine;
|
||||
|
||||
}
|
||||
@ -1,120 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.amazonaws.services.kms.model.NotFoundException;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Document implements GenericSemanticNode {
|
||||
|
||||
Set<Page> pages;
|
||||
DocumentTree documentTree;
|
||||
Integer numberOfPages;
|
||||
TextBlock textBlock;
|
||||
@Builder.Default
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.DOCUMENT;
|
||||
}
|
||||
|
||||
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
public List<Section> getMainSections() {
|
||||
|
||||
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
|
||||
|
||||
return streamAllNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Integer> getTreeId() {
|
||||
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void setTreeId(List<Integer> tocId) {
|
||||
|
||||
throw new UnsupportedOperationException("Document is always the root of the TablePageBlock of Contents");
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node).findFirst().orElseThrow(() -> new NotFoundException("No Headlines found in this document!"));
|
||||
}
|
||||
|
||||
|
||||
private Stream<SemanticNode> streamAllNodes() {
|
||||
|
||||
return documentTree.allEntriesInOrder().map(DocumentTree.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
public Stream<Image> streamAllImages() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.IMAGE).map(node -> (Image) node);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return NodeType.DOCUMENT + ": " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
Map<Page, Rectangle2D> bBox = new HashMap<>();
|
||||
for (Page page : pages) {
|
||||
bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()));
|
||||
}
|
||||
return bBox;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,65 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Footer implements GenericSemanticNode {
|
||||
|
||||
List<Integer> treeId;
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.FOOTER;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,5 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
public interface GenericSemanticNode extends SemanticNode {
|
||||
|
||||
}
|
||||
@ -1,65 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Header implements GenericSemanticNode {
|
||||
|
||||
List<Integer> treeId;
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.HEADER;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,72 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Headline implements GenericSemanticNode {
|
||||
|
||||
List<Integer> treeId;
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.HEADLINE;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.HEADLINE + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Headline getHeadline() {
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,95 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Image implements GenericSemanticNode {
|
||||
|
||||
List<Integer> treeId;
|
||||
String id;
|
||||
|
||||
ImageType imageType;
|
||||
boolean transparent;
|
||||
Rectangle2D position;
|
||||
|
||||
boolean redaction;
|
||||
boolean ignored;
|
||||
@Builder.Default
|
||||
String redactionReason = "";
|
||||
@Builder.Default
|
||||
String legalBasis = "";
|
||||
@Builder.Default
|
||||
int matchedRule = -1;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Page page;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.IMAGE;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<Page> getPages() {
|
||||
|
||||
return Collections.singleton(page);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
bBoxPerPage.put(page, position);
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,23 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
public enum ImageType {
|
||||
LOGO,
|
||||
FORMULA,
|
||||
SIGNATURE,
|
||||
OTHER,
|
||||
OCR;
|
||||
|
||||
|
||||
public static ImageType fromString(String imageType) {
|
||||
|
||||
return switch (imageType.toLowerCase(Locale.ROOT)) {
|
||||
case "logo" -> ImageType.LOGO;
|
||||
case "formula" -> ImageType.FORMULA;
|
||||
case "signature" -> ImageType.SIGNATURE;
|
||||
case "ocr" -> ImageType.OCR;
|
||||
default -> ImageType.OTHER;
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -1,87 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@Setter
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Page {
|
||||
|
||||
Integer number;
|
||||
Integer height;
|
||||
Integer width;
|
||||
Integer rotation;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
List<SemanticNode> mainBody;
|
||||
@EqualsAndHashCode.Exclude
|
||||
Header header;
|
||||
@EqualsAndHashCode.Exclude
|
||||
Footer footer;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<Image> images = new HashSet<>();
|
||||
|
||||
|
||||
public static Page fromClassificationPage(ClassificationPage classificationPage) {
|
||||
|
||||
return Page.builder()
|
||||
.height((int) classificationPage.getPageHeight())
|
||||
.width((int) classificationPage.getPageWidth())
|
||||
.number(classificationPage.getPageNumber())
|
||||
.rotation(classificationPage.getRotation())
|
||||
.mainBody(new LinkedList<>())
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
public TextBlock getMainBodyTextBlock() {
|
||||
|
||||
return mainBody.stream().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.valueOf(number);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return number;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
|
||||
return o instanceof Page && o.hashCode() == this.hashCode();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,63 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Paragraph implements GenericSemanticNode {
|
||||
|
||||
List<Integer> treeId;
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.PARAGRAPH;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.PARAGRAPH + ": " + leafTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,77 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Section implements GenericSemanticNode {
|
||||
|
||||
List<Integer> treeId;
|
||||
|
||||
TextBlock textBlock;
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.SECTION;
|
||||
}
|
||||
|
||||
|
||||
public boolean hasTables() {
|
||||
|
||||
return streamAllSubNodesOfType(NodeType.TABLE).findAny().isPresent();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId.toString() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
public Headline getHeadline() {
|
||||
|
||||
return streamChildrenOfType(NodeType.HEADLINE)//
|
||||
.map(node -> (Headline) node)//
|
||||
.findFirst()//
|
||||
.orElseGet(() -> getParent().getHeadline());
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,453 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.EntityType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
public interface SemanticNode {
|
||||
|
||||
/**
|
||||
* Returns the type of this node, such as Section, Paragraph, etc.
|
||||
*
|
||||
* @return NodeType of this node
|
||||
*/
|
||||
NodeType getType();
|
||||
|
||||
|
||||
/**
|
||||
* Searches all Nodes located underneath this Node in the DocumentTree and concatenates their AtomicTextBlocks into a single TextBlock.
|
||||
* So, for a Section all TextBlocks of Subsections, Paragraphs, and Tables are concatenated into a single TextBlock
|
||||
* If the Node is a Leaf, the LeafTextBlock will be returned instead.
|
||||
*
|
||||
* @return TextBlock containing all AtomicTextBlocks that are located under this Node.
|
||||
*/
|
||||
TextBlock getTextBlock();
|
||||
|
||||
|
||||
/**
|
||||
* Any Node maintains its own Set of Entities.
|
||||
* This Set contains all Entities whose boundary intersects the boundary of this node.
|
||||
*
|
||||
* @return Set of all Entities associated with this Node
|
||||
*/
|
||||
Set<RedactionEntity> getEntities();
|
||||
|
||||
|
||||
/**
|
||||
* Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.
|
||||
*
|
||||
* @return Set of PageNodes this node appears on.
|
||||
*/
|
||||
default Set<Page> getPages() {
|
||||
|
||||
return getTextBlock().getPages();
|
||||
}
|
||||
|
||||
|
||||
default Page getFirstPage() {
|
||||
|
||||
return getTextBlock().getPages().stream().min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Each AtomicTextBlock is assigned a page, so to get the pages for this boundary, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.
|
||||
*
|
||||
* @return Set of PageNodes this node appears on.
|
||||
*/
|
||||
default Set<Page> getPages(Boundary boundary) {
|
||||
|
||||
if (!getBoundary().contains(boundary)) {
|
||||
throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", boundary, getBoundary()));
|
||||
}
|
||||
return getTextBlock().getPages(boundary);
|
||||
}
|
||||
|
||||
|
||||
default boolean isOnPage(int pageNumber) {
|
||||
|
||||
return getPages().stream().anyMatch(page -> page.getNumber() == pageNumber);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the DocumentTree Object.
|
||||
*
|
||||
* @return the DocumentTree of the Document this node belongs to
|
||||
*/
|
||||
DocumentTree getDocumentTree();
|
||||
|
||||
|
||||
/**
|
||||
* The id is a List of Integers uniquely identifying this node in the DocumentTree.
|
||||
*
|
||||
* @return the DocumentTree ID
|
||||
*/
|
||||
List<Integer> getTreeId();
|
||||
|
||||
|
||||
/**
|
||||
* This should only be used during graph construction.
|
||||
*
|
||||
* @param tocId List of Integers
|
||||
*/
|
||||
void setTreeId(List<Integer> tocId);
|
||||
|
||||
|
||||
/**
|
||||
* Traverses the Tree up, until it hits a Headline or hits a Section which will then return the first Headline from its children.
|
||||
* Throws NotFoundException if no Headline is found this way
|
||||
*
|
||||
* @return First Headline found
|
||||
*/
|
||||
default Headline getHeadline() {
|
||||
|
||||
return getParent().getHeadline();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if its TocId has a length greater than zero.
|
||||
*
|
||||
* @return boolean indicating whether this Node has a Parent in the DocumentTree
|
||||
*/
|
||||
default boolean hasParent() {
|
||||
|
||||
return getDocumentTree().hasParentById(getTreeId());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return The SemanticNode representing the Parent in the DocumentTree
|
||||
* throws NotFoundException, when no parent is present
|
||||
*/
|
||||
default SemanticNode getParent() {
|
||||
|
||||
return getDocumentTree().getParentEntryById(getTreeId()).getNode();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return The SemanticNode which is directly underneath the document and also under which this node is.
|
||||
* if this is the highest child node or the document itself, it returns itself.
|
||||
*/
|
||||
default SemanticNode getHighestParent() {
|
||||
|
||||
return getDocumentTree().getHighestParentById(getTreeId());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Leaf means a SemanticNode has direct access to a TextBlock, by default this is false and must be overridden.
|
||||
* Currently only Sections, Images, and Tables are not leaves.
|
||||
* A TableCell might be a leaf depending on its area compared to the page.
|
||||
*
|
||||
* @return boolean, indicating if a Node has direct access to a TextBlock
|
||||
*/
|
||||
default boolean isLeaf() {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Leaf means a SemanticNode has direct access to a TextBlock, by default this is false and must be overridden.
|
||||
* Currently only Sections and Tables are no leaves.
|
||||
*
|
||||
* @return AtomicTextBlock
|
||||
*/
|
||||
default TextBlock getLeafTextBlock() {
|
||||
|
||||
throw new UnsupportedOperationException("Only leaf Nodes have access to LeafTextBlocks!");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Should only be used during construction of the Graph. Sets the LeafTextBlock of this SemanticNode.
|
||||
*
|
||||
* @param textBlock the TextBlock to set as the LeafTextBlock of this SemanticNode
|
||||
*/
|
||||
default void setLeafTextBlock(TextBlock textBlock) {
|
||||
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode has any Entity with EntityType.ENTITY of the provided type.
|
||||
*
|
||||
* @param type string representing the type of entity to check for
|
||||
* @return true, if this SemanticNode has at least one Entity of the provided type
|
||||
*/
|
||||
default boolean hasEntitiesOfType(String type) {
|
||||
|
||||
return getEntities().stream().filter(entity -> entity.getEntityType().equals(EntityType.ENTITY)).anyMatch(redactionEntity -> redactionEntity.getType().equals(type));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a List of Entities in this SemanticNode which are of the provided type such as "CBI_author".
|
||||
*
|
||||
* @param type string representing the type of entities to return
|
||||
* @return List of RedactionEntities of any the type
|
||||
*/
|
||||
default List<RedactionEntity> getEntitiesOfType(String type) {
|
||||
|
||||
return getEntities().stream().filter(redactionEntity -> redactionEntity.getType().equals(type)).toList();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a List of Entities in this SemanticNode which have any of the provided types such as "CBI_author".
|
||||
*
|
||||
* @param types A list of strings representing the types of entities to return
|
||||
* @return List of RedactionEntities of any provided type
|
||||
*/
|
||||
default List<RedactionEntity> getEntitiesOfType(List<String> types) {
|
||||
|
||||
return getEntities().stream().filter(redactionEntity -> redactionEntity.isAnyType(types)).toList();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Each AtomicTextBlock has an index on its page, this returns the number of the first AtomicTextBlock underneath this node.
|
||||
* If this node does not have any AtomicTexBlocks underneath it, e.g. an empty TableCell. It returns -1.
|
||||
*
|
||||
* @return Integer representing the number on the page
|
||||
*/
|
||||
default Integer getNumberOnPage() {
|
||||
|
||||
TextBlock textBlock = getTextBlock();
|
||||
if (!textBlock.getAtomicTextBlocks().isEmpty()) {
|
||||
return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if the SemanticNode contains any text.
|
||||
*
|
||||
* @return true, if this node's TextBlock is not empty
|
||||
*/
|
||||
default boolean hasText() {
|
||||
|
||||
return !getTextBlock().isEmpty();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains the provided String.
|
||||
*
|
||||
* @param string A String which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains the string
|
||||
*/
|
||||
default boolean containsString(String string) {
|
||||
|
||||
return getTextBlock().getSearchText().contains(string);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains all the provided Strings.
|
||||
*
|
||||
* @param strings A List of Strings which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains all strings
|
||||
*/
|
||||
default boolean containsStrings(List<String> strings) {
|
||||
|
||||
return strings.stream().allMatch(this::containsString);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains all the provided Strings ignoring case.
|
||||
*
|
||||
* @param string A String which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains the string ignoring case
|
||||
*/
|
||||
default boolean containsStringIgnoreCase(String string) {
|
||||
|
||||
return getTextBlock().getSearchText().toLowerCase(Locale.ROOT).contains(string.toLowerCase(Locale.ROOT));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains any of the provided Strings.
|
||||
*
|
||||
* @param strings A List of Strings which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains any of the strings
|
||||
*/
|
||||
default boolean containsAnyString(List<String> strings) {
|
||||
|
||||
return strings.stream().anyMatch(this::containsString);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether this SemanticNode contains any of the provided Strings ignoring case.
|
||||
*
|
||||
* @param strings A List of Strings which the TextBlock might contain
|
||||
* @return true, if this node's TextBlock contains any of the strings
|
||||
*/
|
||||
default boolean containsAnyStringIgnoreCase(List<String> strings) {
|
||||
|
||||
return strings.stream().anyMatch(this::containsStringIgnoreCase);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the RedactionEntity intersects or even contains the RedactionEntity.
|
||||
* It sets the fields accordingly and recursively calls this function on all its children.
|
||||
*
|
||||
* @param redactionEntity RedactionEntity, which is being inserted into the graph
|
||||
*/
|
||||
default void addThisToEntityIfIntersects(RedactionEntity redactionEntity) {
|
||||
|
||||
TextBlock textBlock = getTextBlock();
|
||||
if (textBlock.getBoundary().intersects(redactionEntity.getBoundary())) {
|
||||
if (textBlock.containsBoundary(redactionEntity.getBoundary())) {
|
||||
redactionEntity.setDeepestFullyContainingNode(this);
|
||||
}
|
||||
|
||||
redactionEntity.addIntersectingNode(this);
|
||||
streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(redactionEntity.getBoundary()))
|
||||
.forEach(node -> node.addThisToEntityIfIntersects(redactionEntity));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all children located directly underneath this node in the DocumentTree.
|
||||
*
|
||||
* @return Stream of all children
|
||||
*/
|
||||
default Stream<SemanticNode> streamChildren() {
|
||||
|
||||
return getDocumentTree().childNodes(getTreeId());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all children located directly underneath this node in the DocumentTree of the provided type.
|
||||
*
|
||||
* @return Stream of all children
|
||||
*/
|
||||
default Stream<SemanticNode> streamChildrenOfType(NodeType nodeType) {
|
||||
|
||||
return getDocumentTree().childNodesOfType(getTreeId(), nodeType);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Recursively streams all SemanticNodes located underneath this node in the DocumentTree in order.
|
||||
*
|
||||
* @return Stream of all SubNodes
|
||||
*/
|
||||
default Stream<SemanticNode> streamAllSubNodes() {
|
||||
|
||||
return getDocumentTree().allSubEntriesInOrder(getTreeId()).map(DocumentTree.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Recursively streams all SemanticNodes of the provided type located underneath this node in the DocumentTree in order.
|
||||
*
|
||||
* @return Stream of all SubNodes
|
||||
*/
|
||||
default Stream<SemanticNode> streamAllSubNodesOfType(NodeType nodeType) {
|
||||
|
||||
return getDocumentTree().allSubEntriesInOrder(getTreeId()).filter(entry -> entry.getType().equals(nodeType)).map(DocumentTree.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* The Boundary is the start and end string offsets in the reading order of the document.
|
||||
*
|
||||
* @return Boundary of this Node's TextBlock
|
||||
*/
|
||||
default Boundary getBoundary() {
|
||||
|
||||
return getTextBlock().getBoundary();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* If this Node is a Leaf it will calculate the boundingBox of its LeafTextBlock, otherwise it will calculate the Union of the BoundingBoxes of all its Children.
|
||||
* If called on the Document, it will return the cropbox of each page
|
||||
*
|
||||
* @return Rectangle2D fully encapsulating this Node for each page.
|
||||
*/
|
||||
default Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
if (isLeaf()) {
|
||||
return getBBoxFromLeafTextBlock(bBoxPerPage);
|
||||
}
|
||||
|
||||
return getBBoxFromChildren(bBoxPerPage);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether the Bounding Box of this SemanticNode contains the provided rectangle on the provided page.
|
||||
*
|
||||
* @param rectangle2D The rectangle to check if it is contained
|
||||
* @param pageNumber The Page number on which the rectangle should be checked
|
||||
* @return boolean
|
||||
*/
|
||||
default boolean containsRectangle(Rectangle2D rectangle2D, Integer pageNumber) {
|
||||
|
||||
Page helperPage = Page.builder().number(pageNumber).build();
|
||||
if (!getPages().contains(helperPage)) {
|
||||
return false;
|
||||
}
|
||||
return getBBox().get(helperPage).contains(rectangle2D);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* TODO: this produces unwanted results for sections spanning multiple columns.
|
||||
*
|
||||
* @param bBoxPerPage initial empty BoundingBox
|
||||
* @return The union of the BoundingBoxes of all children
|
||||
*/
|
||||
private Map<Page, Rectangle2D> getBBoxFromChildren(Map<Page, Rectangle2D> bBoxPerPage) {
|
||||
|
||||
return streamChildren().map(SemanticNode::getBBox).reduce((map1, map2) -> {
|
||||
map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D()));
|
||||
return map2;
|
||||
}).orElse(bBoxPerPage);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param bBoxPerPage initial empty BoundingBox
|
||||
* @return The union of all BoundingBoxes of the TextBlock of this node
|
||||
*/
|
||||
private Map<Page, Rectangle2D> getBBoxFromLeafTextBlock(Map<Page, Rectangle2D> bBoxPerPage) {
|
||||
|
||||
Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage));
|
||||
atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,317 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Table implements SemanticNode {
|
||||
|
||||
List<Integer> treeId;
|
||||
DocumentTree documentTree;
|
||||
|
||||
int numberOfRows;
|
||||
int numberOfCols;
|
||||
|
||||
TextBlock textBlock;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
/**
|
||||
* Streams all entities in this table, that appear in a row, which contains any of the provided strings.
|
||||
*
|
||||
* @param strings Strings to check whether a row contains them
|
||||
* @return Stream of all entities in this table, that appear in a row, which contains any of the provided strings
|
||||
*/
|
||||
public Stream<RedactionEntity> streamEntitiesWhereRowContainsStringsIgnoreCase(List<String> strings) {
|
||||
|
||||
return IntStream.range(0, numberOfRows)
|
||||
.boxed()
|
||||
.filter(row -> rowContainsStringsIgnoreCase(row, strings))
|
||||
.flatMap(this::streamRow)
|
||||
.map(TableCell::getEntities)
|
||||
.flatMap(Collection::stream);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks whether the specified row contains all the provided strings.
|
||||
*
|
||||
* @param row the row to check as an Integer, must be smaller than numberOfRows
|
||||
* @param strings a list of strings to check for
|
||||
* @return true, if all strings appear in the provided row
|
||||
*/
|
||||
public boolean rowContainsStringsIgnoreCase(Integer row, List<String> strings) {
|
||||
|
||||
String rowText = streamRow(row).map(TableCell::getTextBlock).collect(new TextBlockCollector()).getSearchText().toLowerCase(Locale.ROOT);
|
||||
return strings.stream().map(String::toLowerCase).allMatch(rowText::contains);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all entities which appear in a row where at least one cell has the provided header and the provided value.
|
||||
*
|
||||
* @param header the header value to search for
|
||||
* @param value the string which the table cell should contain
|
||||
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and the provided value.
|
||||
*/
|
||||
public Stream<RedactionEntity> streamEntitiesWhereRowHasHeaderAndValue(String header, String value) {
|
||||
|
||||
List<Integer> vertebrateStudyCols = streamHeaders().filter(headerNode -> headerNode.containsString(header)).map(TableCell::getCol).toList();
|
||||
return streamTableCells().filter(tableCellNode -> vertebrateStudyCols.stream()
|
||||
.anyMatch(vertebrateStudyCol -> getCell(tableCellNode.getRow(), vertebrateStudyCol).containsString(value))).map(TableCell::getEntities).flatMap(Collection::stream);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all entities which appear in a row where at least one cell has the provided header and any provided value.
|
||||
*
|
||||
* @param header the header value to search for
|
||||
* @param values the strings which the table cell should contain
|
||||
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and any provided value.
|
||||
*/
|
||||
public Stream<RedactionEntity> streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List<String> values) {
|
||||
|
||||
List<Integer> colsWithHeader = streamHeaders().filter(headerNode -> headerNode.containsString(header)).map(TableCell::getCol).toList();
|
||||
return streamTableCells().filter(tableCellNode -> colsWithHeader.stream()
|
||||
.anyMatch(colWithHeader -> getCell(tableCellNode.getRow(), colWithHeader).containsAnyString(values))).map(TableCell::getEntities).flatMap(Collection::stream);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
|
||||
*
|
||||
* @param types type strings to check whether a row contains an entity like them
|
||||
* @return Stream of all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
|
||||
*/
|
||||
public Stream<RedactionEntity> streamEntitiesWhereRowContainsEntitiesOfType(List<String> types) {
|
||||
|
||||
List<Integer> rowsWithEntityOfType = IntStream.range(0, numberOfRows)
|
||||
.boxed()
|
||||
.filter(rowNumber -> streamEntityTypesInRow(rowNumber).anyMatch(existingType -> types.stream().anyMatch(typeToCheck -> typeToCheck.equals(existingType))))
|
||||
.toList();
|
||||
|
||||
return rowsWithEntityOfType.stream().flatMap(this::streamRow).map(TableCell::getEntities).flatMap(Collection::stream);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all entities in this table, that appear in a row, which does not contain any entity with any of the provided types.
|
||||
*
|
||||
* @param types type strings to check whether a row doesn't contain an entity like it
|
||||
* @return Stream of all entities in this table, that appear in a row, which does not contain any entity with any of the provided types.
|
||||
*/
|
||||
public Stream<RedactionEntity> streamEntitiesWhereRowContainsNoEntitiesOfType(List<String> types) {
|
||||
|
||||
List<Integer> rowsWithNoEntityOfType = IntStream.range(0, numberOfRows)
|
||||
.boxed()
|
||||
.filter(rowNumber -> streamEntityTypesInRow(rowNumber).noneMatch(existingType -> types.stream().anyMatch(typeToCheck -> typeToCheck.equals(existingType))))
|
||||
.toList();
|
||||
|
||||
return rowsWithNoEntityOfType.stream().flatMap(this::streamRow).map(TableCell::getEntities).flatMap(Collection::stream);
|
||||
}
|
||||
|
||||
|
||||
private Stream<String> streamEntityTypesInRow(Integer rowNumber) {
|
||||
|
||||
return streamRow(rowNumber).map(TableCell::getEntities).flatMap(Collection::stream).map(RedactionEntity::getType).distinct();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a TableCell at the provided row and column location.
|
||||
*
|
||||
* @param row int representing the row, must be smaller than numberOfRows
|
||||
* @param col int representing the col, must be smaller than numberOfCols
|
||||
* @return TableCell at the provided location in the table
|
||||
*/
|
||||
public TableCell getCell(int row, int col) {
|
||||
|
||||
if (numberOfRows - row < 0 || numberOfCols - col < 0) {
|
||||
throw new IllegalArgumentException(format("row %d, col %d is out of bounds for number of rows of %d and number of cols %d", row, col, numberOfRows, numberOfCols));
|
||||
}
|
||||
int idx = row * numberOfCols + col;
|
||||
return (TableCell) documentTree.getEntryById(treeId).getChildren().get(idx).getNode();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all TableCells in this Table row-wise.
|
||||
*
|
||||
* @return Stream of all TableCells
|
||||
*/
|
||||
public Stream<TableCell> streamTableCells() {
|
||||
|
||||
return streamChildrenOfType(NodeType.TABLE_CELL).map(node -> (TableCell) node);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all TableCells in this Table which have the provided header row-wise.
|
||||
*
|
||||
* @return Stream of all TableCells which have the provided header
|
||||
*/
|
||||
public Stream<TableCell> streamTableCellsWithHeader(String header) {
|
||||
|
||||
return streamHeaders().filter(tableCellNode -> tableCellNode.getTextBlock().getSearchText().contains(header))
|
||||
.map(TableCell::getCol)
|
||||
.flatMap(this::streamCol)
|
||||
.filter(tableCellNode -> !tableCellNode.isHeader());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all TableCells belonging to the provided column from top down.
|
||||
*
|
||||
* @param col int representing the column
|
||||
* @return Stream of all TableCell in the provided column
|
||||
*/
|
||||
public Stream<TableCell> streamCol(int col) {
|
||||
|
||||
return IntStream.range(0, numberOfRows).boxed().map(row -> getCell(row, col));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all TableCells belonging to the provided row from left to right.
|
||||
*
|
||||
* @param row int representing the row
|
||||
* @return Stream of all TableCell in the provided row
|
||||
*/
|
||||
public Stream<TableCell> streamRow(int row) {
|
||||
|
||||
return IntStream.range(0, numberOfCols).boxed().map(col -> getCell(row, col));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all TableCells row-wise and filters them with header == true.
|
||||
*
|
||||
* @return Stream of all TableCells with header == true
|
||||
*/
|
||||
public Stream<TableCell> streamHeaders() {
|
||||
|
||||
return streamTableCells().filter(TableCell::isHeader);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all TableCells of the provided row and column and filters them with header == true.
|
||||
*
|
||||
* @param row int representing the row
|
||||
* @param col int representing the column
|
||||
* @return Stream of all TableCells with header == true in the provided row or col
|
||||
*/
|
||||
public Stream<TableCell> streamHeadersForCell(int row, int col) {
|
||||
|
||||
return Stream.concat(streamRow(row), streamCol(col)).filter(TableCell::isHeader);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all Headers and checks if any equal the provided string.
|
||||
*
|
||||
* @param header string to check the headers for
|
||||
* @return true, if at least one header equals the provided string
|
||||
*/
|
||||
public boolean hasHeader(String header) {
|
||||
|
||||
return streamHeaders().anyMatch(tableCellNode -> tableCellNode.getTextBlock().getSearchText().strip().equals(header));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if this table has a column with the provided header and any of the table cells in that column contain the provided value.
|
||||
*
|
||||
* @param header string to find header cells
|
||||
* @param value string to check cells with provided header
|
||||
* @return true, if this table has a column with the provided header and any of the table cells in that column contain the provided value
|
||||
*/
|
||||
public boolean hasRowWithHeaderAndValue(String header, String value) {
|
||||
|
||||
return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsString(value));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if this table has a column with the provided header and any of the table cells in that column contains any of the provided values.
|
||||
*
|
||||
* @param header string to find header cells
|
||||
* @param values List of strings to check cells with provided header
|
||||
* @return true, if this table has a column with the provided header and any of the table cells in that column contains any of the provided values.
|
||||
*/
|
||||
public boolean hasRowWithHeaderAndAnyValue(String header, List<String> values) {
|
||||
|
||||
return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsAnyString(values));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Finds all entities of the provided type, which appear in the same row that the provided entity appears in.
|
||||
*
|
||||
* @param type the type of entities to search for
|
||||
* @param redactionEntity the entity, which appears in the row to search
|
||||
* @return List of all entities of the provided type, which appear in the same row that the provided entity appears in.
|
||||
*/
|
||||
public List<RedactionEntity> getEntitiesOfTypeInSameRow(String type, RedactionEntity redactionEntity) {
|
||||
|
||||
return redactionEntity.getIntersectingNodes()
|
||||
.stream()
|
||||
.filter(node -> node instanceof TableCell)
|
||||
.map(node -> (TableCell) node)
|
||||
.flatMap(tableCellNode -> streamRow(tableCellNode.getRow()))
|
||||
.map(cell -> cell.getEntitiesOfType(type))
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.TABLE;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId.toString() + ": " + NodeType.TABLE + ": #cols: " + numberOfCols + ", #rows: " + numberOfRows + ", " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,91 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class TableCell implements GenericSemanticNode {
|
||||
|
||||
List<Integer> treeId;
|
||||
int row;
|
||||
int col;
|
||||
boolean header;
|
||||
|
||||
Rectangle2D bBox;
|
||||
|
||||
TextBlock leafTextBlock;
|
||||
|
||||
TextBlock textBlock;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
DocumentTree documentTree;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, Rectangle2D> getBBox() {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
getPages().forEach(page -> bBoxPerPage.put(page, bBox));
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public NodeType getType() {
|
||||
|
||||
return NodeType.TABLE_CELL;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isLeaf() {
|
||||
|
||||
return getDocumentTree().getEntryById(getTreeId()).getChildren().isEmpty();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock getTextBlock() {
|
||||
|
||||
if (isLeaf()) {
|
||||
return leafTextBlock;
|
||||
}
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return treeId + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,232 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class AtomicTextBlock implements TextBlock {
|
||||
|
||||
Long id;
|
||||
Integer numberOnPage;
|
||||
Page page;
|
||||
|
||||
//string coordinates
|
||||
Boundary boundary;
|
||||
String searchText;
|
||||
@Builder.Default
|
||||
List<Integer> lineBreaks = new ArrayList<>();
|
||||
@Builder.Default
|
||||
List<Boundary> boldTextBoundaries = new ArrayList<>();
|
||||
@Builder.Default
|
||||
List<Boundary> italicTextBoundaries = new ArrayList<>();
|
||||
String orientation;
|
||||
int textDirection;
|
||||
|
||||
//position coordinates
|
||||
@Builder.Default
|
||||
List<Integer> stringIdxToPositionIdx = new ArrayList<>();
|
||||
@Builder.Default
|
||||
List<Rectangle2D> positions = new ArrayList<>();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
SemanticNode parent;
|
||||
|
||||
|
||||
@Override
|
||||
public int numberOfLines() {
|
||||
|
||||
return lineBreaks.size() + 1;
|
||||
}
|
||||
|
||||
|
||||
public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText,
|
||||
List<Integer> lineBreaks,
|
||||
List<Boundary> boldTextBoundaries,
|
||||
List<Boundary> italicTextBoundaries,
|
||||
List<Rectangle2D> positions,
|
||||
List<Integer> stringIdxToPositionIdx,
|
||||
long idx,
|
||||
SemanticNode parent,
|
||||
int numberOnPage,
|
||||
Page page,
|
||||
int offset,
|
||||
String orientation,
|
||||
int textDirection) {
|
||||
|
||||
return AtomicTextBlock.builder()
|
||||
.id(idx)
|
||||
.parent(parent)
|
||||
.searchText(searchText)
|
||||
.numberOnPage(numberOnPage)
|
||||
.page(page)
|
||||
.lineBreaks(lineBreaks)
|
||||
.boldTextBoundaries(boldTextBoundaries)
|
||||
.italicTextBoundaries(italicTextBoundaries)
|
||||
.positions(positions)
|
||||
.stringIdxToPositionIdx(stringIdxToPositionIdx)
|
||||
.boundary(new Boundary(offset, offset + searchText.length()))
|
||||
.textDirection(textDirection)
|
||||
.orientation(orientation)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
public static AtomicTextBlock empty(Long textBlockIdx, int stringOffset, Page page, int numberOnPage, SemanticNode parent) {
|
||||
|
||||
return AtomicTextBlock.builder()
|
||||
.id(textBlockIdx)
|
||||
.boundary(new Boundary(stringOffset, stringOffset))
|
||||
.searchText("")
|
||||
.page(page)
|
||||
.numberOnPage(numberOnPage)
|
||||
.parent(parent)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData,
|
||||
DocumentPositionData documentPositionData,
|
||||
SemanticNode parent,
|
||||
Page page) {
|
||||
|
||||
return AtomicTextBlock.builder()
|
||||
.id(documentTextData.getId())
|
||||
.numberOnPage(documentTextData.getNumberOnPage())
|
||||
.page(page)
|
||||
.boundary(new Boundary(documentTextData.getStart(), documentTextData.getEnd()))
|
||||
.searchText(documentTextData.getSearchText())
|
||||
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList())
|
||||
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList())
|
||||
.positions(toRectangle2DList(documentPositionData.getPositions()))
|
||||
.parent(parent)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
|
||||
|
||||
return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList();
|
||||
}
|
||||
|
||||
|
||||
public CharSequence getLine(int lineNumber) {
|
||||
|
||||
if (lineNumber >= numberOfLines() || lineNumber < 0) {
|
||||
throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
|
||||
}
|
||||
if (lineNumber == 0) {
|
||||
return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start());
|
||||
} else if (lineNumber == numberOfLines() - 1) {
|
||||
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end());
|
||||
}
|
||||
return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<AtomicTextBlock> getAtomicTextBlocks() {
|
||||
|
||||
return List.of(this);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getNextLinebreak(int fromIndex) {
|
||||
|
||||
return lineBreaks.stream()//
|
||||
.filter(linebreak -> linebreak > fromIndex - boundary.start()) //
|
||||
.findFirst() //
|
||||
.orElse(searchText.length()) + boundary.start();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getPreviousLinebreak(int fromIndex) {
|
||||
|
||||
return lineBreaks.stream()//
|
||||
.filter(linebreak -> linebreak <= fromIndex - boundary.start())//
|
||||
.reduce((a, b) -> b)//
|
||||
.orElse(0) + boundary.start();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Rectangle2D getPosition(int stringIdx) {
|
||||
|
||||
return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start()));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
|
||||
|
||||
if (!containsBoundary(stringBoundary)) {
|
||||
throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringBoundary, this.boundary));
|
||||
}
|
||||
if (stringBoundary.length() == 0) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
int startPositionIdx = stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start());
|
||||
|
||||
if (stringBoundary.end() == this.boundary.end()) {
|
||||
return positions.subList(startPositionIdx, positions.size());
|
||||
}
|
||||
|
||||
return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start()));
|
||||
|
||||
}
|
||||
|
||||
|
||||
public Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary) {
|
||||
|
||||
List<Rectangle2D> rectanglesPerLine = stringBoundary.split(getAllLineBreaksInBoundary(stringBoundary))
|
||||
.stream()
|
||||
.map(this::getPositions)
|
||||
.map(RectangleTransformations::rectangleBBoxWithGaps)
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
Map<Page, List<Rectangle2D>> rectanglePerLinePerPage = new HashMap<>();
|
||||
rectanglePerLinePerPage.put(page, rectanglesPerLine);
|
||||
return rectanglePerLinePerPage;
|
||||
}
|
||||
|
||||
|
||||
private List<Integer> getAllLineBreaksInBoundary(Boundary boundary) {
|
||||
|
||||
return getLineBreaks().stream().map(linebreak -> linebreak + this.boundary.start()).filter(boundary::contains).toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return searchText;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,220 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class ConcatenatedTextBlock implements TextBlock {
|
||||
|
||||
List<AtomicTextBlock> atomicTextBlocks;
|
||||
String searchText;
|
||||
Boundary boundary;
|
||||
|
||||
|
||||
public static ConcatenatedTextBlock empty() {
|
||||
|
||||
return new ConcatenatedTextBlock(Collections.emptyList());
|
||||
}
|
||||
|
||||
|
||||
public ConcatenatedTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
|
||||
|
||||
this.atomicTextBlocks = new LinkedList<>();
|
||||
if (atomicTextBlocks.isEmpty()) {
|
||||
boundary = new Boundary(-1, -1);
|
||||
return;
|
||||
}
|
||||
var firstTextBlock = atomicTextBlocks.get(0);
|
||||
this.atomicTextBlocks.add(firstTextBlock);
|
||||
boundary = new Boundary(firstTextBlock.getBoundary().start(), firstTextBlock.getBoundary().end());
|
||||
|
||||
atomicTextBlocks.subList(1, atomicTextBlocks.size()).forEach(this::concat);
|
||||
}
|
||||
|
||||
|
||||
public ConcatenatedTextBlock concat(TextBlock textBlock) {
|
||||
|
||||
if (this.atomicTextBlocks.isEmpty()) {
|
||||
boundary.setStart(textBlock.getBoundary().start());
|
||||
boundary.setEnd(textBlock.getBoundary().end());
|
||||
} else if (boundary.end() != textBlock.getBoundary().start()) {
|
||||
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
|
||||
}
|
||||
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
|
||||
boundary.setEnd(textBlock.getBoundary().end());
|
||||
this.searchText = null;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
|
||||
|
||||
return atomicTextBlocks.stream().filter(textBlock -> textBlock.getBoundary().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new);
|
||||
}
|
||||
|
||||
|
||||
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(Boundary boundary) {
|
||||
|
||||
return atomicTextBlocks.stream().filter(tb -> tb.getBoundary().intersects(boundary)).toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getSearchText() {
|
||||
|
||||
if (searchText == null) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
getAtomicTextBlocks().forEach(atb -> sb.append(atb.getSearchText()));
|
||||
searchText = sb.toString();
|
||||
}
|
||||
return searchText;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int numberOfLines() {
|
||||
|
||||
return atomicTextBlocks.stream().map(AtomicTextBlock::getLineBreaks).mapToInt(List::size).sum();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getNextLinebreak(int fromIndex) {
|
||||
|
||||
return getAtomicTextBlockByStringIndex(fromIndex).getNextLinebreak(fromIndex);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getPreviousLinebreak(int fromIndex) {
|
||||
|
||||
return getAtomicTextBlockByStringIndex(fromIndex).getPreviousLinebreak(fromIndex);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Integer> getLineBreaks() {
|
||||
|
||||
return getAtomicTextBlocks().stream().flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks().stream()).toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Rectangle2D getPosition(int stringIdx) {
|
||||
|
||||
return getAtomicTextBlockByStringIndex(stringIdx).getPosition(stringIdx);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
|
||||
|
||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
|
||||
|
||||
if (textBlocks.size() == 1) {
|
||||
return textBlocks.get(0).getPositions(stringBoundary);
|
||||
}
|
||||
|
||||
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
||||
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())));
|
||||
|
||||
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
||||
positions.addAll(textBlock.getPositions());
|
||||
}
|
||||
|
||||
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||
positions.addAll(lastTextBlock.getPositions(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
|
||||
|
||||
return positions;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary) {
|
||||
|
||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
|
||||
|
||||
if (textBlocks.size() == 1) {
|
||||
return textBlocks.get(0).getPositionsPerPage(stringBoundary);
|
||||
}
|
||||
|
||||
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
||||
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end()));
|
||||
|
||||
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
||||
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getBoundary()));
|
||||
}
|
||||
|
||||
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage,
|
||||
lastTextBlock.getPositionsPerPage(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
|
||||
|
||||
return rectanglesPerLinePerPage;
|
||||
}
|
||||
|
||||
|
||||
private Map<Page, List<Rectangle2D>> mergeEntityPositionsWithSamePageNode(Map<Page, List<Rectangle2D>> map1, Map<Page, List<Rectangle2D>> map2) {
|
||||
|
||||
Map<Page, List<Rectangle2D>> mergedMap = new HashMap<>(map1);
|
||||
map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode, rectangles, (l1, l2) -> Stream.concat(l1.stream(), l2.stream()).toList()));
|
||||
return mergedMap;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return getSearchText();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Boundary> getBoldTextBoundaries() {
|
||||
|
||||
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getBoldTextBoundaries).flatMap(Collection::stream).toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Boundary> getItalicTextBoundaries() {
|
||||
|
||||
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getItalicTextBoundaries).flatMap(Collection::stream).toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getOrientation() {
|
||||
|
||||
if (atomicTextBlocks.isEmpty()) {
|
||||
return "";
|
||||
}
|
||||
return atomicTextBlocks.get(0).getOrientation();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getTextDirection() {
|
||||
|
||||
if (atomicTextBlocks.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
return atomicTextBlocks.get(0).getTextDirection();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,148 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
|
||||
public interface TextBlock extends CharSequence {
|
||||
|
||||
String getSearchText();
|
||||
|
||||
|
||||
List<AtomicTextBlock> getAtomicTextBlocks();
|
||||
|
||||
|
||||
List<Boundary> getBoldTextBoundaries();
|
||||
|
||||
|
||||
List<Boundary> getItalicTextBoundaries();
|
||||
|
||||
|
||||
String getOrientation();
|
||||
|
||||
|
||||
int getTextDirection();
|
||||
|
||||
|
||||
Boundary getBoundary();
|
||||
|
||||
|
||||
int getNextLinebreak(int fromIndex);
|
||||
|
||||
|
||||
int getPreviousLinebreak(int fromIndex);
|
||||
|
||||
|
||||
List<Integer> getLineBreaks();
|
||||
|
||||
|
||||
Rectangle2D getPosition(int stringIdx);
|
||||
|
||||
|
||||
List<Rectangle2D> getPositions(Boundary stringBoundary);
|
||||
|
||||
|
||||
Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary);
|
||||
|
||||
|
||||
int numberOfLines();
|
||||
|
||||
|
||||
default int indexOf(String searchTerm) {
|
||||
|
||||
return indexOf(searchTerm, getBoundary().start());
|
||||
}
|
||||
|
||||
|
||||
default Set<Page> getPages() {
|
||||
|
||||
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getPage).collect(Collectors.toUnmodifiableSet());
|
||||
}
|
||||
|
||||
|
||||
default Set<Page> getPages(Boundary boundary) {
|
||||
|
||||
return getAtomicTextBlocks().stream()
|
||||
.filter(atomicTextBlock -> atomicTextBlock.getBoundary().intersects(boundary))
|
||||
.map(AtomicTextBlock::getPage)
|
||||
.collect(Collectors.toUnmodifiableSet());
|
||||
}
|
||||
|
||||
|
||||
default int indexOf(String searchTerm, int startOffset) {
|
||||
|
||||
int start = getSearchText().indexOf(searchTerm, startOffset - getBoundary().start());
|
||||
if (start == -1) {
|
||||
return -1;
|
||||
}
|
||||
return start + getBoundary().start();
|
||||
}
|
||||
|
||||
|
||||
default CharSequence getFirstLine() {
|
||||
|
||||
return subSequence(getBoundary().start(), getNextLinebreak(getBoundary().start()));
|
||||
}
|
||||
|
||||
|
||||
default boolean containsBoundary(Boundary boundary) {
|
||||
|
||||
if (boundary.end() < boundary.start()) {
|
||||
throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", boundary));
|
||||
}
|
||||
return getBoundary().contains(boundary);
|
||||
}
|
||||
|
||||
|
||||
default boolean containsIndex(int stringIndex) {
|
||||
|
||||
return getBoundary().contains(stringIndex);
|
||||
}
|
||||
|
||||
|
||||
default CharSequence subSequence(Boundary boundary) {
|
||||
|
||||
return subSequence(boundary.start(), boundary.end());
|
||||
}
|
||||
|
||||
|
||||
default String buildSummary() {
|
||||
|
||||
String[] words = getSearchText().split(" ");
|
||||
int bound = Math.min(words.length, 4);
|
||||
List<String> list = new ArrayList<>(Arrays.asList(words).subList(0, bound));
|
||||
|
||||
return String.join(" ", list);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
default CharSequence subSequence(int start, int end) {
|
||||
|
||||
return getSearchText().substring(start - getBoundary().start(), end - getBoundary().start());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
default int length() {
|
||||
|
||||
return getBoundary().length();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
default char charAt(int index) {
|
||||
|
||||
return getSearchText().charAt(index - getBoundary().start());
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,49 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.BinaryOperator;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collector;
|
||||
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@NoArgsConstructor
|
||||
public class TextBlockCollector implements Collector<TextBlock, ConcatenatedTextBlock, TextBlock> {
|
||||
|
||||
@Override
|
||||
public Supplier<ConcatenatedTextBlock> supplier() {
|
||||
|
||||
return ConcatenatedTextBlock::empty;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BiConsumer<ConcatenatedTextBlock, TextBlock> accumulator() {
|
||||
|
||||
return ConcatenatedTextBlock::concat;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BinaryOperator<ConcatenatedTextBlock> combiner() {
|
||||
|
||||
return ConcatenatedTextBlock::concat;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Function<ConcatenatedTextBlock, TextBlock> finisher() {
|
||||
|
||||
return a -> a;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<Characteristics> characteristics() {
|
||||
|
||||
return Set.of(Characteristics.IDENTITY_FINISH, Characteristics.CONCURRENT);
|
||||
}
|
||||
|
||||
}
|
||||
@ -2,13 +2,15 @@ package com.knecon.fforesight.service.layoutparser.processor.model.image;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@RequiredArgsConstructor
|
||||
public class ClassifiedImage {
|
||||
|
||||
@ -16,10 +18,20 @@ public class ClassifiedImage {
|
||||
private Rectangle2D position;
|
||||
@NonNull
|
||||
private ImageType imageType;
|
||||
private boolean sourceByAi;
|
||||
private boolean isAppendedToSection;
|
||||
@NonNull
|
||||
private boolean hasTransparency;
|
||||
@NonNull
|
||||
private int page;
|
||||
private String representation;
|
||||
|
||||
|
||||
public ClassifiedImage(@NonNull Rectangle2D position, @NonNull ImageType imageType, boolean hasTransparency, int page, String representation) {
|
||||
|
||||
this.position = position;
|
||||
this.imageType = imageType;
|
||||
this.hasTransparency = hasTransparency;
|
||||
this.page = page;
|
||||
this.representation = representation;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,229 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.pdfbox.cos.COSArray;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.cos.COSString;
|
||||
import org.apache.pdfbox.pdmodel.PDDestinationNameTreeNode;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitHeightDestination;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitRectangleDestination;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitWidthDestination;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageXYZDestination;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Service
|
||||
@Slf4j
|
||||
public class OutlineExtractorService {
|
||||
|
||||
private static final String PDDESTINATION_TYPE_FIT = "Fit";
|
||||
private static final String PDDESTINATION_TYPE_FIT_B = "FitB";
|
||||
private static final String PDDESTINATION_TYPE_FIT_H = "FitH";
|
||||
private static final String PDDESTINATION_TYPE_FIT_V = "FitV";
|
||||
private static final String PDDESTINATION_TYPE_FIT_R = "FitR";
|
||||
private static final String PDDESTINATION_TYPE_FIT_BH = "FitBH";
|
||||
private static final String PDDESTINATION_TYPE_FIT_BV = "FitBV";
|
||||
private static final String PDDESTINATION_TYPE_XYZ = "XYZ";
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public OutlineObjectTree getOutlineObjectTree(PDDocument document) {
|
||||
|
||||
PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline();
|
||||
|
||||
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
|
||||
if (documentOutline != null) {
|
||||
for (PDOutlineItem child : documentOutline.children()) {
|
||||
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, 1);
|
||||
outlineObjectWithChildren.ifPresent(rootNodes::add);
|
||||
}
|
||||
}
|
||||
|
||||
return new OutlineObjectTree(rootNodes);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Optional<OutlineObjectTreeNode> createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) {
|
||||
|
||||
Optional<OutlineObjectTreeNode> outlineObject = createOutlineObject(item, document, depth);
|
||||
if (outlineObject.isPresent()) {
|
||||
for (var child : item.children()) {
|
||||
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, depth + 1);
|
||||
outlineObjectWithChildren.ifPresent(outlineObjectTreeNode -> outlineObject.get().addChild(outlineObjectTreeNode));
|
||||
}
|
||||
}
|
||||
|
||||
return outlineObject;
|
||||
}
|
||||
|
||||
|
||||
// if the structure elements are processed beforehand, another case can be handled here as well:
|
||||
// outline objects can reference structure elements (see pdf documentation)
|
||||
@SneakyThrows
|
||||
private Optional<OutlineObjectTreeNode> createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {
|
||||
|
||||
String title = item.getTitle();
|
||||
|
||||
PDPage page;
|
||||
try {
|
||||
// Can throw: "Error: can't convert to Destination COSArray" for some OCR'd PDFs
|
||||
page = item.findDestinationPage(document);
|
||||
if (page == null) {
|
||||
return Optional.empty();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.info(String.format("Error occurred during position resolution for outline item with title %s: " + e, title));
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
int pageNumber = document.getPages().indexOf(page) + 1;
|
||||
AffineTransform userSpaceToPageCoords = CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(PageInformation.fromPDPage(pageNumber, page));
|
||||
|
||||
Optional<Point2D> outlinePosition = Optional.empty();
|
||||
|
||||
try {
|
||||
PDDocumentNameDictionary names = document.getDocumentCatalog().getNames();
|
||||
PDDestinationNameTreeNode destinations = null;
|
||||
if (names != null) {
|
||||
destinations = names.getDests();
|
||||
}
|
||||
|
||||
PDDestination destination = item.getDestination();
|
||||
if (destination != null) {
|
||||
outlinePosition = getLocationFromCOSBase(destinations, destination.getCOSObject());
|
||||
}
|
||||
|
||||
if (outlinePosition.isEmpty()) {
|
||||
|
||||
PDAction action = item.getAction();
|
||||
if (action != null) {
|
||||
outlinePosition = extractOutlineLocationGoTo(destinations, action.getCOSObject());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title));
|
||||
}
|
||||
|
||||
return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title,
|
||||
pageNumber,
|
||||
transformPointToPageCoords(outlinePosition, userSpaceToPageCoords), depth)));
|
||||
}
|
||||
|
||||
|
||||
private static Point2D transformPointToPageCoords(Optional<Point2D> outlinePosition, AffineTransform userSpaceToPageCoords) {
|
||||
|
||||
return outlinePosition.map(point -> userSpaceToPageCoords.transform(point, null)).orElse(null);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static Optional<Point2D> extractOutlineLocationGoTo(PDDestinationNameTreeNode destinations, COSDictionary cosDictionary) {
|
||||
|
||||
if (isGoToAction(cosDictionary)) {
|
||||
COSBase cosBase = cosDictionary.getItem(COSName.D);
|
||||
return getLocationFromCOSBase(destinations, cosBase);
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
private static Optional<Point2D> getLocationFromCOSBase(PDDestinationNameTreeNode destinations, COSBase cosBase) throws IOException {
|
||||
|
||||
if (cosBase != null) {
|
||||
if (cosBase instanceof COSArray cosArray) {
|
||||
return getLocationFromCosArray(cosArray);
|
||||
}
|
||||
|
||||
if (cosBase instanceof COSString cosString) {
|
||||
String destinationName = cosString.getString();
|
||||
COSArray cosArray = destinations.getValue(destinationName).getCOSObject();
|
||||
return getLocationFromCosArray(cosArray);
|
||||
}
|
||||
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
private static Optional<Point2D> getLocationFromCosArray(COSArray cosArray) {
|
||||
|
||||
boolean located = false;
|
||||
float x = 0;
|
||||
float y = 0;
|
||||
|
||||
try {
|
||||
|
||||
PDDestination destination = PDDestination.create(cosArray);
|
||||
COSName type = (COSName) cosArray.getObject(1);
|
||||
String typeString = type.getName();
|
||||
|
||||
switch (typeString) {
|
||||
case PDDESTINATION_TYPE_FIT_V:
|
||||
case PDDESTINATION_TYPE_FIT_BV:
|
||||
PDPageFitHeightDestination fitHeightDestination = (PDPageFitHeightDestination) destination;
|
||||
x = fitHeightDestination.getLeft();
|
||||
located = true;
|
||||
break;
|
||||
case PDDESTINATION_TYPE_FIT_R:
|
||||
PDPageFitRectangleDestination fitRectangleDestination = (PDPageFitRectangleDestination) destination;
|
||||
x = fitRectangleDestination.getLeft();
|
||||
y = fitRectangleDestination.getTop();
|
||||
located = true;
|
||||
break;
|
||||
case PDDESTINATION_TYPE_FIT_H:
|
||||
case PDDESTINATION_TYPE_FIT_BH:
|
||||
PDPageFitWidthDestination fitWidthDestination = (PDPageFitWidthDestination) destination;
|
||||
y = fitWidthDestination.getTop();
|
||||
located = true;
|
||||
break;
|
||||
case PDDESTINATION_TYPE_XYZ:
|
||||
PDPageXYZDestination xyzDestination = (PDPageXYZDestination) destination;
|
||||
x = xyzDestination.getLeft();
|
||||
y = xyzDestination.getTop();
|
||||
located = true;
|
||||
break;
|
||||
case PDDESTINATION_TYPE_FIT:
|
||||
case PDDESTINATION_TYPE_FIT_B:
|
||||
default:
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
return located ? Optional.of(new Point2D.Float(x, y)) : Optional.empty();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static boolean isGoToAction(COSDictionary cosDictionary) {
|
||||
|
||||
return cosDictionary.getNameAsString("S").toLowerCase(Locale.ROOT).equals("goto");
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,77 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.Optional;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
public class OutlineObject {
|
||||
|
||||
@Getter
|
||||
private final String title;
|
||||
@Getter
|
||||
private final int pageNumber;
|
||||
@Getter
|
||||
private final int treeDepth;
|
||||
|
||||
private Point2D point; // java coordinates, (0, 0) is always top left
|
||||
|
||||
@Getter
|
||||
@Setter
|
||||
private boolean found;
|
||||
|
||||
|
||||
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {
|
||||
|
||||
this.title = title;
|
||||
this.pageNumber = pageNumber;
|
||||
this.treeDepth = depth;
|
||||
this.point = point2D;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return "OutlineObject{" + "title='" + title + '\'' + '}';
|
||||
}
|
||||
|
||||
|
||||
public Optional<Point2D> getPoint() {
|
||||
|
||||
return Optional.ofNullable(point);
|
||||
}
|
||||
|
||||
|
||||
public boolean isAbove(BoundingBox boundingBox) {
|
||||
|
||||
if (point == null) {
|
||||
return true;
|
||||
}
|
||||
return point.getY() <= boundingBox.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public double distance(BoundingBox boundingBox) {
|
||||
|
||||
if (point == null) {
|
||||
return 0;
|
||||
}
|
||||
if (boundingBox.getBBox().contains(point)) {
|
||||
return 0;
|
||||
}
|
||||
double deltaX = Math.min(Math.abs(boundingBox.getMinX() - point.getX()), Math.abs(boundingBox.getMaxX() - point.getX()));
|
||||
double deltaY = Math.min(Math.abs(boundingBox.getMinY() - point.getY()), Math.abs(boundingBox.getMaxY() - point.getY()));
|
||||
return Math.sqrt(deltaX * deltaX + deltaY * deltaY);
|
||||
}
|
||||
|
||||
|
||||
public void resetPoint() {
|
||||
|
||||
this.point = null;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,66 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
public class OutlineObjectTree {
|
||||
|
||||
private List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
|
||||
|
||||
private Map<Integer, List<OutlineObject>> outlineObjectsPerPage = new HashMap<>();
|
||||
|
||||
|
||||
public OutlineObjectTree(List<OutlineObjectTreeNode> rootNodes) {
|
||||
|
||||
this.rootNodes = rootNodes;
|
||||
flattenNodesAndGroupByPage(rootNodes);
|
||||
}
|
||||
|
||||
|
||||
private void flattenNodesAndGroupByPage(List<OutlineObjectTreeNode> outlineObjectTreeNodes) {
|
||||
|
||||
for (OutlineObjectTreeNode node : outlineObjectTreeNodes) {
|
||||
int pageNumber = node.getOutlineObject().getPageNumber();
|
||||
if (!this.outlineObjectsPerPage.containsKey(pageNumber)) {
|
||||
outlineObjectsPerPage.put(pageNumber, new ArrayList<>());
|
||||
}
|
||||
outlineObjectsPerPage.get(pageNumber).add(node.getOutlineObject());
|
||||
|
||||
if (!node.getChildren().isEmpty()) {
|
||||
flattenNodesAndGroupByPage(node.getChildren());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("OutlineObjectTree(\n");
|
||||
for (OutlineObjectTreeNode node : rootNodes) {
|
||||
buildString(node, sb, 1);
|
||||
}
|
||||
sb.append(")");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private void buildString(OutlineObjectTreeNode node, StringBuilder sb, int depth) {
|
||||
|
||||
for (int i = 0; i < depth; i++) {
|
||||
sb.append(" ");
|
||||
}
|
||||
sb.append(node.getOutlineObject().getTitle()).append("\n");
|
||||
|
||||
for (OutlineObjectTreeNode child : node.getChildren()) {
|
||||
buildString(child, sb, depth + 1);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,34 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public class OutlineObjectTreeNode {
|
||||
|
||||
private OutlineObject outlineObject;
|
||||
|
||||
private List<OutlineObjectTreeNode> children = new ArrayList<>();
|
||||
|
||||
|
||||
public OutlineObjectTreeNode(OutlineObject outlineObject) {
|
||||
|
||||
this.outlineObject = outlineObject;
|
||||
}
|
||||
|
||||
|
||||
public void addChild(OutlineObjectTreeNode outlineObject) {
|
||||
|
||||
children.add(outlineObject);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return "OutlineObjectTreeNode{" + "outlineObject=" + outlineObject + '}';
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,136 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Stack;
|
||||
|
||||
import org.springframework.lang.NonNull;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
public class SectionTree implements Iterable<SectionTreeEntry> {
|
||||
|
||||
private List<SectionTreeEntry> mainSections = new ArrayList<>();
|
||||
|
||||
|
||||
public SectionTree(List<SectionTreeEntry> mainSections) {
|
||||
|
||||
this.mainSections = mainSections;
|
||||
}
|
||||
|
||||
|
||||
public List<TextPageBlock> getAllTextPageBlocks() {
|
||||
|
||||
List<TextPageBlock> allTextPageBlocks = new ArrayList<>();
|
||||
for (SectionTreeEntry item : mainSections) {
|
||||
collectTextPageBlocks(item, allTextPageBlocks);
|
||||
}
|
||||
return allTextPageBlocks;
|
||||
}
|
||||
|
||||
|
||||
private void collectTextPageBlocks(SectionTreeEntry item, List<TextPageBlock> textPageBlocks) {
|
||||
|
||||
textPageBlocks.add(item.getHeadline());
|
||||
for (SectionTreeEntry child : item.getChildren()) {
|
||||
collectTextPageBlocks(child, textPageBlocks);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public List<SectionTreeEntry> getAllTableOfContentItems() {
|
||||
|
||||
List<SectionTreeEntry> allItems = new ArrayList<>();
|
||||
for (SectionTreeEntry item : mainSections) {
|
||||
collectTableOfContentItems(item, allItems);
|
||||
}
|
||||
return allItems;
|
||||
}
|
||||
|
||||
|
||||
private void collectTableOfContentItems(SectionTreeEntry item, List<SectionTreeEntry> allItems) {
|
||||
|
||||
allItems.add(item);
|
||||
for (SectionTreeEntry child : item.getChildren()) {
|
||||
collectTableOfContentItems(child, allItems);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean containsBlock(TextPageBlock block) {
|
||||
|
||||
for (SectionTreeEntry existingItem : this.getMainSections()) {
|
||||
if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private boolean containsItem(SectionTreeEntry tocItem) {
|
||||
|
||||
for (SectionTreeEntry existingItem : this.getMainSections()) {
|
||||
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public @NonNull Iterator<SectionTreeEntry> iterator() {
|
||||
|
||||
return new SectionTreeEntryIterator(mainSections);
|
||||
}
|
||||
|
||||
|
||||
private static class SectionTreeEntryIterator implements Iterator<SectionTreeEntry> {
|
||||
|
||||
private final Stack<Iterator<SectionTreeEntry>> stack = new Stack<>();
|
||||
|
||||
|
||||
SectionTreeEntryIterator(List<SectionTreeEntry> mainSections) {
|
||||
|
||||
stack.push(mainSections.iterator());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
|
||||
ensureStackTopIsCurrent();
|
||||
return !stack.isEmpty() && stack.peek().hasNext();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public SectionTreeEntry next() {
|
||||
|
||||
ensureStackTopIsCurrent();
|
||||
SectionTreeEntry currentItem = stack.peek().next();
|
||||
if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) {
|
||||
stack.push(currentItem.getChildren()
|
||||
.iterator());
|
||||
}
|
||||
return currentItem;
|
||||
}
|
||||
|
||||
|
||||
private void ensureStackTopIsCurrent() {
|
||||
|
||||
while (!stack.isEmpty() && !stack.peek().hasNext()) {
|
||||
stack.pop();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,82 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.TABLE_OF_CONTENTS_HEADLINE;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Service
|
||||
@Slf4j
|
||||
public class SectionTreeBuilderService {
|
||||
|
||||
@Observed(name = "OutlineValidationService", contextualName = "create-toc")
|
||||
public SectionTree createSectionTree(ClassificationDocument classificationDocument) {
|
||||
|
||||
List<TextPageBlock> headlines = extractHeadlines(classificationDocument);
|
||||
|
||||
List<SectionTreeEntry> mainSections = new ArrayList<>();
|
||||
Map<Integer, SectionTreeEntry> lastItemsPerDepth = new HashMap<>();
|
||||
SectionTreeEntry last = null;
|
||||
TreeSet<Integer> depths = new TreeSet<>();
|
||||
|
||||
for (TextPageBlock current : headlines) {
|
||||
int currentDepth = getHeadlineNumber(current.getClassification());
|
||||
Integer parentDepth = depths.floor(currentDepth - 1);
|
||||
|
||||
var tocItem = new SectionTreeEntry(current);
|
||||
|
||||
if (parentDepth == null) {
|
||||
mainSections.add(tocItem);
|
||||
lastItemsPerDepth = new HashMap<>();
|
||||
depths = new TreeSet<>();
|
||||
|
||||
} else {
|
||||
assert last != null;
|
||||
int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
|
||||
if (last.getHeadline().getClassification().equals(TABLE_OF_CONTENTS_HEADLINE) && !current.getClassification().equals(TABLE_OF_CONTENTS_HEADLINE)) {
|
||||
// headline after toc should always start a main section
|
||||
parentDepth = 1;
|
||||
} else if (lastDepth < parentDepth) {
|
||||
parentDepth = lastDepth;
|
||||
} else if (lastDepth == currentDepth && last.getParent() != null) {
|
||||
parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
|
||||
}
|
||||
|
||||
SectionTreeEntry parent = lastItemsPerDepth.get(parentDepth);
|
||||
parent.addChild(tocItem);
|
||||
}
|
||||
|
||||
last = tocItem;
|
||||
lastItemsPerDepth.put(currentDepth, tocItem);
|
||||
depths.add(currentDepth);
|
||||
}
|
||||
|
||||
return new SectionTree(mainSections);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static List<TextPageBlock> extractHeadlines(ClassificationDocument classificationDocument) {
|
||||
|
||||
return classificationDocument.getPages()
|
||||
.stream()
|
||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||
.stream()
|
||||
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
|
||||
.map(tb -> (TextPageBlock) tb))
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,252 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class SectionTreeEnhancementService {
|
||||
|
||||
public void assignSectionBlocksAndImages(ClassificationDocument document) {
|
||||
|
||||
SectionTree toc = document.getSectionTree();
|
||||
Iterator<SectionTreeEntry> iterator = toc.iterator();
|
||||
SectionTreeEntry currentTOCItem = null;
|
||||
if (iterator.hasNext()) {
|
||||
currentTOCItem = iterator.next();
|
||||
}
|
||||
List<AbstractPageBlock> startBlocks = new ArrayList<>();
|
||||
List<ClassifiedImage> startImages = new ArrayList<>();
|
||||
SectionTreeEntry currentSection = null;
|
||||
boolean foundFirstHeadline = false;
|
||||
|
||||
List<ClassificationHeader> headers = new ArrayList<>();
|
||||
List<ClassificationFooter> footers = new ArrayList<>();
|
||||
TablePageBlock previousTable = null;
|
||||
List<SectionTreeEntry> lastFoundTOCItems = new ArrayList<>();
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
List<SectionTreeEntry> currentPageTOCItems = new ArrayList<>();
|
||||
List<TextPageBlock> header = new ArrayList<>();
|
||||
List<TextPageBlock> footer = new ArrayList<>();
|
||||
for (AbstractPageBlock current : page.getTextBlocks()) {
|
||||
|
||||
if (current.getClassification() == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
current.setPage(page.getPageNumber());
|
||||
|
||||
if (current.getClassification().equals(PageBlockType.HEADER)) {
|
||||
header.add((TextPageBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getClassification().equals(PageBlockType.FOOTER)) {
|
||||
footer.add((TextPageBlock) current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current instanceof TablePageBlock table) {
|
||||
if (previousTable != null) {
|
||||
mergeTableMetadata(table, previousTable);
|
||||
}
|
||||
previousTable = table;
|
||||
}
|
||||
|
||||
if (current instanceof TextPageBlock && currentTOCItem != null && currentTOCItem.getHeadline().getText().equals(current.getText())) {
|
||||
if (!foundFirstHeadline) {
|
||||
foundFirstHeadline = true;
|
||||
}
|
||||
currentSection = currentTOCItem;
|
||||
currentTOCItem.getSectionBlocks().add(current);
|
||||
currentPageTOCItems.add(currentTOCItem);
|
||||
|
||||
if (iterator.hasNext()) {
|
||||
currentTOCItem = iterator.next();
|
||||
}
|
||||
} else if (!foundFirstHeadline) {
|
||||
startBlocks.add(current);
|
||||
} else {
|
||||
currentSection.getSectionBlocks().add(current);
|
||||
}
|
||||
}
|
||||
|
||||
if (!currentPageTOCItems.isEmpty()) {
|
||||
lastFoundTOCItems = currentPageTOCItems;
|
||||
}
|
||||
|
||||
for (ClassifiedImage image : page.getImages()) {
|
||||
|
||||
Double xMin = null;
|
||||
Double yMin = null;
|
||||
Double xMax = null;
|
||||
Double yMax = null;
|
||||
|
||||
for (SectionTreeEntry tocItem : lastFoundTOCItems) {
|
||||
var headline = tocItem.getHeadline();
|
||||
|
||||
if (headline.getPage() != page.getPageNumber()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (headline.getMinX() < headline.getMaxX()) {
|
||||
if (xMin == null || headline.getMinX() < xMin) {
|
||||
xMin = headline.getMinX();
|
||||
}
|
||||
if (xMax == null || headline.getMaxX() > xMax) {
|
||||
xMax = headline.getMaxX();
|
||||
}
|
||||
} else {
|
||||
if (xMin == null || headline.getMaxX() < xMin) {
|
||||
xMin = headline.getMaxX();
|
||||
}
|
||||
if (xMax == null || headline.getMinX() > xMax) {
|
||||
xMax = headline.getMinX();
|
||||
}
|
||||
}
|
||||
|
||||
if (headline.getMinY() < headline.getMaxY()) {
|
||||
if (yMin == null || headline.getMinY() < yMin) {
|
||||
yMin = headline.getMinY();
|
||||
}
|
||||
if (yMax == null || headline.getMaxY() > yMax) {
|
||||
yMax = headline.getMaxY();
|
||||
}
|
||||
} else {
|
||||
if (yMin == null || headline.getMaxY() < yMin) {
|
||||
yMin = headline.getMaxY();
|
||||
}
|
||||
if (yMax == null || headline.getMinY() > yMax) {
|
||||
yMax = headline.getMinY();
|
||||
}
|
||||
}
|
||||
|
||||
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
|
||||
log.debug("Headline position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
|
||||
|
||||
if (image.getPosition().getX() >= xMin && image.getPosition().getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
|
||||
tocItem.getImages().add(image);
|
||||
image.setAppendedToSection(true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!image.isAppendedToSection()) {
|
||||
log.debug("Image uses last found section");
|
||||
if (!lastFoundTOCItems.isEmpty()) {
|
||||
lastFoundTOCItems.get(lastFoundTOCItems.size() - 1).getImages().add(image);
|
||||
} else {
|
||||
startImages.add(image);
|
||||
}
|
||||
image.setAppendedToSection(true);
|
||||
}
|
||||
}
|
||||
|
||||
if (!header.isEmpty()) {
|
||||
headers.add(new ClassificationHeader(header));
|
||||
}
|
||||
if (!footer.isEmpty()) {
|
||||
footers.add(new ClassificationFooter(footer));
|
||||
}
|
||||
}
|
||||
|
||||
if (!startBlocks.isEmpty() || !startImages.isEmpty()) {
|
||||
SectionTreeEntry unassigned = new SectionTreeEntry(null);
|
||||
unassigned.setSectionBlocks(startBlocks);
|
||||
unassigned.setImages(startImages);
|
||||
document.getSectionTree().getMainSections().add(0, unassigned);
|
||||
}
|
||||
document.setHeaders(headers);
|
||||
document.setFooters(footers);
|
||||
}
|
||||
|
||||
|
||||
private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
|
||||
|
||||
// Distribute header information for subsequent tables
|
||||
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
|
||||
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
|
||||
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
|
||||
// Allow merging of tables if header row is separated from first logical non-header row
|
||||
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows().get(0)
|
||||
.stream()
|
||||
.map(cell -> {
|
||||
Cell fakeCell = Cell.copy(cell);
|
||||
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
||||
return fakeCell;
|
||||
})
|
||||
.toList();
|
||||
}
|
||||
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
|
||||
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = currentTable.getRows().get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream()
|
||||
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean hasValidHeaderInformation(TablePageBlock table) {
|
||||
|
||||
return !hasInvalidHeaderInformation(table);
|
||||
}
|
||||
|
||||
|
||||
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
|
||||
|
||||
return table.getRows()
|
||||
.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.allMatch(cell -> cell.getHeaderCells().isEmpty());
|
||||
}
|
||||
|
||||
|
||||
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
|
||||
|
||||
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = table.getRows().get(i);
|
||||
if (row.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
boolean allNonHeader = true;
|
||||
for (Cell cell : row) {
|
||||
if (cell.isHeaderCell()) {
|
||||
allNonHeader = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (allNonHeader) {
|
||||
return row;
|
||||
}
|
||||
}
|
||||
|
||||
return Collections.emptyList();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,129 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
|
||||
@Data
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
public class SectionTreeEntry {
|
||||
|
||||
public enum Type {
|
||||
SECTION,
|
||||
SUPER_SECTION,
|
||||
TOC_SECTION
|
||||
}
|
||||
|
||||
@EqualsAndHashCode.Include
|
||||
private TextPageBlock headline;
|
||||
private List<SectionTreeEntry> children = new ArrayList<>();
|
||||
private SectionTreeEntry parent;
|
||||
|
||||
private List<AbstractPageBlock> sectionBlocks = new ArrayList<>();
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
|
||||
private GenericSemanticNode section;
|
||||
|
||||
|
||||
public SectionTreeEntry(TextPageBlock headline) {
|
||||
|
||||
this.headline = headline;
|
||||
}
|
||||
|
||||
|
||||
public Type getType() {
|
||||
|
||||
if (!Objects.isNull(headline) && headline.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_HEADLINE)) {
|
||||
return Type.TOC_SECTION;
|
||||
}
|
||||
if (children.isEmpty()) {
|
||||
return Type.SECTION;
|
||||
}
|
||||
return Type.SUPER_SECTION;
|
||||
}
|
||||
|
||||
|
||||
public void addChild(SectionTreeEntry sectionTreeEntry) {
|
||||
|
||||
children.add(sectionTreeEntry);
|
||||
sectionTreeEntry.setParent(this);
|
||||
}
|
||||
|
||||
|
||||
public SectionTreeEntry getSiblingBefore() {
|
||||
|
||||
if (parent != null) {
|
||||
int index = parent.getChildren().indexOf(this);
|
||||
if (index > 0) {
|
||||
return parent.getChildren().get(index - 1);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
public SectionTreeEntry getSiblingAfter() {
|
||||
|
||||
if (parent != null) {
|
||||
int index = parent.getChildren().indexOf(this);
|
||||
if (index >= 0 && index < parent.getChildren().size() - 1) {
|
||||
return parent.getChildren().get(index + 1);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(TextPageBlock block) {
|
||||
|
||||
if (headline.equals(block)) {
|
||||
return true;
|
||||
}
|
||||
for (SectionTreeEntry child : children) {
|
||||
if (child.contains(block)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(SectionTreeEntry tocItem) {
|
||||
|
||||
if (this.equals(tocItem)) {
|
||||
return true;
|
||||
}
|
||||
for (SectionTreeEntry child : children) {
|
||||
if (child.contains(tocItem)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public List<AbstractPageBlock> getNonEmptySectionBlocks() {
|
||||
|
||||
return sectionBlocks.stream()
|
||||
.filter(pageBlock -> !pageBlock.isEmpty())
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return "OutlineObjectTreeNode{" + "textPageBlock=" + headline + '}';
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,12 +1,15 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.Data;
|
||||
@ -17,7 +20,7 @@ import lombok.NoArgsConstructor;
|
||||
@Data
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@NoArgsConstructor
|
||||
public class Cell extends Rectangle {
|
||||
public class Cell extends BoundingBox {
|
||||
|
||||
private List<TextPageBlock> textBlocks = new ArrayList<>();
|
||||
|
||||
@ -32,7 +35,24 @@ public class Cell extends Rectangle {
|
||||
|
||||
public Cell(Point2D topLeft, Point2D bottomRight) {
|
||||
|
||||
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
|
||||
this.bBoxPdf = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
|
||||
this.bBox = bBoxPdf;
|
||||
}
|
||||
|
||||
|
||||
public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) {
|
||||
|
||||
this.bBoxPdf = bBoxInitialUserSpace;
|
||||
this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D();
|
||||
}
|
||||
|
||||
|
||||
public static Cell copy(Cell cell) {
|
||||
|
||||
Cell copy = new Cell();
|
||||
copy.bBoxPdf = cell.bBoxPdf;
|
||||
copy.bBox = cell.bBox;
|
||||
return copy;
|
||||
}
|
||||
|
||||
|
||||
@ -48,12 +68,12 @@ public class Cell extends Rectangle {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
Iterator<TextPageBlock> itty = textBlocks.iterator();
|
||||
TextPositionSequence previous = null;
|
||||
Word previous = null;
|
||||
while (itty.hasNext()) {
|
||||
|
||||
TextPageBlock textBlock = itty.next();
|
||||
|
||||
for (TextPositionSequence word : textBlock.getSequences()) {
|
||||
for (Word word : textBlock.getWords()) {
|
||||
if (previous != null) {
|
||||
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
|
||||
sb.append('\n');
|
||||
@ -67,7 +87,7 @@ public class Cell extends Rectangle {
|
||||
|
||||
}
|
||||
|
||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" {2}", " ");
|
||||
return TextNormalizationUtilities.cleanString(sb.toString());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,15 +1,206 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
import lombok.Getter;
|
||||
|
||||
@Getter
|
||||
public class CleanRulings {
|
||||
|
||||
List<Ruling> horizontal;
|
||||
List<Ruling> vertical;
|
||||
List<Ruling> horizontals; // unmodifiable sorted by Y list
|
||||
List<Ruling> verticals; // unmodifiable sorted by X list
|
||||
|
||||
|
||||
public CleanRulings(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||
|
||||
this.horizontals = horizontals.stream()
|
||||
.peek(Ruling::assertHorizontal)
|
||||
.sorted(Comparator.comparing(Line2D.Float::getY1))
|
||||
.toList();
|
||||
this.verticals = verticals.stream()
|
||||
.peek(Ruling::assertVertical)
|
||||
.sorted(Comparator.comparing(Line2D.Float::getX1))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
public CleanRulings getTableLines() {
|
||||
|
||||
return new CleanRulings(horizontals.stream()
|
||||
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.TABLE_LINE))
|
||||
.toList(),
|
||||
verticals.stream()
|
||||
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.TABLE_LINE))
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
public CleanRulings withoutTextRulings() {
|
||||
|
||||
return new CleanRulings(horizontals.stream()
|
||||
.filter(ruling -> !(ruling.getClassification().equals(Ruling.Classification.UNDERLINE) || ruling.getClassification()
|
||||
.equals(Ruling.Classification.STRIKETROUGH)))
|
||||
.toList(),
|
||||
verticals.stream()
|
||||
.filter(ruling -> !(ruling.getClassification().equals(Ruling.Classification.UNDERLINE) || ruling.getClassification()
|
||||
.equals(Ruling.Classification.STRIKETROUGH)))
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
public List<Ruling> buildAll() {
|
||||
|
||||
ArrayList<Ruling> rulings = new ArrayList<>(horizontals.size() + verticals.size());
|
||||
rulings.addAll(horizontals);
|
||||
rulings.addAll(verticals);
|
||||
return rulings;
|
||||
}
|
||||
|
||||
|
||||
public boolean lineBetween(BoundingBox a, BoundingBox b) {
|
||||
|
||||
return lineBetween(a.getBBoxPdf(), b.getBBoxPdf());
|
||||
}
|
||||
|
||||
|
||||
public boolean lineBetween(Rectangle2D a, Rectangle2D b) {
|
||||
|
||||
return lineBetween(new Point2D.Double(a.getCenterX(), a.getCenterY()), new Point2D.Double(b.getCenterX(), b.getCenterY()));
|
||||
}
|
||||
|
||||
|
||||
public boolean lineBetween(Point2D p1, Point2D p2) {
|
||||
|
||||
Ruling ruling = new Ruling(p1, p2);
|
||||
|
||||
if (ruling.isHorizontal()) {
|
||||
return getVerticalsInXInterval(ruling.x1, ruling.x2).stream()
|
||||
.anyMatch(vertical -> vertical.intersectsLine(ruling));
|
||||
|
||||
}
|
||||
|
||||
if (ruling.isVertical()) {
|
||||
return getHorizontalsInYInterval(ruling.y1, ruling.y2).stream()
|
||||
.anyMatch(horizontal -> horizontal.intersectsLine(ruling));
|
||||
|
||||
}
|
||||
|
||||
return Stream.of(getVerticalsInXInterval(ruling.x1, ruling.x2), getHorizontalsInYInterval(ruling.y1, ruling.y2))
|
||||
.flatMap(Collection::stream)
|
||||
.anyMatch(other -> other.intersectsLine(ruling));
|
||||
}
|
||||
|
||||
|
||||
public List<Ruling> getHorizontalsInYInterval(float y1, float y2) {
|
||||
|
||||
float startY = Math.min(y1, y2);
|
||||
float endY = Math.max(y1, y2);
|
||||
|
||||
if (horizontals.isEmpty() || Float.isNaN(startY) || Float.isNaN(endY)) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
int firstGreaterThanIdx = findFirstHorizontalRulingIdxAbove(startY);
|
||||
|
||||
if (firstGreaterThanIdx == -1) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
List<Ruling> result = new LinkedList<>();
|
||||
for (int i = firstGreaterThanIdx; i < horizontals.size(); i++) {
|
||||
Ruling horizontal = horizontals.get(i);
|
||||
if (horizontal.y1 > endY) {
|
||||
break;
|
||||
}
|
||||
result.add(horizontal);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private int findFirstHorizontalRulingIdxAbove(float y) {
|
||||
|
||||
int low = 0;
|
||||
int high = horizontals.size() - 1;
|
||||
|
||||
while (low <= high) {
|
||||
int mid = low + (high - low) / 2;
|
||||
Line2D.Float midLine = horizontals.get(mid);
|
||||
float midY = midLine.y1;
|
||||
|
||||
if (midY == y) {
|
||||
return mid;
|
||||
} else if (midY > y) {
|
||||
high = mid - 1;
|
||||
} else {
|
||||
low = mid + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Return the index of the first element greater than y or -1 if not found
|
||||
return horizontals.size() > low && horizontals.get(low).y1 > y ? low : -1;
|
||||
}
|
||||
|
||||
|
||||
public List<Ruling> getVerticalsInXInterval(float x1, float x2) {
|
||||
|
||||
float startX = Math.min(x1, x2);
|
||||
float endX = Math.max(x1, x2);
|
||||
|
||||
if (verticals.isEmpty() || Float.isNaN(startX) || Float.isNaN(endX)) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
int firstGreaterThanIdx = findFirstVerticalRulingIdxRightOf(startX);
|
||||
|
||||
if (firstGreaterThanIdx == -1) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
List<Ruling> result = new LinkedList<>();
|
||||
for (int i = firstGreaterThanIdx; i < verticals.size(); i++) {
|
||||
Ruling horizontal = verticals.get(i);
|
||||
if (horizontal.x1 > endX) {
|
||||
break;
|
||||
}
|
||||
result.add(horizontal);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private int findFirstVerticalRulingIdxRightOf(float x) {
|
||||
|
||||
int low = 0;
|
||||
int high = verticals.size() - 1;
|
||||
|
||||
while (low <= high) {
|
||||
int mid = low + (high - low) / 2;
|
||||
Line2D.Float midLine = verticals.get(mid);
|
||||
float midX = midLine.x1;
|
||||
|
||||
if (midX == x) {
|
||||
return mid;
|
||||
} else if (midX > x) {
|
||||
high = mid - 1;
|
||||
} else {
|
||||
low = mid + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Return the index of the first element greater than y or -1 if not found
|
||||
return verticals.size() > low && verticals.get(low).x1 > x ? low : -1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,218 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
@SuppressWarnings("all")
|
||||
public class Rectangle extends Rectangle2D.Float {
|
||||
|
||||
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||
/**
|
||||
* Ill-defined comparator, from when Rectangle was Comparable.
|
||||
* <p>
|
||||
* see https://github.com/tabulapdf/tabula-java/issues/116
|
||||
*
|
||||
* @deprecated with no replacement
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
|
||||
@Override
|
||||
public int compare(Rectangle o1, Rectangle o2) {
|
||||
|
||||
if (o1.equals(o2)) {
|
||||
return 0;
|
||||
}
|
||||
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
|
||||
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1 ? -java.lang.Double.compare(o1.getX(), o2.getX()) : java.lang.Double.compare(o1.getX(), o2.getX());
|
||||
} else {
|
||||
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public Rectangle() {
|
||||
|
||||
super();
|
||||
}
|
||||
|
||||
|
||||
public Rectangle(float top, float left, float width, float height) {
|
||||
|
||||
super();
|
||||
this.setRect(left, top, width, height);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param rectangles
|
||||
* @return minimum bounding box that contains all the rectangles
|
||||
*/
|
||||
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
|
||||
|
||||
float minx = java.lang.Float.MAX_VALUE;
|
||||
float miny = java.lang.Float.MAX_VALUE;
|
||||
float maxx = java.lang.Float.MIN_VALUE;
|
||||
float maxy = java.lang.Float.MIN_VALUE;
|
||||
|
||||
for (Rectangle r : rectangles) {
|
||||
minx = (float) Math.min(r.getMinX(), minx);
|
||||
miny = (float) Math.min(r.getMinY(), miny);
|
||||
maxx = (float) Math.max(r.getMaxX(), maxx);
|
||||
maxy = (float) Math.max(r.getMaxY(), maxy);
|
||||
}
|
||||
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
|
||||
}
|
||||
|
||||
|
||||
public int compareTo(Rectangle other) {
|
||||
|
||||
return ILL_DEFINED_ORDER.compare(this, other);
|
||||
}
|
||||
|
||||
|
||||
// I'm bad at Java and need this for fancy sorting in
|
||||
// technology.tabula.TextChunk.
|
||||
public int isLtrDominant() {
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
public float getArea() {
|
||||
|
||||
return this.width * this.height;
|
||||
}
|
||||
|
||||
|
||||
public float verticalOverlap(Rectangle other) {
|
||||
|
||||
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
||||
}
|
||||
|
||||
|
||||
public boolean verticallyOverlaps(Rectangle other) {
|
||||
|
||||
return verticalOverlap(other) > 0;
|
||||
}
|
||||
|
||||
|
||||
public float horizontalOverlap(Rectangle other) {
|
||||
|
||||
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
||||
}
|
||||
|
||||
|
||||
public boolean horizontallyOverlaps(Rectangle other) {
|
||||
|
||||
return horizontalOverlap(other) > 0;
|
||||
}
|
||||
|
||||
|
||||
public float verticalOverlapRatio(Rectangle other) {
|
||||
|
||||
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
|
||||
|
||||
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
|
||||
rv = (other.getBottom() - this.getTop()) / delta;
|
||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
|
||||
rv = (this.getBottom() - other.getTop()) / delta;
|
||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
|
||||
rv = (other.getBottom() - other.getTop()) / delta;
|
||||
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
|
||||
rv = (this.getBottom() - this.getTop()) / delta;
|
||||
}
|
||||
|
||||
return rv;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public float overlapRatio(Rectangle other) {
|
||||
|
||||
double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
||||
double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
||||
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
|
||||
double unionArea = this.getArea() + other.getArea() - intersectionArea;
|
||||
|
||||
return (float) (intersectionArea / unionArea);
|
||||
}
|
||||
|
||||
|
||||
public Rectangle merge(Rectangle other) {
|
||||
|
||||
this.setRect(this.createUnion(other));
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
public float getTop() {
|
||||
|
||||
return (float) this.getMinY();
|
||||
}
|
||||
|
||||
|
||||
public void setTop(float top) {
|
||||
|
||||
float deltaHeight = top - this.y;
|
||||
this.setRect(this.x, top, this.width, this.height - deltaHeight);
|
||||
}
|
||||
|
||||
|
||||
public float getRight() {
|
||||
|
||||
return (float) this.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public void setRight(float right) {
|
||||
|
||||
this.setRect(this.x, this.y, right - this.x, this.height);
|
||||
}
|
||||
|
||||
|
||||
public float getLeft() {
|
||||
|
||||
return (float) this.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public void setLeft(float left) {
|
||||
|
||||
float deltaWidth = left - this.x;
|
||||
this.setRect(left, this.y, this.width - deltaWidth, this.height);
|
||||
}
|
||||
|
||||
|
||||
public float getBottom() {
|
||||
|
||||
return (float) this.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public void setBottom(float bottom) {
|
||||
|
||||
this.setRect(this.x, this.y, this.width, bottom - this.y);
|
||||
}
|
||||
|
||||
|
||||
public Point2D[] getPoints() {
|
||||
|
||||
return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()), new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(),
|
||||
this.getBottom()), new Point2D.Float(this.getLeft(), this.getBottom())};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String s = super.toString();
|
||||
sb.append(s.substring(0, s.length() - 1));
|
||||
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
@ -4,28 +4,41 @@ import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Formatter;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CohenSutherlandClipping;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@SuppressWarnings("all")
|
||||
public class Ruling extends Line2D.Float {
|
||||
|
||||
private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2;
|
||||
public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2;
|
||||
public static final int COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT = 2;
|
||||
|
||||
public enum Classification {
|
||||
TABLE_LINE,
|
||||
UNDERLINE,
|
||||
STRIKETROUGH,
|
||||
HEADER_SEPARATOR,
|
||||
FOOTER_SEPARATOR,
|
||||
OTHER
|
||||
}
|
||||
|
||||
@Getter
|
||||
@Setter
|
||||
private Classification classification;
|
||||
|
||||
|
||||
public Ruling(Point2D p1, Point2D p2) {
|
||||
|
||||
super(p1, p2);
|
||||
this.classification = Classification.OTHER;
|
||||
}
|
||||
|
||||
|
||||
@ -59,126 +72,32 @@ public class Ruling extends Line2D.Float {
|
||||
}
|
||||
|
||||
|
||||
// log(n) implementation of find_intersections
|
||||
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
|
||||
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||
|
||||
class SortObject {
|
||||
|
||||
protected SOType type;
|
||||
protected float position;
|
||||
protected Ruling ruling;
|
||||
|
||||
|
||||
public SortObject(SOType type, float position, Ruling ruling) {
|
||||
|
||||
this.type = type;
|
||||
this.position = position;
|
||||
this.ruling = ruling;
|
||||
}
|
||||
public void assertHorizontal() {
|
||||
|
||||
if (isHorizontal()) {
|
||||
return;
|
||||
}
|
||||
|
||||
List<SortObject> sos = new ArrayList<>();
|
||||
|
||||
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
|
||||
@Override
|
||||
public int compare(Ruling o1, Ruling o2) {
|
||||
|
||||
return java.lang.Double.compare(o1.getTop(), o2.getTop());
|
||||
}
|
||||
});
|
||||
|
||||
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
|
||||
@Override
|
||||
public int compare(Point2D o1, Point2D o2) {
|
||||
|
||||
if (o1.getY() > o2.getY()) {
|
||||
return 1;
|
||||
}
|
||||
if (o1.getY() < o2.getY()) {
|
||||
return -1;
|
||||
}
|
||||
if (o1.getX() > o2.getX()) {
|
||||
return 1;
|
||||
}
|
||||
if (o1.getX() < o2.getX()) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
});
|
||||
|
||||
for (Ruling h : horizontals) {
|
||||
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
||||
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
||||
}
|
||||
|
||||
for (Ruling v : verticals) {
|
||||
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
|
||||
}
|
||||
|
||||
Collections.sort(sos, new Comparator<SortObject>() {
|
||||
@Override
|
||||
public int compare(SortObject a, SortObject b) {
|
||||
|
||||
int rv;
|
||||
if (DoubleComparisons.feq(a.position, b.position)) {
|
||||
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
|
||||
rv = 1;
|
||||
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
|
||||
rv = -1;
|
||||
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
|
||||
rv = -1;
|
||||
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
|
||||
rv = 1;
|
||||
} else {
|
||||
rv = java.lang.Double.compare(a.position, b.position);
|
||||
}
|
||||
} else {
|
||||
return java.lang.Double.compare(a.position, b.position);
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
});
|
||||
|
||||
for (SortObject so : sos) {
|
||||
switch (so.type) {
|
||||
case VERTICAL:
|
||||
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
|
||||
try {
|
||||
Point2D i = h.getKey().intersectionPoint(so.ruling);
|
||||
if (i == null) {
|
||||
continue;
|
||||
}
|
||||
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
|
||||
} catch (UnsupportedOperationException e) {
|
||||
log.info("Some line are oblique, ignoring...");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case HRIGHT:
|
||||
tree.remove(so.ruling);
|
||||
break;
|
||||
case HLEFT:
|
||||
tree.put(so.ruling, true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return rv;
|
||||
throw new IllegalArgumentException("Ruling " + this + " is not horizontal");
|
||||
|
||||
}
|
||||
|
||||
|
||||
public boolean vertical() {
|
||||
public void assertVertical() {
|
||||
|
||||
if (isVertical()) {
|
||||
return;
|
||||
}
|
||||
throw new IllegalArgumentException("Ruling " + this + " is not vertical");
|
||||
}
|
||||
|
||||
|
||||
public boolean isVertical() {
|
||||
|
||||
return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
public boolean horizontal() {
|
||||
public boolean isHorizontal() {
|
||||
|
||||
return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
||||
}
|
||||
@ -187,36 +106,36 @@ public class Ruling extends Line2D.Float {
|
||||
// these are used to have a single collapse method (in page, currently)
|
||||
|
||||
|
||||
public boolean oblique() {
|
||||
public boolean isOblique() {
|
||||
|
||||
return !(this.vertical() || this.horizontal());
|
||||
return !(this.isVertical() || this.isHorizontal());
|
||||
}
|
||||
|
||||
|
||||
public float getPosition() {
|
||||
|
||||
if (this.oblique()) {
|
||||
if (this.isOblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
return this.vertical() ? this.getLeft() : this.getTop();
|
||||
return this.isVertical() ? this.getLeft() : this.getTop();
|
||||
}
|
||||
|
||||
|
||||
public float getStart() {
|
||||
|
||||
if (this.oblique()) {
|
||||
if (this.isOblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
return this.vertical() ? this.getTop() : this.getLeft();
|
||||
return this.isVertical() ? this.getTop() : this.getLeft();
|
||||
}
|
||||
|
||||
|
||||
public void setStart(float v) {
|
||||
|
||||
if (this.oblique()) {
|
||||
if (this.isOblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
if (this.vertical()) {
|
||||
if (this.isVertical()) {
|
||||
this.setTop(v);
|
||||
} else {
|
||||
this.setLeft(v);
|
||||
@ -226,19 +145,19 @@ public class Ruling extends Line2D.Float {
|
||||
|
||||
public float getEnd() {
|
||||
|
||||
if (this.oblique()) {
|
||||
if (this.isOblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
return this.vertical() ? this.getBottom() : this.getRight();
|
||||
return this.isVertical() ? this.getBottom() : this.getRight();
|
||||
}
|
||||
|
||||
|
||||
public void setEnd(float v) {
|
||||
|
||||
if (this.oblique()) {
|
||||
if (this.isOblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
if (this.vertical()) {
|
||||
if (this.isVertical()) {
|
||||
this.setBottom(v);
|
||||
} else {
|
||||
this.setRight(v);
|
||||
@ -248,10 +167,10 @@ public class Ruling extends Line2D.Float {
|
||||
|
||||
public void setStartEnd(float start, float end) {
|
||||
|
||||
if (this.oblique()) {
|
||||
if (this.isOblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
if (this.vertical()) {
|
||||
if (this.isVertical()) {
|
||||
this.setTop(start);
|
||||
this.setBottom(end);
|
||||
} else {
|
||||
@ -263,11 +182,11 @@ public class Ruling extends Line2D.Float {
|
||||
|
||||
public boolean perpendicularTo(Ruling other) {
|
||||
|
||||
return this.vertical() == other.horizontal();
|
||||
return this.isVertical() == other.isHorizontal();
|
||||
}
|
||||
|
||||
|
||||
public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) {
|
||||
public boolean nearlyIntersects(Ruling another) {
|
||||
|
||||
if (this.intersectsLine(another)) {
|
||||
return true;
|
||||
@ -276,9 +195,9 @@ public class Ruling extends Line2D.Float {
|
||||
boolean rv = false;
|
||||
|
||||
if (this.perpendicularTo(another)) {
|
||||
rv = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).intersectsLine(another);
|
||||
rv = this.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT).intersectsLine(another);
|
||||
} else {
|
||||
rv = this.expand(colinearOrParallelExpandAmount).intersectsLine(another.expand(colinearOrParallelExpandAmount));
|
||||
rv = this.expand(COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT).intersectsLine(another.expand(COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT));
|
||||
}
|
||||
|
||||
return rv;
|
||||
@ -317,30 +236,6 @@ public class Ruling extends Line2D.Float {
|
||||
}
|
||||
|
||||
|
||||
public Point2D intersectionPoint(Ruling other) {
|
||||
|
||||
Ruling this_l = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
|
||||
Ruling other_l = other.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
|
||||
Ruling horizontal, vertical;
|
||||
|
||||
if (!this_l.intersectsLine(other_l)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (this_l.horizontal() && other_l.vertical()) {
|
||||
horizontal = this_l;
|
||||
vertical = other_l;
|
||||
} else if (this_l.vertical() && other_l.horizontal()) {
|
||||
vertical = this_l;
|
||||
horizontal = other_l;
|
||||
} else {
|
||||
log.warn("lines must be orthogonal, vertical and horizontal");
|
||||
return null;
|
||||
}
|
||||
return new Point2D.Float(vertical.getLeft(), horizontal.getTop());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
|
||||
@ -450,16 +345,9 @@ public class Ruling extends Line2D.Float {
|
||||
|
||||
final float TOLERANCE = 1;
|
||||
return Math.abs(ruling.getX1() - x1) < TOLERANCE &&//
|
||||
Math.abs(ruling.getY1() - y1) < TOLERANCE &&//
|
||||
Math.abs(ruling.getX2() - x2) < TOLERANCE &&//
|
||||
Math.abs(ruling.getY2() - y2) < TOLERANCE;
|
||||
}
|
||||
|
||||
|
||||
private enum SOType {
|
||||
VERTICAL,
|
||||
HRIGHT,
|
||||
HLEFT
|
||||
Math.abs(ruling.getY1() - y1) < TOLERANCE &&//
|
||||
Math.abs(ruling.getX2() - x2) < TOLERANCE &&//
|
||||
Math.abs(ruling.getY2() - y2) < TOLERANCE;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
@ -11,6 +12,7 @@ import java.util.TreeMap;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
@ -19,7 +21,8 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Slf4j
|
||||
public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
private final TreeMap<CellPosition, Cell> cells = new TreeMap<>();
|
||||
public static final double CELL_AREA_CONTAINED_THRESHOLD = 0.98;
|
||||
private final TreeMap<CellPosition, Cell> cellTreeMap = new TreeMap<>();
|
||||
|
||||
private final int rotation;
|
||||
@Getter
|
||||
@ -28,20 +31,28 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
private int unrotatedRowCount;
|
||||
private int unrotatedColCount;
|
||||
private List<List<Cell>> rows;
|
||||
@Getter
|
||||
@Setter
|
||||
private List<Cell> cells;
|
||||
|
||||
|
||||
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
|
||||
public TablePageBlock(List<Cell> cells, int rotation) {
|
||||
|
||||
setToBBoxOfComponents(cells);
|
||||
this.cells = cells;
|
||||
addCells(cells);
|
||||
minX = area.getLeft();
|
||||
minY = area.getBottom();
|
||||
maxX = area.getRight();
|
||||
maxY = area.getTop();
|
||||
classification = PageBlockType.TABLE;
|
||||
this.rotation = rotation;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
|
||||
return getColCount() == 0 || getRowCount() == 0;
|
||||
}
|
||||
|
||||
|
||||
public List<List<Cell>> getRows() {
|
||||
|
||||
if (rows == null) {
|
||||
@ -72,14 +83,17 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
public int getColCount() {
|
||||
|
||||
return getRows().stream().mapToInt(List::size).max().orElse(0);
|
||||
return getRows().stream()
|
||||
.mapToInt(List::size)
|
||||
.max()
|
||||
.orElse(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Detect header cells (either first row or first column):
|
||||
* Column is marked as header if cell text is bold and row cell text is not bold.
|
||||
* Column is marked as header if originalCell text is bold and row originalCell text is not bold.
|
||||
* Defaults to row.
|
||||
*/
|
||||
private void computeHeaders() {
|
||||
@ -87,7 +101,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
if (rows == null) {
|
||||
rows = computeRows();
|
||||
}
|
||||
// A bold cell is a header cell as long as every cell to the left/top is bold, too
|
||||
// A bold originalCell is a header originalCell as long as every originalCell to the left/top is bold, too
|
||||
// we move from left to right and top to bottom
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
List<Cell> rowCells = rows.get(rowIndex);
|
||||
@ -112,7 +126,8 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
List<Cell> cellsToTheTop = new ArrayList<>();
|
||||
for (int i = 0; i < rowIndex; i++) {
|
||||
try {
|
||||
cellsToTheTop.add(rows.get(i).get(colIndex));
|
||||
cellsToTheTop.add(rows.get(i)
|
||||
.get(colIndex));
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
|
||||
}
|
||||
@ -127,7 +142,8 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
|
||||
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks()
|
||||
.get(0).getMostPopularWordStyle().equals("bold")) {
|
||||
cell.setHeaderCell(true);
|
||||
}
|
||||
}
|
||||
@ -143,7 +159,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
for (int i = 0; i < unrotatedColCount; i++) { // rows
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
|
||||
Cell cell = cells.get(new CellPosition(j, i));
|
||||
Cell cell = cellTreeMap.get(new CellPosition(j, i));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
@ -154,7 +170,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < unrotatedRowCount; j++) { // cols
|
||||
Cell cell = cells.get(new CellPosition(j, i));
|
||||
Cell cell = cellTreeMap.get(new CellPosition(j, i));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
@ -165,7 +181,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
for (int i = 0; i < unrotatedRowCount; i++) {
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < unrotatedColCount; j++) {
|
||||
Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
|
||||
Cell cell = cellTreeMap.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
@ -179,17 +195,6 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
private void add(Cell chunk, int row, int col) {
|
||||
|
||||
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
|
||||
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
|
||||
|
||||
CellPosition cp = new CellPosition(row, col);
|
||||
cells.put(cp, chunk);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void addCells(List<Cell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
@ -198,11 +203,12 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1);
|
||||
|
||||
List<List<Cell>> rowsOfCells = calculateStructure(cells);
|
||||
List<List<Cell>> rowsOfCellsMatrix = calculateTableStructure(cells);
|
||||
|
||||
for (int i = 0; i < rowsOfCells.size(); i++) {
|
||||
for (int j = 0; j < rowsOfCells.get(i).size(); j++) {
|
||||
add(rowsOfCells.get(i).get(j), i, j);
|
||||
for (int i = 0; i < rowsOfCellsMatrix.size(); i++) {
|
||||
for (int j = 0; j < rowsOfCellsMatrix.get(i).size(); j++) {
|
||||
addCellToRowAndCol(rowsOfCellsMatrix.get(i)
|
||||
.get(j), i, j);
|
||||
}
|
||||
}
|
||||
|
||||
@ -213,57 +219,125 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
||||
*
|
||||
* @param cells The found cells
|
||||
* @return TablePageBlock Structure
|
||||
* @return TablePageBlock Structure as a rows of cells matrix
|
||||
*/
|
||||
private List<List<Cell>> calculateStructure(List<Cell> cells) {
|
||||
|
||||
List<List<Cell>> matrix = new ArrayList<>();
|
||||
private List<List<Cell>> calculateTableStructure(List<Cell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return matrix;
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
Set<Float> uniqueX = new HashSet<>();
|
||||
Set<Float> uniqueY = new HashSet<>();
|
||||
cells.stream().filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3).forEach(c -> {
|
||||
uniqueX.add(c.getLeft());
|
||||
uniqueX.add(c.getRight());
|
||||
uniqueY.add(c.getBottom());
|
||||
uniqueY.add(c.getTop());
|
||||
});
|
||||
Set<Double> uniqueX = new HashSet<>();
|
||||
Set<Double> uniqueY = new HashSet<>();
|
||||
cells.stream()
|
||||
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
|
||||
.forEach(c -> {
|
||||
uniqueX.add(c.getPdfMinX());
|
||||
uniqueX.add(c.getPdfMaxX());
|
||||
uniqueY.add(c.getPdfMinY());
|
||||
uniqueY.add(c.getPdfMaxY());
|
||||
});
|
||||
|
||||
var sortedUniqueX = uniqueX.stream().sorted().toList();
|
||||
var sortedUniqueY = uniqueY.stream().sorted().toList();
|
||||
var sortedUniqueX = uniqueX.stream()
|
||||
.sorted()
|
||||
.toList();
|
||||
var sortedUniqueY = uniqueY.stream()
|
||||
.sorted()
|
||||
.toList();
|
||||
|
||||
Float prevY = null;
|
||||
for (Float y : sortedUniqueY) {
|
||||
List<List<Cell>> rowsOfCells = new ArrayList<>();
|
||||
|
||||
Double prevY = null;
|
||||
|
||||
for (Double y : sortedUniqueY) {
|
||||
|
||||
List<Cell> row = new ArrayList<>();
|
||||
|
||||
Float prevX = null;
|
||||
for (Float x : sortedUniqueX) {
|
||||
Double prevX = null;
|
||||
for (Double x : sortedUniqueX) {
|
||||
|
||||
if (prevY != null && prevX != null) {
|
||||
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
||||
var cellFromGridStructure = new Cell(new Point2D.Double(prevX, prevY), new Point2D.Double(x, y));
|
||||
|
||||
var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
|
||||
intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
|
||||
if (cell.hasMinimumSize()) {
|
||||
row.add(cell);
|
||||
if (cellFromGridStructure.hasMinimumSize()) {
|
||||
|
||||
cells.stream()
|
||||
.map(originalCell -> new CellWithIntersection(originalCell,
|
||||
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxPdf(),
|
||||
originalCell.getBBoxPdf())))
|
||||
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
|
||||
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
|
||||
.max(Comparator.comparing(CellWithIntersection::intersectedArea))
|
||||
.map(CellWithIntersection::originalCell)
|
||||
.ifPresent(matchingCell -> cellFromGridStructure.getTextBlocks().addAll(matchingCell.getTextBlocks()));
|
||||
|
||||
row.add(cellFromGridStructure);
|
||||
}
|
||||
}
|
||||
prevX = x;
|
||||
}
|
||||
|
||||
if (prevY != null && prevX != null && !row.isEmpty()) {
|
||||
matrix.add(row);
|
||||
// exclude empty rows and rows where all text blocks are empty
|
||||
if (prevY != null && prevX != null && !row.isEmpty() && !row.stream()
|
||||
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
|
||||
|
||||
rowsOfCells.add(row);
|
||||
}
|
||||
prevY = y;
|
||||
}
|
||||
|
||||
Collections.reverse(matrix);
|
||||
Collections.reverse(rowsOfCells);
|
||||
|
||||
// now cells are removed which are part of a column without any text blocks
|
||||
// this is done by first computing the inverse matrix which contains call columns of cells
|
||||
// then the column indices that have to be removed are determined
|
||||
List<List<Cell>> columnsOfCells = new ArrayList<>();
|
||||
int maxRowLength = rowsOfCells.stream()
|
||||
.map(List::size)
|
||||
.max(java.util.Comparator.naturalOrder())
|
||||
.orElse(0);
|
||||
for (int i = 0; i < maxRowLength; i++) {
|
||||
columnsOfCells.add(new ArrayList<>());
|
||||
}
|
||||
|
||||
for (List<Cell> row : rowsOfCells) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
columnsOfCells.get(j).add(row.get(j));
|
||||
}
|
||||
}
|
||||
|
||||
List<Integer> columnIndicesToRemove = new ArrayList<>();
|
||||
int columnIndex = 0;
|
||||
for (List<Cell> col : columnsOfCells) {
|
||||
if (col.stream()
|
||||
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
|
||||
columnIndicesToRemove.add(columnIndex);
|
||||
}
|
||||
columnIndex++;
|
||||
}
|
||||
columnIndicesToRemove.sort(Collections.reverseOrder());
|
||||
|
||||
// update all rows so that the values of the empty columns get removed
|
||||
var rowsOfCellsBefore = new ArrayList<>(rowsOfCells);
|
||||
rowsOfCells = new ArrayList<>();
|
||||
for (List<Cell> row : rowsOfCellsBefore) {
|
||||
var updatedRow = new ArrayList<>(row);
|
||||
columnIndicesToRemove.forEach(idxToRemove -> updatedRow.remove(updatedRow.get(idxToRemove)));
|
||||
rowsOfCells.add(updatedRow);
|
||||
}
|
||||
|
||||
return rowsOfCells;
|
||||
}
|
||||
|
||||
|
||||
private void addCellToRowAndCol(Cell cell, int row, int col) {
|
||||
|
||||
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
|
||||
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
|
||||
|
||||
CellPosition cp = new CellPosition(row, col);
|
||||
cellTreeMap.put(cp, cell);
|
||||
|
||||
return matrix;
|
||||
}
|
||||
|
||||
|
||||
@ -290,7 +364,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
if (!first) {
|
||||
sb.append("\n");
|
||||
}
|
||||
sb.append('\"').append(textBlock.getText().replaceAll("\"", "\\\"")).append('\"');
|
||||
sb.append('\"').append(textBlock.getText().replaceAll("\"", "\"")).append('\"');
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
@ -337,4 +411,9 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
record CellWithIntersection(Cell originalCell, double intersectedArea) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
|
||||
public record AbstractBlockOnPage(AbstractPageBlock block, ClassificationPage page) {
|
||||
|
||||
}
|
||||
@ -0,0 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
|
||||
public enum FontStyle {
|
||||
REGULAR,
|
||||
BOLD,
|
||||
ITALIC,
|
||||
BOLD_ITALIC;
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class FrequencyCounters {
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
}
|
||||
@ -0,0 +1,107 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ListIdentifier {
|
||||
|
||||
public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]{1,4})\\.\\s+");
|
||||
public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]{1,4})\\)\\s+");
|
||||
|
||||
enum Format {
|
||||
NUMBER_WITH_DOT,
|
||||
NUMBER_IN_PARENTHESES
|
||||
}
|
||||
|
||||
Format format;
|
||||
@Getter
|
||||
Word word;
|
||||
@Getter
|
||||
int page;
|
||||
int representation;
|
||||
|
||||
|
||||
public static Optional<ListIdentifier> parse(TextPageBlock textPageBlock, int page) {
|
||||
|
||||
return parse(textPageBlock.getWords().subList(0, Math.min(5, textPageBlock.getWords().size())), page);
|
||||
}
|
||||
|
||||
|
||||
public static Optional<ListIdentifier> parse(List<Word> sequences, int page) {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (Word sequence : sequences) {
|
||||
sb.append(sequence.toString());
|
||||
sb.append(" ");
|
||||
}
|
||||
sb.replace(sb.length() - 1, sb.length(), "");
|
||||
String text = sb.toString();
|
||||
|
||||
Matcher numberMatcher = NUMBER_WITH_DOT.matcher(text);
|
||||
|
||||
if (numberMatcher.find()) {
|
||||
Optional<Integer> representation = parseInteger(numberMatcher.group(1));
|
||||
if (representation.isPresent()) {
|
||||
return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, representation.get()));
|
||||
}
|
||||
}
|
||||
|
||||
Matcher parenthesisMatcher = NUMBER_IN_PARENTHESES.matcher(text);
|
||||
if (parenthesisMatcher.find()) {
|
||||
Optional<Integer> representation = parseInteger(parenthesisMatcher.group(1));
|
||||
if (representation.isPresent()) {
|
||||
return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, representation.get()));
|
||||
}
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
private static Optional<Integer> parseInteger(String text) {
|
||||
|
||||
try {
|
||||
return Optional.of(Integer.parseInt(text));
|
||||
} catch (NumberFormatException e) {
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static boolean isInOrder(List<ListIdentifier> listIdentifiers) {
|
||||
|
||||
if (listIdentifiers.size() <= 1) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for (int i = 1; i < listIdentifiers.size(); i++) {
|
||||
ListIdentifier current = listIdentifiers.get(i);
|
||||
ListIdentifier previous = listIdentifiers.get(i - 1);
|
||||
if (current.format != previous.format) {
|
||||
return false;
|
||||
}
|
||||
if (current.representation <= previous.representation) {
|
||||
return false;
|
||||
}
|
||||
if (!current.word.intersectsXDirAdj(previous.word, 2)) {
|
||||
return false;
|
||||
}
|
||||
if (current.page == previous.page && !current.word.isBelowDirAdj(previous.word)) {
|
||||
return false;
|
||||
}
|
||||
if (current.page < previous.page) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user