Compare commits
536 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ef23ee0ade | ||
|
|
af31f52b47 | ||
|
|
b5152112ee | ||
|
|
85ea4ef455 | ||
|
|
01f8c01fff | ||
|
|
0b6a292c75 | ||
|
|
e24020589c | ||
|
|
c619b845e8 | ||
|
|
ed0371ca11 | ||
|
|
89b5be8d67 | ||
|
|
077ce60c9d | ||
|
|
ab171be6e2 | ||
|
|
664b47b4c3 | ||
|
|
8005c1f25f | ||
|
|
42185a95a0 | ||
|
|
51b42efaf6 | ||
|
|
6a50d45947 | ||
|
|
073ac12cf7 | ||
|
|
84b054a4cc | ||
|
|
905b65a5fa | ||
|
|
7617c1f308 | ||
|
|
2b3936c09b | ||
|
|
6e5b1f1978 | ||
|
|
cf846d18bc | ||
|
|
25c46f16ac | ||
|
|
96acefed78 | ||
|
|
366241e6c6 | ||
|
|
7f472ccc52 | ||
|
|
6f807c7d94 | ||
|
|
6e04c15f3d | ||
|
|
1384584e2f | ||
|
|
e58011e111 | ||
|
|
a821570065 | ||
|
|
7ee1f9e360 | ||
|
|
f9b25c8157 | ||
|
|
c90874da7a | ||
|
|
4683c696a5 | ||
|
|
95c02ce3cf | ||
|
|
b2d62e32fe | ||
|
|
65c1f03ea3 | ||
|
|
2219519a2b | ||
|
|
af05218e37 | ||
|
|
736f531df3 | ||
|
|
c64445d54b | ||
|
|
af29233b10 | ||
|
|
5f04b45554 | ||
|
|
6c41533f0b | ||
|
|
9d2596e5ef | ||
|
|
e7b01161ac | ||
|
|
7b073eb4f3 | ||
|
|
4b0c041d84 | ||
|
|
6c7442ac6d | ||
|
|
23e23328ee | ||
|
|
9d1ffdd779 | ||
|
|
3109a30ae1 | ||
|
|
fe2ed1807e | ||
|
|
31de229fa5 | ||
|
|
8a80abfff1 | ||
|
|
7c08905eda | ||
|
|
4f40c9dbc9 | ||
|
|
32381b4472 | ||
|
|
469da38952 | ||
|
|
0f8c4674b3 | ||
|
|
8e165a41d7 | ||
|
|
ed7a701ad9 | ||
|
|
393103e074 | ||
|
|
bd02066e2c | ||
|
|
fec19f4afb | ||
|
|
c726a643f0 | ||
|
|
519e95735c | ||
|
|
b52af2637f | ||
|
|
46ea7edc4c | ||
|
|
9650195afd | ||
|
|
ce628a99f7 | ||
|
|
b66afe135c | ||
|
|
dc892d0fec | ||
|
|
af45f2cd8c | ||
|
|
befb6b1df6 | ||
|
|
61efb4cae9 | ||
|
|
4a06059258 | ||
|
|
292e5b215e | ||
|
|
7c2db6c3c5 | ||
|
|
4395074b21 | ||
|
|
8e14b74da2 | ||
|
|
3b91639ea9 | ||
|
|
c5178ea5c2 | ||
|
|
cf39d4dfcc | ||
|
|
bb40345f79 | ||
|
|
e3e9d16145 | ||
|
|
f6ca5a3c17 | ||
|
|
15e3dced35 | ||
|
|
933054b332 | ||
|
|
ab86714cb3 | ||
|
|
8626b106d0 | ||
|
|
52e948e66c | ||
|
|
3b33405cbf | ||
|
|
b2fa14dde2 | ||
|
|
62e07686d7 | ||
|
|
3eb97d614f | ||
|
|
81469413b0 | ||
|
|
2993676a6f | ||
|
|
8e115dcd8a | ||
|
|
173911b840 | ||
|
|
b0ae00aa02 | ||
|
|
00bf9f279e | ||
|
|
d16377a24a | ||
|
|
81179ee744 | ||
|
|
1953b5924f | ||
|
|
6f6e8d5d4e | ||
|
|
69bcd4f68d | ||
|
|
b900cfaf31 | ||
|
|
cdc2081785 | ||
|
|
a9287ec406 | ||
|
|
5b6a706c28 | ||
|
|
28d8ad0a3f | ||
|
|
0c1583c1be | ||
|
|
7633566d9b | ||
|
|
cc4f09711e | ||
|
|
370165dc59 | ||
|
|
8c052c38d7 | ||
|
|
ea18d3d307 | ||
|
|
2726fc3fe1 | ||
|
|
033279e261 | ||
|
|
ec0dd032c9 | ||
|
|
598fa7f1c7 | ||
|
|
65b1f7d179 | ||
|
|
3173610be5 | ||
|
|
e920eb5a78 | ||
|
|
7e4baea7e5 | ||
|
|
66d3433e04 | ||
|
|
a2f559af51 | ||
|
|
39f527a57c | ||
|
|
5c2844fe31 | ||
|
|
b216f02e15 | ||
|
|
2e2f30ba35 | ||
|
|
9f7ed974ec | ||
|
|
570a348a77 | ||
|
|
859dba2ecf | ||
|
|
1c5d755111 | ||
|
|
133e06460f | ||
|
|
da91fcff97 | ||
|
|
79795e408a | ||
|
|
b719db86ab | ||
|
|
797602e373 | ||
|
|
3d2f66cf10 | ||
|
|
e304a9f2d7 | ||
|
|
c05f67cf44 | ||
|
|
9ecf9ca19f | ||
|
|
3a2ee903af | ||
|
|
072a8aa3da | ||
|
|
b5cfa7b63d | ||
|
|
5f5a6258c5 | ||
|
|
ac0e83725a | ||
|
|
5d33ad570e | ||
|
|
fd698a78fc | ||
|
|
c3edeb3c7d | ||
|
|
fc06dba2ce | ||
|
|
b6742c1e89 | ||
|
|
efb1a748af | ||
|
|
9be672c728 | ||
|
|
23985b14be | ||
|
|
48b7a22e2b | ||
|
|
546341ee75 | ||
|
|
0ed1481517 | ||
|
|
b2a47f66ae | ||
|
|
3835d03036 | ||
|
|
a5fcebce30 | ||
|
|
b867deb9f9 | ||
|
|
8648ed0952 | ||
|
|
53f786b539 | ||
|
|
40465e8778 | ||
|
|
a76b2ace3f | ||
|
|
aeaca2f278 | ||
|
|
f1dbcc24a2 | ||
|
|
fda25852d1 | ||
|
|
471fadbcca | ||
|
|
87001090d5 | ||
|
|
ea355429c2 | ||
|
|
6a65d7f9fc | ||
|
|
e935cc7b14 | ||
|
|
07733d0855 | ||
|
|
abb249e966 | ||
|
|
bcd1eb9afa | ||
|
|
60acbac53f | ||
|
|
a3decd292d | ||
|
|
b6f0a21886 | ||
|
|
d61cac8b4f | ||
|
|
ae46c5f1ca | ||
|
|
f0a70a5242 | ||
|
|
15ea385f4d | ||
|
|
08be18db2d | ||
|
|
64209255cb | ||
|
|
4761d2e1a2 | ||
|
|
1916e626df | ||
|
|
e4663ac8db | ||
|
|
6a691183dc | ||
|
|
3dd215288a | ||
|
|
6fb1a0bef3 | ||
|
|
4e7c3f584b | ||
|
|
84bdb4d1ed | ||
|
|
75ab4df592 | ||
|
|
8442e60055 | ||
|
|
0ef67fc07b | ||
|
|
ea02f31a84 | ||
|
|
58acbab85f | ||
|
|
d38d023485 | ||
|
|
c1afe9b11f | ||
|
|
bdcb9aeda4 | ||
|
|
6a86036a78 | ||
|
|
a358d7565e | ||
|
|
069a6c0b49 | ||
|
|
683f7f1fb8 | ||
|
|
7eab3a4088 | ||
|
|
970fc99ed1 | ||
|
|
48c54f63a0 | ||
|
|
20e4e5ddff | ||
|
|
b53930328a | ||
|
|
c947d552d2 | ||
|
|
6b1b5eab84 | ||
|
|
cc9816c8cb | ||
|
|
f256f9b30f | ||
|
|
6167e3fb57 | ||
|
|
a78fb0244a | ||
|
|
8099a00bb6 | ||
|
|
9bb0468b2b | ||
|
|
c4d9c5df02 | ||
|
|
976f408237 | ||
|
|
319268c53d | ||
|
|
014eba9fc3 | ||
|
|
9bd8419770 | ||
|
|
c13ff7fbf6 | ||
|
|
5d3826e9b9 | ||
|
|
0c3194276a | ||
|
|
e302d9784e | ||
|
|
f185b13f2b | ||
|
|
990c376ce6 | ||
|
|
bf6a0d770b | ||
|
|
f18bda1d4e | ||
|
|
0a11992361 | ||
|
|
456b8fe4a1 | ||
|
|
9778ece992 | ||
|
|
8bd0de6263 | ||
|
|
5c1708f97f | ||
|
|
a35d77be2e | ||
|
|
631160eb22 | ||
|
|
8e7e588d26 | ||
|
|
ac850c2626 | ||
|
|
1d765a6baa | ||
|
|
c55984aa67 | ||
|
|
27aa418029 | ||
|
|
c4edff4696 | ||
|
|
92fd1a72de | ||
|
|
0d3d25e7d7 | ||
|
|
956fbff872 | ||
|
|
2488009af1 | ||
|
|
16be2467fd | ||
|
|
f4cae8a7dc | ||
|
|
dfc23955d7 | ||
|
|
d6e3d6fe22 | ||
|
|
bef23e38b5 | ||
|
|
65ab7a1912 | ||
|
|
d80231e4a9 | ||
|
|
56c07a4491 | ||
|
|
0b4ad29dcb | ||
|
|
0ad0cd45d6 | ||
|
|
d659fe7234 | ||
|
|
cb9127b4f3 | ||
|
|
05523585c0 | ||
|
|
4ced572949 | ||
|
|
79239b751d | ||
|
|
f146beeb44 | ||
|
|
f8a4ccfff0 | ||
|
|
a6ba501fa8 | ||
|
|
7dfb3b2b52 | ||
|
|
c324d3815e | ||
|
|
74f55a5cbf | ||
|
|
e7bf607663 | ||
|
|
f4d789311c | ||
|
|
9817eae897 | ||
|
|
477f6af886 | ||
|
|
2c171b6a9e | ||
|
|
71477dabde | ||
|
|
a927cbd9dc | ||
|
|
a1521877d7 | ||
|
|
f4b6386e1c | ||
|
|
1d64028158 | ||
|
|
0979a267d4 | ||
|
|
cc77d19500 | ||
|
|
fa048b2fe0 | ||
|
|
bdf1161c91 | ||
|
|
b4a225144d | ||
|
|
903b1c1fd4 | ||
|
|
c3e7582ee3 | ||
|
|
cfc5db45cd | ||
|
|
fbd0196719 | ||
|
|
3c9049dc8a | ||
|
|
015984891f | ||
|
|
66fcb62833 | ||
|
|
48824f56a8 | ||
|
|
785628537f | ||
|
|
23eb0c40a3 | ||
|
|
1b4aaf4454 | ||
|
|
e4f3557b36 | ||
|
|
9be3c86297 | ||
|
|
88855de2da | ||
|
|
368a75e985 | ||
|
|
12344d57b2 | ||
|
|
9e854379e7 | ||
|
|
b779c72041 | ||
|
|
760a809900 | ||
|
|
ba1c7c07ab | ||
|
|
ca0cbbcb49 | ||
|
|
da2cdc288e | ||
|
|
68da328889 | ||
|
|
711548d1a7 | ||
|
|
2bddcdafee | ||
|
|
750ccf4ce2 | ||
|
|
57b5d3f48e | ||
|
|
d8c9659469 | ||
|
|
30f060e36c | ||
|
|
53a5824e6c | ||
|
|
e2bcf971c9 | ||
|
|
dacc2f7f43 | ||
|
|
144a9591a2 | ||
|
|
207d9dec97 | ||
|
|
09ee90222e | ||
|
|
1316a067fe | ||
|
|
e203210ade | ||
|
|
b25d46291a | ||
|
|
84148d3b6e | ||
|
|
a6ba66b1aa | ||
|
|
c3e69b2cdf | ||
|
|
f69331e7d8 | ||
|
|
01493dc033 | ||
|
|
459e0c8be7 | ||
|
|
1b1f777706 | ||
|
|
0e0a811f9d | ||
|
|
efa3d75479 | ||
|
|
9abdc6d44d | ||
|
|
3bab61c446 | ||
|
|
d17517d3c3 | ||
|
|
567cbc178b | ||
|
|
3c53772765 | ||
|
|
8647cf5a18 | ||
|
|
310c07b200 | ||
|
|
daba0bf8a6 | ||
|
|
3839de215c | ||
|
|
b4d68594f1 | ||
|
|
99ed331a1e | ||
|
|
f2c0991987 | ||
|
|
b8ef55e6e2 | ||
|
|
5792ff4a93 | ||
|
|
621c3f269d | ||
|
|
8dba392904 | ||
|
|
306a53ea79 | ||
|
|
754fd8f933 | ||
|
|
28ec4c9ccb | ||
|
|
aed4a55787 | ||
|
|
f87e2d75b5 | ||
|
|
de6760abc1 | ||
|
|
261ef4c367 | ||
|
|
11ba9c6bb9 | ||
|
|
b7c3d02978 | ||
|
|
bcf0bcbaf4 | ||
|
|
84cde2a3db | ||
|
|
6f2dd4f823 | ||
|
|
a909724217 | ||
|
|
67a981e7a8 | ||
|
|
0e93fdd515 | ||
|
|
d464239f9b | ||
|
|
88a20924b9 | ||
|
|
f89243472c | ||
|
|
ad3612acd4 | ||
|
|
630eee6bd7 | ||
|
|
a951911ec8 | ||
|
|
75e6b88705 | ||
|
|
2e0adbdd9a | ||
|
|
b747742558 | ||
|
|
192c9976c1 | ||
|
|
b251697492 | ||
|
|
22d6b25fe4 | ||
|
|
e6bcd6fb2b | ||
|
|
2847adde22 | ||
|
|
7cf67d7121 | ||
|
|
3a18923ef5 | ||
|
|
2b15fd1d3c | ||
|
|
3722fff476 | ||
|
|
0cb8029f0a | ||
|
|
b270b9c942 | ||
|
|
60615ec5d8 | ||
|
|
880914a167 | ||
|
|
a80a93d2b0 | ||
|
|
0afa7e5b12 | ||
|
|
12516ebf22 | ||
|
|
0dca90c3fe | ||
|
|
2506c9e091 | ||
|
|
83d39ba3a5 | ||
|
|
c09bb06da6 | ||
|
|
1793b1138e | ||
|
|
d30735bc49 | ||
|
|
9356db5373 | ||
|
|
ee766e7150 | ||
|
|
a33bbc9abc | ||
|
|
5758295fac | ||
|
|
8142d0aa09 | ||
|
|
dc80353a5b | ||
|
|
4d856b04b3 | ||
|
|
6ba25ecaa0 | ||
|
|
fcdcaf16e9 | ||
|
|
086e338f4a | ||
|
|
9dbe73f376 | ||
|
|
aaf4015c95 | ||
|
|
2b65ad4b4b | ||
|
|
c0c75f6a0e | ||
|
|
2f4af6e377 | ||
|
|
b9a305bf2d | ||
|
|
db6b6af4d7 | ||
|
|
d73addf7ed | ||
|
|
c7978c93c2 | ||
|
|
457f7d9c66 | ||
|
|
0387cdd143 | ||
|
|
5c6898b975 | ||
|
|
b7b273b47d | ||
|
|
9aa9cb2d54 | ||
|
|
ee6c21638f | ||
|
|
1e4475afdf | ||
|
|
708d274ebc | ||
|
|
a94faad870 | ||
|
|
d854125867 | ||
|
|
63de8ef82d | ||
|
|
ea0af08c31 | ||
|
|
810caa0624 | ||
|
|
c282372dc8 | ||
|
|
055ccd3366 | ||
|
|
4b4c73fb7b | ||
|
|
35b9cfd1c2 | ||
|
|
9a73b952cf | ||
|
|
a1c73094f1 | ||
|
|
b79d9946a9 | ||
|
|
a9735daa04 | ||
|
|
d00491c15e | ||
|
|
ed48b6a4bf | ||
|
|
62eade84b9 | ||
|
|
d6a217fe70 | ||
|
|
f1e4d0d52b | ||
|
|
38a1e8b95f | ||
|
|
a371558f8c | ||
|
|
b716b187eb | ||
|
|
0be6454a7e | ||
|
|
5fde631e04 | ||
|
|
c076c10840 | ||
|
|
24104f8cc1 | ||
|
|
3632dd4667 | ||
|
|
063aa8bfe1 | ||
|
|
d2716a60e9 | ||
|
|
8f08a8c62b | ||
|
|
091cb73622 | ||
|
|
d3b0bc430f | ||
|
|
e3c12bc1bb | ||
|
|
f6f7a0a952 | ||
|
|
96df6e3145 | ||
|
|
574e5ad425 | ||
|
|
0611e56baa | ||
|
|
442c1dafea | ||
|
|
33bc532eac | ||
|
|
4bd6e7e343 | ||
|
|
159ac6348c | ||
|
|
17259ed805 | ||
|
|
67bf5cbaa8 | ||
|
|
f8a3cbc147 | ||
|
|
a3d4fbe3a3 | ||
|
|
5c1dca5933 | ||
|
|
f56ab8fa49 | ||
|
|
cfca5376a0 | ||
|
|
0633fa04fb | ||
|
|
659a9abaa5 | ||
|
|
5877aea3f7 | ||
|
|
f2b92de827 | ||
|
|
4a5464d6aa | ||
|
|
d9a3bbbd30 | ||
|
|
150aea55c0 | ||
|
|
676f0c9d09 | ||
|
|
ded00df11e | ||
|
|
286556cbb6 | ||
|
|
d6a74dc9f9 | ||
|
|
2a55654fcf | ||
|
|
7496914b37 | ||
|
|
c8ace585e1 | ||
|
|
69c5f80c8c | ||
|
|
79d27189fd | ||
|
|
75bac72c05 | ||
|
|
c5e6271dc3 | ||
|
|
5561dd5e95 | ||
|
|
041b633742 | ||
|
|
715426bd3b | ||
|
|
464b8053fe | ||
|
|
2fece83c7c | ||
|
|
ad03ef1922 | ||
|
|
cc44100e4e | ||
|
|
5d1c1ae406 | ||
|
|
f72838b0be | ||
|
|
6388898cc0 | ||
|
|
72d1e6271a | ||
|
|
299b5be385 | ||
|
|
2ea58f5e9f | ||
|
|
510ec7ce45 | ||
|
|
c186927e3d | ||
|
|
1a494b0dea | ||
|
|
19552ddf69 | ||
|
|
41267a0f98 | ||
|
|
270129cd73 | ||
|
|
a41c13fdd6 | ||
|
|
65ab5eca22 | ||
|
|
143ebee25e | ||
|
|
47fd8e05d1 | ||
|
|
653f280fd1 | ||
|
|
daa68f3fa6 | ||
|
|
ed66043856 | ||
|
|
526b1c5ad3 | ||
|
|
241a32cb4f | ||
|
|
9c8501e76a | ||
|
|
3bc88bc9b7 | ||
|
|
15a6d46f5c | ||
|
|
788613c92e | ||
|
|
7f0aa32d1b | ||
|
|
f08c4ced43 | ||
|
|
dfdeef5812 | ||
|
|
69a62c4dbe | ||
|
|
e346c04d67 | ||
|
|
cdff0b0ece | ||
|
|
a9e6c1f0f8 | ||
|
|
4fc7bac818 | ||
|
|
df9cbdc036 | ||
|
|
cc1fedac41 | ||
|
|
54555d4ce0 | ||
|
|
0ac3ee309a |
1
.gitattributes
vendored
Normal file
1
.gitattributes
vendored
Normal file
@ -0,0 +1 @@
|
||||
*.pdf filter=lfs diff=lfs merge=lfs -text
|
||||
7
.gitignore
vendored
7
.gitignore
vendored
@ -18,6 +18,7 @@ target/
|
||||
.settings
|
||||
.springBeans
|
||||
.sts4-cache
|
||||
.gradle
|
||||
|
||||
### IntelliJ IDEA ###
|
||||
.idea
|
||||
@ -37,3 +38,9 @@ build/
|
||||
|
||||
### VS Code ###
|
||||
.vscode/
|
||||
gradlew.bat
|
||||
gradlew
|
||||
gradle.properties
|
||||
gradle/
|
||||
.DS_Store
|
||||
.DS_Store/
|
||||
|
||||
@ -1,6 +1,26 @@
|
||||
variables:
|
||||
SONAR_PROJECT_KEY: 'fforesight_layout-parser_AYd5quv2mRkBOCG22hvF'
|
||||
# SONAR_PROJECT_KEY: 'fforesight_layout-parser_AYd5quv2mRkBOCG22hvF'
|
||||
GIT_SUBMODULE_STRATEGY: recursive
|
||||
GIT_SUBMODULE_FORCE_HTTPS: 'true'
|
||||
include:
|
||||
- project: 'gitlab/gitlab'
|
||||
ref: 'main'
|
||||
file: 'ci-templates/maven_java.yml'
|
||||
file: 'ci-templates/gradle_java.yml'
|
||||
|
||||
deploy:
|
||||
stage: deploy
|
||||
tags:
|
||||
- dind
|
||||
script:
|
||||
- echo "Building with gradle version ${BUILDVERSION}"
|
||||
- gradle -Pversion=${BUILDVERSION} publish
|
||||
- gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${BUILDVERSION}
|
||||
- echo "BUILDVERSION=$BUILDVERSION" >> version.env
|
||||
artifacts:
|
||||
reports:
|
||||
dotenv: version.env
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
||||
- if: $CI_COMMIT_BRANCH =~ /^feature/ && $CI_COMMIT_TAG == ""
|
||||
- if: $CI_COMMIT_BRANCH =~ /^release/
|
||||
- if: $CI_COMMIT_TAG
|
||||
|
||||
8
.gitmodules
vendored
Normal file
8
.gitmodules
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/basf"]
|
||||
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/basf
|
||||
url = ssh://git@git.knecon.com:22222/fforesight/documents/basf.git
|
||||
update = merge
|
||||
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta"]
|
||||
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta
|
||||
url = ssh://git@git.knecon.com:22222/fforesight/documents/syngenta.git
|
||||
update = merge
|
||||
BIN
.mvn/wrapper/maven-wrapper.jar
vendored
BIN
.mvn/wrapper/maven-wrapper.jar
vendored
Binary file not shown.
18
.mvn/wrapper/maven-wrapper.properties
vendored
18
.mvn/wrapper/maven-wrapper.properties
vendored
@ -1,18 +0,0 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# https://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.7/apache-maven-3.8.7-bin.zip
|
||||
wrapperUrl=https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.1/maven-wrapper-3.1.1.jar
|
||||
88
README.md
88
README.md
@ -1 +1,89 @@
|
||||
# PDF Layout Parser Micro-Service: layout-parser
|
||||
|
||||
## Introduction
|
||||
The layout-parser micro-service is a powerful tool designed to efficiently extract structured information from PDF documents. Written in Java and utilizing Spring Boot 3, Apache PDFBox, and RabbitMQ, this micro-service excels at parsing PDFs and organizing their content into a meaningful and coherent layout structure. Notably, the layout-parser micro-service distinguishes itself by relying solely on advanced algorithms, rather than machine learning techniques.
|
||||
|
||||
### Key Steps in the PDF Layout Parsing Process:
|
||||
|
||||
* **Text Position Extraction:**
|
||||
The micro-service leverages Apache PDFBox to extract precise text positions for each individual character within the PDF document.
|
||||
|
||||
* **Word Segmentation and Text Block Formation:**
|
||||
Employing an array of diverse algorithms, the micro-service initially identifies and segments words, creating distinct text blocks.
|
||||
|
||||
* **Text Block Classification:**
|
||||
The segmented text blocks are then subjected to classification algorithms. These algorithms categorize the text blocks based on their content and visual properties, distinguishing between sections, subsections, headlines, paragraphs, images, tables, table cells, headers, and footers.
|
||||
|
||||
* **Layout Coherence Establishment:**
|
||||
The classified text blocks are subsequently orchestrated into a cohesive layout structure. This process involves arranging sections, subsections, paragraphs, images, and other elements in a logical and structured manner.
|
||||
|
||||
* **Output Generation in Various Formats:**
|
||||
Once the layout structure is established, the micro-service generates output in multiple formats. These formats are designed for seamless integration with downstream micro-services. The supported formats include JSON, XML, and others, ensuring flexibility in downstream data consumption.
|
||||
|
||||
### Optional Enhancements:
|
||||
|
||||
* **ML-Based Table Extraction:**
|
||||
For enhanced results, users have the option to incorporate machine learning-based table extraction. This feature can be activated by providing ML-generated results as a JSON file, which are then integrated seamlessly into the layout structure.
|
||||
|
||||
* **Image Classification using ML:**
|
||||
Additionally, for more accurate image classification, users can optionally feed ML-generated image classification results into the micro-service. Similar to the table extraction option, the micro-service processes the pre-parsed results in JSON format, thus optimizing the accuracy of image content identification.
|
||||
|
||||
In conclusion, the layout-parser micro-service is a versatile PDF layout parsing solution crafted entirely around advanced algorithms, without reliance on machine learning. It proficiently extracts text positions, segments content into meaningful blocks, classifies these blocks, arranges them coherently, and outputs structured data for downstream micro-services. Optional integration with ML-generated table extractions and image classifications further enhances its capabilities.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## Installation
|
||||
|
||||
### Prerequisites
|
||||
|
||||
Before building and using the layout-parser micro-service, please ensure you have the following software and tools installed:
|
||||
|
||||
Java Development Kit (JDK) 17 or later
|
||||
Gradle build tool (preinstalled)
|
||||
Build and Test
|
||||
To build and test the micro-service, follow these steps:
|
||||
|
||||
### Clone the Repository:
|
||||
|
||||
bash
|
||||
```
|
||||
git clone ssh://git@git.knecon.com:22222/fforesight/layout-parser.git
|
||||
cd layout-parser
|
||||
```
|
||||
### Build the Project:
|
||||
Use the following command to build the project using Gradle:
|
||||
|
||||
```
|
||||
gradle clean build
|
||||
```
|
||||
### Run Tests:
|
||||
Run the test suite using the following command:
|
||||
```
|
||||
gradle test
|
||||
```
|
||||
## Building a Custom Docker Image
|
||||
To create a custom Docker image for the layout-parser micro-service, execute the provided script:
|
||||
|
||||
### Ensure Docker is Installed:
|
||||
Ensure that Docker is installed and running on your system.
|
||||
|
||||
### Run the Image Building Script:
|
||||
Execute the publish-custom-image script in the project directory:
|
||||
|
||||
```
|
||||
./publish-custom-image
|
||||
```
|
||||
## Publishing to Internal Maven Repository
|
||||
To publish the layout-parser micro-service to your internal Maven repository, execute the following command:
|
||||
|
||||
```
|
||||
gradle -Pversion=buildVersion publish
|
||||
```
|
||||
Replace buildVersion with the desired version number.
|
||||
|
||||
## Additional Notes
|
||||
Make sure to configure any necessary application properties before deploying the micro-service.
|
||||
For advanced usage and configurations, refer to Kilian or Dom or preferably the source code.
|
||||
|
||||
7
buildSrc/build.gradle.kts
Normal file
7
buildSrc/build.gradle.kts
Normal file
@ -0,0 +1,7 @@
|
||||
plugins {
|
||||
`kotlin-dsl`
|
||||
}
|
||||
|
||||
repositories {
|
||||
gradlePluginPortal()
|
||||
}
|
||||
@ -0,0 +1,93 @@
|
||||
plugins {
|
||||
`java-library`
|
||||
`maven-publish`
|
||||
pmd
|
||||
checkstyle
|
||||
jacoco
|
||||
}
|
||||
|
||||
group = "com.knecon.fforesight"
|
||||
|
||||
val documentVersion by rootProject.extra { "4.433.0" }
|
||||
|
||||
java.sourceCompatibility = JavaVersion.VERSION_17
|
||||
java.targetCompatibility = JavaVersion.VERSION_17
|
||||
|
||||
tasks.pmdMain {
|
||||
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
|
||||
}
|
||||
|
||||
tasks.pmdTest {
|
||||
pmd.ruleSetFiles = files("${rootDir}/config/pmd/test_pmd.xml")
|
||||
}
|
||||
|
||||
tasks.named<Test>("test") {
|
||||
useJUnitPlatform()
|
||||
reports {
|
||||
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
|
||||
}
|
||||
minHeapSize = "512m"
|
||||
maxHeapSize = "2048m"
|
||||
}
|
||||
|
||||
tasks.test {
|
||||
finalizedBy(tasks.jacocoTestReport) // report is always generated after tests run
|
||||
}
|
||||
|
||||
tasks.jacocoTestReport {
|
||||
dependsOn(tasks.test) // tests are required to run before generating the report
|
||||
reports {
|
||||
xml.required.set(true)
|
||||
csv.required.set(false)
|
||||
html.outputLocation.set(layout.buildDirectory.dir("jacocoHtml"))
|
||||
}
|
||||
}
|
||||
|
||||
allprojects {
|
||||
|
||||
tasks.withType<Javadoc> {
|
||||
options {
|
||||
this as StandardJavadocDocletOptions
|
||||
addBooleanOption("Xdoclint:none", true)
|
||||
addStringOption("Xmaxwarns", "1")
|
||||
}
|
||||
}
|
||||
|
||||
pmd {
|
||||
setConsoleOutput(true)
|
||||
}
|
||||
|
||||
publishing {
|
||||
publications {
|
||||
create<MavenPublication>(name) {
|
||||
from(components["java"])
|
||||
}
|
||||
}
|
||||
repositories {
|
||||
maven {
|
||||
url = uri("https://nexus.knecon.com/repository/red-platform-releases/")
|
||||
credentials {
|
||||
username = providers.gradleProperty("mavenUser").getOrNull();
|
||||
password = providers.gradleProperty("mavenPassword").getOrNull();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
java {
|
||||
withJavadocJar()
|
||||
}
|
||||
|
||||
|
||||
repositories {
|
||||
mavenLocal()
|
||||
mavenCentral()
|
||||
maven {
|
||||
url = uri("https://nexus.knecon.com/repository/gindev/")
|
||||
credentials {
|
||||
username = providers.gradleProperty("mavenUser").getOrNull();
|
||||
password = providers.gradleProperty("mavenPassword").getOrNull();
|
||||
}
|
||||
}
|
||||
}
|
||||
39
config/checkstyle/checkstyle.xml
Normal file
39
config/checkstyle/checkstyle.xml
Normal file
@ -0,0 +1,39 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
|
||||
"http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
|
||||
<module name="Checker">
|
||||
<property
|
||||
name="severity"
|
||||
value="error"/>
|
||||
<module name="TreeWalker">
|
||||
<module name="SuppressWarningsHolder"/>
|
||||
<module name="MissingDeprecated"/>
|
||||
<module name="MissingOverride"/>
|
||||
<module name="AnnotationLocation"/>
|
||||
<module name="JavadocStyle"/>
|
||||
<module name="NonEmptyAtclauseDescription"/>
|
||||
<module name="IllegalImport"/>
|
||||
<module name="RedundantImport"/>
|
||||
<module name="RedundantModifier"/>
|
||||
<module name="EmptyBlock"/>
|
||||
<module name="DefaultComesLast"/>
|
||||
<module name="EmptyStatement"/>
|
||||
<module name="EqualsHashCode"/>
|
||||
<module name="ExplicitInitialization"/>
|
||||
<module name="IllegalInstantiation"/>
|
||||
<module name="ModifiedControlVariable"/>
|
||||
<module name="MultipleVariableDeclarations"/>
|
||||
<module name="PackageDeclaration"/>
|
||||
<module name="ParameterAssignment"/>
|
||||
<module name="SimplifyBooleanExpression"/>
|
||||
<module name="SimplifyBooleanReturn"/>
|
||||
<module name="StringLiteralEquality"/>
|
||||
<module name="OneStatementPerLine"/>
|
||||
<module name="FinalClass"/>
|
||||
<module name="ArrayTypeStyle"/>
|
||||
<module name="UpperEll"/>
|
||||
<module name="OuterTypeFilename"/>
|
||||
</module>
|
||||
<module name="FileTabCharacter"/>
|
||||
<module name="SuppressWarningsFilter"/>
|
||||
</module>
|
||||
21
config/pmd/pmd.xml
Normal file
21
config/pmd/pmd.xml
Normal file
@ -0,0 +1,21 @@
|
||||
<?xml version="1.0"?>
|
||||
<ruleset name="Custom ruleset"
|
||||
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
|
||||
|
||||
<description>
|
||||
Knecon ruleset checks the code for bad stuff
|
||||
</description>
|
||||
|
||||
<rule ref="category/java/errorprone.xml">
|
||||
<exclude name="MissingSerialVersionUID"/>
|
||||
<exclude name="AvoidLiteralsInIfCondition"/>
|
||||
<exclude name="AvoidDuplicateLiterals"/>
|
||||
<exclude name="NullAssignment"/>
|
||||
<exclude name="AssignmentInOperand"/>
|
||||
<exclude name="BeanMembersShouldSerialize"/>
|
||||
</rule>
|
||||
|
||||
</ruleset>
|
||||
|
||||
23
config/pmd/test_pmd.xml
Normal file
23
config/pmd/test_pmd.xml
Normal file
@ -0,0 +1,23 @@
|
||||
<?xml version="1.0"?>
|
||||
<ruleset name="Custom ruleset"
|
||||
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
|
||||
|
||||
<description>
|
||||
Knecon test ruleset checks the code for bad stuff
|
||||
</description>
|
||||
|
||||
|
||||
<rule ref="category/java/errorprone.xml">
|
||||
<exclude name="MissingSerialVersionUID"/>
|
||||
<exclude name="AvoidLiteralsInIfCondition"/>
|
||||
<exclude name="AvoidDuplicateLiterals"/>
|
||||
<exclude name="AvoidFieldNameMatchingMethodName"/>
|
||||
<exclude name="NullAssignment"/>
|
||||
<exclude name="AssignmentInOperand"/>
|
||||
<exclude name="TestClassWithoutTestCases"/>
|
||||
<exclude name="BeanMembersShouldSerialize"/>
|
||||
</rule>
|
||||
|
||||
</ruleset>
|
||||
1
gradle.properties.kts
Normal file
1
gradle.properties.kts
Normal file
@ -0,0 +1 @@
|
||||
version = 0.1-SNAPSHOT
|
||||
@ -1,99 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
||||
|
||||
<parent>
|
||||
<groupId>com.knecon.fforesight</groupId>
|
||||
<artifactId>platform-docker-dependency</artifactId>
|
||||
<version>0.1.0</version>
|
||||
<relativePath/>
|
||||
</parent>
|
||||
|
||||
<groupId>com.knecon.fforesight</groupId>
|
||||
<artifactId>layoutparser-service-image</artifactId>
|
||||
<version>1.0.0</version>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<properties>
|
||||
<service.server>layoutparser-service-server</service.server>
|
||||
<platform.jar>${service.server}.jar</platform.jar>
|
||||
<docker.skip.push>false</docker.skip.push>
|
||||
<docker.image.name>${docker.image.prefix}/${service.server}</docker.image.name>
|
||||
</properties>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-resources-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>exec-maven-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>io.fabric8</groupId>
|
||||
<artifactId>docker-maven-plugin</artifactId>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
<pluginManagement>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>download-platform-jar</id>
|
||||
<phase>prepare-package</phase>
|
||||
<goals>
|
||||
<goal>copy</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<artifactItems>
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>${service.server}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<type>jar</type>
|
||||
<overWrite>true</overWrite>
|
||||
<destFileName>${platform.jar}</destFileName>
|
||||
</dependency>
|
||||
</artifactItems>
|
||||
<outputDirectory>${docker.build.directory}</outputDirectory>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>io.fabric8</groupId>
|
||||
<artifactId>docker-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<images>
|
||||
<image>
|
||||
<name>${docker.image.name}</name>
|
||||
<build>
|
||||
<dockerFileDir>${docker.build.directory}</dockerFileDir>
|
||||
<args>
|
||||
<PLATFORM_JAR>${platform.jar}</PLATFORM_JAR>
|
||||
</args>
|
||||
<tags>
|
||||
<tag>${docker.image.version}</tag>
|
||||
<tag>latest</tag>
|
||||
</tags>
|
||||
</build>
|
||||
</image>
|
||||
</images>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
</build>
|
||||
</project>
|
||||
@ -1,9 +0,0 @@
|
||||
FROM red/base-image:2.0.2
|
||||
|
||||
ARG PLATFORM_JAR
|
||||
|
||||
ENV PLATFORM_JAR ${PLATFORM_JAR}
|
||||
|
||||
ENV USES_ELASTICSEARCH false
|
||||
|
||||
COPY ["${PLATFORM_JAR}", "/"]
|
||||
@ -0,0 +1,10 @@
|
||||
plugins {
|
||||
id("com.knecon.fforesight.java-conventions")
|
||||
id("io.freefair.lombok") version "8.4"
|
||||
}
|
||||
|
||||
description = "layoutparser-service-internal-api"
|
||||
|
||||
dependencies {
|
||||
implementation("io.swagger.core.v3:swagger-annotations:2.2.15")
|
||||
}
|
||||
@ -1,29 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>com.knecon.fforesight</groupId>
|
||||
<artifactId>layoutparser-service</artifactId>
|
||||
<version>1.0.0</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>layoutparser-service-internal-api</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
<version>1.18.26</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
<version>31.1-jre</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
@ -1,19 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class AtomicPositionBlockData {
|
||||
|
||||
Long id;
|
||||
int[] stringIdxToPositionIdx;
|
||||
float[][] positions;
|
||||
|
||||
}
|
||||
@ -1,23 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class AtomicTextBlockData {
|
||||
|
||||
Long id;
|
||||
Long page;
|
||||
String searchText;
|
||||
int numberOnPage;
|
||||
int start;
|
||||
int end;
|
||||
int[] lineBreaks;
|
||||
|
||||
}
|
||||
@ -1,20 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class DocumentData {
|
||||
|
||||
PageData[] pages;
|
||||
AtomicTextBlockData[] atomicTextBlocks;
|
||||
AtomicPositionBlockData[] atomicPositionBlocks;
|
||||
TableOfContentsData tableOfContents;
|
||||
|
||||
}
|
||||
@ -1,20 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class PageData {
|
||||
|
||||
int number;
|
||||
int height;
|
||||
int width;
|
||||
int rotation;
|
||||
|
||||
}
|
||||
@ -1,93 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class TableOfContentsData {
|
||||
|
||||
EntryData root;
|
||||
|
||||
|
||||
public EntryData get(List<Integer> tocId) {
|
||||
|
||||
if (tocId.isEmpty()) {
|
||||
return root;
|
||||
}
|
||||
EntryData entry = root.subEntries.get(tocId.get(0));
|
||||
for (int id : tocId.subList(1, tocId.size())) {
|
||||
entry = entry.subEntries.get(id);
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
||||
|
||||
public Stream<EntryData> streamAllEntries() {
|
||||
|
||||
return Stream.concat(Stream.of(root), root.subEntries.stream()).flatMap(TableOfContentsData::flatten);
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return String.join("\n", streamAllEntries().map(EntryData::toString).toList());
|
||||
}
|
||||
|
||||
|
||||
private static Stream<EntryData> flatten(EntryData entry) {
|
||||
|
||||
return Stream.concat(Stream.of(entry), entry.subEntries.stream().flatMap(TableOfContentsData::flatten));
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public static class EntryData {
|
||||
|
||||
NodeType type;
|
||||
int[] tocId;
|
||||
Long[] atomicBlocks;
|
||||
Long[] pages;
|
||||
Map<String, String> properties;
|
||||
List<EntryData> subEntries;
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("[");
|
||||
for (int i : tocId) {
|
||||
sb.append(i);
|
||||
sb.append(",");
|
||||
}
|
||||
sb.delete(sb.length() - 1, sb.length());
|
||||
sb.append("]: ");
|
||||
|
||||
sb.append(type);
|
||||
sb.append(" atbs = ");
|
||||
sb.append(atomicBlocks.length);
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Schema(description = "Object containing a simplified version, which contains almost exclusively text, of the document structure Section class.")
|
||||
public class SimplifiedSectionText {
|
||||
|
||||
@Schema(description = "The number of this Section. This is used to map the simplified section text back to the original Section.")
|
||||
private String sectionNumber;
|
||||
@Schema(description = "The text in this Section.")
|
||||
private String text;
|
||||
|
||||
}
|
||||
@ -0,0 +1,34 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@Schema(description = "Object containing a simplified version, which contains almost exclusively text, of the document structure.")
|
||||
public class SimplifiedText {
|
||||
|
||||
@Schema(description = "Number of pages in the entire document.")
|
||||
private int numberOfPages;
|
||||
@Schema(description = "A List of simplified Sections, which contains almost exclusively text.")
|
||||
@Builder.Default
|
||||
private List<SimplifiedSectionText> sectionTexts = new ArrayList<>();
|
||||
@Schema(description = "A list of the main section numbers ")
|
||||
@Builder.Default
|
||||
private List<String> mainSectionNumbers = new ArrayList<>();
|
||||
@Schema(description = "A list of the header section numbers ")
|
||||
@Builder.Default
|
||||
private List<String> headerSectionNumbers = new ArrayList<>();
|
||||
@Schema(description = "A list of the footer section numbers ")
|
||||
@Builder.Default
|
||||
private List<String> footerSectionNumbers = new ArrayList<>();
|
||||
|
||||
}
|
||||
@ -0,0 +1,30 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@Schema(description = "Object containing information about a Paragraph/Headline/Header/Footer.")
|
||||
public class ParagraphData {
|
||||
|
||||
@Schema(description = "The text of this Semantic Node, without any linebreaks.", example = "This is some text.")
|
||||
private String text;
|
||||
@Schema(description = "A list of text ranges in string offsets. Every character in any of the ranges is bold.", example = "[0, 15]")
|
||||
List<Range> boldTextBoundaries;
|
||||
@Schema(description = "A list of text ranges in string offsets. Every character in any of the ranges is italic.", example = "[0, 15]")
|
||||
List<Range> italicTextBoundaries;
|
||||
@Schema(description = "The line breaks in the text of this semantic node in string offsets. They are exclusive end. At the end of each semantic node there is an implicit linebreak.", example = "[5, 10]")
|
||||
List<Integer> linebreaks;
|
||||
@Schema(description = "The classification of this Paragraph.", allowableValues = "{paragraph, headline, header, footer}")
|
||||
private String classification;
|
||||
|
||||
@Schema(description = "Describes the text orientation of this semantic node. Any semantic node only has a single text orientation.", allowableValues = "{ZERO, QUARTER_CIRCLE, HALF_CIRCLE, THREE_QUARTER_CIRCLE}")
|
||||
private String orientation;
|
||||
@Schema(description = "Describes the text direction in degrees of this semantic node. Any semantic node only has a single text direction.", minimum = "0", maximum = "359")
|
||||
private int textDirection;
|
||||
|
||||
}
|
||||
@ -0,0 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
|
||||
@Schema(description = "Object specifying the start and end offsets of a text range in string offsets.")
|
||||
public record Range(int start, int end) {
|
||||
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
@Builder
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@Schema(description = "Object containing a simplified version of the document structure. This simplified form only knows Paragraphs and Tables. The Paragraph Objects might be a Paragraph, Headline, Header or Footer.")
|
||||
public class ResearchDocumentData {
|
||||
|
||||
@Schema(description = "File name of the original uploaded file.")
|
||||
String originalFile;
|
||||
@Schema(description = "A List of all paragraphs/headline or table objects, that have been parsed in this document.")
|
||||
List<StructureObject> structureObjects;
|
||||
|
||||
}
|
||||
@ -0,0 +1,20 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@Schema(description = "Object containing information about a Table Row.")
|
||||
public class RowData {
|
||||
|
||||
@Schema(description = "Boolean indicating whether this table row is classified as a header row.")
|
||||
boolean header;
|
||||
@Schema(description = "A list of Objects containing information about the text in each cell of this row.")
|
||||
List<ParagraphData> cellText;
|
||||
@Schema(description = "The bounding box of this StructureObject. Is always exactly 4 values representing x, y, w, h, where x, y specify the lower left corner.")
|
||||
float[] bBox;
|
||||
}
|
||||
@ -0,0 +1,31 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@Schema(description = "Object containing information about either a Paragraph/Headline/Header/Footer or a Table.")
|
||||
public class StructureObject {
|
||||
|
||||
@Schema(description = "The ID of this StructureObject.")
|
||||
Integer structureObjectNumber;
|
||||
@Schema(description = "The Tree ID of this StructureObject.")
|
||||
List<Integer> treeId;
|
||||
@Schema(description = "This value indicates the start of the string offsets in this Object, with respect to the reading order.")
|
||||
int page;
|
||||
@Schema(description = "This stringOffset indicates the start of the string offsets in this Object, with respect to the reading order of the entire document. It is equal to the previous' StructureObject stringOffset + its length.")
|
||||
int stringOffset;
|
||||
@Schema(description = "The bounding box of this StructureObject. Is always exactly 4 values representing x, y, w, h, where x, y specify the lower left corner.", example = "[100, 100, 50, 50]")
|
||||
float[] boundingBox;
|
||||
@Schema(description = "Object containing information about a Paragraph/Headline/Header/Footer. Either this or table is null.")
|
||||
ParagraphData paragraph;
|
||||
@Schema(description = "Object containing information about a Table. Either this or paragraph is null.")
|
||||
TableData table;
|
||||
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@Schema(description = "Object containing information about a Table.")
|
||||
public class TableData {
|
||||
|
||||
@Schema(description = "A list of Objects containing information about all rows in this table.")
|
||||
List<RowData> rowData;
|
||||
@Schema(description = "Number of columns in this table.")
|
||||
Integer numberOfCols;
|
||||
@Schema(description = "Number of rows in this table.")
|
||||
Integer numberOfRows;
|
||||
|
||||
}
|
||||
@ -1,148 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.Setter;
|
||||
|
||||
@Setter
|
||||
public class Boundary implements Comparable<Boundary> {
|
||||
|
||||
private int start;
|
||||
private int end;
|
||||
|
||||
|
||||
public Boundary(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end));
|
||||
}
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
|
||||
public int length() {
|
||||
|
||||
return end - start;
|
||||
}
|
||||
|
||||
|
||||
public int start() {
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
|
||||
public int end() {
|
||||
|
||||
return end;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Boundary boundary) {
|
||||
|
||||
return start <= boundary.start() && boundary.end() <= end;
|
||||
}
|
||||
|
||||
|
||||
public boolean containedBy(Boundary boundary) {
|
||||
|
||||
return boundary.contains(this);
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end));
|
||||
}
|
||||
return this.start <= start && end <= this.end;
|
||||
}
|
||||
|
||||
|
||||
public boolean containedBy(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end));
|
||||
}
|
||||
return start <= this.start && this.end <= end;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(int index) {
|
||||
|
||||
return start <= index && index < end;
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(Boundary boundary) {
|
||||
|
||||
return contains(boundary.start()) || contains(boundary.end() - 1);
|
||||
}
|
||||
|
||||
|
||||
public List<Boundary> split(List<Integer> splitIndices) {
|
||||
|
||||
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
|
||||
throw new IndexOutOfBoundsException(String.format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
|
||||
}
|
||||
List<Boundary> splitBoundaries = new LinkedList<>();
|
||||
int previousIndex = start;
|
||||
for (int splitIndex : splitIndices) {
|
||||
|
||||
// skip split if it would produce a boundary of length 0
|
||||
if (splitIndex == previousIndex) {
|
||||
continue;
|
||||
}
|
||||
splitBoundaries.add(new Boundary(previousIndex, splitIndex));
|
||||
previousIndex = splitIndex;
|
||||
}
|
||||
splitBoundaries.add(new Boundary(previousIndex, end));
|
||||
return splitBoundaries;
|
||||
}
|
||||
|
||||
|
||||
public static Boundary merge(List<Boundary> boundaries) {
|
||||
|
||||
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
|
||||
int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new);
|
||||
return new Boundary(minStart, maxEnd);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.format("Boundary [%d|%d)", start, end);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(Boundary boundary) {
|
||||
|
||||
if (end < boundary.end() && start < boundary.start()) {
|
||||
return -1;
|
||||
}
|
||||
if (start > boundary.start() && end > boundary.end()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return toString().hashCode();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object object) {
|
||||
|
||||
return hashCode() == object.hashCode();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,101 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class DocumentGraph implements SemanticNode {
|
||||
|
||||
Set<PageNode> pages;
|
||||
TableOfContents tableOfContents;
|
||||
Integer numberOfPages;
|
||||
TextBlock textBlock;
|
||||
|
||||
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
public List<SectionNode> getMainSections() {
|
||||
|
||||
return streamChildren().filter(node -> node instanceof SectionNode).map(node -> (SectionNode) node).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
|
||||
|
||||
return streamAllNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock);
|
||||
}
|
||||
|
||||
|
||||
public Set<EntityNode> getEntities() {
|
||||
|
||||
return streamAllSubNodes().map(SemanticNode::getEntities).flatMap(Set::stream).collect(Collectors.toUnmodifiableSet());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Integer> getTocId() {
|
||||
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void setTocId(List<Integer> tocId) {
|
||||
|
||||
throw new UnsupportedOperationException("DocumentGraph is always the root of the Table of Contents");
|
||||
}
|
||||
|
||||
|
||||
private Stream<SemanticNode> streamAllNodes() {
|
||||
|
||||
return tableOfContents.streamAllEntriesInOrder().map(TableOfContents.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return NodeType.DOCUMENT + ": " + buildTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<PageNode, Rectangle2D> getBBox() {
|
||||
|
||||
Map<PageNode, Rectangle2D> bBox = new HashMap<>();
|
||||
for (PageNode page : pages) {
|
||||
bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()));
|
||||
}
|
||||
return bBox;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,193 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
public class TableOfContents {
|
||||
|
||||
private final Entry root;
|
||||
|
||||
|
||||
public TableOfContents(DocumentGraph documentGraph) {
|
||||
|
||||
root = Entry.builder().tocId(Collections.emptyList()).type(NodeType.DOCUMENT).children(new LinkedList<>()).node(documentGraph).build();
|
||||
}
|
||||
|
||||
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return streamAllEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewMainEntryAndReturnId(NodeType nodeType, SemanticNode node) {
|
||||
|
||||
return createNewChildEntryAndReturnId(Collections.emptyList(), nodeType, node);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> createNewChildEntryAndReturnId(List<Integer> parentId, NodeType nodeType, SemanticNode node) {
|
||||
|
||||
if (!entryExists(parentId)) {
|
||||
throw new UnsupportedOperationException(format("parentId %s does not exist!", parentId));
|
||||
}
|
||||
|
||||
Entry parent = getEntryById(parentId);
|
||||
List<Integer> newId = new LinkedList<>(parentId);
|
||||
newId.add(parent.children.size());
|
||||
parent.children.add(Entry.builder().tocId(newId).node(node).type(nodeType).children(new LinkedList<>()).build());
|
||||
|
||||
return newId;
|
||||
}
|
||||
|
||||
|
||||
private boolean entryExists(List<Integer> tocId) {
|
||||
|
||||
if (tocId.isEmpty()) {
|
||||
return root != null;
|
||||
}
|
||||
Entry entry = root.children.get(tocId.get(0));
|
||||
for (int id : tocId.subList(1, tocId.size())) {
|
||||
if (id >= entry.children.size() || 0 > id) {
|
||||
return false;
|
||||
}
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public Entry getParentEntryById(List<Integer> tocId) {
|
||||
|
||||
return getEntryById(getParentId(tocId));
|
||||
}
|
||||
|
||||
|
||||
public boolean hasParentById(List<Integer> tocId) {
|
||||
|
||||
return entryExists(getParentId(tocId));
|
||||
}
|
||||
|
||||
|
||||
public Stream<SemanticNode> streamChildrenNodes(List<Integer> tocId) {
|
||||
|
||||
return getEntryById(tocId).children.stream().map(Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
private static List<Integer> getParentId(List<Integer> tocId) {
|
||||
|
||||
if (tocId.isEmpty()) {
|
||||
throw new UnsupportedOperationException("Root has no parent!");
|
||||
}
|
||||
if (tocId.size() < 2) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
return tocId.subList(0, tocId.size() - 1);
|
||||
}
|
||||
|
||||
|
||||
public Entry getEntryById(List<Integer> tocId) {
|
||||
|
||||
if (tocId.isEmpty()) {
|
||||
return root;
|
||||
}
|
||||
Entry entry = root.children.get(tocId.get(0));
|
||||
for (int id : tocId.subList(1, tocId.size())) {
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> streamMainEntries() {
|
||||
|
||||
return root.children.stream();
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> streamAllEntriesInOrder() {
|
||||
|
||||
return Stream.of(root).flatMap(TableOfContents::flatten);
|
||||
}
|
||||
|
||||
|
||||
public Stream<Entry> streamAllSubEntriesInOrder(List<Integer> parentId) {
|
||||
|
||||
return getEntryById(parentId).getChildren().stream().flatMap(TableOfContents::flatten);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.join("\n", streamAllEntriesInOrder().map(Entry::toString).toList());
|
||||
}
|
||||
|
||||
|
||||
public String toString(List<Integer> id) {
|
||||
|
||||
return String.join("\n", streamAllSubEntriesInOrder(id).map(Entry::toString).toList());
|
||||
}
|
||||
|
||||
|
||||
private static Stream<Entry> flatten(Entry entry) {
|
||||
|
||||
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(TableOfContents::flatten));
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
public static class Entry {
|
||||
|
||||
List<Integer> tocId;
|
||||
NodeType type;
|
||||
SemanticNode node;
|
||||
List<Entry> children;
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return node.toString();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return Hashing.murmur3_32_fixed().hashString(toString(), StandardCharsets.UTF_8).hashCode();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
|
||||
return o instanceof Entry && o.hashCode() == this.hashCode();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,76 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.entity;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
|
||||
public interface EntityNode {
|
||||
|
||||
/**
|
||||
* This represents the text, which is contained within the boundary of the Entity.
|
||||
*
|
||||
* @return String
|
||||
*/
|
||||
String getValue();
|
||||
|
||||
|
||||
/**
|
||||
* The Boundary primarily defines the Entity, all other values may be inferred from it.
|
||||
*
|
||||
* @return Boundary, uniquely identifying this Entity
|
||||
*/
|
||||
Boundary getBoundary();
|
||||
|
||||
|
||||
/**
|
||||
* The deepest fully containing node represents the node which is the deepest node in the document tree structure,
|
||||
* whose boundary also fully contains the boundary of this entity.
|
||||
*
|
||||
* @return the deepest fully containing node
|
||||
*/
|
||||
SemanticNode getDeepestFullyContainingNode();
|
||||
|
||||
|
||||
/**
|
||||
* The intersecting nodes represent all nodes, whose boundary intersects the boundary of this entity.
|
||||
*
|
||||
* @return all intersecting Nodes
|
||||
*/
|
||||
List<SemanticNode> getIntersectingNodes();
|
||||
|
||||
|
||||
void setDeepestFullyContainingNode(SemanticNode semanticNode);
|
||||
|
||||
|
||||
void addIntersectingNode(SemanticNode semanticNode);
|
||||
|
||||
|
||||
void setIntersectingNodes(List<SemanticNode> semanticNodes);
|
||||
|
||||
|
||||
/**
|
||||
* @return all pages this entity intersects.
|
||||
*/
|
||||
Set<PageNode> getPages();
|
||||
|
||||
|
||||
void setPages(Set<PageNode> pages);
|
||||
|
||||
|
||||
/**
|
||||
* removes all occurrences of this node in the graph and resets all graph specific fields.
|
||||
*/
|
||||
default void removeFromGraph() {
|
||||
|
||||
getIntersectingNodes().forEach(node -> node.getEntities().remove(this));
|
||||
getPages().forEach(page -> page.getEntities().remove(this));
|
||||
setPages(Collections.emptySet());
|
||||
setDeepestFullyContainingNode(null);
|
||||
setIntersectingNodes(Collections.emptyList());
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,45 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.entity;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class EntityPosition {
|
||||
|
||||
PageNode pageNode;
|
||||
List<Rectangle2D> rectanglePerLine;
|
||||
|
||||
|
||||
public String getId() {
|
||||
|
||||
return String.valueOf(hashCode());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(pageNode.getNumber());
|
||||
rectanglePerLine.forEach(r -> sb.append(r.getX()).append(r.getY()).append(r.getWidth()).append(r.getHeight()));
|
||||
return Hashing.murmur3_128().hashString(sb.toString(), StandardCharsets.UTF_8).hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
|
||||
return o instanceof EntityPosition && o.hashCode() == this.hashCode();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,53 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class FooterNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
TextBlock terminalTextBlock;
|
||||
|
||||
@Builder.Default
|
||||
boolean terminal = true;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return terminalTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId + ": " + NodeType.FOOTER + ": " + terminalTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,53 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class HeaderNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
TextBlock terminalTextBlock;
|
||||
|
||||
@Builder.Default
|
||||
boolean terminal = true;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return terminalTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId + ": " + NodeType.HEADER + ": " + terminalTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,60 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class HeadlineNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
TextBlock terminalTextBlock;
|
||||
|
||||
@Builder.Default
|
||||
boolean terminal = true;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return terminalTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId + ": " + NodeType.HEADLINE + ": " + terminalTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public SemanticNode getHeadline() {
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,87 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class ImageNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
|
||||
ImageType imageType;
|
||||
boolean transparency;
|
||||
Rectangle2D position;
|
||||
|
||||
|
||||
boolean redaction;
|
||||
boolean ignored;
|
||||
|
||||
@Builder.Default
|
||||
String redactionReason = "";
|
||||
@Builder.Default
|
||||
String legalBasis = "";
|
||||
@Builder.Default
|
||||
int matchedRule = -1;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
PageNode page;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<PageNode> getPages() {
|
||||
|
||||
return Collections.singleton(page);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<PageNode, Rectangle2D> getBBox() {
|
||||
|
||||
Map<PageNode, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
bBoxPerPage.put(page, position);
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,9 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
public enum ImageType {
|
||||
LOGO,
|
||||
FORMULA,
|
||||
SIGNATURE,
|
||||
OTHER,
|
||||
OCR
|
||||
}
|
||||
@ -1,13 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
public enum NodeType {
|
||||
DOCUMENT,
|
||||
SECTION,
|
||||
HEADLINE,
|
||||
PARAGRAPH,
|
||||
TABLE,
|
||||
TABLE_CELL,
|
||||
IMAGE,
|
||||
HEADER,
|
||||
FOOTER
|
||||
}
|
||||
@ -1,71 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@Setter
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class PageNode {
|
||||
|
||||
Integer number;
|
||||
Integer height;
|
||||
Integer width;
|
||||
Integer rotation;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
List<SemanticNode> mainBody;
|
||||
@EqualsAndHashCode.Exclude
|
||||
HeaderNode header;
|
||||
@EqualsAndHashCode.Exclude
|
||||
FooterNode footer;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<ImageNode> images = new HashSet<>();
|
||||
|
||||
|
||||
public TextBlock getMainBodyTextBlock() {
|
||||
|
||||
return mainBody.stream().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.valueOf(number);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return number;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
|
||||
return o instanceof PageNode && o.hashCode() == this.hashCode();
|
||||
}
|
||||
}
|
||||
@ -1,51 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class ParagraphNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
TextBlock terminalTextBlock;
|
||||
|
||||
@Builder.Default
|
||||
boolean terminal = true;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return terminalTextBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId + ": " + NodeType.PARAGRAPH + ": " + terminalTextBlock.buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,63 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class SectionNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
|
||||
TextBlock textBlock;
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId.toString() + ": " + NodeType.SECTION + ": " + buildTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
public HeadlineNode getHeadline() {
|
||||
|
||||
return streamChildren().filter(node -> node instanceof HeadlineNode)
|
||||
.map(node -> (HeadlineNode) node)
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException("ClassificationSection has no Headline!"));
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,275 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations;
|
||||
|
||||
public interface SemanticNode {
|
||||
|
||||
/**
|
||||
* Searches all Nodes located underneath this Node in the TableOfContents and concatenates their AtomicTextBlocks into a single TextBlockEntity.
|
||||
* So, for a ClassificationSection all TextBlocks of Subsections, Paragraphs, and Tables are concatenated into a single TextBlockEntity
|
||||
* If the Node is Terminal, the TerminalTextBlock will be returned instead.
|
||||
*
|
||||
* @return ClassificationTextBlock containing all AtomicTextBlocks that are located under this Node.
|
||||
*/
|
||||
TextBlock buildTextBlock();
|
||||
|
||||
|
||||
/**
|
||||
* Any Node maintains its own Set of Entities.
|
||||
* This Set contains all Entities whose boundary intersects the boundary of this node.
|
||||
*
|
||||
* @return Set of all Entities associated with this Node
|
||||
*/
|
||||
Set<EntityNode> getEntities();
|
||||
|
||||
|
||||
/**
|
||||
* Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's ClassificationTextBlock.
|
||||
*
|
||||
* @return Set of PageNodes this node appears on.
|
||||
*/
|
||||
default Set<PageNode> getPages() {
|
||||
|
||||
return buildTextBlock().getPages();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return the TableOfContents of the ClassificationDocument this node belongs to
|
||||
*/
|
||||
TableOfContents getTableOfContents();
|
||||
|
||||
|
||||
/**
|
||||
* The id is a List of Integers uniquely identifying this node in the TableOfContents.
|
||||
*
|
||||
* @return the TableOfContents ID
|
||||
*/
|
||||
List<Integer> getTocId();
|
||||
|
||||
|
||||
/**
|
||||
* This should only be used during graph construction.
|
||||
*
|
||||
* @param tocId List of Integers
|
||||
*/
|
||||
void setTocId(List<Integer> tocId);
|
||||
|
||||
|
||||
/**
|
||||
* Traverses the Tree up, until it hits a HeadlineNode or hits a SectionNode which will then return the first HeadlineNode from its children.
|
||||
* Throws NotFoundException if no Headline is found this way
|
||||
*
|
||||
* @return First HeadlineNode found
|
||||
*/
|
||||
default SemanticNode getHeadline() {
|
||||
|
||||
return getParent().getHeadline();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return boolean indicating wether this Node has a Parent in the TableOfContents
|
||||
*/
|
||||
default boolean hasParent() {
|
||||
|
||||
return getTableOfContents().hasParentById(getTocId());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return The SemanticNode representing the Parent in the TableOfContents
|
||||
* throws NotFoundException, when no parent is present
|
||||
*/
|
||||
default SemanticNode getParent() {
|
||||
|
||||
return getTableOfContents().getParentEntryById(getTocId()).getNode();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Terminal means a SemanticNode has direct access to a ClassificationTextBlock, by default this is false and must be overridden.
|
||||
* Currently only Sections, Images, and Tables are not terminal.
|
||||
* A TableCell might be Terminal depending on its area compared to the page.
|
||||
*
|
||||
* @return boolean, indicating if a Node has direct access to a ClassificationTextBlock
|
||||
*/
|
||||
default boolean isTerminal() {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Terminal means a SemanticNode has direct access to a ClassificationTextBlock, by default this is false and must be overridden.
|
||||
* Currently only Sections and Tables are not terminal.
|
||||
*
|
||||
* @return AtomicTextBlock
|
||||
*/
|
||||
default TextBlock getTerminalTextBlock() {
|
||||
|
||||
throw new UnsupportedOperationException("Only terminal Nodes have access to TerminalTextBlocks!");
|
||||
}
|
||||
|
||||
|
||||
default void setTerminalTextBlock(TextBlock textBlock) {
|
||||
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Each AtomicTextBlock has an index on its page, this returns the number of the first AtomicTextBlock underneath this node.
|
||||
* If this node does not have any AtomicTexBlocks underneath it, e.g. an empty TableCell. It returns -1.
|
||||
*
|
||||
* @return Integer representing the number on the page
|
||||
*/
|
||||
default Integer getNumberOnPage() {
|
||||
|
||||
TextBlock textBlock = buildTextBlock();
|
||||
if (textBlock.getAtomicTextBlocks().size() > 0) {
|
||||
return buildTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return true, if this node's ClassificationTextBlock is not empty
|
||||
*/
|
||||
default boolean hasText() {
|
||||
|
||||
return buildTextBlock().length() > 0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param string A String which the ClassificationTextBlock might contain
|
||||
* @return true, if this node's ClassificationTextBlock contains the string
|
||||
*/
|
||||
default boolean containsString(String string) {
|
||||
|
||||
return buildTextBlock().getSearchText().contains(string);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param strings A List of Strings which the ClassificationTextBlock might contain
|
||||
* @return true, if this node's ClassificationTextBlock contains any of the strings
|
||||
*/
|
||||
default boolean containsAnyString(List<String> strings) {
|
||||
|
||||
return strings.stream().anyMatch(this::containsString);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the EntityNode intersects or even contains the EntityNode.
|
||||
* It sets the fields accordingly and recursively calls this function on all its children.
|
||||
*
|
||||
* @param entityNode EntityNode, which is being inserted into the graph
|
||||
*/
|
||||
default void addThisToEntityIfIntersects(EntityNode entityNode) {
|
||||
|
||||
TextBlock textBlock = buildTextBlock();
|
||||
if (textBlock.getBoundary().intersects(entityNode.getBoundary())) {
|
||||
|
||||
if (textBlock.containsBoundary(entityNode.getBoundary())) {
|
||||
entityNode.setDeepestFullyContainingNode(this);
|
||||
}
|
||||
|
||||
entityNode.addIntersectingNode(this);
|
||||
streamChildren().forEach(node -> node.addThisToEntityIfIntersects(entityNode));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all children located directly underneath this node in the TableOfContents.
|
||||
*
|
||||
* @return Stream of all children
|
||||
*/
|
||||
default Stream<SemanticNode> streamChildren() {
|
||||
|
||||
return getTableOfContents().streamChildrenNodes(getTocId());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* recursively streams all SemanticNodes located underneath this node in the TableOfContents in order.
|
||||
*
|
||||
* @return Stream of all SubNodes
|
||||
*/
|
||||
default Stream<SemanticNode> streamAllSubNodes() {
|
||||
|
||||
return getTableOfContents().streamAllSubEntriesInOrder(getTocId()).map(TableOfContents.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return Boundary of this Node's ClassificationTextBlock
|
||||
*/
|
||||
default Boundary getBoundary() {
|
||||
|
||||
return buildTextBlock().getBoundary();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* If this Node is Terminal it will calculate the boundingBox of its TerminalTextBlock, otherwise it will calculate the Union of the BoundingBoxes of all its Children.
|
||||
* If called on the ClassificationDocument, it will return the cropbox of each page
|
||||
*
|
||||
* @return Rectangle2D fully encapsulating this Node for each page.
|
||||
*/
|
||||
default Map<PageNode, Rectangle2D> getBBox() {
|
||||
|
||||
Map<PageNode, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
if (isTerminal()) {
|
||||
return getBBoxFromTerminalTextBlock(bBoxPerPage);
|
||||
}
|
||||
|
||||
return getBBoxFromChildren(bBoxPerPage);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* TODO this does not yet work for sections spanning multiple columns.
|
||||
*
|
||||
* @param bBoxPerPage initial empty BoundingBox
|
||||
* @return The union of the BoundingBoxes of all children
|
||||
*/
|
||||
private Map<PageNode, Rectangle2D> getBBoxFromChildren(Map<PageNode, Rectangle2D> bBoxPerPage) {
|
||||
|
||||
return streamChildren().map(SemanticNode::getBBox).reduce((map1, map2) -> {
|
||||
map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D()));
|
||||
return map2;
|
||||
}).orElse(bBoxPerPage);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param bBoxPerPage initial empty BoundingBox
|
||||
* @return The union of all BoundingBoxes of the ClassificationTextBlock of this node
|
||||
*/
|
||||
private Map<PageNode, Rectangle2D> getBBoxFromTerminalTextBlock(Map<PageNode, Rectangle2D> bBoxPerPage) {
|
||||
|
||||
Map<PageNode, List<AtomicTextBlock>> atomicTextBlockPerPage = buildTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage));
|
||||
atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,92 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class TableCellNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
int row;
|
||||
int col;
|
||||
boolean header;
|
||||
|
||||
Rectangle2D bBox;
|
||||
|
||||
@Builder.Default
|
||||
boolean terminal = true;
|
||||
TextBlock terminalTextBlock;
|
||||
|
||||
TextBlock textBlock;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public Map<PageNode, Rectangle2D> getBBox() {
|
||||
|
||||
Map<PageNode, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
getPages().forEach(page -> bBoxPerPage.put(page, bBox));
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
if (terminal) {
|
||||
return terminalTextBlock;
|
||||
}
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId + ": " + NodeType.TABLE_CELL + ": " + buildTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
|
||||
public boolean hasHeader(String headerString) {
|
||||
|
||||
return getHeaders().anyMatch(header -> header.buildTextBlock().getSearchText().strip().equals(headerString));
|
||||
}
|
||||
|
||||
|
||||
private Stream<TableCellNode> getHeaders() {
|
||||
|
||||
TableNode tableNode = (TableNode) getParent();
|
||||
return tableNode.streamHeadersForCell(row, col);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,73 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class TableNode implements SemanticNode {
|
||||
|
||||
List<Integer> tocId;
|
||||
TableOfContents tableOfContents;
|
||||
|
||||
Integer numberOfRows;
|
||||
Integer numberOfCols;
|
||||
|
||||
TextBlock textBlock;
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
public Stream<TableCellNode> streamTableCells() {
|
||||
|
||||
return streamChildren().map(node -> (TableCellNode) node);
|
||||
}
|
||||
|
||||
|
||||
public Stream<TableCellNode> streamHeaders() {
|
||||
|
||||
return streamTableCells().filter(TableCellNode::isHeader);
|
||||
}
|
||||
|
||||
|
||||
public Stream<TableCellNode> streamHeadersForCell(int row, int col) {
|
||||
|
||||
return streamHeaders().filter(cell -> cell.getRow() == row || cell.getCol() == col);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return tocId.toString() + ": " + NodeType.TABLE + ": " + buildTextBlock().buildSummary();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,131 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class AtomicTextBlock implements TextBlock {
|
||||
|
||||
Long id;
|
||||
Integer numberOnPage;
|
||||
PageNode page;
|
||||
|
||||
//string coordinates
|
||||
Boundary boundary;
|
||||
String searchText;
|
||||
List<Integer> lineBreaks;
|
||||
|
||||
//position coordinates
|
||||
List<Integer> stringIdxToPositionIdx;
|
||||
List<Rectangle2D> positions;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
SemanticNode parent;
|
||||
|
||||
|
||||
@Override
|
||||
public int numberOfLines() {
|
||||
|
||||
return lineBreaks.size() + 1;
|
||||
}
|
||||
|
||||
|
||||
public CharSequence getLine(int lineNumber) {
|
||||
|
||||
if (lineNumber >= numberOfLines() || lineNumber < 0) {
|
||||
throw new IndexOutOfBoundsException(String.format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
|
||||
}
|
||||
if (lineNumber == 0) {
|
||||
return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start());
|
||||
} else if (lineNumber == numberOfLines() - 1) {
|
||||
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end());
|
||||
}
|
||||
return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<AtomicTextBlock> getAtomicTextBlocks() {
|
||||
|
||||
return List.of(this);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getNextLinebreak(int fromIndex) {
|
||||
|
||||
return lineBreaks.stream()//
|
||||
.filter(linebreak -> linebreak > fromIndex - boundary.start()) //
|
||||
.findFirst() //
|
||||
.orElse(searchText.length()) + boundary.start();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getPreviousLinebreak(int fromIndex) {
|
||||
|
||||
return lineBreaks.stream()//
|
||||
.filter(linebreak -> linebreak <= fromIndex - boundary.start())//
|
||||
.reduce((a, b) -> b)//
|
||||
.orElse(0) + boundary.start();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Rectangle2D getPosition(int stringIdx) {
|
||||
|
||||
return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start()));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
|
||||
|
||||
if (!containsBoundary(stringBoundary)) {
|
||||
throw new IndexOutOfBoundsException(String.format("%s is out of bounds for %s", stringBoundary, this.boundary));
|
||||
}
|
||||
|
||||
if (stringBoundary.end() == this.boundary.end()) {
|
||||
return positions.subList(stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()), positions.size());
|
||||
}
|
||||
|
||||
return positions.subList(stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()),
|
||||
stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start()));
|
||||
}
|
||||
|
||||
|
||||
public List<EntityPosition> getEntityPositionsPerPage(Boundary stringBoundary) {
|
||||
|
||||
List<Rectangle2D> positionsPerLine = stringBoundary.split(getLineBreaks().stream().map(lb -> lb + boundary.start()).filter(stringBoundary::contains).toList())
|
||||
.stream()
|
||||
.map(this::getPositions)
|
||||
.map(RectangleTransformations::rectangleUnion)
|
||||
.toList();
|
||||
|
||||
return List.of(EntityPosition.builder().rectanglePerLine(positionsPerLine).pageNode(page).build());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return searchText;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,179 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class ConcatenatedTextBlock implements TextBlock {
|
||||
|
||||
List<AtomicTextBlock> atomicTextBlocks;
|
||||
String searchText;
|
||||
Boundary boundary;
|
||||
|
||||
|
||||
public ConcatenatedTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
|
||||
|
||||
this.atomicTextBlocks = new LinkedList<>();
|
||||
if (atomicTextBlocks.isEmpty()) {
|
||||
boundary = new Boundary(-1, -1);
|
||||
return;
|
||||
}
|
||||
var firstTextBlock = atomicTextBlocks.get(0);
|
||||
this.atomicTextBlocks.add(firstTextBlock);
|
||||
boundary = new Boundary(firstTextBlock.getBoundary().start(), firstTextBlock.getBoundary().end());
|
||||
|
||||
atomicTextBlocks.subList(1, atomicTextBlocks.size()).forEach(this::concat);
|
||||
}
|
||||
|
||||
|
||||
public ConcatenatedTextBlock concat(TextBlock textBlock) {
|
||||
|
||||
if (this.atomicTextBlocks.isEmpty()) {
|
||||
boundary.setStart(textBlock.getBoundary().start());
|
||||
boundary.setEnd(textBlock.getBoundary().end());
|
||||
} else if (boundary.end() != textBlock.getBoundary().start()) {
|
||||
throw new UnsupportedOperationException(String.format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
|
||||
}
|
||||
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
|
||||
boundary.setEnd(textBlock.getBoundary().end());
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
|
||||
|
||||
return atomicTextBlocks.stream().filter(textBlock -> textBlock.getBoundary().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new);
|
||||
}
|
||||
|
||||
|
||||
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(Boundary boundary) {
|
||||
|
||||
return atomicTextBlocks.stream().filter(tb -> tb.getBoundary().intersects(boundary)).toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getSearchText() {
|
||||
|
||||
if (searchText == null) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
getAtomicTextBlocks().forEach(atb -> sb.append(atb.getSearchText()));
|
||||
searchText = sb.toString();
|
||||
}
|
||||
return searchText;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int numberOfLines() {
|
||||
|
||||
return atomicTextBlocks.stream().map(AtomicTextBlock::getLineBreaks).mapToInt(List::size).sum();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getNextLinebreak(int fromIndex) {
|
||||
|
||||
return getAtomicTextBlockByStringIndex(fromIndex).getNextLinebreak(fromIndex);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getPreviousLinebreak(int fromIndex) {
|
||||
|
||||
return getAtomicTextBlockByStringIndex(fromIndex).getPreviousLinebreak(fromIndex);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Integer> getLineBreaks() {
|
||||
|
||||
return getAtomicTextBlocks().stream().flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks().stream()).toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Rectangle2D getPosition(int stringIdx) {
|
||||
|
||||
return getAtomicTextBlockByStringIndex(stringIdx).getPosition(stringIdx);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
|
||||
|
||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
|
||||
|
||||
if (textBlocks.size() == 1) {
|
||||
return textBlocks.get(0).getPositions(stringBoundary);
|
||||
}
|
||||
|
||||
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
||||
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())));
|
||||
|
||||
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
||||
positions.addAll(textBlock.getPositions());
|
||||
}
|
||||
|
||||
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||
positions.addAll(lastTextBlock.getPositions(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
|
||||
|
||||
return positions;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<EntityPosition> getEntityPositionsPerPage(Boundary stringBoundary) {
|
||||
|
||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
|
||||
|
||||
if (textBlocks.size() == 1) {
|
||||
return textBlocks.get(0).getEntityPositionsPerPage(stringBoundary);
|
||||
}
|
||||
|
||||
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
||||
List<EntityPosition> positions = new LinkedList<>(firstTextBlock.getEntityPositionsPerPage(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())));
|
||||
|
||||
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
||||
positions.addAll(textBlock.getEntityPositionsPerPage(textBlock.getBoundary()));
|
||||
}
|
||||
|
||||
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||
positions.addAll(lastTextBlock.getEntityPositionsPerPage(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
|
||||
|
||||
return mergeEntityPositionsWithSamePageNode(positions);
|
||||
}
|
||||
|
||||
|
||||
private List<EntityPosition> mergeEntityPositionsWithSamePageNode(List<EntityPosition> positions) {
|
||||
|
||||
Map<PageNode, List<Rectangle2D>> entityPositionsPerPage = positions.stream().collect(//
|
||||
Collectors.groupingBy(EntityPosition::getPageNode, //
|
||||
Collectors.flatMapping(entityPosition -> entityPosition.getRectanglePerLine().stream(), Collectors.toList())));
|
||||
|
||||
return entityPositionsPerPage.entrySet().stream()//
|
||||
.map(entry -> EntityPosition.builder().pageNode(entry.getKey()).rectanglePerLine(entry.getValue()).build())//
|
||||
.toList();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return getSearchText();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,125 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
|
||||
public interface TextBlock extends CharSequence {
|
||||
|
||||
String getSearchText();
|
||||
|
||||
|
||||
List<AtomicTextBlock> getAtomicTextBlocks();
|
||||
|
||||
|
||||
Boundary getBoundary();
|
||||
|
||||
|
||||
int getNextLinebreak(int fromIndex);
|
||||
|
||||
|
||||
int getPreviousLinebreak(int fromIndex);
|
||||
|
||||
|
||||
List<Integer> getLineBreaks();
|
||||
|
||||
|
||||
Rectangle2D getPosition(int stringIdx);
|
||||
|
||||
|
||||
List<Rectangle2D> getPositions(Boundary stringBoundary);
|
||||
|
||||
|
||||
List<EntityPosition> getEntityPositionsPerPage(Boundary stringBoundary);
|
||||
|
||||
|
||||
int numberOfLines();
|
||||
|
||||
|
||||
default int indexOf(String searchTerm) {
|
||||
|
||||
return indexOf(searchTerm, getBoundary().start());
|
||||
}
|
||||
|
||||
|
||||
default Set<PageNode> getPages() {
|
||||
|
||||
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getPage).collect(Collectors.toUnmodifiableSet());
|
||||
}
|
||||
|
||||
|
||||
default int indexOf(String searchTerm, int startOffset) {
|
||||
|
||||
int start = getSearchText().indexOf(searchTerm, startOffset - getBoundary().start());
|
||||
if (start == -1) {
|
||||
return -1;
|
||||
}
|
||||
return start + getBoundary().start();
|
||||
}
|
||||
|
||||
|
||||
default CharSequence getFirstLine() {
|
||||
|
||||
return subSequence(getBoundary().start(), getNextLinebreak(getBoundary().start()));
|
||||
}
|
||||
|
||||
|
||||
default boolean containsBoundary(Boundary boundary) {
|
||||
|
||||
if (boundary.end() < boundary.start()) {
|
||||
throw new IllegalArgumentException(String.format("Invalid %s, StartIndex must be smaller than EndIndex", boundary));
|
||||
}
|
||||
return getBoundary().contains(boundary);
|
||||
}
|
||||
|
||||
|
||||
default boolean containsIndex(int stringIndex) {
|
||||
|
||||
return getBoundary().contains(stringIndex);
|
||||
}
|
||||
|
||||
|
||||
default CharSequence subSequence(Boundary boundary) {
|
||||
|
||||
return subSequence(boundary.start(), boundary.end());
|
||||
}
|
||||
|
||||
|
||||
default String buildSummary() {
|
||||
|
||||
String[] words = getSearchText().split(" ");
|
||||
int bound = Math.min(words.length, 4);
|
||||
List<String> list = new ArrayList<>(Arrays.asList(words).subList(0, bound));
|
||||
|
||||
return String.join(" ", list);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
default CharSequence subSequence(int start, int end) {
|
||||
|
||||
return getSearchText().substring(start - getBoundary().start(), end - getBoundary().start());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
default int length() {
|
||||
|
||||
return getBoundary().length();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
default char charAt(int index) {
|
||||
|
||||
return getSearchText().charAt(index - getBoundary().start());
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,50 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.BinaryOperator;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collector;
|
||||
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@NoArgsConstructor
|
||||
public class TextBlockCollector implements Collector<TextBlock, ConcatenatedTextBlock, TextBlock> {
|
||||
|
||||
@Override
|
||||
public Supplier<ConcatenatedTextBlock> supplier() {
|
||||
|
||||
return () -> new ConcatenatedTextBlock(Collections.emptyList());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BiConsumer<ConcatenatedTextBlock, TextBlock> accumulator() {
|
||||
|
||||
return ConcatenatedTextBlock::concat;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BinaryOperator<ConcatenatedTextBlock> combiner() {
|
||||
|
||||
return ConcatenatedTextBlock::concat;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Function<ConcatenatedTextBlock, TextBlock> finisher() {
|
||||
|
||||
return a -> a;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<Characteristics> characteristics() {
|
||||
|
||||
return Set.of(Characteristics.IDENTITY_FINISH, Characteristics.CONCURRENT);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,146 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.mapper;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class DocumentDataMapper {
|
||||
|
||||
public DocumentData toDocumentData(DocumentGraph documentGraph) {
|
||||
|
||||
List<AtomicTextBlockData> atomicTextBlockData = documentGraph.streamTerminalTextBlocksInOrder()
|
||||
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
|
||||
.distinct()
|
||||
.map(DocumentDataMapper::toAtomicTextBlockData)
|
||||
.toList();
|
||||
|
||||
List<AtomicPositionBlockData> atomicPositionBlockData = documentGraph.streamTerminalTextBlocksInOrder()
|
||||
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
|
||||
.distinct()
|
||||
.map(DocumentDataMapper::toAtomicPositionBlockData)
|
||||
.toList();
|
||||
|
||||
List<PageData> pageData = documentGraph.getPages().stream().map(DocumentDataMapper::toPageData).toList();
|
||||
TableOfContentsData tableOfContentsData = toTableOfContentsData(documentGraph.getTableOfContents());
|
||||
return DocumentData.builder()
|
||||
.atomicTextBlocks(atomicTextBlockData.toArray(new AtomicTextBlockData[0]))
|
||||
.atomicPositionBlocks(atomicPositionBlockData.toArray(new AtomicPositionBlockData[0]))
|
||||
.pages(pageData.toArray(new PageData[0]))
|
||||
.tableOfContents(tableOfContentsData)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private TableOfContentsData toTableOfContentsData(TableOfContents tableOfContents) {
|
||||
|
||||
return new TableOfContentsData(toEntryData(tableOfContents.getRoot()));
|
||||
}
|
||||
|
||||
|
||||
private TableOfContentsData.EntryData toEntryData(TableOfContents.Entry entry) {
|
||||
|
||||
Long[] atomicTextBlocks;
|
||||
|
||||
if (entry.getNode().isTerminal()) {
|
||||
atomicTextBlocks = toAtomicTextBlockIds(entry.getNode().getTerminalTextBlock());
|
||||
} else {
|
||||
atomicTextBlocks = new Long[]{};
|
||||
}
|
||||
|
||||
Map<String, String> properties = switch (entry.getType()) {
|
||||
case TABLE -> PropertiesMapper.buildTableProperties((TableNode) entry.getNode());
|
||||
case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCellNode) entry.getNode());
|
||||
case IMAGE -> PropertiesMapper.buildImageProperties((ImageNode) entry.getNode());
|
||||
default -> new HashMap<>();
|
||||
};
|
||||
|
||||
return TableOfContentsData.EntryData.builder()
|
||||
.tocId(toPrimitiveIntArray(entry.getTocId()))
|
||||
.subEntries(entry.getChildren().stream().map(DocumentDataMapper::toEntryData).toList())
|
||||
.type(entry.getType())
|
||||
.atomicBlocks(atomicTextBlocks)
|
||||
.pages(entry.getNode().getPages().stream().map(PageNode::getNumber).map(Integer::longValue).toArray(Long[]::new))
|
||||
.properties(properties)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private Long[] toAtomicTextBlockIds(TextBlock textBlock) {
|
||||
|
||||
return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new);
|
||||
}
|
||||
|
||||
|
||||
private PageData toPageData(PageNode p) {
|
||||
|
||||
return PageData.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).build();
|
||||
}
|
||||
|
||||
|
||||
private AtomicTextBlockData toAtomicTextBlockData(AtomicTextBlock atomicTextBlock) {
|
||||
|
||||
return AtomicTextBlockData.builder()
|
||||
.id(atomicTextBlock.getId())
|
||||
.page(atomicTextBlock.getPage().getNumber().longValue())
|
||||
.searchText(atomicTextBlock.getSearchText())
|
||||
.numberOnPage(atomicTextBlock.getNumberOnPage())
|
||||
.start(atomicTextBlock.getBoundary().start())
|
||||
.end(atomicTextBlock.getBoundary().end())
|
||||
.lineBreaks(toPrimitiveIntArray(atomicTextBlock.getLineBreaks()))
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private AtomicPositionBlockData toAtomicPositionBlockData(AtomicTextBlock atomicTextBlock) {
|
||||
|
||||
return AtomicPositionBlockData.builder()
|
||||
.id(atomicTextBlock.getId())
|
||||
.positions(toPrimitiveFloatMatrix(atomicTextBlock.getPositions()))
|
||||
.stringIdxToPositionIdx(toPrimitiveIntArray(atomicTextBlock.getStringIdxToPositionIdx()))
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private float[][] toPrimitiveFloatMatrix(List<Rectangle2D> positions) {
|
||||
|
||||
float[][] positionMatrix = new float[positions.size()][];
|
||||
for (int i = 0; i < positions.size(); i++) {
|
||||
float[] singlePositions = new float[4];
|
||||
singlePositions[0] = (float) positions.get(i).getMinX();
|
||||
singlePositions[1] = (float) positions.get(i).getMinY();
|
||||
singlePositions[2] = (float) positions.get(i).getWidth();
|
||||
singlePositions[3] = (float) positions.get(i).getHeight();
|
||||
positionMatrix[i] = singlePositions;
|
||||
}
|
||||
return positionMatrix;
|
||||
}
|
||||
|
||||
|
||||
private int[] toPrimitiveIntArray(List<Integer> list) {
|
||||
|
||||
int[] array = new int[list.size()];
|
||||
for (int i = 0; i < list.size(); i++) {
|
||||
array[i] = list.get(i);
|
||||
}
|
||||
return array;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,229 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.mapper;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType.FOOTER;
|
||||
import static com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType.HEADER;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
import com.google.common.primitives.Ints;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.FooterNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeaderNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeadlineNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ParagraphNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class DocumentGraphMapper {
|
||||
|
||||
public DocumentGraph toDocumentGraph(DocumentData documentData) {
|
||||
|
||||
|
||||
DocumentGraph documentGraph = new DocumentGraph();
|
||||
Context context = new Context(documentData,
|
||||
new TableOfContents(documentGraph),
|
||||
new LinkedList<>(),
|
||||
new LinkedList<>(),
|
||||
Arrays.stream(documentData.getAtomicTextBlocks()).toList(),
|
||||
Arrays.stream(documentData.getAtomicPositionBlocks()).toList());
|
||||
|
||||
context.pages.addAll(Arrays.stream(documentData.getPages()).map(DocumentGraphMapper::buildPage).toList());
|
||||
|
||||
context.tableOfContents.getRoot().getChildren().addAll(buildEntries(documentData.getTableOfContents().getRoot().getSubEntries(), context));
|
||||
|
||||
documentGraph.setTableOfContents(context.tableOfContents);
|
||||
documentGraph.setPages(new HashSet<>(context.pages));
|
||||
documentGraph.setNumberOfPages(documentData.getPages().length);
|
||||
|
||||
documentGraph.setTextBlock(documentGraph.buildTextBlock());
|
||||
return documentGraph;
|
||||
}
|
||||
|
||||
|
||||
private List<TableOfContents.Entry> buildEntries(List<TableOfContentsData.EntryData> entries,
|
||||
Context context) {
|
||||
|
||||
List<TableOfContents.Entry> newEntries = new LinkedList<>();
|
||||
for (TableOfContentsData.EntryData entryData : entries) {
|
||||
|
||||
boolean terminal = isTerminal(entryData);
|
||||
List<PageNode> pages = Arrays.stream(entryData.getPages()).map(pageNumber -> getPage(pageNumber, context)).toList();
|
||||
|
||||
SemanticNode node = switch (entryData.getType()) {
|
||||
case SECTION -> buildSection(context);
|
||||
case PARAGRAPH -> buildParagraph(context, terminal);
|
||||
case HEADLINE -> buildHeadline(context, terminal);
|
||||
case HEADER -> buildHeader(context, terminal);
|
||||
case FOOTER -> buildFooter(context, terminal);
|
||||
case TABLE -> buildTable(context, entryData.getProperties());
|
||||
case TABLE_CELL -> buildTableCell(context, entryData.getProperties(), terminal);
|
||||
case IMAGE -> buildImage(context, entryData.getProperties());
|
||||
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
|
||||
};
|
||||
|
||||
if (node.isTerminal()) {
|
||||
TextBlock textBlock = toTextBlock(entryData.getAtomicBlocks(), context, node);
|
||||
node.setTerminalTextBlock(textBlock);
|
||||
}
|
||||
List<Integer> tocId = Arrays.stream(entryData.getTocId()).boxed().toList();
|
||||
node.setTocId(tocId);
|
||||
|
||||
if (entryData.getType() == HEADER) {
|
||||
pages.forEach(page -> page.setHeader((HeaderNode) node));
|
||||
} else if (entryData.getType() == FOOTER) {
|
||||
pages.forEach(page -> page.setFooter((FooterNode) node));
|
||||
} else {
|
||||
pages.forEach(page -> page.getMainBody().add(node));
|
||||
}
|
||||
newEntries.add(TableOfContents.Entry.builder().tocId(tocId).type(entryData.getType()).children(buildEntries(entryData.getSubEntries(), context)).node(node).build());
|
||||
}
|
||||
return newEntries;
|
||||
}
|
||||
|
||||
|
||||
private HeadlineNode buildHeadline(Context context, boolean terminal) {
|
||||
|
||||
return HeadlineNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private static boolean isTerminal(TableOfContentsData.EntryData entryData) {
|
||||
|
||||
return entryData.getAtomicBlocks().length > 0;
|
||||
}
|
||||
|
||||
|
||||
private ImageNode buildImage(Context context, Map<String, String> properties) {
|
||||
|
||||
var builder = ImageNode.builder();
|
||||
PropertiesMapper.parseImageProperties(properties, builder);
|
||||
return builder.tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private TableCellNode buildTableCell(Context context, Map<String, String> properties, boolean terminal) {
|
||||
|
||||
TableCellNode.TableCellNodeBuilder builder = TableCellNode.builder();
|
||||
PropertiesMapper.parseTableCellProperties(properties, builder);
|
||||
return builder.terminal(terminal).tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private TableNode buildTable(Context context, Map<String, String> properties) {
|
||||
|
||||
TableNode.TableNodeBuilder builder = TableNode.builder();
|
||||
PropertiesMapper.parseTableProperties(properties, builder);
|
||||
return TableNode.builder().tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private FooterNode buildFooter(Context context, boolean terminal) {
|
||||
|
||||
return FooterNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private HeaderNode buildHeader(Context context, boolean terminal) {
|
||||
|
||||
return HeaderNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private SectionNode buildSection(Context context) {
|
||||
|
||||
return SectionNode.builder().tableOfContents(context.tableOfContents()).build();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private ParagraphNode buildParagraph(Context context, boolean terminal) {
|
||||
|
||||
return ParagraphNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
|
||||
}
|
||||
|
||||
|
||||
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
|
||||
|
||||
return Arrays.stream(atomicTextBlockIds)
|
||||
.map(atomicTextBlockId -> toAtomicTextBlock(context.atomicTextBlockData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
|
||||
parent,
|
||||
context))
|
||||
.collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
private PageNode buildPage(PageData p) {
|
||||
|
||||
return PageNode.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
|
||||
}
|
||||
|
||||
|
||||
private AtomicTextBlock toAtomicTextBlock(AtomicTextBlockData atomicTextBlockData,
|
||||
AtomicPositionBlockData atomicPositionBlockData,
|
||||
SemanticNode parent,
|
||||
Context context) {
|
||||
|
||||
return AtomicTextBlock.builder()
|
||||
.id(atomicTextBlockData.getId())
|
||||
.numberOnPage(atomicTextBlockData.getNumberOnPage())
|
||||
.page(getPage(atomicTextBlockData.getPage(), context))
|
||||
.boundary(new Boundary(atomicTextBlockData.getStart(), atomicTextBlockData.getEnd()))
|
||||
.searchText(atomicTextBlockData.getSearchText())
|
||||
.lineBreaks(Ints.asList(atomicTextBlockData.getLineBreaks()))
|
||||
.stringIdxToPositionIdx(Ints.asList(atomicPositionBlockData.getStringIdxToPositionIdx()))
|
||||
.positions(toRectangle2DList(atomicPositionBlockData.getPositions()))
|
||||
.parent(parent)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
|
||||
|
||||
return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList();
|
||||
}
|
||||
|
||||
|
||||
private PageNode getPage(Long pageIndex, Context context) {
|
||||
|
||||
return context.pages.stream()
|
||||
.filter(page -> page.getNumber() == Math.toIntExact(pageIndex))
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex)));
|
||||
}
|
||||
|
||||
|
||||
record Context(
|
||||
DocumentData layoutParsingModel,
|
||||
TableOfContents tableOfContents,
|
||||
List<PageNode> pages,
|
||||
List<SectionNode> sections,
|
||||
List<AtomicTextBlockData> atomicTextBlockData,
|
||||
List<AtomicPositionBlockData> atomicPositionBlockData) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,101 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.mapper;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations;
|
||||
|
||||
public class PropertiesMapper {
|
||||
|
||||
public static Map<String, String> buildImageProperties(ImageNode image) {
|
||||
|
||||
Map<String, String> properties = new HashMap<>();
|
||||
properties.put("imageType", image.getImageType().toString());
|
||||
properties.put("transparency", String.valueOf(image.isTransparency()));
|
||||
properties.put("position", RectangleTransformations.toString(image.getPosition()));
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
||||
public static Map<String, String> buildTableCellProperties(TableCellNode tableCell) {
|
||||
|
||||
Map<String, String> properties = new HashMap<>();
|
||||
properties.put("row", String.valueOf(tableCell.getRow()));
|
||||
properties.put("col", String.valueOf(tableCell.getCol()));
|
||||
properties.put("header", String.valueOf(tableCell.isHeader()));
|
||||
|
||||
if (tableCell.getPages().size() > 1 || tableCell.getBBox().keySet().size() > 1) {
|
||||
throw new IllegalArgumentException("TableCell can only occur on a single page!");
|
||||
}
|
||||
String bBoxString = RectangleTransformations.toString(tableCell.getBBox().get(tableCell.getPages().stream().findFirst().get()));
|
||||
properties.put("bBox", bBoxString);
|
||||
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
||||
public static Map<String, String> buildTableProperties(TableNode table) {
|
||||
|
||||
Map<String, String> properties = new HashMap<>();
|
||||
properties.put("numberOfRows", String.valueOf(table.getNumberOfRows()));
|
||||
properties.put("numberOfCols", String.valueOf(table.getNumberOfCols()));
|
||||
return properties;
|
||||
}
|
||||
|
||||
|
||||
public static void parseImageProperties(Map<String, String> properties, ImageNode.ImageNodeBuilder builder) {
|
||||
|
||||
builder.imageType(parseImageType(properties.get("imageType")));
|
||||
builder.transparency(Boolean.parseBoolean(properties.get("transparency")));
|
||||
builder.position(parseRectangle2D(properties.get("position")));
|
||||
}
|
||||
|
||||
|
||||
public static void parseTableCellProperties(Map<String, String> properties, TableCellNode.TableCellNodeBuilder builder) {
|
||||
|
||||
builder.row(Integer.parseInt(properties.get("row")));
|
||||
builder.col(Integer.parseInt(properties.get("col")));
|
||||
builder.header(Boolean.parseBoolean(properties.get("header")));
|
||||
builder.bBox(parseRectangle2D(properties.get("bBox")));
|
||||
}
|
||||
|
||||
|
||||
public static void parseTableProperties(Map<String, String> properties, TableNode.TableNodeBuilder builder) {
|
||||
|
||||
builder.numberOfRows(Integer.parseInt(properties.get("numberOfRows")));
|
||||
builder.numberOfCols(Integer.parseInt(properties.get("numberOfCols")));
|
||||
}
|
||||
|
||||
|
||||
private static ImageType parseImageType(String imageType) {
|
||||
|
||||
return switch (imageType) {
|
||||
case "LOGO" -> ImageType.LOGO;
|
||||
case "FORMULA" -> ImageType.FORMULA;
|
||||
case "SIGNATURE" -> ImageType.SIGNATURE;
|
||||
case "OCR" -> ImageType.OCR;
|
||||
default -> ImageType.OTHER;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public static String toString(Rectangle2D rectangle2D) {
|
||||
|
||||
return String.format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D parseRectangle2D(String bBox) {
|
||||
|
||||
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
|
||||
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
||||
}
|
||||
|
||||
}
|
||||
@ -2,9 +2,26 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.Builder;
|
||||
|
||||
@Builder
|
||||
public record LayoutParsingFinishedEvent(Map<String, String> identifier, long duration, int numberOfPages, String message) {
|
||||
@Schema(description = "Object containing information about the layout parsing.")
|
||||
public record LayoutParsingFinishedEvent(
|
||||
@Schema(description = "General purpose identifier. It is returned exactly the same way it is inserted with the LayoutParsingRequest.") //
|
||||
Map<String, String> identifier,
|
||||
|
||||
@Schema(description = "The duration of a single layout parsing in ms.") //
|
||||
long duration,
|
||||
|
||||
@Schema(description = "The number of pages of the parsed document.") //
|
||||
int numberOfPages,
|
||||
|
||||
@Schema(description = "A general message. It contains some information useful for a developer, like the paths where the files are stored. Not meant to be machine readable.") //
|
||||
String message,
|
||||
|
||||
@Schema(description = "The app version of the layout parser.") //
|
||||
String layoutParserVersion
|
||||
) {
|
||||
|
||||
}
|
||||
|
||||
@ -2,7 +2,9 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
|
||||
|
||||
public class LayoutParsingQueueNames {
|
||||
|
||||
public static final String LAYOUT_PARSING_REQUEST_QUEUE = "LAYOUTPARSING_REQUEST_QUEUE";
|
||||
public static final String LAYOUT_PARSING_DLQ = "LAYOUTPARSING_DLQ";
|
||||
public static final String LAYOUT_PARSING_FINISHED_EVENT_QUEUE = "LAYOUTPARSING_FINISHED_EVENT_QUEUE";
|
||||
public static final String LAYOUT_PARSING_REQUEST_QUEUE_PREFIX = "layout_parsing_request";
|
||||
public static final String LAYOUT_PARSING_REQUEST_EXCHANGE = "layout_parsing_request_exchange";
|
||||
public static final String LAYOUT_PARSING_RESPONSE_QUEUE_PREFIX = "layout_parsing_response";
|
||||
public static final String LAYOUT_PARSING_RESPONSE_EXCHANGE = "layout_parsing_response_exchange";
|
||||
public static final String LAYOUT_PARSING_DLQ = "layout_parsing_error";
|
||||
}
|
||||
|
||||
@ -3,18 +3,45 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import lombok.Builder;
|
||||
|
||||
import lombok.NonNull;
|
||||
|
||||
@Builder
|
||||
@Schema(description = "Object containing all storage paths the service needs to know.")
|
||||
public record LayoutParsingRequest(
|
||||
@Schema(description = "Enum specifying the type of layout parsing to be performed.", allowableValues = "{RedactManager, DocuMine, TAAS}")//
|
||||
@NonNull LayoutParsingType layoutParsingType,
|
||||
|
||||
@Schema(description = "General purpose identifiers. They are not changed by the service at all and are returned as is in the response queue.")//
|
||||
Map<String, String> identifier,
|
||||
String originFileStorageId,
|
||||
Optional<String> tablesFileStorageId,
|
||||
Optional<String> imagesFileStorageId,
|
||||
String structureFileStorageId,
|
||||
String textBlockFileStorageId,
|
||||
String positionBlockFileStorageId,
|
||||
String pageFileStorageId) {
|
||||
|
||||
@Schema(description = "Path to the original PDF file.")//
|
||||
@NonNull String originFileStorageId,//
|
||||
|
||||
@Schema(description = "Optional Path to the table extraction file.")//
|
||||
Optional<String> tablesFileStorageId,//
|
||||
@Schema(description = "Optional Path to the image classification file.")//
|
||||
Optional<String> imagesFileStorageId,//
|
||||
|
||||
@Schema(description = "Optional Path to the the visual layout parsing service file") Optional<String> visualLayoutParsingFileId,//
|
||||
|
||||
@Schema(description = "Path where the Document Structure File will be stored.")//
|
||||
@NonNull String structureFileStorageId,//
|
||||
@Schema(description = "Path where the Research Data File will be stored.")//
|
||||
String researchDocumentStorageId,//
|
||||
@Schema(description = "Path where the Document Text File will be stored.")//
|
||||
@NonNull String textBlockFileStorageId,//
|
||||
@Schema(description = "Path where the Document Positions File will be stored.")//
|
||||
@NonNull String positionBlockFileStorageId,//
|
||||
@Schema(description = "Path where the Document Pages File will be stored.")//
|
||||
@NonNull String pageFileStorageId,//
|
||||
@Schema(description = "Path where the Document Markdown File will be stored.")//
|
||||
Optional<String> documentMarkdownFileStorageId,//
|
||||
@Schema(description = "Path where the Simplified Text File will be stored.")//
|
||||
@NonNull String simplifiedTextStorageId,//
|
||||
@Schema(description = "Path where the Viewer Document PDF will be stored.")//
|
||||
@NonNull String viewerDocumentStorageId
|
||||
) {
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,12 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.queue;
|
||||
|
||||
public enum LayoutParsingType {
|
||||
REDACT_MANAGER,
|
||||
REDACT_MANAGER_OLD,
|
||||
REDACT_MANAGER_PARAGRAPH_DEBUG,
|
||||
REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
DOCUMINE,
|
||||
DOCUMINE_OLD,
|
||||
CLARIFYND,
|
||||
CLARIFYND_PARAGRAPH_DEBUG
|
||||
}
|
||||
@ -1,10 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.services;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
public interface EntityEnrichmentService {
|
||||
|
||||
void enrichEntity(EntityNode entity, TextBlock textBlock);
|
||||
|
||||
}
|
||||
@ -1,56 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.services;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@RequiredArgsConstructor
|
||||
public class EntityInsertionService {
|
||||
|
||||
private final EntityEnrichmentService entityEnrichmentService;
|
||||
|
||||
|
||||
public void addEntityToGraph(EntityNode entity, TableOfContents tableOfContents) {
|
||||
|
||||
try {
|
||||
SemanticNode containingNode = tableOfContents.streamChildrenNodes(Collections.emptyList())
|
||||
.filter(node -> node.buildTextBlock().containsBoundary(entity.getBoundary()))
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException("No containing Node found!"));
|
||||
|
||||
containingNode.addThisToEntityIfIntersects(entity);
|
||||
|
||||
TextBlock textBlock = entity.getDeepestFullyContainingNode().buildTextBlock();
|
||||
entityEnrichmentService.enrichEntity(entity, textBlock);
|
||||
|
||||
addToPages(entity);
|
||||
addToNodeEntitySets(entity);
|
||||
|
||||
} catch (NoSuchElementException e) {
|
||||
entity.removeFromGraph();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addToPages(EntityNode entity) {
|
||||
|
||||
Set<PageNode> pages = entity.getDeepestFullyContainingNode().getPages();
|
||||
entity.getPages().addAll(pages);
|
||||
pages.forEach(page -> page.getEntities().add(entity));
|
||||
}
|
||||
|
||||
|
||||
private void addToNodeEntitySets(EntityNode entity) {
|
||||
|
||||
entity.getIntersectingNodes().forEach(node -> node.getEntities().add(entity));
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,95 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.services;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Area;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.BinaryOperator;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collector;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class RectangleTransformations {
|
||||
|
||||
public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) {
|
||||
|
||||
return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY);
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
|
||||
|
||||
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DUnion());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D rectangleUnion(List<Rectangle2D> rectangle2DList) {
|
||||
|
||||
return rectangle2DList.stream().collect(new Rectangle2DUnion());
|
||||
}
|
||||
|
||||
|
||||
public static String toString(Rectangle2D rectangle2D) {
|
||||
|
||||
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D parseRectangle2D(String bBox) {
|
||||
|
||||
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
|
||||
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
||||
}
|
||||
|
||||
|
||||
private static class Rectangle2DUnion implements Collector<Rectangle2D, Area, Rectangle2D> {
|
||||
|
||||
@Override
|
||||
public Supplier<Area> supplier() {
|
||||
|
||||
return Area::new;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BiConsumer<Area, Rectangle2D> accumulator() {
|
||||
|
||||
return (area, rectangle2D) -> area.add(new Area(rectangle2D));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BinaryOperator<Area> combiner() {
|
||||
|
||||
return (area1, area2) -> {
|
||||
area1.add(area2);
|
||||
return area1;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Function<Area, Rectangle2D> finisher() {
|
||||
|
||||
return Area::getBounds2D;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<Characteristics> characteristics() {
|
||||
|
||||
return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,16 @@
|
||||
<Configuration>
|
||||
|
||||
<Appenders>
|
||||
<Console name="CONSOLE" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
|
||||
</Console>
|
||||
</Appenders>
|
||||
|
||||
<Loggers>
|
||||
<Root level="warn">
|
||||
<AppenderRef ref="CONSOLE"/>
|
||||
</Root>
|
||||
<Logger name="com.iqser" level="info"/>
|
||||
</Loggers>
|
||||
|
||||
</Configuration>
|
||||
@ -0,0 +1,40 @@
|
||||
plugins {
|
||||
id("com.knecon.fforesight.java-conventions")
|
||||
id("io.freefair.lombok") version "8.4"
|
||||
}
|
||||
|
||||
description = "layoutparser-service-processor"
|
||||
|
||||
val jacksonVersion = "2.15.2"
|
||||
val pdfBoxVersion = "3.0.0"
|
||||
|
||||
|
||||
dependencies {
|
||||
implementation(project(":layoutparser-service-internal-api"))
|
||||
implementation(project(":viewer-doc-processor"))
|
||||
|
||||
implementation("com.knecon.fforesight:document:${rootProject.extra.get("documentVersion")}")
|
||||
implementation("com.iqser.red.service:persistence-service-shared-api-v1:2.564.0-RED9010.0") {
|
||||
exclude("org.springframework.boot", "spring-boot-starter-security")
|
||||
exclude("org.springframework.boot", "spring-boot-starter-validation")
|
||||
}
|
||||
implementation("com.knecon.fforesight:tenant-commons:0.30.0") {
|
||||
exclude("com.iqser.red.commons", "storage-commons")
|
||||
}
|
||||
implementation("com.iqser.red.commons:storage-commons:2.50.0")
|
||||
|
||||
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
|
||||
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
|
||||
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")
|
||||
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
|
||||
implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
|
||||
implementation("org.jgrapht:jgrapht-core:1.5.2")
|
||||
implementation("org.apache.pdfbox:jbig2-imageio:3.0.4")
|
||||
implementation("com.github.jai-imageio:jai-imageio-core:1.4.0")
|
||||
implementation("com.github.jai-imageio:jai-imageio-jpeg2000:1.4.0")
|
||||
implementation("org.tinspin:tinspin-indexes:2.1.3")
|
||||
implementation("org.commonmark:commonmark:0.22.0")
|
||||
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
|
||||
implementation("com.pdftron:PDFNet:10.11.0")
|
||||
implementation("org.apache.commons:commons-text:1.12.0")
|
||||
}
|
||||
@ -1,129 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>com.knecon.fforesight</groupId>
|
||||
<artifactId>layoutparser-service</artifactId>
|
||||
<version>1.0.0</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>layoutparser-service-processor</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.service</groupId>
|
||||
<artifactId>persistence-service-internal-api-v1</artifactId>
|
||||
<version>2.36.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.knecon.fforesight</groupId>
|
||||
<artifactId>layoutparser-service-internal-api</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.commons</groupId>
|
||||
<artifactId>storage-commons</artifactId>
|
||||
<version>1.13.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.commons</groupId>
|
||||
<artifactId>spring-commons</artifactId>
|
||||
<version>6.2.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.dslplatform</groupId>
|
||||
<artifactId>dsl-json-java8</artifactId>
|
||||
<version>1.10.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.pdfbox</groupId>
|
||||
<artifactId>pdfbox</artifactId>
|
||||
<version>${pdfbox.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.pdfbox</groupId>
|
||||
<artifactId>pdfbox-tools</artifactId>
|
||||
<version>${pdfbox.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
<version>31.1-jre</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.module</groupId>
|
||||
<artifactId>jackson-module-afterburner</artifactId>
|
||||
<version>${jackson.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.datatype</groupId>
|
||||
<artifactId>jackson-datatype-jsr310</artifactId>
|
||||
<version>${jackson.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-security</artifactId>
|
||||
<version>${spring.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-web</artifactId>
|
||||
<version>${spring.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
<optional>true</optional>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.cloud</groupId>
|
||||
<artifactId>spring-cloud-starter-openfeign</artifactId>
|
||||
<version>4.0.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-amqp</artifactId>
|
||||
<version>${spring.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<repositories>
|
||||
<repository>
|
||||
<id>spring-milestones</id>
|
||||
<name>Spring Milestones</name>
|
||||
<url>https://repo.spring.io/milestone</url>
|
||||
<snapshots>
|
||||
<enabled>false</enabled>
|
||||
</snapshots>
|
||||
</repository>
|
||||
<repository>
|
||||
<id>spring-snapshots</id>
|
||||
<name>Spring Snapshots</name>
|
||||
<url>https://repo.spring.io/snapshot</url>
|
||||
<releases>
|
||||
<enabled>false</enabled>
|
||||
</releases>
|
||||
</repository>
|
||||
</repositories>
|
||||
<pluginRepositories>
|
||||
<pluginRepository>
|
||||
<id>spring-milestones</id>
|
||||
<name>Spring Milestones</name>
|
||||
<url>https://repo.spring.io/milestone</url>
|
||||
<snapshots>
|
||||
<enabled>false</enabled>
|
||||
</snapshots>
|
||||
</pluginRepository>
|
||||
<pluginRepository>
|
||||
<id>spring-snapshots</id>
|
||||
<name>Spring Snapshots</name>
|
||||
<url>https://repo.spring.io/snapshot</url>
|
||||
<releases>
|
||||
<enabled>false</enabled>
|
||||
</releases>
|
||||
</pluginRepository>
|
||||
</pluginRepositories>
|
||||
|
||||
</project>
|
||||
@ -0,0 +1,20 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Configuration
|
||||
@ConfigurationProperties("layoutparser")
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class LayoutParserSettings {
|
||||
|
||||
boolean debug;
|
||||
LayoutParsingType layoutParsingTypeOverride;
|
||||
}
|
||||
@ -0,0 +1,474 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.mapper.DocumentDataMapper;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEnhancementService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.VisualLayoutParsingAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||
|
||||
import io.micrometer.observation.Observation;
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@SuppressWarnings("PMD.CloseResource")
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class LayoutParsingPipeline {
|
||||
|
||||
final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
final CvTableParsingAdapter cvTableParsingAdapter;
|
||||
final LayoutParsingStorageService layoutParsingStorageService;
|
||||
final SectionsBuilderService sectionsBuilderService;
|
||||
final SimplifiedSectionTextService simplifiedSectionTextService;
|
||||
final RulingCleaningService rulingCleaningService;
|
||||
final TableExtractionService tableExtractionService;
|
||||
final DocuMineBlockificationService docuMineBlockificationService;
|
||||
final RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
final BlockificationPostprocessingService blockificationPostprocessingService;
|
||||
final DocstrumBlockificationService docstrumBlockificationService;
|
||||
final LayoutGridService layoutGridService;
|
||||
final ObservationRegistry observationRegistry;
|
||||
final VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
||||
final GraphicExtractorService graphicExtractorService;
|
||||
final OutlineExtractorService outlineExtractorService;
|
||||
final SectionTreeBuilderService sectionTreeBuilderService;
|
||||
final SectionTreeEnhancementService sectionTreeEnhancementService;
|
||||
final LayoutParserSettings settings;
|
||||
final ClassificationService classificationService;
|
||||
|
||||
@Value("${LAYOUT_PARSER_VERSION:}")
|
||||
private String layoutParserVersion;
|
||||
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||
|
||||
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
|
||||
.orElse(originFile);
|
||||
|
||||
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
|
||||
.map(layoutParsingStorageService::getVisualLayoutParsingFile)
|
||||
.orElse(new VisualLayoutParsingResponse());
|
||||
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
|
||||
.map(layoutParsingStorageService::getImagesFile)
|
||||
.orElse(new ImageServiceResponse());
|
||||
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
|
||||
.map(layoutParsingStorageService::getTablesFile)
|
||||
.orElse(new TableServiceResponse());
|
||||
|
||||
LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null //
|
||||
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride();
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingType,
|
||||
originFile,
|
||||
imageServiceResponse,
|
||||
tableServiceResponse,
|
||||
visualLayoutParsingResponse,
|
||||
layoutParsingRequest.identifier());
|
||||
|
||||
log.info("Building document graph for {}", layoutParsingRequest.identifier());
|
||||
|
||||
DocumentWithVisualization documentWithVisualization = observeBuildDocumentGraph(layoutParsingType, classificationDocument);
|
||||
|
||||
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
|
||||
|
||||
layoutGridService.addLayoutGrid(viewerDocumentFile, documentWithVisualization, viewerDocumentFile, layoutParsingType, layoutParserVersion, false);
|
||||
|
||||
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
|
||||
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentWithVisualization.document()));
|
||||
if (layoutParsingRequest.documentMarkdownFileStorageId()
|
||||
.isPresent()) {
|
||||
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
|
||||
.get(), new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
|
||||
}
|
||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentWithVisualization.document()));
|
||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
|
||||
|
||||
if (layoutParsingRequest.researchDocumentStorageId() != null) {
|
||||
log.info("Building research document data for {}", layoutParsingRequest.identifier());
|
||||
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentWithVisualization.document());
|
||||
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
|
||||
}
|
||||
|
||||
if (!viewerDocumentFile.equals(originFile)) {
|
||||
assert !viewerDocumentFile.exists() || viewerDocumentFile.delete();
|
||||
}
|
||||
assert !originFile.exists() || originFile.delete();
|
||||
|
||||
return LayoutParsingFinishedEvent.builder()
|
||||
.identifier(layoutParsingRequest.identifier())
|
||||
.numberOfPages(documentWithVisualization.document().getNumberOfPages())
|
||||
.duration(System.currentTimeMillis() - start)
|
||||
.message(format("""
|
||||
Layout parsing has finished in %.02f s.
|
||||
identifiers: %s
|
||||
%s
|
||||
Files have been saved with Ids:
|
||||
Structure: %s
|
||||
Text: %s
|
||||
Positions: %s
|
||||
PageData: %s
|
||||
Simplified Text: %s
|
||||
Viewer Doc: %s""",
|
||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||
layoutParsingRequest.identifier(),
|
||||
buildSemanticNodeCountMessage(documentWithVisualization.document().getNumberOfPages(), documentWithVisualization.buildSemanticNodeCounts()),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
layoutParsingRequest.simplifiedTextStorageId(),
|
||||
layoutParsingRequest.viewerDocumentStorageId()))
|
||||
.layoutParserVersion(layoutParserVersion)
|
||||
.build();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private DocumentWithVisualization observeBuildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument) {
|
||||
|
||||
AtomicReference<DocumentWithVisualization> documentReference = new AtomicReference<>();
|
||||
|
||||
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry)
|
||||
.contextualName("build-document-graph")
|
||||
.observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(layoutParsingType, classificationDocument)));
|
||||
|
||||
return documentReference.get();
|
||||
}
|
||||
|
||||
|
||||
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
||||
|
||||
return format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||
numberOfPages,
|
||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Observed(name = "LayoutParsingPipeline", contextualName = "parse-layout")
|
||||
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
|
||||
File originFile,
|
||||
ImageServiceResponse imageServiceResponse,
|
||||
TableServiceResponse tableServiceResponse,
|
||||
VisualLayoutParsingResponse visualLayoutParsingResponse,
|
||||
Map<String, String> identifier) {
|
||||
|
||||
PDDocument originDocument = openDocument(originFile);
|
||||
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
|
||||
|
||||
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
||||
|
||||
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||
|
||||
if (settings.isDebug() || identifier.containsKey("debug")) {
|
||||
classificationDocument.getLayoutDebugLayer().setActive(true);
|
||||
}
|
||||
|
||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||
|
||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||
|
||||
long pageCount = originDocument.getNumberOfPages();
|
||||
|
||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||
|
||||
if (pageNumber % 100 == 0) {
|
||||
// re-open document every once in a while to save on RAM. This has no significant performance impact.
|
||||
// This is due to PDFBox caching all images and some other stuff with Soft References. This dereferences them and forces the freeing of memory.
|
||||
originDocument.close();
|
||||
originDocument = openDocument(originFile);
|
||||
}
|
||||
|
||||
if (pageNumber % 100 == 0 || pageNumber == pageCount || pageNumber == 1) {
|
||||
log.info("Extracting text on Page {} for {}", pageNumber, identifier);
|
||||
}
|
||||
|
||||
classificationDocument.setPages(classificationPages);
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = originDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(originDocument);
|
||||
List<Word> words = stripper.getWords();
|
||||
|
||||
// rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now
|
||||
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
||||
var lines = TextPositionOperations.groupByLine(new HashSet<>(words));
|
||||
classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber);
|
||||
words = TextPositionOperations.sortWords(lines);
|
||||
}
|
||||
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
|
||||
|
||||
PDRectangle pdr = pdPage.getMediaBox();
|
||||
|
||||
List<Ruling> rulings = stripper.getRulings();
|
||||
classificationDocument.getLayoutDebugLayer().addRulingVisualization(rulings, pageNumber);
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), rulings);
|
||||
|
||||
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
|
||||
classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber);
|
||||
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
||||
|
||||
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getWords(), false);
|
||||
|
||||
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
||||
.addAll(graphics.stream()
|
||||
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()),
|
||||
ImageType.GRAPHIC,
|
||||
false,
|
||||
stripper.getPageNumber(),
|
||||
""))
|
||||
.toList());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer());
|
||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
|
||||
};
|
||||
|
||||
updateClassificationPage(pdPage, pdr, classificationPage, cleanRulings, pageNumber, pageInformation);
|
||||
|
||||
blockificationPostprocessingService.findHeadlinesFromOutline(classificationDocument, pageNumber, classificationPage, pageInformation);
|
||||
|
||||
classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
|
||||
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
||||
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
|
||||
|
||||
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
|
||||
if (pdfImages.containsKey(pageNumber)) {
|
||||
classificationPage.setImages(pdfImages.get(pageNumber));
|
||||
imageServiceResponseAdapter.findOcr(classificationPage);
|
||||
}
|
||||
|
||||
if (signatures.containsKey(pageNumber)) {
|
||||
if (classificationPage.getImages() == null || classificationPage.getImages().isEmpty()) {
|
||||
classificationPage.setImages(signatures.get(pageNumber));
|
||||
} else {
|
||||
classificationPage.getImages().addAll(signatures.get(pageNumber));
|
||||
}
|
||||
}
|
||||
|
||||
tableExtractionService.extractTables(emptyTableCells, classificationPage);
|
||||
|
||||
buildPageStatistics(classificationPage);
|
||||
increaseDocumentStatistics(classificationPage, classificationDocument);
|
||||
|
||||
classificationPages.add(classificationPage);
|
||||
}
|
||||
|
||||
originDocument.close();
|
||||
|
||||
classificationService.classify(classificationDocument, layoutParsingType, identifier);
|
||||
|
||||
SectionTree sectionTree = sectionTreeBuilderService.createSectionTree(classificationDocument);
|
||||
classificationDocument.setSectionTree(sectionTree);
|
||||
|
||||
log.info("Building Sections for {}", identifier);
|
||||
|
||||
switch (layoutParsingType) {
|
||||
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
|
||||
default -> sectionTreeEnhancementService.assignSectionBlocksAndImages(classificationDocument);
|
||||
}
|
||||
|
||||
return classificationDocument;
|
||||
}
|
||||
|
||||
|
||||
private static void updateClassificationPage(PDPage pdPage,
|
||||
PDRectangle pdr,
|
||||
ClassificationPage classificationPage,
|
||||
CleanRulings cleanRulings,
|
||||
int pageNumber,
|
||||
PageInformation pageInformation) {
|
||||
|
||||
int rotation = pdPage.getRotation();
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
classificationPage.setRotation(rotation);
|
||||
classificationPage.setLandscape(isLandscape);
|
||||
classificationPage.setPageNumber(pageNumber);
|
||||
classificationPage.setPageWidth((float) pageInformation.width());
|
||||
classificationPage.setPageHeight((float) pageInformation.height());
|
||||
}
|
||||
|
||||
|
||||
private static void rotateDirAdjExactly(List<Word> words, PDPage pdPage) {
|
||||
|
||||
for (TextDirection dir : TextDirection.values()) {
|
||||
double averageRotation = words.stream()
|
||||
.map(Word::getCharacters)
|
||||
.flatMap(Collection::stream)
|
||||
.map(Character::getTextPosition)
|
||||
.filter(pos -> pos.getDir().equals(dir))
|
||||
.mapToDouble(RedTextPosition::getExactDir).average()
|
||||
.orElse(0);
|
||||
|
||||
if (averageRotation == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
AffineTransform rotateInstance = AffineTransform.getRotateInstance(averageRotation, pdPage.getMediaBox().getWidth() / 2, pdPage.getMediaBox().getHeight() / 2);
|
||||
|
||||
for (Word word : words) {
|
||||
if (!dir.equals(word.getDir())) {
|
||||
continue;
|
||||
}
|
||||
word.transform(rotateInstance);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addNumberOfPagesToTrace(int numberOfPages, long size) {
|
||||
|
||||
if (observationRegistry.getCurrentObservation() != null) {
|
||||
observationRegistry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages));
|
||||
observationRegistry.getCurrentObservation().highCardinalityKeyValue("fileSize", String.valueOf(size));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private PDDocument openDocument(File originFile) {
|
||||
|
||||
PDDocument document = Loader.loadPDF(originFile);
|
||||
document.setAllSecurityToBeRemoved(true);
|
||||
return document;
|
||||
}
|
||||
|
||||
|
||||
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
|
||||
|
||||
Map<String, List<Rectangle2D>> markedContentBboxes = new HashMap<>();
|
||||
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER));
|
||||
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER));
|
||||
return markedContentBboxes;
|
||||
}
|
||||
|
||||
|
||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||
|
||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
||||
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
||||
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
||||
}
|
||||
|
||||
|
||||
private void buildPageStatistics(ClassificationPage classificationPage) {
|
||||
|
||||
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
|
||||
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
|
||||
if (textBlock instanceof TextPageBlock) {
|
||||
if (((TextPageBlock) textBlock).getWords() == null) {
|
||||
continue;
|
||||
}
|
||||
for (Word word : ((TextPageBlock) textBlock).getWords()) {
|
||||
classificationPage.getTextHeightCounter().add(word.getTextHeight());
|
||||
classificationPage.getFontCounter().add(word.getFont());
|
||||
classificationPage.getFontSizeCounter().add(word.getFontSize());
|
||||
classificationPage.getFontStyleCounter().add(word.getFontStyle());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,88 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.CvTableParsingAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.ClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.PdfParsingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class LayoutParsingService {
|
||||
|
||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
private final CvTableParsingAdapter cvTableParsingAdapter;
|
||||
private final LayoutParsingStorageService layoutParsingStorageService;
|
||||
private final PdfParsingService pdfParsingService;
|
||||
private final ClassificationService classificationService;
|
||||
private final SectionsBuilderService sectionsBuilderService;
|
||||
private final DocumentGraphFactory documentGraphFactory;
|
||||
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
PDDocument originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||
|
||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.pageFileStorageId());
|
||||
}
|
||||
|
||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.pageFileStorageId());
|
||||
}
|
||||
|
||||
DocumentGraph documentGraph = parseLayout(originDocument, imageServiceResponse, tableServiceResponse);
|
||||
int numberOfPages = originDocument.getNumberOfPages();
|
||||
originDocument.close();
|
||||
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
||||
|
||||
return LayoutParsingFinishedEvent.builder()
|
||||
.identifier(layoutParsingRequest.identifier())
|
||||
.numberOfPages(numberOfPages)
|
||||
.duration(System.currentTimeMillis() - start)
|
||||
.message(format("Layout parsing is finished and files have been saved with Ids:\n Structure: %s\nText: %s\nPositions: %s\nPageData: %s",
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId()))
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
public DocumentGraph parseLayout(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) {
|
||||
|
||||
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument,
|
||||
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
|
||||
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
|
||||
|
||||
classificationService.classifyDocument(classificationDocument);
|
||||
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
|
||||
return documentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,10 +1,23 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.ComponentScan;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
|
||||
@Configuration
|
||||
@ComponentScan
|
||||
public class LayoutParsingServiceProcessorConfiguration {
|
||||
|
||||
@Bean
|
||||
@Autowired
|
||||
public PDFTronViewerDocumentService viewerDocumentService(ObservationRegistry registry) {
|
||||
|
||||
return new PDFTronViewerDocumentService(registry);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,30 +1,36 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.core.task.TaskExecutor;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.redaction.v1.server.data.DocumentData;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.multitenancy.TenantContext;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocVersioningUtility;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@ -35,67 +41,104 @@ public class LayoutParsingStorageService {
|
||||
private final StorageService storageService;
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
private final TaskExecutor taskExecutor;
|
||||
|
||||
public PDDocument getOriginFile(String storageId) throws IOException {
|
||||
|
||||
try (var originDocumentInputStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) {
|
||||
File tempFile = createTempFile("document", ".pdf");
|
||||
try (var tempFileOutputStream = new FileOutputStream(tempFile)) {
|
||||
IOUtils.copy(originDocumentInputStream, tempFileOutputStream);
|
||||
}
|
||||
return Loader.loadPDF(tempFile, MemoryUsageSetting.setupMixed(67108864L));
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file")
|
||||
public File getOriginFile(String storageId) throws IOException {
|
||||
|
||||
File tempFile = createTempFile("document", ".pdf");
|
||||
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
|
||||
return tempFile;
|
||||
}
|
||||
|
||||
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "get-viewer-doc-file")
|
||||
public Optional<File> getViewerDocFile(String storageId) throws IOException {
|
||||
|
||||
if (!storageService.objectExists(TenantContext.getTenantId(), storageId)) {
|
||||
return Optional.empty();
|
||||
}
|
||||
File tempFile = createTempFile("viewerDocument", ".pdf");
|
||||
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
|
||||
|
||||
if (!ViewerDocVersioningUtility.isCurrentVersion(tempFile)) {
|
||||
assert tempFile.delete();
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return Optional.of(tempFile);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public ImageServiceResponse getImagesFile(String storageId) {
|
||||
|
||||
try (InputStream inputStream = getObject(storageId)) {
|
||||
|
||||
ImageServiceResponse imageServiceResponse = objectMapper.readValue(inputStream, ImageServiceResponse.class);
|
||||
inputStream.close();
|
||||
return imageServiceResponse;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public ImageServiceResponse getImagesFile(String storageId) throws IOException {
|
||||
@SneakyThrows
|
||||
public TableServiceResponse getTablesFile(String storageId) {
|
||||
|
||||
try (InputStream inputStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) {
|
||||
try (var tableClassificationStream = getObject(storageId)) {
|
||||
|
||||
return objectMapper.readValue(inputStream, ImageServiceResponse.class);
|
||||
TableServiceResponse tableServiceResponse = objectMapper.readValue(tableClassificationStream, TableServiceResponse.class);
|
||||
tableClassificationStream.close();
|
||||
return tableServiceResponse;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public TableServiceResponse getTablesFile(String storageId) throws IOException {
|
||||
|
||||
try (var tableClassificationStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) {
|
||||
|
||||
return objectMapper.readValue(tableClassificationStream, TableServiceResponse.class);
|
||||
@SneakyThrows
|
||||
public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) {
|
||||
|
||||
try (InputStream inputStream = getObject(storageId)) {
|
||||
return objectMapper.readValue(inputStream, VisualLayoutParsingResponse.class);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) throws IOException {
|
||||
@SneakyThrows
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "store-document-data")
|
||||
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getTableOfContents());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getAtomicTextBlocks());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getAtomicPositionBlocks());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getPages());
|
||||
Runnable storeDocumentStructureRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
documentData.getDocumentStructure());
|
||||
|
||||
CompletableFuture<Void> storeDocumentStructureFuture = CompletableFuture.runAsync(storeDocumentStructureRunnable, taskExecutor);
|
||||
|
||||
Runnable storeDocumentTextDataRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
documentData.getDocumentTextData());
|
||||
|
||||
CompletableFuture<Void> storeDocumentTextDataFuture = CompletableFuture.runAsync(storeDocumentTextDataRunnable, taskExecutor);
|
||||
|
||||
Runnable storeDocumentPositionsRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
documentData.getDocumentPositionData());
|
||||
|
||||
CompletableFuture<Void> storeDocumentPositionsFuture = CompletableFuture.runAsync(storeDocumentPositionsRunnable, taskExecutor);
|
||||
|
||||
Runnable storeDocumentPagesRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
documentData.getDocumentPages());
|
||||
|
||||
CompletableFuture<Void> storeDocumentPagesFuture = CompletableFuture.runAsync(storeDocumentPagesRunnable, taskExecutor);
|
||||
|
||||
CompletableFuture.allOf(storeDocumentStructureFuture, storeDocumentTextDataFuture, storeDocumentPositionsFuture, storeDocumentPagesFuture).join();
|
||||
}
|
||||
|
||||
|
||||
public DocumentData readDocumentData(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) {
|
||||
|
||||
PageData[] pageData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), PageData[].class);
|
||||
AtomicTextBlockData[] atomicTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
AtomicTextBlockData[].class);
|
||||
AtomicPositionBlockData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
AtomicPositionBlockData[].class);
|
||||
TableOfContentsData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
TableOfContentsData.class);
|
||||
|
||||
return DocumentData.builder()
|
||||
.tableOfContents(tableOfContentsData)
|
||||
.atomicPositionBlocks(atomicPositionBlockData)
|
||||
.atomicTextBlocks(atomicTextBlockData)
|
||||
.pages(pageData)
|
||||
.build();
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
|
||||
}
|
||||
|
||||
|
||||
@ -123,4 +166,43 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "store-simplified-text")
|
||||
public void storeSimplifiedText(LayoutParsingRequest layoutParsingRequest, SimplifiedText simplifiedText) {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.simplifiedTextStorageId(), simplifiedText);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private InputStream getObject(String storageId) {
|
||||
|
||||
File tempFile = File.createTempFile("temp", ".data");
|
||||
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
|
||||
Path path = Paths.get(tempFile.getPath());
|
||||
return Files.newInputStream(path, StandardOpenOption.DELETE_ON_CLOSE);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "store-viewer-document")
|
||||
public void storeViewerDocument(LayoutParsingRequest layoutParsingRequest, File out) {
|
||||
|
||||
try (var in = new FileInputStream(out)) {
|
||||
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.viewerDocumentStorageId(), in);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "store-markdown-file")
|
||||
public void storeMarkdownFile(String markdownFileStorageId, String markdownContent) {
|
||||
|
||||
try (InputStream inputStream = new ByteArrayInputStream(markdownContent.getBytes(StandardCharsets.UTF_8))) {
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(), markdownFileStorageId, inputStream);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,49 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.CvParsedTableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class CvTableParsingAdapter {
|
||||
|
||||
public Map<Integer, List<com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell>> buildCvParsedTablesPerPage(TableServiceResponse tableServiceResponse) {
|
||||
|
||||
Map<Integer, List<com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell>> tableCells = new HashMap<>();
|
||||
tableServiceResponse.getData()
|
||||
.forEach(tableData -> tableCells.computeIfAbsent(tableData.getPageInfo().getNumber(), tableCell -> new ArrayList<>())
|
||||
.addAll(convertTableCells(tableData.getTableCells())));
|
||||
|
||||
return tableCells;
|
||||
}
|
||||
|
||||
|
||||
private Collection<? extends com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell> convertTableCells(List<CvParsedTableCell> tableCells) {
|
||||
|
||||
List<com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell> cvParsedTableCells = new ArrayList<>();
|
||||
|
||||
tableCells.forEach(t -> cvParsedTableCells.add(com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell.builder()
|
||||
.y0(t.getY0())
|
||||
.x1(t.getX1())
|
||||
.y1(t.getY1())
|
||||
.x0(t.getX0())
|
||||
.width(t.getWidth())
|
||||
.height(t.getHeight())
|
||||
.build()));
|
||||
|
||||
return cvParsedTableCells;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,17 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Classification {
|
||||
|
||||
private Map<String, Float> probabilities = new HashMap<>();
|
||||
private String label;
|
||||
|
||||
}
|
||||
@ -1,14 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class FilterGeometry {
|
||||
|
||||
private ImageSize imageSize;
|
||||
private Format imageFormat;
|
||||
|
||||
}
|
||||
@ -1,15 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Filters {
|
||||
|
||||
private FilterGeometry geometry;
|
||||
private Probability probability;
|
||||
private boolean allPassed;
|
||||
|
||||
}
|
||||
@ -1,15 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Format {
|
||||
|
||||
private float quotient;
|
||||
private boolean tooTall;
|
||||
private boolean tooWide;
|
||||
|
||||
}
|
||||
@ -1,14 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Geometry {
|
||||
|
||||
private float width;
|
||||
private float height;
|
||||
|
||||
}
|
||||
@ -1,33 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
import com.dslplatform.json.JsonAttribute;
|
||||
import com.fasterxml.jackson.annotation.JsonAlias;
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class ImageServiceResponse {
|
||||
|
||||
private String dossierId;
|
||||
private String fileId;
|
||||
|
||||
@JsonProperty(value = "imageMetadata")
|
||||
@JsonAlias("data")
|
||||
@JsonAttribute(alternativeNames = {"imageMetadata"})
|
||||
private List<Metadata> data = new ArrayList<>();
|
||||
|
||||
private List<Metadata> dataCV = new ArrayList<>();
|
||||
|
||||
|
||||
@JsonProperty(value = "imageMetadata")
|
||||
@JsonAlias("data")
|
||||
@JsonAttribute(alternativeNames = {"imageMetadata"})
|
||||
public void setData(List<Metadata> data) {this.data = data;}
|
||||
|
||||
}
|
||||
@ -1,15 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class ImageSize {
|
||||
|
||||
private float quotient;
|
||||
private boolean tooLarge;
|
||||
private boolean tooSmall;
|
||||
|
||||
}
|
||||
@ -1,17 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Metadata {
|
||||
|
||||
private Classification classification;
|
||||
private Position position;
|
||||
private Geometry geometry;
|
||||
private Filters filters;
|
||||
private boolean alpha;
|
||||
|
||||
}
|
||||
@ -1,17 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Position {
|
||||
|
||||
private float x1;
|
||||
private float x2;
|
||||
private float y1;
|
||||
private float y2;
|
||||
private int pageNumber;
|
||||
|
||||
}
|
||||
@ -1,13 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class Probability {
|
||||
|
||||
private boolean unconfident;
|
||||
|
||||
}
|
||||
@ -1,16 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class CvParsedPageInfo {
|
||||
|
||||
private int number;
|
||||
private int rotation;
|
||||
private float width;
|
||||
private float height;
|
||||
|
||||
}
|
||||
@ -1,18 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class CvParsedTableCell {
|
||||
|
||||
private float x0;
|
||||
private float y0;
|
||||
private float x1;
|
||||
private float y1;
|
||||
private float width;
|
||||
private float height;
|
||||
|
||||
}
|
||||
@ -1,17 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class CvParsedTableModel {
|
||||
|
||||
private CvParsedPageInfo pageInfo;
|
||||
private List<CvParsedTableCell> tableCells = new ArrayList<>();
|
||||
|
||||
}
|
||||
@ -1,22 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@CompiledJson
|
||||
public class TableServiceResponse {
|
||||
|
||||
private String dossierId;
|
||||
private String fileId;
|
||||
private String operation;
|
||||
private String targetFileExtension;
|
||||
private String responseFileExtension;
|
||||
|
||||
private List<CvParsedTableModel> data = new ArrayList<>();
|
||||
|
||||
}
|
||||
@ -1,71 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.dslplatform.json.JsonAttribute;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextBlockOrientation;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
public abstract class AbstractTextContainer {
|
||||
|
||||
protected float minX;
|
||||
protected float maxX;
|
||||
protected float minY;
|
||||
protected float maxY;
|
||||
protected String classification;
|
||||
protected int page;
|
||||
|
||||
private TextBlockOrientation orientation = TextBlockOrientation.NONE;
|
||||
|
||||
|
||||
public abstract String getText();
|
||||
|
||||
|
||||
public boolean containsBlock(ClassificationTextBlock other) {
|
||||
|
||||
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(AbstractTextContainer other) {
|
||||
|
||||
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Rectangle2D other) {
|
||||
|
||||
return other.contains(minX, minY, getWidth(), getHeight());
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getHeight() {
|
||||
|
||||
return maxY - minY;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getWidth() {
|
||||
|
||||
return maxX - minX;
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(AbstractTextContainer atc) {
|
||||
|
||||
return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,16 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class ClassificationFooter {
|
||||
|
||||
private List<ClassificationTextBlock> textBlocks;
|
||||
|
||||
}
|
||||
@ -1,16 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class ClassificationHeader {
|
||||
|
||||
private List<ClassificationTextBlock> textBlocks;
|
||||
|
||||
}
|
||||
@ -1,38 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class ClassificationSection implements Comparable {
|
||||
|
||||
private List<AbstractTextContainer> pageBlocks = new ArrayList<>();
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
private String headline;
|
||||
|
||||
|
||||
public List<Table> getTables() {
|
||||
|
||||
List<Table> tables = new ArrayList<>();
|
||||
pageBlocks.forEach(block -> {
|
||||
if (block instanceof Table) {
|
||||
tables.add((Table) block);
|
||||
}
|
||||
});
|
||||
return tables;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(Object o) {
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@ -1,77 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class FloatFrequencyCounter {
|
||||
|
||||
@Getter
|
||||
Map<Float, Integer> countPerValue = new HashMap<>();
|
||||
|
||||
|
||||
public void add(float value) {
|
||||
|
||||
if (!countPerValue.containsKey(value)) {
|
||||
countPerValue.put(value, 1);
|
||||
} else {
|
||||
countPerValue.put(value, countPerValue.get(value) + 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void addAll(Map<Float, Integer> otherCounter) {
|
||||
|
||||
for (Map.Entry<Float, Integer> entry : otherCounter.entrySet()) {
|
||||
if (countPerValue.containsKey(entry.getKey())) {
|
||||
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
|
||||
} else {
|
||||
countPerValue.put(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Float getMostPopular() {
|
||||
|
||||
Map.Entry<Float, Integer> mostPopular = null;
|
||||
for (Map.Entry<Float, Integer> entry : countPerValue.entrySet()) {
|
||||
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
|
||||
mostPopular = entry;
|
||||
}
|
||||
}
|
||||
return mostPopular != null ? mostPopular.getKey() : null;
|
||||
}
|
||||
|
||||
|
||||
public List<Float> getHighterThanMostPopular() {
|
||||
|
||||
Float mostPopular = getMostPopular();
|
||||
List<Float> higher = new ArrayList<>();
|
||||
for (Float value : countPerValue.keySet()) {
|
||||
if (value > mostPopular) {
|
||||
higher.add(value);
|
||||
}
|
||||
}
|
||||
|
||||
return higher.stream().sorted(Collections.reverseOrder()).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
public Float getHighest() {
|
||||
|
||||
Float highest = null;
|
||||
for (Float value : countPerValue.keySet()) {
|
||||
if (highest == null || value > highest) {
|
||||
highest = value;
|
||||
}
|
||||
}
|
||||
return highest;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,218 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
@SuppressWarnings("all")
|
||||
public class Rectangle extends Rectangle2D.Float {
|
||||
|
||||
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||
/**
|
||||
* Ill-defined comparator, from when Rectangle was Comparable.
|
||||
* <p>
|
||||
* see https://github.com/tabulapdf/tabula-java/issues/116
|
||||
*
|
||||
* @deprecated with no replacement
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
|
||||
@Override
|
||||
public int compare(Rectangle o1, Rectangle o2) {
|
||||
|
||||
if (o1.equals(o2)) {
|
||||
return 0;
|
||||
}
|
||||
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
|
||||
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1 ? -java.lang.Double.compare(o1.getX(), o2.getX()) : java.lang.Double.compare(o1.getX(), o2.getX());
|
||||
} else {
|
||||
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public Rectangle() {
|
||||
|
||||
super();
|
||||
}
|
||||
|
||||
|
||||
public Rectangle(float top, float left, float width, float height) {
|
||||
|
||||
super();
|
||||
this.setRect(left, top, width, height);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param rectangles
|
||||
* @return minimum bounding box that contains all the rectangles
|
||||
*/
|
||||
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
|
||||
|
||||
float minx = java.lang.Float.MAX_VALUE;
|
||||
float miny = java.lang.Float.MAX_VALUE;
|
||||
float maxx = java.lang.Float.MIN_VALUE;
|
||||
float maxy = java.lang.Float.MIN_VALUE;
|
||||
|
||||
for (Rectangle r : rectangles) {
|
||||
minx = (float) Math.min(r.getMinX(), minx);
|
||||
miny = (float) Math.min(r.getMinY(), miny);
|
||||
maxx = (float) Math.max(r.getMaxX(), maxx);
|
||||
maxy = (float) Math.max(r.getMaxY(), maxy);
|
||||
}
|
||||
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
|
||||
}
|
||||
|
||||
|
||||
public int compareTo(Rectangle other) {
|
||||
|
||||
return ILL_DEFINED_ORDER.compare(this, other);
|
||||
}
|
||||
|
||||
|
||||
// I'm bad at Java and need this for fancy sorting in
|
||||
// technology.tabula.TextChunk.
|
||||
public int isLtrDominant() {
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
public float getArea() {
|
||||
|
||||
return this.width * this.height;
|
||||
}
|
||||
|
||||
|
||||
public float verticalOverlap(Rectangle other) {
|
||||
|
||||
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
||||
}
|
||||
|
||||
|
||||
public boolean verticallyOverlaps(Rectangle other) {
|
||||
|
||||
return verticalOverlap(other) > 0;
|
||||
}
|
||||
|
||||
|
||||
public float horizontalOverlap(Rectangle other) {
|
||||
|
||||
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
||||
}
|
||||
|
||||
|
||||
public boolean horizontallyOverlaps(Rectangle other) {
|
||||
|
||||
return horizontalOverlap(other) > 0;
|
||||
}
|
||||
|
||||
|
||||
public float verticalOverlapRatio(Rectangle other) {
|
||||
|
||||
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
|
||||
|
||||
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
|
||||
rv = (other.getBottom() - this.getTop()) / delta;
|
||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
|
||||
rv = (this.getBottom() - other.getTop()) / delta;
|
||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
|
||||
rv = (other.getBottom() - other.getTop()) / delta;
|
||||
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
|
||||
rv = (this.getBottom() - this.getTop()) / delta;
|
||||
}
|
||||
|
||||
return rv;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public float overlapRatio(Rectangle other) {
|
||||
|
||||
double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
||||
double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
||||
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
|
||||
double unionArea = this.getArea() + other.getArea() - intersectionArea;
|
||||
|
||||
return (float) (intersectionArea / unionArea);
|
||||
}
|
||||
|
||||
|
||||
public Rectangle merge(Rectangle other) {
|
||||
|
||||
this.setRect(this.createUnion(other));
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
public float getTop() {
|
||||
|
||||
return (float) this.getMinY();
|
||||
}
|
||||
|
||||
|
||||
public void setTop(float top) {
|
||||
|
||||
float deltaHeight = top - this.y;
|
||||
this.setRect(this.x, top, this.width, this.height - deltaHeight);
|
||||
}
|
||||
|
||||
|
||||
public float getRight() {
|
||||
|
||||
return (float) this.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public void setRight(float right) {
|
||||
|
||||
this.setRect(this.x, this.y, right - this.x, this.height);
|
||||
}
|
||||
|
||||
|
||||
public float getLeft() {
|
||||
|
||||
return (float) this.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public void setLeft(float left) {
|
||||
|
||||
float deltaWidth = left - this.x;
|
||||
this.setRect(left, this.y, this.width - deltaWidth, this.height);
|
||||
}
|
||||
|
||||
|
||||
public float getBottom() {
|
||||
|
||||
return (float) this.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public void setBottom(float bottom) {
|
||||
|
||||
this.setRect(this.x, this.y, this.width, bottom - this.y);
|
||||
}
|
||||
|
||||
|
||||
public Point2D[] getPoints() {
|
||||
|
||||
return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()), new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(),
|
||||
this.getBottom()), new Point2D.Float(this.getLeft(), this.getBottom())};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String s = super.toString();
|
||||
sb.append(s.substring(0, s.length() - 1));
|
||||
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,25 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.image;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
public class ClassifiedImage {
|
||||
|
||||
@NonNull
|
||||
private Rectangle2D position;
|
||||
@NonNull
|
||||
private ImageType imageType;
|
||||
private boolean isAppendedToSection;
|
||||
@NonNull
|
||||
private boolean hasTransparency;
|
||||
@NonNull
|
||||
private int page;
|
||||
|
||||
}
|
||||
@ -1,15 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
public class CleanRulings {
|
||||
|
||||
List<Ruling> horizontal;
|
||||
List<Ruling> vertical;
|
||||
|
||||
}
|
||||
@ -1,437 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Formatter;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.CohenSutherlandClipping;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@SuppressWarnings("all")
|
||||
public class Ruling extends Line2D.Float {
|
||||
|
||||
private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2;
|
||||
|
||||
|
||||
public Ruling(Point2D p1, Point2D p2) {
|
||||
|
||||
super(p1, p2);
|
||||
}
|
||||
|
||||
|
||||
public static List<Ruling> cropRulingsToArea(List<Ruling> rulings, Rectangle2D area) {
|
||||
|
||||
ArrayList<Ruling> rv = new ArrayList<>();
|
||||
for (Ruling r : rulings) {
|
||||
if (r.intersects(area)) {
|
||||
rv.add(r.intersect(area));
|
||||
}
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
|
||||
|
||||
// log(n) implementation of find_intersections
|
||||
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
|
||||
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||
|
||||
class SortObject {
|
||||
|
||||
protected SOType type;
|
||||
protected float position;
|
||||
protected Ruling ruling;
|
||||
|
||||
|
||||
public SortObject(SOType type, float position, Ruling ruling) {
|
||||
|
||||
this.type = type;
|
||||
this.position = position;
|
||||
this.ruling = ruling;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
List<SortObject> sos = new ArrayList<>();
|
||||
|
||||
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
|
||||
@Override
|
||||
public int compare(Ruling o1, Ruling o2) {
|
||||
|
||||
return java.lang.Double.compare(o1.getTop(), o2.getTop());
|
||||
}
|
||||
});
|
||||
|
||||
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
|
||||
@Override
|
||||
public int compare(Point2D o1, Point2D o2) {
|
||||
|
||||
if (o1.getY() > o2.getY()) {
|
||||
return 1;
|
||||
}
|
||||
if (o1.getY() < o2.getY()) {
|
||||
return -1;
|
||||
}
|
||||
if (o1.getX() > o2.getX()) {
|
||||
return 1;
|
||||
}
|
||||
if (o1.getX() < o2.getX()) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
});
|
||||
|
||||
for (Ruling h : horizontals) {
|
||||
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
||||
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
||||
}
|
||||
|
||||
for (Ruling v : verticals) {
|
||||
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
|
||||
}
|
||||
|
||||
Collections.sort(sos, new Comparator<SortObject>() {
|
||||
@Override
|
||||
public int compare(SortObject a, SortObject b) {
|
||||
|
||||
int rv;
|
||||
if (DoubleComparisons.feq(a.position, b.position)) {
|
||||
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
|
||||
rv = 1;
|
||||
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
|
||||
rv = -1;
|
||||
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
|
||||
rv = -1;
|
||||
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
|
||||
rv = 1;
|
||||
} else {
|
||||
rv = java.lang.Double.compare(a.position, b.position);
|
||||
}
|
||||
} else {
|
||||
return java.lang.Double.compare(a.position, b.position);
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
});
|
||||
|
||||
for (SortObject so : sos) {
|
||||
switch (so.type) {
|
||||
case VERTICAL:
|
||||
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
|
||||
try {
|
||||
Point2D i = h.getKey().intersectionPoint(so.ruling);
|
||||
if (i == null) {
|
||||
continue;
|
||||
}
|
||||
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
|
||||
} catch (UnsupportedOperationException e) {
|
||||
log.info("Some line are oblique, ignoring...");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case HRIGHT:
|
||||
tree.remove(so.ruling);
|
||||
break;
|
||||
case HLEFT:
|
||||
tree.put(so.ruling, true);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return rv;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public boolean vertical() {
|
||||
|
||||
return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
public boolean horizontal() {
|
||||
|
||||
return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
||||
}
|
||||
|
||||
// attributes that make sense only for non-oblique lines
|
||||
// these are used to have a single collapse method (in page, currently)
|
||||
|
||||
|
||||
public boolean oblique() {
|
||||
|
||||
return !(this.vertical() || this.horizontal());
|
||||
}
|
||||
|
||||
|
||||
public float getPosition() {
|
||||
|
||||
if (this.oblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
return this.vertical() ? this.getLeft() : this.getTop();
|
||||
}
|
||||
|
||||
|
||||
public float getStart() {
|
||||
|
||||
if (this.oblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
return this.vertical() ? this.getTop() : this.getLeft();
|
||||
}
|
||||
|
||||
|
||||
public void setStart(float v) {
|
||||
|
||||
if (this.oblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
if (this.vertical()) {
|
||||
this.setTop(v);
|
||||
} else {
|
||||
this.setLeft(v);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public float getEnd() {
|
||||
|
||||
if (this.oblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
return this.vertical() ? this.getBottom() : this.getRight();
|
||||
}
|
||||
|
||||
|
||||
public void setEnd(float v) {
|
||||
|
||||
if (this.oblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
if (this.vertical()) {
|
||||
this.setBottom(v);
|
||||
} else {
|
||||
this.setRight(v);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void setStartEnd(float start, float end) {
|
||||
|
||||
if (this.oblique()) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
if (this.vertical()) {
|
||||
this.setTop(start);
|
||||
this.setBottom(end);
|
||||
} else {
|
||||
this.setLeft(start);
|
||||
this.setRight(end);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean perpendicularTo(Ruling other) {
|
||||
|
||||
return this.vertical() == other.horizontal();
|
||||
}
|
||||
|
||||
|
||||
public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) {
|
||||
|
||||
if (this.intersectsLine(another)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
boolean rv = false;
|
||||
|
||||
if (this.perpendicularTo(another)) {
|
||||
rv = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).intersectsLine(another);
|
||||
} else {
|
||||
rv = this.expand(colinearOrParallelExpandAmount).intersectsLine(another.expand(colinearOrParallelExpandAmount));
|
||||
}
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
|
||||
public double length() {
|
||||
|
||||
return Math.sqrt(Math.pow(this.x1 - this.x2, 2) + Math.pow(this.y1 - this.y2, 2));
|
||||
}
|
||||
|
||||
|
||||
public Ruling intersect(Rectangle2D clip) {
|
||||
|
||||
Float clipee = (Float) this.clone();
|
||||
boolean clipped = new CohenSutherlandClipping(clip).clip(clipee);
|
||||
|
||||
if (clipped) {
|
||||
return new Ruling(clipee.getP1(), clipee.getP2());
|
||||
} else {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Ruling expand(float amount) {
|
||||
|
||||
Ruling r = (Ruling) this.clone();
|
||||
try {
|
||||
r.setStart(this.getStart() - amount);
|
||||
r.setEnd(this.getEnd() + amount);
|
||||
} catch (UnsupportedOperationException e) {
|
||||
log.warn("Could not expand ruling!");
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
public Point2D intersectionPoint(Ruling other) {
|
||||
|
||||
Ruling this_l = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
|
||||
Ruling other_l = other.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
|
||||
Ruling horizontal, vertical;
|
||||
|
||||
if (!this_l.intersectsLine(other_l)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (this_l.horizontal() && other_l.vertical()) {
|
||||
horizontal = this_l;
|
||||
vertical = other_l;
|
||||
} else if (this_l.vertical() && other_l.horizontal()) {
|
||||
vertical = this_l;
|
||||
horizontal = other_l;
|
||||
} else {
|
||||
log.warn("lines must be orthogonal, vertical and horizontal");
|
||||
return null;
|
||||
}
|
||||
return new Point2D.Float(vertical.getLeft(), horizontal.getTop());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
|
||||
if (this == other) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!(other instanceof Ruling)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Ruling o = (Ruling) other;
|
||||
return this.getP1().equals(o.getP1()) && this.getP2().equals(o.getP2());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return super.hashCode();
|
||||
}
|
||||
|
||||
|
||||
public float getTop() {
|
||||
|
||||
return this.y1;
|
||||
}
|
||||
|
||||
|
||||
public void setTop(float v) {
|
||||
|
||||
setLine(this.getLeft(), v, this.getRight(), this.getBottom());
|
||||
}
|
||||
|
||||
|
||||
public float getLeft() {
|
||||
|
||||
return this.x1;
|
||||
}
|
||||
|
||||
|
||||
public void setLeft(float v) {
|
||||
|
||||
setLine(v, this.getTop(), this.getRight(), this.getBottom());
|
||||
}
|
||||
|
||||
|
||||
public float getBottom() {
|
||||
|
||||
return this.y2;
|
||||
}
|
||||
|
||||
|
||||
public void setBottom(float v) {
|
||||
|
||||
setLine(this.getLeft(), this.getTop(), this.getRight(), v);
|
||||
}
|
||||
|
||||
|
||||
public float getRight() {
|
||||
|
||||
return this.x2;
|
||||
}
|
||||
|
||||
|
||||
public void setRight(float v) {
|
||||
|
||||
setLine(this.getLeft(), this.getTop(), v, this.getBottom());
|
||||
}
|
||||
|
||||
|
||||
public float getWidth() {
|
||||
|
||||
return this.getRight() - this.getLeft();
|
||||
}
|
||||
|
||||
|
||||
public float getHeight() {
|
||||
|
||||
return this.getBottom() - this.getTop();
|
||||
}
|
||||
|
||||
|
||||
public double getAngle() {
|
||||
|
||||
double angle = Math.toDegrees(Math.atan2(this.getP2().getY() - this.getP1().getY(), this.getP2().getX() - this.getP1().getX()));
|
||||
|
||||
if (angle < 0) {
|
||||
angle += 360;
|
||||
}
|
||||
return angle;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
Formatter formatter = new Formatter(sb);
|
||||
String rv = formatter.format("%s[minX=%f minY=%f maxX=%f maxY=%f]", this.getClass().toString(), this.x1, this.y1, this.x2, this.y2).toString();
|
||||
formatter.close();
|
||||
return rv;
|
||||
}
|
||||
|
||||
|
||||
private enum SOType {
|
||||
VERTICAL,
|
||||
HRIGHT,
|
||||
HLEFT
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,350 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class Table extends AbstractTextContainer {
|
||||
|
||||
private final TreeMap<TableCellPosition, TableCell> cells = new TreeMap<>();
|
||||
|
||||
private final int rotation;
|
||||
@Getter
|
||||
@Setter
|
||||
private String headline;
|
||||
private int unrotatedRowCount;
|
||||
private int unrotatedColCount;
|
||||
private int rowCount = -1;
|
||||
private int colCount = -1;
|
||||
private List<List<TableCell>> rows;
|
||||
|
||||
|
||||
public Table(List<TableCell> cells, Rectangle area, int rotation) {
|
||||
|
||||
addCells(cells);
|
||||
minX = area.getLeft();
|
||||
minY = area.getBottom();
|
||||
maxX = area.getRight();
|
||||
maxY = area.getTop();
|
||||
classification = "Table";
|
||||
this.rotation = rotation;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public List<List<TableCell>> getRows() {
|
||||
|
||||
if (rows == null) {
|
||||
rows = computeRows();
|
||||
|
||||
// Ignore rows that does not contain any cells and values.
|
||||
List<List<TableCell>> rowsToRemove = new ArrayList<>();
|
||||
for (List<TableCell> row : rows) {
|
||||
if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()) {
|
||||
rowsToRemove.add(row);
|
||||
}
|
||||
}
|
||||
rows.removeAll(rowsToRemove);
|
||||
|
||||
computeHeaders();
|
||||
}
|
||||
|
||||
return rows;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public int getRowCount() {
|
||||
|
||||
if (rowCount == -1) {
|
||||
rowCount = getRows().size();
|
||||
}
|
||||
return rowCount;
|
||||
}
|
||||
|
||||
|
||||
public int getColCount() {
|
||||
|
||||
if (colCount == -1) {
|
||||
colCount = getRows().stream().mapToInt(List::size).max().orElse(0);
|
||||
}
|
||||
return colCount;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Detect header cells (either first row or first column):
|
||||
* Column is marked as header if cell text is bold and row cell text is not bold.
|
||||
* Defaults to row.
|
||||
*/
|
||||
private void computeHeaders() {
|
||||
|
||||
if (rows == null) {
|
||||
rows = computeRows();
|
||||
}
|
||||
// A bold cell is a header cell as long as every cell to the left/top is bold, too
|
||||
// we move from left to right and top to bottom
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
List<TableCell> rowCells = rows.get(rowIndex);
|
||||
if (rowCells.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
|
||||
TableCell cell = rowCells.get(colIndex);
|
||||
List<TableCell> cellsToTheLeft = rowCells.subList(0, colIndex);
|
||||
TableCell lastHeaderCell = null;
|
||||
for (TableCell leftCell : cellsToTheLeft) {
|
||||
if (leftCell.isHeaderCell()) {
|
||||
lastHeaderCell = leftCell;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
List<TableCell> cellsToTheTop = new ArrayList<>();
|
||||
for (int i = 0; i < rowIndex; i++) {
|
||||
try {
|
||||
cellsToTheTop.add(rows.get(i).get(colIndex));
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
|
||||
}
|
||||
}
|
||||
for (TableCell topCell : cellsToTheTop) {
|
||||
if (topCell.isHeaderCell()) {
|
||||
lastHeaderCell = topCell;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
|
||||
cell.setHeaderCell(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private List<List<TableCell>> computeRows() {
|
||||
|
||||
List<List<TableCell>> rows = new ArrayList<>();
|
||||
if (rotation == 90) {
|
||||
for (int i = 0; i < unrotatedColCount; i++) { // rows
|
||||
List<TableCell> lastRow = new ArrayList<>();
|
||||
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
|
||||
TableCell cell = cells.get(new TableCellPosition(j, i));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
}
|
||||
rows.add(lastRow);
|
||||
}
|
||||
} else if (rotation == 270) {
|
||||
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
|
||||
List<TableCell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < unrotatedRowCount; j++) { // cols
|
||||
TableCell cell = cells.get(new TableCellPosition(j, i));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
}
|
||||
rows.add(lastRow);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < unrotatedRowCount; i++) {
|
||||
List<TableCell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < unrotatedColCount; j++) {
|
||||
TableCell cell = cells.get(new TableCellPosition(i, j)); // JAVA_8 use getOrDefault()
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
}
|
||||
rows.add(lastRow);
|
||||
}
|
||||
}
|
||||
|
||||
return rows;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void add(TableCell chunk, int row, int col) {
|
||||
|
||||
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
|
||||
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
|
||||
|
||||
TableCellPosition cp = new TableCellPosition(row, col);
|
||||
cells.put(cp, chunk);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void addCells(List<TableCell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1);
|
||||
|
||||
List<List<TableCell>> rowsOfCells = calculateStructure(cells);
|
||||
|
||||
for (int i = 0; i < rowsOfCells.size(); i++) {
|
||||
for (int j = 0; j < rowsOfCells.get(i).size(); j++) {
|
||||
add(rowsOfCells.get(i).get(j), i, j);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
||||
*
|
||||
* @param cells The found cells
|
||||
* @return Table Structure
|
||||
*/
|
||||
private List<List<TableCell>> calculateStructure(List<TableCell> cells) {
|
||||
|
||||
List<List<TableCell>> matrix = new ArrayList<>();
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return matrix;
|
||||
}
|
||||
|
||||
Set<Float> uniqueX = new HashSet<>();
|
||||
Set<Float> uniqueY = new HashSet<>();
|
||||
cells.stream().filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3).forEach(c -> {
|
||||
uniqueX.add(c.getLeft());
|
||||
uniqueX.add(c.getRight());
|
||||
uniqueY.add(c.getBottom());
|
||||
uniqueY.add(c.getTop());
|
||||
});
|
||||
|
||||
var sortedUniqueX = uniqueX.stream().sorted().collect(Collectors.toList());
|
||||
var sortedUniqueY = uniqueY.stream().sorted().collect(Collectors.toList());
|
||||
|
||||
Float prevY = null;
|
||||
for (Float y : sortedUniqueY) {
|
||||
|
||||
List<TableCell> row = new ArrayList<>();
|
||||
|
||||
Float prevX = null;
|
||||
for (Float x : sortedUniqueX) {
|
||||
|
||||
if (prevY != null && prevX != null) {
|
||||
var cell = new TableCell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
||||
|
||||
var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
|
||||
if (intersectionCell.isPresent()) {
|
||||
cell.getTextBlocks().addAll(intersectionCell.get().getTextBlocks());
|
||||
}
|
||||
row.add(cell);
|
||||
}
|
||||
prevX = x;
|
||||
}
|
||||
|
||||
if (prevY != null && prevX != null) {
|
||||
matrix.add(row);
|
||||
}
|
||||
prevY = y;
|
||||
}
|
||||
|
||||
Collections.reverse(matrix);
|
||||
|
||||
return matrix;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getText() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
List<List<TableCell>> rows = getRows();
|
||||
|
||||
int i = 0;
|
||||
for (List<TableCell> row : rows) {
|
||||
if (i != 0) {
|
||||
sb.append("\n");
|
||||
}
|
||||
if (!row.isEmpty()) {
|
||||
boolean firstColumn = true;
|
||||
for (TableCell column : row) {
|
||||
if (!firstColumn) {
|
||||
sb.append(",");
|
||||
}
|
||||
if (column != null && column.getTextBlocks() != null) {
|
||||
boolean first = true;
|
||||
for (ClassificationTextBlock textBlock : column.getTextBlocks()) {
|
||||
if (!first) {
|
||||
sb.append("\n");
|
||||
}
|
||||
sb.append('\"').append(textBlock.getText().replaceAll("\"", "\\\"")).append('\"');
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
firstColumn = false;
|
||||
}
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
public String getTextAsHtml() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
List<List<TableCell>> rows = getRows();
|
||||
|
||||
sb.append("<table border=\"1\">");
|
||||
int i = 0;
|
||||
for (List<TableCell> row : rows) {
|
||||
sb.append("\n<tr>");
|
||||
if (!row.isEmpty()) {
|
||||
for (TableCell column : row) {
|
||||
sb.append(i == 0 ? "\n<th>" : "\n<td>");
|
||||
if (column != null && column.getTextBlocks() != null) {
|
||||
boolean first = true;
|
||||
for (ClassificationTextBlock textBlock : column.getTextBlocks()) {
|
||||
if (!first) {
|
||||
sb.append("<br />");
|
||||
}
|
||||
sb.append(textBlock.getText().replaceAll("\\n", "<br />"));
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
sb.append(i == 0 ? "</th>" : "</td>");
|
||||
}
|
||||
}
|
||||
sb.append("</tr>");
|
||||
i++;
|
||||
}
|
||||
sb.append("</table>");
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,38 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@SuppressWarnings("serial")
|
||||
@Data
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@NoArgsConstructor
|
||||
public class TableCell extends Rectangle {
|
||||
|
||||
private List<ClassificationTextBlock> textBlocks = new ArrayList<>();
|
||||
|
||||
private List<TableCell> headerCells = new ArrayList<>();
|
||||
|
||||
private boolean isHeaderCell;
|
||||
|
||||
|
||||
public TableCell(Point2D topLeft, Point2D bottomRight) {
|
||||
|
||||
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
|
||||
}
|
||||
|
||||
|
||||
public void addTextBlock(ClassificationTextBlock textBlock) {
|
||||
|
||||
textBlocks.add(textBlock);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,286 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class ClassificationTextBlock extends AbstractTextContainer {
|
||||
|
||||
@Builder.Default
|
||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
|
||||
private int rotation;
|
||||
|
||||
private int indexOnPage;
|
||||
|
||||
private String mostPopularWordFont;
|
||||
|
||||
private String mostPopularWordStyle;
|
||||
|
||||
private float mostPopularWordFontSize;
|
||||
|
||||
private float mostPopularWordHeight;
|
||||
|
||||
private float mostPopularWordSpaceWidth;
|
||||
|
||||
private float highestFontSize;
|
||||
|
||||
private String classification;
|
||||
|
||||
|
||||
public TextDirection getDir() {
|
||||
|
||||
return sequences.get(0).getDir();
|
||||
}
|
||||
|
||||
private float getPageHeight() {
|
||||
|
||||
return sequences.get(0).getPageHeight();
|
||||
}
|
||||
|
||||
|
||||
private float getPageWidth() {
|
||||
|
||||
return sequences.get(0).getPageWidth();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the minX value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return the minX value in pdf coordinate system
|
||||
*/
|
||||
public float getPdfMinX() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
return minY;
|
||||
} else if (getDir().getDegrees() == 180) {
|
||||
return getPageWidth() - maxX;
|
||||
|
||||
} else if (getDir().getDegrees() == 270) {
|
||||
|
||||
return getPageWidth() - maxY;
|
||||
} else {
|
||||
return minX;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the maxX value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return the maxX value in pdf coordinate system
|
||||
*/
|
||||
public float getPdfMaxX() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
return maxY;
|
||||
} else if (getDir().getDegrees() == 180) {
|
||||
return getPageWidth() - minX;
|
||||
} else if (getDir().getDegrees() == 270) {
|
||||
return getPageWidth() - minY;
|
||||
|
||||
} else {
|
||||
return maxX;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the minY value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return the minY value in pdf coordinate system
|
||||
*/
|
||||
public float getPdfMinY() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
return minX;
|
||||
} else if (getDir().getDegrees() == 180) {
|
||||
return maxY;
|
||||
|
||||
} else if (getDir().getDegrees() == 270) {
|
||||
return getPageHeight() - maxX;
|
||||
|
||||
} else {
|
||||
return getPageHeight() - maxY;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the maxY value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return the maxY value in pdf coordinate system
|
||||
*/
|
||||
public float getPdfMaxY() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
return maxX;
|
||||
} else if (getDir().getDegrees() == 180) {
|
||||
|
||||
return minY;
|
||||
} else if (getDir().getDegrees() == 270) {
|
||||
return getPageHeight() - minX;
|
||||
} else {
|
||||
return getPageHeight() - minY;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public ClassificationTextBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation, int indexOnPage) {
|
||||
super();
|
||||
this.indexOnPage = indexOnPage;
|
||||
super.minX = minX;
|
||||
super.maxX = maxX;
|
||||
super.minY = minY;
|
||||
super.maxY = maxY;
|
||||
this.sequences = sequences;
|
||||
this.rotation = rotation;
|
||||
}
|
||||
|
||||
|
||||
public ClassificationTextBlock union(TextPositionSequence r) {
|
||||
|
||||
ClassificationTextBlock union = this.copy();
|
||||
union.add(r);
|
||||
return union;
|
||||
}
|
||||
|
||||
|
||||
public ClassificationTextBlock union(ClassificationTextBlock r) {
|
||||
|
||||
ClassificationTextBlock union = this.copy();
|
||||
union.add(r);
|
||||
return union;
|
||||
}
|
||||
|
||||
|
||||
public void add(ClassificationTextBlock r) {
|
||||
|
||||
if (r.getMinX() < minX) {
|
||||
minX = r.getMinX();
|
||||
}
|
||||
if (r.getMaxX() > maxX) {
|
||||
maxX = r.getMaxX();
|
||||
}
|
||||
if (r.getMinY() < minY) {
|
||||
minY = r.getMinY();
|
||||
}
|
||||
if (r.getMaxY() > maxY) {
|
||||
maxY = r.getMaxY();
|
||||
}
|
||||
sequences.addAll(r.getSequences());
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPositionSequence r) {
|
||||
|
||||
if (r.getMinXDirAdj() < minX) {
|
||||
minX = r.getMinXDirAdj();
|
||||
}
|
||||
if (r.getMaxXDirAdj() > maxX) {
|
||||
maxX = r.getMaxXDirAdj();
|
||||
}
|
||||
if (r.getMinYDirAdj() < minY) {
|
||||
minY = r.getMinYDirAdj();
|
||||
}
|
||||
if (r.getMaxYDirAdj() > maxY) {
|
||||
maxY = r.getMaxYDirAdj();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public ClassificationTextBlock copy() {
|
||||
|
||||
return new ClassificationTextBlock(minX, maxX, minY, maxY, sequences, rotation, indexOnPage);
|
||||
}
|
||||
|
||||
|
||||
public void resize(float x1, float y1, float width, float height) {
|
||||
|
||||
set(x1, y1, x1 + width, y1 + height);
|
||||
}
|
||||
|
||||
|
||||
public void set(float x1, float y1, float x2, float y2) {
|
||||
|
||||
this.minX = Math.min(x1, x2);
|
||||
this.maxX = Math.max(x1, x2);
|
||||
this.minY = Math.min(y1, y2);
|
||||
this.maxY = Math.max(y1, y2);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder builder = new StringBuilder();
|
||||
|
||||
for (int i = 0; i < sequences.size(); i++) {
|
||||
String sequenceAsString = sequences.get(i).toString();
|
||||
// Fix for missing Whitespace. This is recognized in getSequences method. See PDFTextStripper Line 1730.
|
||||
if (i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() - 1) != ' ' && sequenceAsString.charAt(0) != ' ') {
|
||||
builder.append(' ');
|
||||
}
|
||||
builder.append(sequenceAsString);
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getText() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
TextPositionSequence previous = null;
|
||||
for (TextPositionSequence word : sequences) {
|
||||
if (previous != null) {
|
||||
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
|
||||
sb.append('\n');
|
||||
} else {
|
||||
sb.append(' ');
|
||||
}
|
||||
}
|
||||
sb.append(word.toString());
|
||||
previous = word;
|
||||
}
|
||||
|
||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
|
||||
|
||||
}
|
||||
}
|
||||
@ -1,106 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
|
||||
import com.dslplatform.json.CompiledJson;
|
||||
import com.dslplatform.json.JsonAttribute;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@CompiledJson
|
||||
public class RedTextPosition {
|
||||
|
||||
private String textMatrix;
|
||||
private float[] position;
|
||||
|
||||
@JsonIgnore
|
||||
private int rotation;
|
||||
|
||||
@JsonIgnore
|
||||
private float pageHeight;
|
||||
|
||||
@JsonIgnore
|
||||
private float pageWidth;
|
||||
|
||||
private String unicode;
|
||||
|
||||
@JsonIgnore
|
||||
private float dir;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
private float widthOfSpace;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
private float fontSizeInPt;
|
||||
|
||||
// not used in reanalysis
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
private String fontName;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
|
||||
|
||||
var pos = new RedTextPosition();
|
||||
BeanUtils.copyProperties(textPosition, pos);
|
||||
pos.setFontName(textPosition.getFont().getName());
|
||||
|
||||
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
||||
|
||||
pos.setTextMatrix(textPosition.getTextMatrix().toString());
|
||||
|
||||
var position = new float[4];
|
||||
|
||||
position[0] = textPosition.getXDirAdj();
|
||||
position[1] = textPosition.getYDirAdj();
|
||||
position[2] = textPosition.getWidthDirAdj();
|
||||
position[3] = textPosition.getHeightDir();
|
||||
|
||||
pos.setPosition(position);
|
||||
return pos;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getXDirAdj() {
|
||||
|
||||
return position[0];
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getYDirAdj() {
|
||||
|
||||
return position[1];
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getWidthDirAdj() {
|
||||
|
||||
return position[2];
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getHeightDir() {
|
||||
|
||||
return position[3];
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,8 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
|
||||
|
||||
public enum TextBlockOrientation {
|
||||
|
||||
NONE,
|
||||
LEFT,
|
||||
RIGHT
|
||||
}
|
||||
@ -1,298 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.dslplatform.json.JsonAttribute;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class TextPositionSequence implements CharSequence {
|
||||
|
||||
public static final int HEIGHT_PADDING = 2;
|
||||
private int page;
|
||||
private List<RedTextPosition> textPositions = new ArrayList<>();
|
||||
|
||||
private TextDirection dir;
|
||||
private int rotation;
|
||||
private float pageHeight;
|
||||
private float pageWidth;
|
||||
|
||||
|
||||
public TextPositionSequence(List<TextPosition> textPositions, int page) {
|
||||
|
||||
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
|
||||
this.page = page;
|
||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||
this.rotation = textPositions.get(0).getRotation();
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
|
||||
return textPositions.size();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public char charAt(int index) {
|
||||
|
||||
RedTextPosition textPosition = textPositionAt(index);
|
||||
String text = textPosition.getUnicode();
|
||||
return text.charAt(0);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TextPositionSequence subSequence(int start, int end) {
|
||||
|
||||
var textPositionSequence = new TextPositionSequence();
|
||||
textPositionSequence.textPositions = textPositions.subList(start, end);
|
||||
textPositionSequence.page = page;
|
||||
textPositionSequence.dir = dir;
|
||||
textPositionSequence.rotation = rotation;
|
||||
textPositionSequence.pageHeight = pageHeight;
|
||||
textPositionSequence.pageWidth = pageWidth;
|
||||
|
||||
return textPositionSequence;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder builder = new StringBuilder(length());
|
||||
for (int i = 0; i < length(); i++) {
|
||||
builder.append(charAt(i));
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
|
||||
public RedTextPosition textPositionAt(int index) {
|
||||
|
||||
return textPositions.get(index);
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPositionSequence textPositionSequence, RedTextPosition textPosition) {
|
||||
|
||||
this.textPositions.add(textPosition);
|
||||
this.page = textPositionSequence.getPage();
|
||||
this.dir = textPositionSequence.getDir();
|
||||
this.rotation = textPositionSequence.getRotation();
|
||||
this.pageHeight = textPositionSequence.getPageHeight();
|
||||
this.pageWidth = textPositionSequence.getPageWidth();
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPosition textPosition) {
|
||||
|
||||
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
|
||||
|
||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||
this.rotation = textPositions.get(0).getRotation();
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
||||
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
|
||||
*
|
||||
* @return the text direction adjusted minX value
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getMinXDirAdj() {
|
||||
|
||||
return textPositions.get(0).getXDirAdj();
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
||||
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
|
||||
*
|
||||
* @return the text direction adjusted maxX value
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getMaxXDirAdj() {
|
||||
|
||||
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
||||
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
|
||||
*
|
||||
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getMinYDirAdj() {
|
||||
|
||||
return textPositions.get(0).getYDirAdj() - getTextHeight();
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
||||
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
|
||||
*
|
||||
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getMaxYDirAdj() {
|
||||
|
||||
return textPositions.get(0).getYDirAdj();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getTextHeight() {
|
||||
|
||||
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getHeight() {
|
||||
|
||||
return getMaxYDirAdj() - getMinYDirAdj();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getWidth() {
|
||||
|
||||
return getMaxXDirAdj() - getMinXDirAdj();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public String getFont() {
|
||||
|
||||
return textPositions.get(0).getFontName().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", "");
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public String getFontStyle() {
|
||||
|
||||
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
|
||||
|
||||
if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) {
|
||||
return "bold, italic";
|
||||
} else if (lowercaseFontName.contains("bold")) {
|
||||
return "bold";
|
||||
} else if (lowercaseFontName.contains("italic")) {
|
||||
return "italic";
|
||||
} else {
|
||||
return "standard";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getFontSize() {
|
||||
|
||||
return textPositions.get(0).getFontSizeInPt();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getSpaceWidth() {
|
||||
|
||||
return textPositions.get(0).getWidthOfSpace();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This returns the bounding box of the word in Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return bounding box of the word in Pdf Coordinate System
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
@SneakyThrows
|
||||
public Rectangle getRectangle() {
|
||||
|
||||
log.debug("ClassificationPage: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir);
|
||||
|
||||
float textHeight = getTextHeight();
|
||||
|
||||
RedTextPosition firstTextPos = textPositions.get(0);
|
||||
RedTextPosition lastTextPos = textPositions.get(textPositions.size() - 1);
|
||||
|
||||
Point2D bottomLeft = new Point2D.Double(firstTextPos.getXDirAdj(), firstTextPos.getYDirAdj() - HEIGHT_PADDING);
|
||||
Point2D topRight = new Point2D.Double(lastTextPos.getXDirAdj() + lastTextPos.getWidthDirAdj(), lastTextPos.getYDirAdj() + textHeight + HEIGHT_PADDING);
|
||||
|
||||
AffineTransform transform = new AffineTransform();
|
||||
if (dir == TextDirection.ZERO || dir == TextDirection.HALF_CIRCLE) {
|
||||
transform.rotate(dir.getRadians(), pageWidth / 2f, pageHeight / 2f);
|
||||
transform.translate(0f, pageHeight + textHeight);
|
||||
transform.scale(1., -1.);
|
||||
} else if (dir == TextDirection.QUARTER_CIRCLE) {
|
||||
transform.rotate(dir.getRadians(), pageWidth / 2f, pageWidth / 2f);
|
||||
transform.translate(0f, pageWidth + textHeight);
|
||||
transform.scale(1., -1.);
|
||||
} else {
|
||||
transform.rotate(dir.getRadians(), pageHeight / 2f, pageHeight / 2f);
|
||||
transform.translate(0f, pageWidth + textHeight);
|
||||
transform.scale(1., -1.);
|
||||
}
|
||||
|
||||
bottomLeft = transform.transform(bottomLeft, null);
|
||||
topRight = transform.transform(topRight, null);
|
||||
|
||||
return new Rectangle( //
|
||||
new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()),
|
||||
(float) (topRight.getX() - bottomLeft.getX()),
|
||||
(float) (topRight.getY() - bottomLeft.getY()),
|
||||
page);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,82 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.text.PDFTextStripperByArea;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
|
||||
public class PDFAreaTextStripper extends PDFTextStripperByArea {
|
||||
|
||||
@Getter
|
||||
private List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
|
||||
@Setter
|
||||
private int pageNumber;
|
||||
|
||||
|
||||
public PDFAreaTextStripper() throws IOException {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
||||
|
||||
int startIndex = 0;
|
||||
for (int i = 0; i <= textPositions.size() - 1; i++) {
|
||||
|
||||
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0"))) {
|
||||
startIndex++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
|
||||
if (i > 0 && textPositions.get(i).getX() < textPositions.get(i - 1).getX()) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
startIndex = i;
|
||||
}
|
||||
|
||||
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
startIndex = i;
|
||||
}
|
||||
|
||||
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0")) && i <= textPositions.size() - 2) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
startIndex = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
|
||||
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) {
|
||||
sublist = sublist.subList(0, sublist.size() - 1);
|
||||
}
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
}
|
||||
super.writeString(text);
|
||||
}
|
||||
|
||||
|
||||
public void clearPositions() {
|
||||
|
||||
textPositionSequences = new ArrayList<>();
|
||||
}
|
||||
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user