Compare commits
520 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ef23ee0ade | ||
|
|
af31f52b47 | ||
|
|
b5152112ee | ||
|
|
85ea4ef455 | ||
|
|
01f8c01fff | ||
|
|
0b6a292c75 | ||
|
|
e24020589c | ||
|
|
c619b845e8 | ||
|
|
ed0371ca11 | ||
|
|
89b5be8d67 | ||
|
|
077ce60c9d | ||
|
|
ab171be6e2 | ||
|
|
664b47b4c3 | ||
|
|
8005c1f25f | ||
|
|
42185a95a0 | ||
|
|
51b42efaf6 | ||
|
|
6a50d45947 | ||
|
|
073ac12cf7 | ||
|
|
84b054a4cc | ||
|
|
905b65a5fa | ||
|
|
7617c1f308 | ||
|
|
2b3936c09b | ||
|
|
6e5b1f1978 | ||
|
|
cf846d18bc | ||
|
|
25c46f16ac | ||
|
|
96acefed78 | ||
|
|
366241e6c6 | ||
|
|
7f472ccc52 | ||
|
|
6f807c7d94 | ||
|
|
6e04c15f3d | ||
|
|
1384584e2f | ||
|
|
e58011e111 | ||
|
|
a821570065 | ||
|
|
7ee1f9e360 | ||
|
|
f9b25c8157 | ||
|
|
c90874da7a | ||
|
|
4683c696a5 | ||
|
|
95c02ce3cf | ||
|
|
b2d62e32fe | ||
|
|
65c1f03ea3 | ||
|
|
2219519a2b | ||
|
|
af05218e37 | ||
|
|
736f531df3 | ||
|
|
c64445d54b | ||
|
|
af29233b10 | ||
|
|
5f04b45554 | ||
|
|
6c41533f0b | ||
|
|
9d2596e5ef | ||
|
|
e7b01161ac | ||
|
|
7b073eb4f3 | ||
|
|
4b0c041d84 | ||
|
|
6c7442ac6d | ||
|
|
23e23328ee | ||
|
|
9d1ffdd779 | ||
|
|
3109a30ae1 | ||
|
|
fe2ed1807e | ||
|
|
31de229fa5 | ||
|
|
8a80abfff1 | ||
|
|
7c08905eda | ||
|
|
4f40c9dbc9 | ||
|
|
32381b4472 | ||
|
|
469da38952 | ||
|
|
0f8c4674b3 | ||
|
|
8e165a41d7 | ||
|
|
ed7a701ad9 | ||
|
|
393103e074 | ||
|
|
bd02066e2c | ||
|
|
fec19f4afb | ||
|
|
c726a643f0 | ||
|
|
519e95735c | ||
|
|
b52af2637f | ||
|
|
46ea7edc4c | ||
|
|
9650195afd | ||
|
|
ce628a99f7 | ||
|
|
b66afe135c | ||
|
|
dc892d0fec | ||
|
|
af45f2cd8c | ||
|
|
befb6b1df6 | ||
|
|
61efb4cae9 | ||
|
|
4a06059258 | ||
|
|
292e5b215e | ||
|
|
7c2db6c3c5 | ||
|
|
4395074b21 | ||
|
|
8e14b74da2 | ||
|
|
3b91639ea9 | ||
|
|
c5178ea5c2 | ||
|
|
cf39d4dfcc | ||
|
|
bb40345f79 | ||
|
|
e3e9d16145 | ||
|
|
f6ca5a3c17 | ||
|
|
15e3dced35 | ||
|
|
933054b332 | ||
|
|
ab86714cb3 | ||
|
|
8626b106d0 | ||
|
|
52e948e66c | ||
|
|
3b33405cbf | ||
|
|
b2fa14dde2 | ||
|
|
62e07686d7 | ||
|
|
3eb97d614f | ||
|
|
81469413b0 | ||
|
|
2993676a6f | ||
|
|
8e115dcd8a | ||
|
|
173911b840 | ||
|
|
b0ae00aa02 | ||
|
|
00bf9f279e | ||
|
|
d16377a24a | ||
|
|
81179ee744 | ||
|
|
1953b5924f | ||
|
|
6f6e8d5d4e | ||
|
|
69bcd4f68d | ||
|
|
b900cfaf31 | ||
|
|
cdc2081785 | ||
|
|
a9287ec406 | ||
|
|
5b6a706c28 | ||
|
|
28d8ad0a3f | ||
|
|
0c1583c1be | ||
|
|
7633566d9b | ||
|
|
cc4f09711e | ||
|
|
370165dc59 | ||
|
|
8c052c38d7 | ||
|
|
ea18d3d307 | ||
|
|
2726fc3fe1 | ||
|
|
033279e261 | ||
|
|
ec0dd032c9 | ||
|
|
598fa7f1c7 | ||
|
|
65b1f7d179 | ||
|
|
3173610be5 | ||
|
|
e920eb5a78 | ||
|
|
7e4baea7e5 | ||
|
|
66d3433e04 | ||
|
|
a2f559af51 | ||
|
|
39f527a57c | ||
|
|
5c2844fe31 | ||
|
|
b216f02e15 | ||
|
|
2e2f30ba35 | ||
|
|
9f7ed974ec | ||
|
|
570a348a77 | ||
|
|
859dba2ecf | ||
|
|
1c5d755111 | ||
|
|
133e06460f | ||
|
|
da91fcff97 | ||
|
|
79795e408a | ||
|
|
b719db86ab | ||
|
|
797602e373 | ||
|
|
3d2f66cf10 | ||
|
|
e304a9f2d7 | ||
|
|
c05f67cf44 | ||
|
|
9ecf9ca19f | ||
|
|
3a2ee903af | ||
|
|
072a8aa3da | ||
|
|
b5cfa7b63d | ||
|
|
5f5a6258c5 | ||
|
|
ac0e83725a | ||
|
|
5d33ad570e | ||
|
|
fd698a78fc | ||
|
|
c3edeb3c7d | ||
|
|
fc06dba2ce | ||
|
|
b6742c1e89 | ||
|
|
efb1a748af | ||
|
|
9be672c728 | ||
|
|
23985b14be | ||
|
|
48b7a22e2b | ||
|
|
546341ee75 | ||
|
|
0ed1481517 | ||
|
|
b2a47f66ae | ||
|
|
3835d03036 | ||
|
|
a5fcebce30 | ||
|
|
b867deb9f9 | ||
|
|
8648ed0952 | ||
|
|
53f786b539 | ||
|
|
40465e8778 | ||
|
|
a76b2ace3f | ||
|
|
aeaca2f278 | ||
|
|
f1dbcc24a2 | ||
|
|
fda25852d1 | ||
|
|
471fadbcca | ||
|
|
87001090d5 | ||
|
|
ea355429c2 | ||
|
|
6a65d7f9fc | ||
|
|
e935cc7b14 | ||
|
|
07733d0855 | ||
|
|
abb249e966 | ||
|
|
bcd1eb9afa | ||
|
|
60acbac53f | ||
|
|
a3decd292d | ||
|
|
b6f0a21886 | ||
|
|
d61cac8b4f | ||
|
|
ae46c5f1ca | ||
|
|
f0a70a5242 | ||
|
|
15ea385f4d | ||
|
|
08be18db2d | ||
|
|
64209255cb | ||
|
|
4761d2e1a2 | ||
|
|
1916e626df | ||
|
|
e4663ac8db | ||
|
|
6a691183dc | ||
|
|
3dd215288a | ||
|
|
6fb1a0bef3 | ||
|
|
4e7c3f584b | ||
|
|
84bdb4d1ed | ||
|
|
75ab4df592 | ||
|
|
8442e60055 | ||
|
|
0ef67fc07b | ||
|
|
ea02f31a84 | ||
|
|
58acbab85f | ||
|
|
d38d023485 | ||
|
|
c1afe9b11f | ||
|
|
bdcb9aeda4 | ||
|
|
6a86036a78 | ||
|
|
a358d7565e | ||
|
|
069a6c0b49 | ||
|
|
683f7f1fb8 | ||
|
|
7eab3a4088 | ||
|
|
970fc99ed1 | ||
|
|
48c54f63a0 | ||
|
|
20e4e5ddff | ||
|
|
b53930328a | ||
|
|
c947d552d2 | ||
|
|
6b1b5eab84 | ||
|
|
cc9816c8cb | ||
|
|
f256f9b30f | ||
|
|
6167e3fb57 | ||
|
|
a78fb0244a | ||
|
|
8099a00bb6 | ||
|
|
9bb0468b2b | ||
|
|
c4d9c5df02 | ||
|
|
976f408237 | ||
|
|
319268c53d | ||
|
|
014eba9fc3 | ||
|
|
9bd8419770 | ||
|
|
c13ff7fbf6 | ||
|
|
5d3826e9b9 | ||
|
|
0c3194276a | ||
|
|
e302d9784e | ||
|
|
f185b13f2b | ||
|
|
990c376ce6 | ||
|
|
bf6a0d770b | ||
|
|
f18bda1d4e | ||
|
|
0a11992361 | ||
|
|
456b8fe4a1 | ||
|
|
9778ece992 | ||
|
|
8bd0de6263 | ||
|
|
5c1708f97f | ||
|
|
a35d77be2e | ||
|
|
631160eb22 | ||
|
|
8e7e588d26 | ||
|
|
ac850c2626 | ||
|
|
1d765a6baa | ||
|
|
c55984aa67 | ||
|
|
27aa418029 | ||
|
|
c4edff4696 | ||
|
|
92fd1a72de | ||
|
|
0d3d25e7d7 | ||
|
|
956fbff872 | ||
|
|
2488009af1 | ||
|
|
16be2467fd | ||
|
|
f4cae8a7dc | ||
|
|
dfc23955d7 | ||
|
|
d6e3d6fe22 | ||
|
|
bef23e38b5 | ||
|
|
65ab7a1912 | ||
|
|
d80231e4a9 | ||
|
|
56c07a4491 | ||
|
|
0b4ad29dcb | ||
|
|
0ad0cd45d6 | ||
|
|
d659fe7234 | ||
|
|
cb9127b4f3 | ||
|
|
05523585c0 | ||
|
|
4ced572949 | ||
|
|
79239b751d | ||
|
|
f146beeb44 | ||
|
|
f8a4ccfff0 | ||
|
|
a6ba501fa8 | ||
|
|
7dfb3b2b52 | ||
|
|
c324d3815e | ||
|
|
74f55a5cbf | ||
|
|
e7bf607663 | ||
|
|
f4d789311c | ||
|
|
9817eae897 | ||
|
|
477f6af886 | ||
|
|
2c171b6a9e | ||
|
|
71477dabde | ||
|
|
a927cbd9dc | ||
|
|
a1521877d7 | ||
|
|
f4b6386e1c | ||
|
|
1d64028158 | ||
|
|
0979a267d4 | ||
|
|
cc77d19500 | ||
|
|
fa048b2fe0 | ||
|
|
bdf1161c91 | ||
|
|
b4a225144d | ||
|
|
903b1c1fd4 | ||
|
|
c3e7582ee3 | ||
|
|
cfc5db45cd | ||
|
|
fbd0196719 | ||
|
|
3c9049dc8a | ||
|
|
015984891f | ||
|
|
66fcb62833 | ||
|
|
48824f56a8 | ||
|
|
785628537f | ||
|
|
23eb0c40a3 | ||
|
|
1b4aaf4454 | ||
|
|
e4f3557b36 | ||
|
|
9be3c86297 | ||
|
|
88855de2da | ||
|
|
368a75e985 | ||
|
|
12344d57b2 | ||
|
|
9e854379e7 | ||
|
|
b779c72041 | ||
|
|
760a809900 | ||
|
|
ba1c7c07ab | ||
|
|
ca0cbbcb49 | ||
|
|
da2cdc288e | ||
|
|
68da328889 | ||
|
|
711548d1a7 | ||
|
|
2bddcdafee | ||
|
|
750ccf4ce2 | ||
|
|
57b5d3f48e | ||
|
|
d8c9659469 | ||
|
|
30f060e36c | ||
|
|
53a5824e6c | ||
|
|
e2bcf971c9 | ||
|
|
dacc2f7f43 | ||
|
|
144a9591a2 | ||
|
|
207d9dec97 | ||
|
|
09ee90222e | ||
|
|
1316a067fe | ||
|
|
e203210ade | ||
|
|
b25d46291a | ||
|
|
84148d3b6e | ||
|
|
a6ba66b1aa | ||
|
|
c3e69b2cdf | ||
|
|
f69331e7d8 | ||
|
|
01493dc033 | ||
|
|
459e0c8be7 | ||
|
|
1b1f777706 | ||
|
|
0e0a811f9d | ||
|
|
efa3d75479 | ||
|
|
9abdc6d44d | ||
|
|
3bab61c446 | ||
|
|
d17517d3c3 | ||
|
|
567cbc178b | ||
|
|
3c53772765 | ||
|
|
8647cf5a18 | ||
|
|
310c07b200 | ||
|
|
daba0bf8a6 | ||
|
|
3839de215c | ||
|
|
b4d68594f1 | ||
|
|
99ed331a1e | ||
|
|
f2c0991987 | ||
|
|
b8ef55e6e2 | ||
|
|
5792ff4a93 | ||
|
|
621c3f269d | ||
|
|
8dba392904 | ||
|
|
306a53ea79 | ||
|
|
754fd8f933 | ||
|
|
28ec4c9ccb | ||
|
|
aed4a55787 | ||
|
|
f87e2d75b5 | ||
|
|
de6760abc1 | ||
|
|
261ef4c367 | ||
|
|
11ba9c6bb9 | ||
|
|
b7c3d02978 | ||
|
|
bcf0bcbaf4 | ||
|
|
84cde2a3db | ||
|
|
6f2dd4f823 | ||
|
|
a909724217 | ||
|
|
67a981e7a8 | ||
|
|
0e93fdd515 | ||
|
|
d464239f9b | ||
|
|
88a20924b9 | ||
|
|
f89243472c | ||
|
|
ad3612acd4 | ||
|
|
630eee6bd7 | ||
|
|
a951911ec8 | ||
|
|
75e6b88705 | ||
|
|
2e0adbdd9a | ||
|
|
b747742558 | ||
|
|
192c9976c1 | ||
|
|
b251697492 | ||
|
|
22d6b25fe4 | ||
|
|
e6bcd6fb2b | ||
|
|
2847adde22 | ||
|
|
7cf67d7121 | ||
|
|
3a18923ef5 | ||
|
|
2b15fd1d3c | ||
|
|
3722fff476 | ||
|
|
0cb8029f0a | ||
|
|
b270b9c942 | ||
|
|
60615ec5d8 | ||
|
|
880914a167 | ||
|
|
a80a93d2b0 | ||
|
|
0afa7e5b12 | ||
|
|
12516ebf22 | ||
|
|
0dca90c3fe | ||
|
|
2506c9e091 | ||
|
|
83d39ba3a5 | ||
|
|
c09bb06da6 | ||
|
|
1793b1138e | ||
|
|
d30735bc49 | ||
|
|
9356db5373 | ||
|
|
ee766e7150 | ||
|
|
a33bbc9abc | ||
|
|
5758295fac | ||
|
|
8142d0aa09 | ||
|
|
dc80353a5b | ||
|
|
4d856b04b3 | ||
|
|
6ba25ecaa0 | ||
|
|
fcdcaf16e9 | ||
|
|
086e338f4a | ||
|
|
9dbe73f376 | ||
|
|
aaf4015c95 | ||
|
|
2b65ad4b4b | ||
|
|
c0c75f6a0e | ||
|
|
2f4af6e377 | ||
|
|
b9a305bf2d | ||
|
|
db6b6af4d7 | ||
|
|
d73addf7ed | ||
|
|
c7978c93c2 | ||
|
|
457f7d9c66 | ||
|
|
0387cdd143 | ||
|
|
5c6898b975 | ||
|
|
b7b273b47d | ||
|
|
9aa9cb2d54 | ||
|
|
ee6c21638f | ||
|
|
1e4475afdf | ||
|
|
708d274ebc | ||
|
|
a94faad870 | ||
|
|
d854125867 | ||
|
|
63de8ef82d | ||
|
|
ea0af08c31 | ||
|
|
810caa0624 | ||
|
|
c282372dc8 | ||
|
|
055ccd3366 | ||
|
|
4b4c73fb7b | ||
|
|
35b9cfd1c2 | ||
|
|
9a73b952cf | ||
|
|
a1c73094f1 | ||
|
|
b79d9946a9 | ||
|
|
a9735daa04 | ||
|
|
d00491c15e | ||
|
|
ed48b6a4bf | ||
|
|
62eade84b9 | ||
|
|
d6a217fe70 | ||
|
|
f1e4d0d52b | ||
|
|
38a1e8b95f | ||
|
|
a371558f8c | ||
|
|
b716b187eb | ||
|
|
0be6454a7e | ||
|
|
5fde631e04 | ||
|
|
c076c10840 | ||
|
|
24104f8cc1 | ||
|
|
3632dd4667 | ||
|
|
063aa8bfe1 | ||
|
|
d2716a60e9 | ||
|
|
8f08a8c62b | ||
|
|
091cb73622 | ||
|
|
d3b0bc430f | ||
|
|
e3c12bc1bb | ||
|
|
f6f7a0a952 | ||
|
|
96df6e3145 | ||
|
|
574e5ad425 | ||
|
|
0611e56baa | ||
|
|
442c1dafea | ||
|
|
33bc532eac | ||
|
|
4bd6e7e343 | ||
|
|
159ac6348c | ||
|
|
17259ed805 | ||
|
|
67bf5cbaa8 | ||
|
|
f8a3cbc147 | ||
|
|
a3d4fbe3a3 | ||
|
|
5c1dca5933 | ||
|
|
f56ab8fa49 | ||
|
|
cfca5376a0 | ||
|
|
0633fa04fb | ||
|
|
659a9abaa5 | ||
|
|
5877aea3f7 | ||
|
|
f2b92de827 | ||
|
|
4a5464d6aa | ||
|
|
d9a3bbbd30 | ||
|
|
150aea55c0 | ||
|
|
676f0c9d09 | ||
|
|
ded00df11e | ||
|
|
286556cbb6 | ||
|
|
d6a74dc9f9 | ||
|
|
2a55654fcf | ||
|
|
7496914b37 | ||
|
|
c8ace585e1 | ||
|
|
69c5f80c8c | ||
|
|
79d27189fd | ||
|
|
75bac72c05 | ||
|
|
c5e6271dc3 | ||
|
|
5561dd5e95 | ||
|
|
041b633742 | ||
|
|
715426bd3b | ||
|
|
464b8053fe | ||
|
|
2fece83c7c | ||
|
|
ad03ef1922 | ||
|
|
cc44100e4e | ||
|
|
5d1c1ae406 | ||
|
|
f72838b0be | ||
|
|
6388898cc0 | ||
|
|
72d1e6271a | ||
|
|
299b5be385 | ||
|
|
2ea58f5e9f | ||
|
|
510ec7ce45 | ||
|
|
c186927e3d | ||
|
|
1a494b0dea | ||
|
|
19552ddf69 | ||
|
|
41267a0f98 | ||
|
|
270129cd73 | ||
|
|
a41c13fdd6 | ||
|
|
65ab5eca22 | ||
|
|
143ebee25e | ||
|
|
47fd8e05d1 | ||
|
|
653f280fd1 | ||
|
|
daa68f3fa6 | ||
|
|
ed66043856 | ||
|
|
526b1c5ad3 | ||
|
|
241a32cb4f |
1
.gitattributes
vendored
Normal file
1
.gitattributes
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
*.pdf filter=lfs diff=lfs merge=lfs -text
|
||||||
7
.gitignore
vendored
7
.gitignore
vendored
@ -18,6 +18,7 @@ target/
|
|||||||
.settings
|
.settings
|
||||||
.springBeans
|
.springBeans
|
||||||
.sts4-cache
|
.sts4-cache
|
||||||
|
.gradle
|
||||||
|
|
||||||
### IntelliJ IDEA ###
|
### IntelliJ IDEA ###
|
||||||
.idea
|
.idea
|
||||||
@ -37,3 +38,9 @@ build/
|
|||||||
|
|
||||||
### VS Code ###
|
### VS Code ###
|
||||||
.vscode/
|
.vscode/
|
||||||
|
gradlew.bat
|
||||||
|
gradlew
|
||||||
|
gradle.properties
|
||||||
|
gradle/
|
||||||
|
.DS_Store
|
||||||
|
.DS_Store/
|
||||||
|
|||||||
@ -1,4 +1,26 @@
|
|||||||
|
variables:
|
||||||
|
# SONAR_PROJECT_KEY: 'fforesight_layout-parser_AYd5quv2mRkBOCG22hvF'
|
||||||
|
GIT_SUBMODULE_STRATEGY: recursive
|
||||||
|
GIT_SUBMODULE_FORCE_HTTPS: 'true'
|
||||||
include:
|
include:
|
||||||
- project: 'gitlab/gitlab'
|
- project: 'gitlab/gitlab'
|
||||||
ref: 'main'
|
ref: 'main'
|
||||||
file: 'ci-templates/maven_java.yml'
|
file: 'ci-templates/gradle_java.yml'
|
||||||
|
|
||||||
|
deploy:
|
||||||
|
stage: deploy
|
||||||
|
tags:
|
||||||
|
- dind
|
||||||
|
script:
|
||||||
|
- echo "Building with gradle version ${BUILDVERSION}"
|
||||||
|
- gradle -Pversion=${BUILDVERSION} publish
|
||||||
|
- gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${BUILDVERSION}
|
||||||
|
- echo "BUILDVERSION=$BUILDVERSION" >> version.env
|
||||||
|
artifacts:
|
||||||
|
reports:
|
||||||
|
dotenv: version.env
|
||||||
|
rules:
|
||||||
|
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
||||||
|
- if: $CI_COMMIT_BRANCH =~ /^feature/ && $CI_COMMIT_TAG == ""
|
||||||
|
- if: $CI_COMMIT_BRANCH =~ /^release/
|
||||||
|
- if: $CI_COMMIT_TAG
|
||||||
|
|||||||
8
.gitmodules
vendored
Normal file
8
.gitmodules
vendored
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/basf"]
|
||||||
|
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/basf
|
||||||
|
url = ssh://git@git.knecon.com:22222/fforesight/documents/basf.git
|
||||||
|
update = merge
|
||||||
|
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta"]
|
||||||
|
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta
|
||||||
|
url = ssh://git@git.knecon.com:22222/fforesight/documents/syngenta.git
|
||||||
|
update = merge
|
||||||
BIN
.mvn/wrapper/maven-wrapper.jar
vendored
BIN
.mvn/wrapper/maven-wrapper.jar
vendored
Binary file not shown.
18
.mvn/wrapper/maven-wrapper.properties
vendored
18
.mvn/wrapper/maven-wrapper.properties
vendored
@ -1,18 +0,0 @@
|
|||||||
# Licensed to the Apache Software Foundation (ASF) under one
|
|
||||||
# or more contributor license agreements. See the NOTICE file
|
|
||||||
# distributed with this work for additional information
|
|
||||||
# regarding copyright ownership. The ASF licenses this file
|
|
||||||
# to you under the Apache License, Version 2.0 (the
|
|
||||||
# "License"); you may not use this file except in compliance
|
|
||||||
# with the License. You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# https://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing,
|
|
||||||
# software distributed under the License is distributed on an
|
|
||||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
# KIND, either express or implied. See the License for the
|
|
||||||
# specific language governing permissions and limitations
|
|
||||||
# under the License.
|
|
||||||
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.7/apache-maven-3.8.7-bin.zip
|
|
||||||
wrapperUrl=https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.1/maven-wrapper-3.1.1.jar
|
|
||||||
88
README.md
88
README.md
@ -1 +1,89 @@
|
|||||||
|
# PDF Layout Parser Micro-Service: layout-parser
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
The layout-parser micro-service is a powerful tool designed to efficiently extract structured information from PDF documents. Written in Java and utilizing Spring Boot 3, Apache PDFBox, and RabbitMQ, this micro-service excels at parsing PDFs and organizing their content into a meaningful and coherent layout structure. Notably, the layout-parser micro-service distinguishes itself by relying solely on advanced algorithms, rather than machine learning techniques.
|
||||||
|
|
||||||
|
### Key Steps in the PDF Layout Parsing Process:
|
||||||
|
|
||||||
|
* **Text Position Extraction:**
|
||||||
|
The micro-service leverages Apache PDFBox to extract precise text positions for each individual character within the PDF document.
|
||||||
|
|
||||||
|
* **Word Segmentation and Text Block Formation:**
|
||||||
|
Employing an array of diverse algorithms, the micro-service initially identifies and segments words, creating distinct text blocks.
|
||||||
|
|
||||||
|
* **Text Block Classification:**
|
||||||
|
The segmented text blocks are then subjected to classification algorithms. These algorithms categorize the text blocks based on their content and visual properties, distinguishing between sections, subsections, headlines, paragraphs, images, tables, table cells, headers, and footers.
|
||||||
|
|
||||||
|
* **Layout Coherence Establishment:**
|
||||||
|
The classified text blocks are subsequently orchestrated into a cohesive layout structure. This process involves arranging sections, subsections, paragraphs, images, and other elements in a logical and structured manner.
|
||||||
|
|
||||||
|
* **Output Generation in Various Formats:**
|
||||||
|
Once the layout structure is established, the micro-service generates output in multiple formats. These formats are designed for seamless integration with downstream micro-services. The supported formats include JSON, XML, and others, ensuring flexibility in downstream data consumption.
|
||||||
|
|
||||||
|
### Optional Enhancements:
|
||||||
|
|
||||||
|
* **ML-Based Table Extraction:**
|
||||||
|
For enhanced results, users have the option to incorporate machine learning-based table extraction. This feature can be activated by providing ML-generated results as a JSON file, which are then integrated seamlessly into the layout structure.
|
||||||
|
|
||||||
|
* **Image Classification using ML:**
|
||||||
|
Additionally, for more accurate image classification, users can optionally feed ML-generated image classification results into the micro-service. Similar to the table extraction option, the micro-service processes the pre-parsed results in JSON format, thus optimizing the accuracy of image content identification.
|
||||||
|
|
||||||
|
In conclusion, the layout-parser micro-service is a versatile PDF layout parsing solution crafted entirely around advanced algorithms, without reliance on machine learning. It proficiently extracts text positions, segments content into meaningful blocks, classifies these blocks, arranges them coherently, and outputs structured data for downstream micro-services. Optional integration with ML-generated table extractions and image classifications further enhances its capabilities.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
|
||||||
|
Before building and using the layout-parser micro-service, please ensure you have the following software and tools installed:
|
||||||
|
|
||||||
|
Java Development Kit (JDK) 17 or later
|
||||||
|
Gradle build tool (preinstalled)
|
||||||
|
Build and Test
|
||||||
|
To build and test the micro-service, follow these steps:
|
||||||
|
|
||||||
|
### Clone the Repository:
|
||||||
|
|
||||||
|
bash
|
||||||
|
```
|
||||||
|
git clone ssh://git@git.knecon.com:22222/fforesight/layout-parser.git
|
||||||
|
cd layout-parser
|
||||||
|
```
|
||||||
|
### Build the Project:
|
||||||
|
Use the following command to build the project using Gradle:
|
||||||
|
|
||||||
|
```
|
||||||
|
gradle clean build
|
||||||
|
```
|
||||||
|
### Run Tests:
|
||||||
|
Run the test suite using the following command:
|
||||||
|
```
|
||||||
|
gradle test
|
||||||
|
```
|
||||||
|
## Building a Custom Docker Image
|
||||||
|
To create a custom Docker image for the layout-parser micro-service, execute the provided script:
|
||||||
|
|
||||||
|
### Ensure Docker is Installed:
|
||||||
|
Ensure that Docker is installed and running on your system.
|
||||||
|
|
||||||
|
### Run the Image Building Script:
|
||||||
|
Execute the publish-custom-image script in the project directory:
|
||||||
|
|
||||||
|
```
|
||||||
|
./publish-custom-image
|
||||||
|
```
|
||||||
|
## Publishing to Internal Maven Repository
|
||||||
|
To publish the layout-parser micro-service to your internal Maven repository, execute the following command:
|
||||||
|
|
||||||
|
```
|
||||||
|
gradle -Pversion=buildVersion publish
|
||||||
|
```
|
||||||
|
Replace buildVersion with the desired version number.
|
||||||
|
|
||||||
|
## Additional Notes
|
||||||
|
Make sure to configure any necessary application properties before deploying the micro-service.
|
||||||
|
For advanced usage and configurations, refer to Kilian or Dom or preferably the source code.
|
||||||
|
|||||||
7
buildSrc/build.gradle.kts
Normal file
7
buildSrc/build.gradle.kts
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
plugins {
|
||||||
|
`kotlin-dsl`
|
||||||
|
}
|
||||||
|
|
||||||
|
repositories {
|
||||||
|
gradlePluginPortal()
|
||||||
|
}
|
||||||
@ -0,0 +1,93 @@
|
|||||||
|
plugins {
|
||||||
|
`java-library`
|
||||||
|
`maven-publish`
|
||||||
|
pmd
|
||||||
|
checkstyle
|
||||||
|
jacoco
|
||||||
|
}
|
||||||
|
|
||||||
|
group = "com.knecon.fforesight"
|
||||||
|
|
||||||
|
val documentVersion by rootProject.extra { "4.433.0" }
|
||||||
|
|
||||||
|
java.sourceCompatibility = JavaVersion.VERSION_17
|
||||||
|
java.targetCompatibility = JavaVersion.VERSION_17
|
||||||
|
|
||||||
|
tasks.pmdMain {
|
||||||
|
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks.pmdTest {
|
||||||
|
pmd.ruleSetFiles = files("${rootDir}/config/pmd/test_pmd.xml")
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks.named<Test>("test") {
|
||||||
|
useJUnitPlatform()
|
||||||
|
reports {
|
||||||
|
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
|
||||||
|
}
|
||||||
|
minHeapSize = "512m"
|
||||||
|
maxHeapSize = "2048m"
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks.test {
|
||||||
|
finalizedBy(tasks.jacocoTestReport) // report is always generated after tests run
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks.jacocoTestReport {
|
||||||
|
dependsOn(tasks.test) // tests are required to run before generating the report
|
||||||
|
reports {
|
||||||
|
xml.required.set(true)
|
||||||
|
csv.required.set(false)
|
||||||
|
html.outputLocation.set(layout.buildDirectory.dir("jacocoHtml"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
allprojects {
|
||||||
|
|
||||||
|
tasks.withType<Javadoc> {
|
||||||
|
options {
|
||||||
|
this as StandardJavadocDocletOptions
|
||||||
|
addBooleanOption("Xdoclint:none", true)
|
||||||
|
addStringOption("Xmaxwarns", "1")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pmd {
|
||||||
|
setConsoleOutput(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
publishing {
|
||||||
|
publications {
|
||||||
|
create<MavenPublication>(name) {
|
||||||
|
from(components["java"])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
repositories {
|
||||||
|
maven {
|
||||||
|
url = uri("https://nexus.knecon.com/repository/red-platform-releases/")
|
||||||
|
credentials {
|
||||||
|
username = providers.gradleProperty("mavenUser").getOrNull();
|
||||||
|
password = providers.gradleProperty("mavenPassword").getOrNull();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
withJavadocJar()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
repositories {
|
||||||
|
mavenLocal()
|
||||||
|
mavenCentral()
|
||||||
|
maven {
|
||||||
|
url = uri("https://nexus.knecon.com/repository/gindev/")
|
||||||
|
credentials {
|
||||||
|
username = providers.gradleProperty("mavenUser").getOrNull();
|
||||||
|
password = providers.gradleProperty("mavenPassword").getOrNull();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
39
config/checkstyle/checkstyle.xml
Normal file
39
config/checkstyle/checkstyle.xml
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
|
||||||
|
"http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
|
||||||
|
<module name="Checker">
|
||||||
|
<property
|
||||||
|
name="severity"
|
||||||
|
value="error"/>
|
||||||
|
<module name="TreeWalker">
|
||||||
|
<module name="SuppressWarningsHolder"/>
|
||||||
|
<module name="MissingDeprecated"/>
|
||||||
|
<module name="MissingOverride"/>
|
||||||
|
<module name="AnnotationLocation"/>
|
||||||
|
<module name="JavadocStyle"/>
|
||||||
|
<module name="NonEmptyAtclauseDescription"/>
|
||||||
|
<module name="IllegalImport"/>
|
||||||
|
<module name="RedundantImport"/>
|
||||||
|
<module name="RedundantModifier"/>
|
||||||
|
<module name="EmptyBlock"/>
|
||||||
|
<module name="DefaultComesLast"/>
|
||||||
|
<module name="EmptyStatement"/>
|
||||||
|
<module name="EqualsHashCode"/>
|
||||||
|
<module name="ExplicitInitialization"/>
|
||||||
|
<module name="IllegalInstantiation"/>
|
||||||
|
<module name="ModifiedControlVariable"/>
|
||||||
|
<module name="MultipleVariableDeclarations"/>
|
||||||
|
<module name="PackageDeclaration"/>
|
||||||
|
<module name="ParameterAssignment"/>
|
||||||
|
<module name="SimplifyBooleanExpression"/>
|
||||||
|
<module name="SimplifyBooleanReturn"/>
|
||||||
|
<module name="StringLiteralEquality"/>
|
||||||
|
<module name="OneStatementPerLine"/>
|
||||||
|
<module name="FinalClass"/>
|
||||||
|
<module name="ArrayTypeStyle"/>
|
||||||
|
<module name="UpperEll"/>
|
||||||
|
<module name="OuterTypeFilename"/>
|
||||||
|
</module>
|
||||||
|
<module name="FileTabCharacter"/>
|
||||||
|
<module name="SuppressWarningsFilter"/>
|
||||||
|
</module>
|
||||||
21
config/pmd/pmd.xml
Normal file
21
config/pmd/pmd.xml
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
<?xml version="1.0"?>
|
||||||
|
<ruleset name="Custom ruleset"
|
||||||
|
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
|
||||||
|
|
||||||
|
<description>
|
||||||
|
Knecon ruleset checks the code for bad stuff
|
||||||
|
</description>
|
||||||
|
|
||||||
|
<rule ref="category/java/errorprone.xml">
|
||||||
|
<exclude name="MissingSerialVersionUID"/>
|
||||||
|
<exclude name="AvoidLiteralsInIfCondition"/>
|
||||||
|
<exclude name="AvoidDuplicateLiterals"/>
|
||||||
|
<exclude name="NullAssignment"/>
|
||||||
|
<exclude name="AssignmentInOperand"/>
|
||||||
|
<exclude name="BeanMembersShouldSerialize"/>
|
||||||
|
</rule>
|
||||||
|
|
||||||
|
</ruleset>
|
||||||
|
|
||||||
23
config/pmd/test_pmd.xml
Normal file
23
config/pmd/test_pmd.xml
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
<?xml version="1.0"?>
|
||||||
|
<ruleset name="Custom ruleset"
|
||||||
|
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
|
||||||
|
|
||||||
|
<description>
|
||||||
|
Knecon test ruleset checks the code for bad stuff
|
||||||
|
</description>
|
||||||
|
|
||||||
|
|
||||||
|
<rule ref="category/java/errorprone.xml">
|
||||||
|
<exclude name="MissingSerialVersionUID"/>
|
||||||
|
<exclude name="AvoidLiteralsInIfCondition"/>
|
||||||
|
<exclude name="AvoidDuplicateLiterals"/>
|
||||||
|
<exclude name="AvoidFieldNameMatchingMethodName"/>
|
||||||
|
<exclude name="NullAssignment"/>
|
||||||
|
<exclude name="AssignmentInOperand"/>
|
||||||
|
<exclude name="TestClassWithoutTestCases"/>
|
||||||
|
<exclude name="BeanMembersShouldSerialize"/>
|
||||||
|
</rule>
|
||||||
|
|
||||||
|
</ruleset>
|
||||||
1
gradle.properties.kts
Normal file
1
gradle.properties.kts
Normal file
@ -0,0 +1 @@
|
|||||||
|
version = 0.1-SNAPSHOT
|
||||||
@ -1,99 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
||||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
||||||
<modelVersion>4.0.0</modelVersion>
|
|
||||||
|
|
||||||
|
|
||||||
<parent>
|
|
||||||
<groupId>com.knecon.fforesight</groupId>
|
|
||||||
<artifactId>platform-docker-dependency</artifactId>
|
|
||||||
<version>0.1.0</version>
|
|
||||||
<relativePath/>
|
|
||||||
</parent>
|
|
||||||
|
|
||||||
<groupId>com.knecon.fforesight</groupId>
|
|
||||||
<artifactId>layoutparser-service-image</artifactId>
|
|
||||||
<version>0.1-SNAPSHOT</version>
|
|
||||||
<packaging>pom</packaging>
|
|
||||||
|
|
||||||
<properties>
|
|
||||||
<service.server>layoutparser-service-server</service.server>
|
|
||||||
<platform.jar>${service.server}.jar</platform.jar>
|
|
||||||
<docker.skip.push>false</docker.skip.push>
|
|
||||||
<docker.image.prefix>ff</docker.image.prefix>
|
|
||||||
<docker.image.name>${docker.image.prefix}/${service.server}</docker.image.name>
|
|
||||||
</properties>
|
|
||||||
|
|
||||||
<build>
|
|
||||||
<plugins>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-dependency-plugin</artifactId>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-resources-plugin</artifactId>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.codehaus.mojo</groupId>
|
|
||||||
<artifactId>exec-maven-plugin</artifactId>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>io.fabric8</groupId>
|
|
||||||
<artifactId>docker-maven-plugin</artifactId>
|
|
||||||
</plugin>
|
|
||||||
</plugins>
|
|
||||||
|
|
||||||
<pluginManagement>
|
|
||||||
<plugins>
|
|
||||||
<plugin>
|
|
||||||
<groupId>org.apache.maven.plugins</groupId>
|
|
||||||
<artifactId>maven-dependency-plugin</artifactId>
|
|
||||||
<executions>
|
|
||||||
<execution>
|
|
||||||
<id>download-platform-jar</id>
|
|
||||||
<phase>prepare-package</phase>
|
|
||||||
<goals>
|
|
||||||
<goal>copy</goal>
|
|
||||||
</goals>
|
|
||||||
<configuration>
|
|
||||||
<artifactItems>
|
|
||||||
<dependency>
|
|
||||||
<groupId>${project.groupId}</groupId>
|
|
||||||
<artifactId>${service.server}</artifactId>
|
|
||||||
<version>${project.version}</version>
|
|
||||||
<type>jar</type>
|
|
||||||
<overWrite>true</overWrite>
|
|
||||||
<destFileName>${platform.jar}</destFileName>
|
|
||||||
</dependency>
|
|
||||||
</artifactItems>
|
|
||||||
<outputDirectory>${docker.build.directory}</outputDirectory>
|
|
||||||
</configuration>
|
|
||||||
</execution>
|
|
||||||
</executions>
|
|
||||||
</plugin>
|
|
||||||
<plugin>
|
|
||||||
<groupId>io.fabric8</groupId>
|
|
||||||
<artifactId>docker-maven-plugin</artifactId>
|
|
||||||
<configuration>
|
|
||||||
<images>
|
|
||||||
<image>
|
|
||||||
<name>${docker.image.name}</name>
|
|
||||||
<build>
|
|
||||||
<dockerFileDir>${docker.build.directory}</dockerFileDir>
|
|
||||||
<args>
|
|
||||||
<PLATFORM_JAR>${platform.jar}</PLATFORM_JAR>
|
|
||||||
</args>
|
|
||||||
<tags>
|
|
||||||
<tag>${docker.image.version}</tag>
|
|
||||||
<tag>latest</tag>
|
|
||||||
</tags>
|
|
||||||
</build>
|
|
||||||
</image>
|
|
||||||
</images>
|
|
||||||
</configuration>
|
|
||||||
</plugin>
|
|
||||||
</plugins>
|
|
||||||
</pluginManagement>
|
|
||||||
</build>
|
|
||||||
</project>
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
FROM red/base-image:2.0.2
|
|
||||||
|
|
||||||
ARG PLATFORM_JAR
|
|
||||||
|
|
||||||
ENV PLATFORM_JAR ${PLATFORM_JAR}
|
|
||||||
|
|
||||||
ENV USES_ELASTICSEARCH false
|
|
||||||
|
|
||||||
COPY ["${PLATFORM_JAR}", "/"]
|
|
||||||
@ -0,0 +1,10 @@
|
|||||||
|
plugins {
|
||||||
|
id("com.knecon.fforesight.java-conventions")
|
||||||
|
id("io.freefair.lombok") version "8.4"
|
||||||
|
}
|
||||||
|
|
||||||
|
description = "layoutparser-service-internal-api"
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation("io.swagger.core.v3:swagger-annotations:2.2.15")
|
||||||
|
}
|
||||||
@ -1,24 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
||||||
xmlns="http://maven.apache.org/POM/4.0.0"
|
|
||||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
||||||
|
|
||||||
<modelVersion>4.0.0</modelVersion>
|
|
||||||
|
|
||||||
<parent>
|
|
||||||
<groupId>com.knecon.fforesight</groupId>
|
|
||||||
<artifactId>layoutparser-service</artifactId>
|
|
||||||
<version>0.1-SNAPSHOT</version>
|
|
||||||
</parent>
|
|
||||||
|
|
||||||
<artifactId>layoutparser-service-internal-api</artifactId>
|
|
||||||
|
|
||||||
<dependencies>
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.google.guava</groupId>
|
|
||||||
<artifactId>guava</artifactId>
|
|
||||||
<version>${guava.version}</version>
|
|
||||||
</dependency>
|
|
||||||
</dependencies>
|
|
||||||
|
|
||||||
</project>
|
|
||||||
@ -1,19 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Data;
|
|
||||||
import lombok.experimental.FieldDefaults;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@Builder
|
|
||||||
@AllArgsConstructor
|
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
|
||||||
public class AtomicPositionBlockData {
|
|
||||||
|
|
||||||
Long id;
|
|
||||||
int[] stringIdxToPositionIdx;
|
|
||||||
float[][] positions;
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,27 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Data;
|
|
||||||
import lombok.experimental.FieldDefaults;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@Builder
|
|
||||||
@AllArgsConstructor
|
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
|
||||||
public class AtomicTextBlockData {
|
|
||||||
|
|
||||||
Long id;
|
|
||||||
Long page;
|
|
||||||
String searchText;
|
|
||||||
int numberOnPage;
|
|
||||||
int start;
|
|
||||||
int end;
|
|
||||||
int[] lineBreaks;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,21 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Data;
|
|
||||||
import lombok.experimental.FieldDefaults;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@Builder
|
|
||||||
@AllArgsConstructor
|
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
|
||||||
public class DocumentData {
|
|
||||||
|
|
||||||
PageData[] pages;
|
|
||||||
AtomicTextBlockData[] atomicTextBlocks;
|
|
||||||
AtomicPositionBlockData[] atomicPositionBlocks;
|
|
||||||
DocumentTreeData documentTreeData;
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,90 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Data;
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.NoArgsConstructor;
|
|
||||||
import lombok.experimental.FieldDefaults;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@Builder
|
|
||||||
@AllArgsConstructor
|
|
||||||
@NoArgsConstructor
|
|
||||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
|
||||||
public class DocumentTreeData {
|
|
||||||
|
|
||||||
EntryData root;
|
|
||||||
|
|
||||||
|
|
||||||
public EntryData get(List<Integer> tocId) {
|
|
||||||
|
|
||||||
if (tocId.isEmpty()) {
|
|
||||||
return root;
|
|
||||||
}
|
|
||||||
EntryData entry = root.children.get(tocId.get(0));
|
|
||||||
for (int id : tocId.subList(1, tocId.size())) {
|
|
||||||
entry = entry.children.get(id);
|
|
||||||
}
|
|
||||||
return entry;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Stream<EntryData> streamAllEntries() {
|
|
||||||
|
|
||||||
return Stream.concat(Stream.of(root), root.children.stream()).flatMap(DocumentTreeData::flatten);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
|
|
||||||
return String.join("\n", streamAllEntries().map(EntryData::toString).toList());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static Stream<EntryData> flatten(EntryData entry) {
|
|
||||||
|
|
||||||
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTreeData::flatten));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Builder
|
|
||||||
@Getter
|
|
||||||
@AllArgsConstructor
|
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
|
||||||
public static class EntryData {
|
|
||||||
|
|
||||||
NodeType type;
|
|
||||||
int[] treeId;
|
|
||||||
Long[] atomicBlockIds;
|
|
||||||
Long[] pageNumbers;
|
|
||||||
Map<String, String> properties;
|
|
||||||
List<EntryData> children;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
sb.append("[");
|
|
||||||
for (int i : treeId) {
|
|
||||||
sb.append(i);
|
|
||||||
sb.append(",");
|
|
||||||
}
|
|
||||||
sb.delete(sb.length() - 1, sb.length());
|
|
||||||
sb.append("]: ");
|
|
||||||
|
|
||||||
sb.append(type);
|
|
||||||
sb.append(" atbs = ");
|
|
||||||
sb.append(atomicBlockIds.length);
|
|
||||||
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,21 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
|
||||||
|
|
||||||
import java.util.Locale;
|
|
||||||
|
|
||||||
public enum NodeType {
|
|
||||||
DOCUMENT,
|
|
||||||
SECTION,
|
|
||||||
HEADLINE,
|
|
||||||
PARAGRAPH,
|
|
||||||
TABLE,
|
|
||||||
TABLE_CELL,
|
|
||||||
IMAGE,
|
|
||||||
HEADER,
|
|
||||||
FOOTER;
|
|
||||||
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
|
|
||||||
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -1,20 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Data;
|
|
||||||
import lombok.experimental.FieldDefaults;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@Builder
|
|
||||||
@AllArgsConstructor
|
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
|
||||||
public class PageData {
|
|
||||||
|
|
||||||
int number;
|
|
||||||
int height;
|
|
||||||
int width;
|
|
||||||
int rotation;
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -0,0 +1,21 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||||
|
|
||||||
|
import io.swagger.v3.oas.annotations.media.Schema;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
@Schema(description = "Object containing a simplified version, which contains almost exclusively text, of the document structure Section class.")
|
||||||
|
public class SimplifiedSectionText {
|
||||||
|
|
||||||
|
@Schema(description = "The number of this Section. This is used to map the simplified section text back to the original Section.")
|
||||||
|
private String sectionNumber;
|
||||||
|
@Schema(description = "The text in this Section.")
|
||||||
|
private String text;
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,34 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import io.swagger.v3.oas.annotations.media.Schema;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
@Schema(description = "Object containing a simplified version, which contains almost exclusively text, of the document structure.")
|
||||||
|
public class SimplifiedText {
|
||||||
|
|
||||||
|
@Schema(description = "Number of pages in the entire document.")
|
||||||
|
private int numberOfPages;
|
||||||
|
@Schema(description = "A List of simplified Sections, which contains almost exclusively text.")
|
||||||
|
@Builder.Default
|
||||||
|
private List<SimplifiedSectionText> sectionTexts = new ArrayList<>();
|
||||||
|
@Schema(description = "A list of the main section numbers ")
|
||||||
|
@Builder.Default
|
||||||
|
private List<String> mainSectionNumbers = new ArrayList<>();
|
||||||
|
@Schema(description = "A list of the header section numbers ")
|
||||||
|
@Builder.Default
|
||||||
|
private List<String> headerSectionNumbers = new ArrayList<>();
|
||||||
|
@Schema(description = "A list of the footer section numbers ")
|
||||||
|
@Builder.Default
|
||||||
|
private List<String> footerSectionNumbers = new ArrayList<>();
|
||||||
|
|
||||||
|
}
|
||||||
@ -2,20 +2,29 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
|||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import io.swagger.v3.oas.annotations.media.Schema;
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@Builder
|
@Builder
|
||||||
|
@Schema(description = "Object containing information about a Paragraph/Headline/Header/Footer.")
|
||||||
public class ParagraphData {
|
public class ParagraphData {
|
||||||
|
|
||||||
|
@Schema(description = "The text of this Semantic Node, without any linebreaks.", example = "This is some text.")
|
||||||
private String text;
|
private String text;
|
||||||
|
@Schema(description = "A list of text ranges in string offsets. Every character in any of the ranges is bold.", example = "[0, 15]")
|
||||||
List<Range> boldTextBoundaries;
|
List<Range> boldTextBoundaries;
|
||||||
|
@Schema(description = "A list of text ranges in string offsets. Every character in any of the ranges is italic.", example = "[0, 15]")
|
||||||
List<Range> italicTextBoundaries;
|
List<Range> italicTextBoundaries;
|
||||||
|
@Schema(description = "The line breaks in the text of this semantic node in string offsets. They are exclusive end. At the end of each semantic node there is an implicit linebreak.", example = "[5, 10]")
|
||||||
List<Integer> linebreaks;
|
List<Integer> linebreaks;
|
||||||
|
@Schema(description = "The classification of this Paragraph.", allowableValues = "{paragraph, headline, header, footer}")
|
||||||
private String classification;
|
private String classification;
|
||||||
|
|
||||||
|
@Schema(description = "Describes the text orientation of this semantic node. Any semantic node only has a single text orientation.", allowableValues = "{ZERO, QUARTER_CIRCLE, HALF_CIRCLE, THREE_QUARTER_CIRCLE}")
|
||||||
private String orientation;
|
private String orientation;
|
||||||
|
@Schema(description = "Describes the text direction in degrees of this semantic node. Any semantic node only has a single text direction.", minimum = "0", maximum = "359")
|
||||||
private int textDirection;
|
private int textDirection;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,8 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||||
|
|
||||||
|
import io.swagger.v3.oas.annotations.media.Schema;
|
||||||
|
|
||||||
|
@Schema(description = "Object specifying the start and end offsets of a text range in string offsets.")
|
||||||
public record Range(int start, int end) {
|
public record Range(int start, int end) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
|||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import io.swagger.v3.oas.annotations.media.Schema;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
@ -9,8 +10,12 @@ import lombok.Data;
|
|||||||
@Builder
|
@Builder
|
||||||
@Data
|
@Data
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
|
@Schema(description = "Object containing a simplified version of the document structure. This simplified form only knows Paragraphs and Tables. The Paragraph Objects might be a Paragraph, Headline, Header or Footer.")
|
||||||
public class ResearchDocumentData {
|
public class ResearchDocumentData {
|
||||||
|
|
||||||
|
@Schema(description = "File name of the original uploaded file.")
|
||||||
String originalFile;
|
String originalFile;
|
||||||
|
@Schema(description = "A List of all paragraphs/headline or table objects, that have been parsed in this document.")
|
||||||
List<StructureObject> structureObjects;
|
List<StructureObject> structureObjects;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,14 +2,19 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
|||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import io.swagger.v3.oas.annotations.media.Schema;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
|
@Schema(description = "Object containing information about a Table Row.")
|
||||||
public class RowData {
|
public class RowData {
|
||||||
|
|
||||||
|
@Schema(description = "Boolean indicating whether this table row is classified as a header row.")
|
||||||
boolean header;
|
boolean header;
|
||||||
|
@Schema(description = "A list of Objects containing information about the text in each cell of this row.")
|
||||||
List<ParagraphData> cellText;
|
List<ParagraphData> cellText;
|
||||||
|
@Schema(description = "The bounding box of this StructureObject. Is always exactly 4 values representing x, y, w, h, where x, y specify the lower left corner.")
|
||||||
float[] bBox;
|
float[] bBox;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,8 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import io.swagger.v3.oas.annotations.media.Schema;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
@ -7,13 +10,22 @@ import lombok.Data;
|
|||||||
@Data
|
@Data
|
||||||
@Builder
|
@Builder
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
|
@Schema(description = "Object containing information about either a Paragraph/Headline/Header/Footer or a Table.")
|
||||||
public class StructureObject {
|
public class StructureObject {
|
||||||
|
|
||||||
|
@Schema(description = "The ID of this StructureObject.")
|
||||||
Integer structureObjectNumber;
|
Integer structureObjectNumber;
|
||||||
|
@Schema(description = "The Tree ID of this StructureObject.")
|
||||||
|
List<Integer> treeId;
|
||||||
|
@Schema(description = "This value indicates the start of the string offsets in this Object, with respect to the reading order.")
|
||||||
int page;
|
int page;
|
||||||
|
@Schema(description = "This stringOffset indicates the start of the string offsets in this Object, with respect to the reading order of the entire document. It is equal to the previous' StructureObject stringOffset + its length.")
|
||||||
int stringOffset;
|
int stringOffset;
|
||||||
|
@Schema(description = "The bounding box of this StructureObject. Is always exactly 4 values representing x, y, w, h, where x, y specify the lower left corner.", example = "[100, 100, 50, 50]")
|
||||||
float[] boundingBox;
|
float[] boundingBox;
|
||||||
|
@Schema(description = "Object containing information about a Paragraph/Headline/Header/Footer. Either this or table is null.")
|
||||||
ParagraphData paragraph;
|
ParagraphData paragraph;
|
||||||
|
@Schema(description = "Object containing information about a Table. Either this or paragraph is null.")
|
||||||
TableData table;
|
TableData table;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,14 +2,20 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
|||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import io.swagger.v3.oas.annotations.media.Schema;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
|
@Schema(description = "Object containing information about a Table.")
|
||||||
public class TableData {
|
public class TableData {
|
||||||
|
|
||||||
|
@Schema(description = "A list of Objects containing information about all rows in this table.")
|
||||||
List<RowData> rowData;
|
List<RowData> rowData;
|
||||||
|
@Schema(description = "Number of columns in this table.")
|
||||||
Integer numberOfCols;
|
Integer numberOfCols;
|
||||||
|
@Schema(description = "Number of rows in this table.")
|
||||||
Integer numberOfRows;
|
Integer numberOfRows;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,9 +2,26 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
|
|||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
import io.swagger.v3.oas.annotations.media.Schema;
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
|
|
||||||
@Builder
|
@Builder
|
||||||
public record LayoutParsingFinishedEvent(Map<String, String> identifier, long duration, int numberOfPages, String message) {
|
@Schema(description = "Object containing information about the layout parsing.")
|
||||||
|
public record LayoutParsingFinishedEvent(
|
||||||
|
@Schema(description = "General purpose identifier. It is returned exactly the same way it is inserted with the LayoutParsingRequest.") //
|
||||||
|
Map<String, String> identifier,
|
||||||
|
|
||||||
|
@Schema(description = "The duration of a single layout parsing in ms.") //
|
||||||
|
long duration,
|
||||||
|
|
||||||
|
@Schema(description = "The number of pages of the parsed document.") //
|
||||||
|
int numberOfPages,
|
||||||
|
|
||||||
|
@Schema(description = "A general message. It contains some information useful for a developer, like the paths where the files are stored. Not meant to be machine readable.") //
|
||||||
|
String message,
|
||||||
|
|
||||||
|
@Schema(description = "The app version of the layout parser.") //
|
||||||
|
String layoutParserVersion
|
||||||
|
) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,7 +2,9 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
|
|||||||
|
|
||||||
public class LayoutParsingQueueNames {
|
public class LayoutParsingQueueNames {
|
||||||
|
|
||||||
public static final String LAYOUT_PARSING_REQUEST_QUEUE = "layout_parsing_request_queue";
|
public static final String LAYOUT_PARSING_REQUEST_QUEUE_PREFIX = "layout_parsing_request";
|
||||||
public static final String LAYOUT_PARSING_DLQ = "layout_parsing_dead_letter_queue";
|
public static final String LAYOUT_PARSING_REQUEST_EXCHANGE = "layout_parsing_request_exchange";
|
||||||
public static final String LAYOUT_PARSING_FINISHED_EVENT_QUEUE = "layout_parsing_response_queue";
|
public static final String LAYOUT_PARSING_RESPONSE_QUEUE_PREFIX = "layout_parsing_response";
|
||||||
|
public static final String LAYOUT_PARSING_RESPONSE_EXCHANGE = "layout_parsing_response_exchange";
|
||||||
|
public static final String LAYOUT_PARSING_DLQ = "layout_parsing_error";
|
||||||
}
|
}
|
||||||
|
|||||||
@ -3,19 +3,45 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import io.swagger.v3.oas.annotations.media.Schema;
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
|
import lombok.NonNull;
|
||||||
|
|
||||||
@Builder
|
@Builder
|
||||||
|
@Schema(description = "Object containing all storage paths the service needs to know.")
|
||||||
public record LayoutParsingRequest(
|
public record LayoutParsingRequest(
|
||||||
|
@Schema(description = "Enum specifying the type of layout parsing to be performed.", allowableValues = "{RedactManager, DocuMine, TAAS}")//
|
||||||
|
@NonNull LayoutParsingType layoutParsingType,
|
||||||
|
|
||||||
|
@Schema(description = "General purpose identifiers. They are not changed by the service at all and are returned as is in the response queue.")//
|
||||||
Map<String, String> identifier,
|
Map<String, String> identifier,
|
||||||
String originFileStorageId,
|
|
||||||
Optional<String> tablesFileStorageId,
|
@Schema(description = "Path to the original PDF file.")//
|
||||||
Optional<String> imagesFileStorageId,
|
@NonNull String originFileStorageId,//
|
||||||
String structureFileStorageId,
|
|
||||||
String researchDocumentStorageId,
|
@Schema(description = "Optional Path to the table extraction file.")//
|
||||||
String textBlockFileStorageId,
|
Optional<String> tablesFileStorageId,//
|
||||||
String positionBlockFileStorageId,
|
@Schema(description = "Optional Path to the image classification file.")//
|
||||||
String pageFileStorageId) {
|
Optional<String> imagesFileStorageId,//
|
||||||
|
|
||||||
|
@Schema(description = "Optional Path to the the visual layout parsing service file") Optional<String> visualLayoutParsingFileId,//
|
||||||
|
|
||||||
|
@Schema(description = "Path where the Document Structure File will be stored.")//
|
||||||
|
@NonNull String structureFileStorageId,//
|
||||||
|
@Schema(description = "Path where the Research Data File will be stored.")//
|
||||||
|
String researchDocumentStorageId,//
|
||||||
|
@Schema(description = "Path where the Document Text File will be stored.")//
|
||||||
|
@NonNull String textBlockFileStorageId,//
|
||||||
|
@Schema(description = "Path where the Document Positions File will be stored.")//
|
||||||
|
@NonNull String positionBlockFileStorageId,//
|
||||||
|
@Schema(description = "Path where the Document Pages File will be stored.")//
|
||||||
|
@NonNull String pageFileStorageId,//
|
||||||
|
@Schema(description = "Path where the Document Markdown File will be stored.")//
|
||||||
|
Optional<String> documentMarkdownFileStorageId,//
|
||||||
|
@Schema(description = "Path where the Simplified Text File will be stored.")//
|
||||||
|
@NonNull String simplifiedTextStorageId,//
|
||||||
|
@Schema(description = "Path where the Viewer Document PDF will be stored.")//
|
||||||
|
@NonNull String viewerDocumentStorageId
|
||||||
|
) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,12 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.internal.api.queue;
|
||||||
|
|
||||||
|
public enum LayoutParsingType {
|
||||||
|
REDACT_MANAGER,
|
||||||
|
REDACT_MANAGER_OLD,
|
||||||
|
REDACT_MANAGER_PARAGRAPH_DEBUG,
|
||||||
|
REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||||
|
DOCUMINE,
|
||||||
|
DOCUMINE_OLD,
|
||||||
|
CLARIFYND,
|
||||||
|
CLARIFYND_PARAGRAPH_DEBUG
|
||||||
|
}
|
||||||
@ -0,0 +1,16 @@
|
|||||||
|
<Configuration>
|
||||||
|
|
||||||
|
<Appenders>
|
||||||
|
<Console name="CONSOLE" target="SYSTEM_OUT">
|
||||||
|
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
|
||||||
|
</Console>
|
||||||
|
</Appenders>
|
||||||
|
|
||||||
|
<Loggers>
|
||||||
|
<Root level="warn">
|
||||||
|
<AppenderRef ref="CONSOLE"/>
|
||||||
|
</Root>
|
||||||
|
<Logger name="com.iqser" level="info"/>
|
||||||
|
</Loggers>
|
||||||
|
|
||||||
|
</Configuration>
|
||||||
@ -0,0 +1,40 @@
|
|||||||
|
plugins {
|
||||||
|
id("com.knecon.fforesight.java-conventions")
|
||||||
|
id("io.freefair.lombok") version "8.4"
|
||||||
|
}
|
||||||
|
|
||||||
|
description = "layoutparser-service-processor"
|
||||||
|
|
||||||
|
val jacksonVersion = "2.15.2"
|
||||||
|
val pdfBoxVersion = "3.0.0"
|
||||||
|
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation(project(":layoutparser-service-internal-api"))
|
||||||
|
implementation(project(":viewer-doc-processor"))
|
||||||
|
|
||||||
|
implementation("com.knecon.fforesight:document:${rootProject.extra.get("documentVersion")}")
|
||||||
|
implementation("com.iqser.red.service:persistence-service-shared-api-v1:2.564.0-RED9010.0") {
|
||||||
|
exclude("org.springframework.boot", "spring-boot-starter-security")
|
||||||
|
exclude("org.springframework.boot", "spring-boot-starter-validation")
|
||||||
|
}
|
||||||
|
implementation("com.knecon.fforesight:tenant-commons:0.30.0") {
|
||||||
|
exclude("com.iqser.red.commons", "storage-commons")
|
||||||
|
}
|
||||||
|
implementation("com.iqser.red.commons:storage-commons:2.50.0")
|
||||||
|
|
||||||
|
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
|
||||||
|
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
|
||||||
|
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")
|
||||||
|
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
|
||||||
|
implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
|
||||||
|
implementation("org.jgrapht:jgrapht-core:1.5.2")
|
||||||
|
implementation("org.apache.pdfbox:jbig2-imageio:3.0.4")
|
||||||
|
implementation("com.github.jai-imageio:jai-imageio-core:1.4.0")
|
||||||
|
implementation("com.github.jai-imageio:jai-imageio-jpeg2000:1.4.0")
|
||||||
|
implementation("org.tinspin:tinspin-indexes:2.1.3")
|
||||||
|
implementation("org.commonmark:commonmark:0.22.0")
|
||||||
|
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
|
||||||
|
implementation("com.pdftron:PDFNet:10.11.0")
|
||||||
|
implementation("org.apache.commons:commons-text:1.12.0")
|
||||||
|
}
|
||||||
@ -1,65 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
||||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
||||||
<modelVersion>4.0.0</modelVersion>
|
|
||||||
|
|
||||||
<parent>
|
|
||||||
<groupId>com.knecon.fforesight</groupId>
|
|
||||||
<artifactId>layoutparser-service</artifactId>
|
|
||||||
<version>0.1-SNAPSHOT</version>
|
|
||||||
</parent>
|
|
||||||
|
|
||||||
<artifactId>layoutparser-service-processor</artifactId>
|
|
||||||
|
|
||||||
<dependencies>
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.iqser.red.service</groupId>
|
|
||||||
<artifactId>persistence-service-shared-api-v1</artifactId>
|
|
||||||
<version>2.36.0</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.knecon.fforesight</groupId>
|
|
||||||
<artifactId>tenant-commons</artifactId>
|
|
||||||
<version>${tennat-commons.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.knecon.fforesight</groupId>
|
|
||||||
<artifactId>layoutparser-service-internal-api</artifactId>
|
|
||||||
<version>${project.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.iqser.red.commons</groupId>
|
|
||||||
<artifactId>storage-commons</artifactId>
|
|
||||||
<version>${storage-commons.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.pdfbox</groupId>
|
|
||||||
<artifactId>pdfbox</artifactId>
|
|
||||||
<version>${pdfbox.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.apache.pdfbox</groupId>
|
|
||||||
<artifactId>pdfbox-tools</artifactId>
|
|
||||||
<version>${pdfbox.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.fasterxml.jackson.module</groupId>
|
|
||||||
<artifactId>jackson-module-afterburner</artifactId>
|
|
||||||
<version>${jackson.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.fasterxml.jackson.datatype</groupId>
|
|
||||||
<artifactId>jackson-datatype-jsr310</artifactId>
|
|
||||||
<version>${jackson.version}</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.springframework.boot</groupId>
|
|
||||||
<artifactId>spring-boot-starter-web</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>org.springframework.boot</groupId>
|
|
||||||
<artifactId>spring-boot-starter-amqp</artifactId>
|
|
||||||
</dependency>
|
|
||||||
</dependencies>
|
|
||||||
|
|
||||||
</project>
|
|
||||||
@ -0,0 +1,20 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor;
|
||||||
|
|
||||||
|
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||||
|
import org.springframework.context.annotation.Configuration;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Configuration
|
||||||
|
@ConfigurationProperties("layoutparser")
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
public class LayoutParserSettings {
|
||||||
|
|
||||||
|
boolean debug;
|
||||||
|
LayoutParsingType layoutParsingTypeOverride;
|
||||||
|
}
|
||||||
@ -0,0 +1,474 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor;
|
||||||
|
|
||||||
|
import static java.lang.String.format;
|
||||||
|
|
||||||
|
import java.awt.geom.AffineTransform;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.Loader;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
|
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||||
|
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||||
|
import org.springframework.beans.factory.annotation.Value;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.server.mapper.DocumentDataMapper;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeBuilderService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEnhancementService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.VisualLayoutParsingAdapter;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||||
|
|
||||||
|
import io.micrometer.observation.Observation;
|
||||||
|
import io.micrometer.observation.ObservationRegistry;
|
||||||
|
import io.micrometer.observation.annotation.Observed;
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
@SuppressWarnings("PMD.CloseResource")
|
||||||
|
@Slf4j
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
public class LayoutParsingPipeline {
|
||||||
|
|
||||||
|
final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||||
|
final CvTableParsingAdapter cvTableParsingAdapter;
|
||||||
|
final LayoutParsingStorageService layoutParsingStorageService;
|
||||||
|
final SectionsBuilderService sectionsBuilderService;
|
||||||
|
final SimplifiedSectionTextService simplifiedSectionTextService;
|
||||||
|
final RulingCleaningService rulingCleaningService;
|
||||||
|
final TableExtractionService tableExtractionService;
|
||||||
|
final DocuMineBlockificationService docuMineBlockificationService;
|
||||||
|
final RedactManagerBlockificationService redactManagerBlockificationService;
|
||||||
|
final BlockificationPostprocessingService blockificationPostprocessingService;
|
||||||
|
final DocstrumBlockificationService docstrumBlockificationService;
|
||||||
|
final LayoutGridService layoutGridService;
|
||||||
|
final ObservationRegistry observationRegistry;
|
||||||
|
final VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
||||||
|
final GraphicExtractorService graphicExtractorService;
|
||||||
|
final OutlineExtractorService outlineExtractorService;
|
||||||
|
final SectionTreeBuilderService sectionTreeBuilderService;
|
||||||
|
final SectionTreeEnhancementService sectionTreeEnhancementService;
|
||||||
|
final LayoutParserSettings settings;
|
||||||
|
final ClassificationService classificationService;
|
||||||
|
|
||||||
|
@Value("${LAYOUT_PARSER_VERSION:}")
|
||||||
|
private String layoutParserVersion;
|
||||||
|
|
||||||
|
|
||||||
|
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||||
|
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||||
|
|
||||||
|
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||||
|
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
|
||||||
|
.orElse(originFile);
|
||||||
|
|
||||||
|
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
|
||||||
|
.map(layoutParsingStorageService::getVisualLayoutParsingFile)
|
||||||
|
.orElse(new VisualLayoutParsingResponse());
|
||||||
|
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
|
||||||
|
.map(layoutParsingStorageService::getImagesFile)
|
||||||
|
.orElse(new ImageServiceResponse());
|
||||||
|
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
|
||||||
|
.map(layoutParsingStorageService::getTablesFile)
|
||||||
|
.orElse(new TableServiceResponse());
|
||||||
|
|
||||||
|
LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null //
|
||||||
|
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride();
|
||||||
|
|
||||||
|
ClassificationDocument classificationDocument = parseLayout(layoutParsingType,
|
||||||
|
originFile,
|
||||||
|
imageServiceResponse,
|
||||||
|
tableServiceResponse,
|
||||||
|
visualLayoutParsingResponse,
|
||||||
|
layoutParsingRequest.identifier());
|
||||||
|
|
||||||
|
log.info("Building document graph for {}", layoutParsingRequest.identifier());
|
||||||
|
|
||||||
|
DocumentWithVisualization documentWithVisualization = observeBuildDocumentGraph(layoutParsingType, classificationDocument);
|
||||||
|
|
||||||
|
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
|
||||||
|
|
||||||
|
layoutGridService.addLayoutGrid(viewerDocumentFile, documentWithVisualization, viewerDocumentFile, layoutParsingType, layoutParserVersion, false);
|
||||||
|
|
||||||
|
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
|
||||||
|
|
||||||
|
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentWithVisualization.document()));
|
||||||
|
if (layoutParsingRequest.documentMarkdownFileStorageId()
|
||||||
|
.isPresent()) {
|
||||||
|
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
|
||||||
|
.get(), new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
|
||||||
|
}
|
||||||
|
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentWithVisualization.document()));
|
||||||
|
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
|
||||||
|
|
||||||
|
if (layoutParsingRequest.researchDocumentStorageId() != null) {
|
||||||
|
log.info("Building research document data for {}", layoutParsingRequest.identifier());
|
||||||
|
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentWithVisualization.document());
|
||||||
|
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!viewerDocumentFile.equals(originFile)) {
|
||||||
|
assert !viewerDocumentFile.exists() || viewerDocumentFile.delete();
|
||||||
|
}
|
||||||
|
assert !originFile.exists() || originFile.delete();
|
||||||
|
|
||||||
|
return LayoutParsingFinishedEvent.builder()
|
||||||
|
.identifier(layoutParsingRequest.identifier())
|
||||||
|
.numberOfPages(documentWithVisualization.document().getNumberOfPages())
|
||||||
|
.duration(System.currentTimeMillis() - start)
|
||||||
|
.message(format("""
|
||||||
|
Layout parsing has finished in %.02f s.
|
||||||
|
identifiers: %s
|
||||||
|
%s
|
||||||
|
Files have been saved with Ids:
|
||||||
|
Structure: %s
|
||||||
|
Text: %s
|
||||||
|
Positions: %s
|
||||||
|
PageData: %s
|
||||||
|
Simplified Text: %s
|
||||||
|
Viewer Doc: %s""",
|
||||||
|
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||||
|
layoutParsingRequest.identifier(),
|
||||||
|
buildSemanticNodeCountMessage(documentWithVisualization.document().getNumberOfPages(), documentWithVisualization.buildSemanticNodeCounts()),
|
||||||
|
layoutParsingRequest.structureFileStorageId(),
|
||||||
|
layoutParsingRequest.textBlockFileStorageId(),
|
||||||
|
layoutParsingRequest.positionBlockFileStorageId(),
|
||||||
|
layoutParsingRequest.pageFileStorageId(),
|
||||||
|
layoutParsingRequest.simplifiedTextStorageId(),
|
||||||
|
layoutParsingRequest.viewerDocumentStorageId()))
|
||||||
|
.layoutParserVersion(layoutParserVersion)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private DocumentWithVisualization observeBuildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument) {
|
||||||
|
|
||||||
|
AtomicReference<DocumentWithVisualization> documentReference = new AtomicReference<>();
|
||||||
|
|
||||||
|
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry)
|
||||||
|
.contextualName("build-document-graph")
|
||||||
|
.observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(layoutParsingType, classificationDocument)));
|
||||||
|
|
||||||
|
return documentReference.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
||||||
|
|
||||||
|
return format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||||
|
numberOfPages,
|
||||||
|
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||||
|
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||||
|
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||||
|
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||||
|
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||||
|
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||||
|
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Observed(name = "LayoutParsingPipeline", contextualName = "parse-layout")
|
||||||
|
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
|
||||||
|
File originFile,
|
||||||
|
ImageServiceResponse imageServiceResponse,
|
||||||
|
TableServiceResponse tableServiceResponse,
|
||||||
|
VisualLayoutParsingResponse visualLayoutParsingResponse,
|
||||||
|
Map<String, String> identifier) {
|
||||||
|
|
||||||
|
PDDocument originDocument = openDocument(originFile);
|
||||||
|
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
|
||||||
|
|
||||||
|
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||||
|
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||||
|
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
||||||
|
|
||||||
|
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||||
|
|
||||||
|
if (settings.isDebug() || identifier.containsKey("debug")) {
|
||||||
|
classificationDocument.getLayoutDebugLayer().setActive(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||||
|
|
||||||
|
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||||
|
|
||||||
|
long pageCount = originDocument.getNumberOfPages();
|
||||||
|
|
||||||
|
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||||
|
|
||||||
|
if (pageNumber % 100 == 0) {
|
||||||
|
// re-open document every once in a while to save on RAM. This has no significant performance impact.
|
||||||
|
// This is due to PDFBox caching all images and some other stuff with Soft References. This dereferences them and forces the freeing of memory.
|
||||||
|
originDocument.close();
|
||||||
|
originDocument = openDocument(originFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pageNumber % 100 == 0 || pageNumber == pageCount || pageNumber == 1) {
|
||||||
|
log.info("Extracting text on Page {} for {}", pageNumber, identifier);
|
||||||
|
}
|
||||||
|
|
||||||
|
classificationDocument.setPages(classificationPages);
|
||||||
|
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||||
|
PDPage pdPage = originDocument.getPage(pageNumber - 1);
|
||||||
|
stripper.setPageNumber(pageNumber);
|
||||||
|
stripper.setStartPage(pageNumber);
|
||||||
|
stripper.setEndPage(pageNumber);
|
||||||
|
stripper.setPdpage(pdPage);
|
||||||
|
stripper.getText(originDocument);
|
||||||
|
List<Word> words = stripper.getWords();
|
||||||
|
|
||||||
|
// rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now
|
||||||
|
|
||||||
|
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
||||||
|
var lines = TextPositionOperations.groupByLine(new HashSet<>(words));
|
||||||
|
classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber);
|
||||||
|
words = TextPositionOperations.sortWords(lines);
|
||||||
|
}
|
||||||
|
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
|
||||||
|
|
||||||
|
PDRectangle pdr = pdPage.getMediaBox();
|
||||||
|
|
||||||
|
List<Ruling> rulings = stripper.getRulings();
|
||||||
|
classificationDocument.getLayoutDebugLayer().addRulingVisualization(rulings, pageNumber);
|
||||||
|
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), rulings);
|
||||||
|
|
||||||
|
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
|
||||||
|
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
|
||||||
|
classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber);
|
||||||
|
|
||||||
|
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
||||||
|
|
||||||
|
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getWords(), false);
|
||||||
|
|
||||||
|
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
||||||
|
.addAll(graphics.stream()
|
||||||
|
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()),
|
||||||
|
ImageType.GRAPHIC,
|
||||||
|
false,
|
||||||
|
stripper.getPageNumber(),
|
||||||
|
""))
|
||||||
|
.toList());
|
||||||
|
|
||||||
|
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||||
|
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer());
|
||||||
|
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||||
|
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
||||||
|
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
|
||||||
|
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||||
|
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
|
||||||
|
};
|
||||||
|
|
||||||
|
updateClassificationPage(pdPage, pdr, classificationPage, cleanRulings, pageNumber, pageInformation);
|
||||||
|
|
||||||
|
blockificationPostprocessingService.findHeadlinesFromOutline(classificationDocument, pageNumber, classificationPage, pageInformation);
|
||||||
|
|
||||||
|
classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
|
||||||
|
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
||||||
|
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
|
||||||
|
|
||||||
|
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
|
||||||
|
if (pdfImages.containsKey(pageNumber)) {
|
||||||
|
classificationPage.setImages(pdfImages.get(pageNumber));
|
||||||
|
imageServiceResponseAdapter.findOcr(classificationPage);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (signatures.containsKey(pageNumber)) {
|
||||||
|
if (classificationPage.getImages() == null || classificationPage.getImages().isEmpty()) {
|
||||||
|
classificationPage.setImages(signatures.get(pageNumber));
|
||||||
|
} else {
|
||||||
|
classificationPage.getImages().addAll(signatures.get(pageNumber));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tableExtractionService.extractTables(emptyTableCells, classificationPage);
|
||||||
|
|
||||||
|
buildPageStatistics(classificationPage);
|
||||||
|
increaseDocumentStatistics(classificationPage, classificationDocument);
|
||||||
|
|
||||||
|
classificationPages.add(classificationPage);
|
||||||
|
}
|
||||||
|
|
||||||
|
originDocument.close();
|
||||||
|
|
||||||
|
classificationService.classify(classificationDocument, layoutParsingType, identifier);
|
||||||
|
|
||||||
|
SectionTree sectionTree = sectionTreeBuilderService.createSectionTree(classificationDocument);
|
||||||
|
classificationDocument.setSectionTree(sectionTree);
|
||||||
|
|
||||||
|
log.info("Building Sections for {}", identifier);
|
||||||
|
|
||||||
|
switch (layoutParsingType) {
|
||||||
|
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
|
||||||
|
default -> sectionTreeEnhancementService.assignSectionBlocksAndImages(classificationDocument);
|
||||||
|
}
|
||||||
|
|
||||||
|
return classificationDocument;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void updateClassificationPage(PDPage pdPage,
|
||||||
|
PDRectangle pdr,
|
||||||
|
ClassificationPage classificationPage,
|
||||||
|
CleanRulings cleanRulings,
|
||||||
|
int pageNumber,
|
||||||
|
PageInformation pageInformation) {
|
||||||
|
|
||||||
|
int rotation = pdPage.getRotation();
|
||||||
|
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||||
|
classificationPage.setCleanRulings(cleanRulings);
|
||||||
|
classificationPage.setRotation(rotation);
|
||||||
|
classificationPage.setLandscape(isLandscape);
|
||||||
|
classificationPage.setPageNumber(pageNumber);
|
||||||
|
classificationPage.setPageWidth((float) pageInformation.width());
|
||||||
|
classificationPage.setPageHeight((float) pageInformation.height());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void rotateDirAdjExactly(List<Word> words, PDPage pdPage) {
|
||||||
|
|
||||||
|
for (TextDirection dir : TextDirection.values()) {
|
||||||
|
double averageRotation = words.stream()
|
||||||
|
.map(Word::getCharacters)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.map(Character::getTextPosition)
|
||||||
|
.filter(pos -> pos.getDir().equals(dir))
|
||||||
|
.mapToDouble(RedTextPosition::getExactDir).average()
|
||||||
|
.orElse(0);
|
||||||
|
|
||||||
|
if (averageRotation == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
AffineTransform rotateInstance = AffineTransform.getRotateInstance(averageRotation, pdPage.getMediaBox().getWidth() / 2, pdPage.getMediaBox().getHeight() / 2);
|
||||||
|
|
||||||
|
for (Word word : words) {
|
||||||
|
if (!dir.equals(word.getDir())) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
word.transform(rotateInstance);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void addNumberOfPagesToTrace(int numberOfPages, long size) {
|
||||||
|
|
||||||
|
if (observationRegistry.getCurrentObservation() != null) {
|
||||||
|
observationRegistry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages));
|
||||||
|
observationRegistry.getCurrentObservation().highCardinalityKeyValue("fileSize", String.valueOf(size));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private PDDocument openDocument(File originFile) {
|
||||||
|
|
||||||
|
PDDocument document = Loader.loadPDF(originFile);
|
||||||
|
document.setAllSecurityToBeRemoved(true);
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
|
||||||
|
|
||||||
|
Map<String, List<Rectangle2D>> markedContentBboxes = new HashMap<>();
|
||||||
|
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER));
|
||||||
|
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER));
|
||||||
|
return markedContentBboxes;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||||
|
|
||||||
|
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||||
|
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
||||||
|
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
||||||
|
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void buildPageStatistics(ClassificationPage classificationPage) {
|
||||||
|
|
||||||
|
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
|
||||||
|
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
|
||||||
|
if (textBlock instanceof TextPageBlock) {
|
||||||
|
if (((TextPageBlock) textBlock).getWords() == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (Word word : ((TextPageBlock) textBlock).getWords()) {
|
||||||
|
classificationPage.getTextHeightCounter().add(word.getTextHeight());
|
||||||
|
classificationPage.getFontCounter().add(word.getFont());
|
||||||
|
classificationPage.getFontSizeCounter().add(word.getFontSize());
|
||||||
|
classificationPage.getFontStyleCounter().add(word.getFontStyle());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,114 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor;
|
|
||||||
|
|
||||||
import static java.lang.String.format;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.CvTableParsingAdapter;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.ClassificationService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.PdfParsingService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.SectionsBuilderService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.mapper.taas.TaasDocumentDataMapper;
|
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
|
|
||||||
@Slf4j
|
|
||||||
@Service
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
public class LayoutParsingService {
|
|
||||||
|
|
||||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
|
||||||
private final CvTableParsingAdapter cvTableParsingAdapter;
|
|
||||||
private final LayoutParsingStorageService layoutParsingStorageService;
|
|
||||||
private final PdfParsingService pdfParsingService;
|
|
||||||
private final ClassificationService classificationService;
|
|
||||||
private final SectionsBuilderService sectionsBuilderService;
|
|
||||||
|
|
||||||
|
|
||||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
|
||||||
PDDocument originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
|
||||||
|
|
||||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
|
||||||
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
|
||||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.pageFileStorageId());
|
|
||||||
}
|
|
||||||
|
|
||||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
|
||||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
|
||||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.pageFileStorageId());
|
|
||||||
}
|
|
||||||
|
|
||||||
Document documentGraph = parseLayout(originDocument, imageServiceResponse, tableServiceResponse);
|
|
||||||
int numberOfPages = originDocument.getNumberOfPages();
|
|
||||||
originDocument.close();
|
|
||||||
|
|
||||||
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
|
|
||||||
|
|
||||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, researchDocumentData, DocumentDataMapper.toDocumentData(documentGraph));
|
|
||||||
|
|
||||||
return LayoutParsingFinishedEvent.builder()
|
|
||||||
.identifier(layoutParsingRequest.identifier())
|
|
||||||
.numberOfPages(numberOfPages)
|
|
||||||
.duration(System.currentTimeMillis() - start)
|
|
||||||
.message(format("Layout parsing is finished and files have been saved with Ids:\n Structure: %s\nText: %s\nPositions: %s\nPageData: %s",
|
|
||||||
layoutParsingRequest.structureFileStorageId(),
|
|
||||||
layoutParsingRequest.textBlockFileStorageId(),
|
|
||||||
layoutParsingRequest.positionBlockFileStorageId(),
|
|
||||||
layoutParsingRequest.pageFileStorageId()))
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Document parseLayout(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) {
|
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument,
|
|
||||||
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
|
|
||||||
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
|
|
||||||
|
|
||||||
classificationService.classifyDocument(classificationDocument);
|
|
||||||
|
|
||||||
sectionsBuilderService.buildSections(classificationDocument);
|
|
||||||
|
|
||||||
return DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Document parseLayoutWithTimer(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) {
|
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
|
||||||
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument,
|
|
||||||
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
|
|
||||||
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
|
|
||||||
System.out.printf("parsed %d ms", System.currentTimeMillis() - start);
|
|
||||||
|
|
||||||
start = System.currentTimeMillis();
|
|
||||||
classificationService.classifyDocument(classificationDocument);
|
|
||||||
System.out.printf(", classified %d ms", System.currentTimeMillis() - start);
|
|
||||||
|
|
||||||
start = System.currentTimeMillis();
|
|
||||||
sectionsBuilderService.buildSections(classificationDocument);
|
|
||||||
System.out.printf(", sections built %d ms", System.currentTimeMillis() - start);
|
|
||||||
|
|
||||||
start = System.currentTimeMillis();
|
|
||||||
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
|
||||||
System.out.printf(", graph constructed %d ms", System.currentTimeMillis() - start);
|
|
||||||
return document;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,10 +1,23 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor;
|
package com.knecon.fforesight.service.layoutparser.processor;
|
||||||
|
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.context.annotation.Bean;
|
||||||
import org.springframework.context.annotation.ComponentScan;
|
import org.springframework.context.annotation.ComponentScan;
|
||||||
import org.springframework.context.annotation.Configuration;
|
import org.springframework.context.annotation.Configuration;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||||
|
|
||||||
|
import io.micrometer.observation.ObservationRegistry;
|
||||||
|
|
||||||
@Configuration
|
@Configuration
|
||||||
@ComponentScan
|
@ComponentScan
|
||||||
public class LayoutParsingServiceProcessorConfiguration {
|
public class LayoutParsingServiceProcessorConfiguration {
|
||||||
|
|
||||||
|
@Bean
|
||||||
|
@Autowired
|
||||||
|
public PDFTronViewerDocumentService viewerDocumentService(ObservationRegistry registry) {
|
||||||
|
|
||||||
|
return new PDFTronViewerDocumentService(registry);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,31 +1,36 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor;
|
package com.knecon.fforesight.service.layoutparser.processor;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.concurrent.CompletableFuture;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.springframework.core.task.TaskExecutor;
|
||||||
import org.apache.pdfbox.Loader;
|
|
||||||
import org.apache.pdfbox.io.MemoryUsageSetting;
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.data.DocumentData;
|
||||||
import com.iqser.red.storage.commons.service.StorageService;
|
import com.iqser.red.storage.commons.service.StorageService;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTreeData;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.PageData;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||||
|
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocVersioningUtility;
|
||||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||||
|
|
||||||
|
import io.micrometer.observation.annotation.Observed;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@ -36,68 +41,104 @@ public class LayoutParsingStorageService {
|
|||||||
private final StorageService storageService;
|
private final StorageService storageService;
|
||||||
private final ObjectMapper objectMapper;
|
private final ObjectMapper objectMapper;
|
||||||
|
|
||||||
|
private final TaskExecutor taskExecutor;
|
||||||
|
|
||||||
public PDDocument getOriginFile(String storageId) throws IOException {
|
|
||||||
|
|
||||||
try (var originDocumentInputStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) {
|
@Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file")
|
||||||
File tempFile = createTempFile("document", ".pdf");
|
public File getOriginFile(String storageId) throws IOException {
|
||||||
try (var tempFileOutputStream = new FileOutputStream(tempFile)) {
|
|
||||||
IOUtils.copy(originDocumentInputStream, tempFileOutputStream);
|
File tempFile = createTempFile("document", ".pdf");
|
||||||
}
|
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
|
||||||
return Loader.loadPDF(tempFile, MemoryUsageSetting.setupMixed(67108864L));
|
return tempFile;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Observed(name = "LayoutParsingStorageService", contextualName = "get-viewer-doc-file")
|
||||||
|
public Optional<File> getViewerDocFile(String storageId) throws IOException {
|
||||||
|
|
||||||
|
if (!storageService.objectExists(TenantContext.getTenantId(), storageId)) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
File tempFile = createTempFile("viewerDocument", ".pdf");
|
||||||
|
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
|
||||||
|
|
||||||
|
if (!ViewerDocVersioningUtility.isCurrentVersion(tempFile)) {
|
||||||
|
assert tempFile.delete();
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.of(tempFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public ImageServiceResponse getImagesFile(String storageId) {
|
||||||
|
|
||||||
|
try (InputStream inputStream = getObject(storageId)) {
|
||||||
|
|
||||||
|
ImageServiceResponse imageServiceResponse = objectMapper.readValue(inputStream, ImageServiceResponse.class);
|
||||||
|
inputStream.close();
|
||||||
|
return imageServiceResponse;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public ImageServiceResponse getImagesFile(String storageId) throws IOException {
|
@SneakyThrows
|
||||||
|
public TableServiceResponse getTablesFile(String storageId) {
|
||||||
|
|
||||||
try (InputStream inputStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) {
|
try (var tableClassificationStream = getObject(storageId)) {
|
||||||
|
|
||||||
return objectMapper.readValue(inputStream, ImageServiceResponse.class);
|
TableServiceResponse tableServiceResponse = objectMapper.readValue(tableClassificationStream, TableServiceResponse.class);
|
||||||
|
tableClassificationStream.close();
|
||||||
|
return tableServiceResponse;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public TableServiceResponse getTablesFile(String storageId) throws IOException {
|
@SneakyThrows
|
||||||
|
public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) {
|
||||||
try (var tableClassificationStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) {
|
|
||||||
|
|
||||||
return objectMapper.readValue(tableClassificationStream, TableServiceResponse.class);
|
|
||||||
|
|
||||||
|
try (InputStream inputStream = getObject(storageId)) {
|
||||||
|
return objectMapper.readValue(inputStream, VisualLayoutParsingResponse.class);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData, DocumentData documentData) {
|
@SneakyThrows
|
||||||
|
@Observed(name = "LayoutParsingStorageService", contextualName = "store-document-data")
|
||||||
|
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
|
||||||
|
|
||||||
|
Runnable storeDocumentStructureRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||||
|
layoutParsingRequest.structureFileStorageId(),
|
||||||
|
documentData.getDocumentStructure());
|
||||||
|
|
||||||
|
CompletableFuture<Void> storeDocumentStructureFuture = CompletableFuture.runAsync(storeDocumentStructureRunnable, taskExecutor);
|
||||||
|
|
||||||
|
Runnable storeDocumentTextDataRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||||
|
layoutParsingRequest.textBlockFileStorageId(),
|
||||||
|
documentData.getDocumentTextData());
|
||||||
|
|
||||||
|
CompletableFuture<Void> storeDocumentTextDataFuture = CompletableFuture.runAsync(storeDocumentTextDataRunnable, taskExecutor);
|
||||||
|
|
||||||
|
Runnable storeDocumentPositionsRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||||
|
layoutParsingRequest.positionBlockFileStorageId(),
|
||||||
|
documentData.getDocumentPositionData());
|
||||||
|
|
||||||
|
CompletableFuture<Void> storeDocumentPositionsFuture = CompletableFuture.runAsync(storeDocumentPositionsRunnable, taskExecutor);
|
||||||
|
|
||||||
|
Runnable storeDocumentPagesRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
|
||||||
|
layoutParsingRequest.pageFileStorageId(),
|
||||||
|
documentData.getDocumentPages());
|
||||||
|
|
||||||
|
CompletableFuture<Void> storeDocumentPagesFuture = CompletableFuture.runAsync(storeDocumentPagesRunnable, taskExecutor);
|
||||||
|
|
||||||
|
CompletableFuture.allOf(storeDocumentStructureFuture, storeDocumentTextDataFuture, storeDocumentPositionsFuture, storeDocumentPagesFuture).join();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) {
|
||||||
|
|
||||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
|
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
|
||||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentTreeData());
|
|
||||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getAtomicTextBlocks());
|
|
||||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getAtomicPositionBlocks());
|
|
||||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getPages());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public DocumentData readDocumentData(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
|
||||||
|
|
||||||
PageData[] pageData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), PageData[].class);
|
|
||||||
AtomicTextBlockData[] atomicTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
|
|
||||||
layoutParsingRequest.textBlockFileStorageId(),
|
|
||||||
AtomicTextBlockData[].class);
|
|
||||||
AtomicPositionBlockData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
|
|
||||||
layoutParsingRequest.positionBlockFileStorageId(),
|
|
||||||
AtomicPositionBlockData[].class);
|
|
||||||
DocumentTreeData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(),
|
|
||||||
layoutParsingRequest.structureFileStorageId(),
|
|
||||||
DocumentTreeData.class);
|
|
||||||
|
|
||||||
return DocumentData.builder()
|
|
||||||
.documentTreeData(tableOfContentsData)
|
|
||||||
.atomicPositionBlocks(atomicPositionBlockData)
|
|
||||||
.atomicTextBlocks(atomicTextBlockData)
|
|
||||||
.pages(pageData)
|
|
||||||
.build();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -125,4 +166,43 @@ public class LayoutParsingStorageService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Observed(name = "LayoutParsingStorageService", contextualName = "store-simplified-text")
|
||||||
|
public void storeSimplifiedText(LayoutParsingRequest layoutParsingRequest, SimplifiedText simplifiedText) {
|
||||||
|
|
||||||
|
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.simplifiedTextStorageId(), simplifiedText);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private InputStream getObject(String storageId) {
|
||||||
|
|
||||||
|
File tempFile = File.createTempFile("temp", ".data");
|
||||||
|
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
|
||||||
|
Path path = Paths.get(tempFile.getPath());
|
||||||
|
return Files.newInputStream(path, StandardOpenOption.DELETE_ON_CLOSE);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Observed(name = "LayoutParsingStorageService", contextualName = "store-viewer-document")
|
||||||
|
public void storeViewerDocument(LayoutParsingRequest layoutParsingRequest, File out) {
|
||||||
|
|
||||||
|
try (var in = new FileInputStream(out)) {
|
||||||
|
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.viewerDocumentStorageId(), in);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Observed(name = "LayoutParsingStorageService", contextualName = "store-markdown-file")
|
||||||
|
public void storeMarkdownFile(String markdownFileStorageId, String markdownContent) {
|
||||||
|
|
||||||
|
try (InputStream inputStream = new ByteArrayInputStream(markdownContent.getBytes(StandardCharsets.UTF_8))) {
|
||||||
|
|
||||||
|
storageService.storeObject(TenantContext.getTenantId(), markdownFileStorageId, inputStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,49 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.adapter;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
|
|
||||||
@Slf4j
|
|
||||||
@Service
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
public class CvTableParsingAdapter {
|
|
||||||
|
|
||||||
public Map<Integer, List<TableCells>> buildCvParsedTablesPerPage(TableServiceResponse tableServiceResponse) {
|
|
||||||
|
|
||||||
Map<Integer, List<TableCells>> tableCells = new HashMap<>();
|
|
||||||
tableServiceResponse.getData()
|
|
||||||
.forEach(tableData -> tableCells.computeIfAbsent(tableData.getPageInfo().getNumber(), tableCell -> new ArrayList<>())
|
|
||||||
.addAll(convertTableCells(tableData.getTableCells())));
|
|
||||||
|
|
||||||
return tableCells;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private Collection<TableCells> convertTableCells(List<TableCells> tableCells) {
|
|
||||||
|
|
||||||
List<TableCells> cvParsedTableCells = new ArrayList<>();
|
|
||||||
|
|
||||||
tableCells.forEach(t -> cvParsedTableCells.add(TableCells.builder()
|
|
||||||
.y0(t.getY0())
|
|
||||||
.x1(t.getX1())
|
|
||||||
.y1(t.getY1())
|
|
||||||
.x0(t.getX0())
|
|
||||||
.width(t.getWidth())
|
|
||||||
.height(t.getHeight())
|
|
||||||
.build()));
|
|
||||||
|
|
||||||
return cvParsedTableCells;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,14 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
|
||||||
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import lombok.Data;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
public class Classification {
|
|
||||||
|
|
||||||
private Map<String, Float> probabilities = new HashMap<>();
|
|
||||||
private String label;
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,11 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
|
||||||
|
|
||||||
import lombok.Data;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
public class FilterGeometry {
|
|
||||||
|
|
||||||
private ImageSize imageSize;
|
|
||||||
private ImageFormat imageFormat;
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,12 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
|
||||||
|
|
||||||
import lombok.Data;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
public class Filters {
|
|
||||||
|
|
||||||
private FilterGeometry geometry;
|
|
||||||
private Probability probability;
|
|
||||||
private boolean allPassed;
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,11 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
|
||||||
|
|
||||||
import lombok.Data;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
public class Geometry {
|
|
||||||
|
|
||||||
private float width;
|
|
||||||
private float height;
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,12 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
|
||||||
|
|
||||||
import lombok.Data;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
public class ImageFormat {
|
|
||||||
|
|
||||||
private float quotient;
|
|
||||||
private boolean tooTall;
|
|
||||||
private boolean tooWide;
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,14 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
|
||||||
|
|
||||||
import lombok.Data;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
public class ImageMetadata {
|
|
||||||
|
|
||||||
private Classification classification;
|
|
||||||
private Position position;
|
|
||||||
private Geometry geometry;
|
|
||||||
private Filters filters;
|
|
||||||
private boolean alpha;
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,12 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
|
||||||
|
|
||||||
import lombok.Data;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
public class ImageSize {
|
|
||||||
|
|
||||||
private float quotient;
|
|
||||||
private boolean tooLarge;
|
|
||||||
private boolean tooSmall;
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,14 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
|
||||||
|
|
||||||
import lombok.Data;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
public class Position {
|
|
||||||
|
|
||||||
private float x1;
|
|
||||||
private float x2;
|
|
||||||
private float y1;
|
|
||||||
private float y2;
|
|
||||||
private int pageNumber;
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,10 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
|
|
||||||
|
|
||||||
import lombok.Data;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
public class Probability {
|
|
||||||
|
|
||||||
private boolean unconfident;
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,13 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
|
|
||||||
|
|
||||||
import lombok.Data;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
public class PageInfo {
|
|
||||||
|
|
||||||
private int number;
|
|
||||||
private int rotation;
|
|
||||||
private float width;
|
|
||||||
private float height;
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,14 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import lombok.Data;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
public class TableData {
|
|
||||||
|
|
||||||
private PageInfo pageInfo;
|
|
||||||
private List<TableCells> tableCells = new ArrayList<>();
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,80 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Data;
|
|
||||||
import lombok.NoArgsConstructor;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@AllArgsConstructor
|
|
||||||
@NoArgsConstructor
|
|
||||||
public abstract class AbstractPageBlock {
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
protected float minX;
|
|
||||||
@JsonIgnore
|
|
||||||
protected float maxX;
|
|
||||||
@JsonIgnore
|
|
||||||
protected float minY;
|
|
||||||
@JsonIgnore
|
|
||||||
protected float maxY;
|
|
||||||
@JsonIgnore
|
|
||||||
protected PageBlockType classification;
|
|
||||||
@JsonIgnore
|
|
||||||
protected int page;
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private Orientation orientation = Orientation.NONE;
|
|
||||||
|
|
||||||
|
|
||||||
public abstract String getText();
|
|
||||||
|
|
||||||
|
|
||||||
public boolean isHeadline() {
|
|
||||||
|
|
||||||
return this instanceof TextPageBlock && this.getClassification() != null && this.getClassification().isHeadline();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean containsBlock(TextPageBlock other) {
|
|
||||||
|
|
||||||
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean contains(AbstractPageBlock other) {
|
|
||||||
|
|
||||||
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean contains(Rectangle other) {
|
|
||||||
|
|
||||||
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft()
|
|
||||||
.getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
public float getHeight() {
|
|
||||||
|
|
||||||
return maxY - minY;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
public float getWidth() {
|
|
||||||
|
|
||||||
return maxX - minX;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean intersectsY(AbstractPageBlock atc) {
|
|
||||||
|
|
||||||
return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,77 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import lombok.Getter;
|
|
||||||
|
|
||||||
@Getter
|
|
||||||
public class FloatFrequencyCounter {
|
|
||||||
|
|
||||||
Map<Float, Integer> countPerValue = new HashMap<>();
|
|
||||||
|
|
||||||
|
|
||||||
public void add(float value) {
|
|
||||||
|
|
||||||
if (!countPerValue.containsKey(value)) {
|
|
||||||
countPerValue.put(value, 1);
|
|
||||||
} else {
|
|
||||||
countPerValue.put(value, countPerValue.get(value) + 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void addAll(Map<Float, Integer> otherCounter) {
|
|
||||||
|
|
||||||
for (Map.Entry<Float, Integer> entry : otherCounter.entrySet()) {
|
|
||||||
if (countPerValue.containsKey(entry.getKey())) {
|
|
||||||
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
|
|
||||||
} else {
|
|
||||||
countPerValue.put(entry.getKey(), entry.getValue());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Float getMostPopular() {
|
|
||||||
|
|
||||||
Map.Entry<Float, Integer> mostPopular = null;
|
|
||||||
for (Map.Entry<Float, Integer> entry : countPerValue.entrySet()) {
|
|
||||||
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
|
|
||||||
mostPopular = entry;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return mostPopular != null ? mostPopular.getKey() : null;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public List<Float> getHighterThanMostPopular() {
|
|
||||||
|
|
||||||
Float mostPopular = getMostPopular();
|
|
||||||
List<Float> higher = new ArrayList<>();
|
|
||||||
for (Float value : countPerValue.keySet()) {
|
|
||||||
if (value > mostPopular) {
|
|
||||||
higher.add(value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return higher.stream().sorted(Collections.reverseOrder()).collect(Collectors.toList());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Float getHighest() {
|
|
||||||
|
|
||||||
Float highest = null;
|
|
||||||
for (Float value : countPerValue.keySet()) {
|
|
||||||
if (highest == null || value > highest) {
|
|
||||||
highest = value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return highest;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,8 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
|
||||||
|
|
||||||
public enum Orientation {
|
|
||||||
|
|
||||||
NONE,
|
|
||||||
LEFT,
|
|
||||||
RIGHT
|
|
||||||
}
|
|
||||||
@ -1,25 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.image;
|
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType;
|
|
||||||
|
|
||||||
import lombok.Data;
|
|
||||||
import lombok.NonNull;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
public class ClassifiedImage {
|
|
||||||
|
|
||||||
@NonNull
|
|
||||||
private Rectangle2D position;
|
|
||||||
@NonNull
|
|
||||||
private ImageType imageType;
|
|
||||||
private boolean isAppendedToSection;
|
|
||||||
@NonNull
|
|
||||||
private boolean hasTransparency;
|
|
||||||
@NonNull
|
|
||||||
private int page;
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,15 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Data;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@Builder
|
|
||||||
public class CleanRulings {
|
|
||||||
|
|
||||||
List<Ruling> horizontal;
|
|
||||||
List<Ruling> vertical;
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,218 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
|
||||||
import java.awt.geom.Rectangle2D;
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
@SuppressWarnings("all")
|
|
||||||
public class Rectangle extends Rectangle2D.Float {
|
|
||||||
|
|
||||||
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
|
||||||
/**
|
|
||||||
* Ill-defined comparator, from when Rectangle was Comparable.
|
|
||||||
* <p>
|
|
||||||
* see https://github.com/tabulapdf/tabula-java/issues/116
|
|
||||||
*
|
|
||||||
* @deprecated with no replacement
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
|
|
||||||
@Override
|
|
||||||
public int compare(Rectangle o1, Rectangle o2) {
|
|
||||||
|
|
||||||
if (o1.equals(o2)) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
|
|
||||||
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1 ? -java.lang.Double.compare(o1.getX(), o2.getX()) : java.lang.Double.compare(o1.getX(), o2.getX());
|
|
||||||
} else {
|
|
||||||
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
public Rectangle() {
|
|
||||||
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Rectangle(float top, float left, float width, float height) {
|
|
||||||
|
|
||||||
super();
|
|
||||||
this.setRect(left, top, width, height);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param rectangles
|
|
||||||
* @return minimum bounding box that contains all the rectangles
|
|
||||||
*/
|
|
||||||
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
|
|
||||||
|
|
||||||
float minx = java.lang.Float.MAX_VALUE;
|
|
||||||
float miny = java.lang.Float.MAX_VALUE;
|
|
||||||
float maxx = java.lang.Float.MIN_VALUE;
|
|
||||||
float maxy = java.lang.Float.MIN_VALUE;
|
|
||||||
|
|
||||||
for (Rectangle r : rectangles) {
|
|
||||||
minx = (float) Math.min(r.getMinX(), minx);
|
|
||||||
miny = (float) Math.min(r.getMinY(), miny);
|
|
||||||
maxx = (float) Math.max(r.getMaxX(), maxx);
|
|
||||||
maxy = (float) Math.max(r.getMaxY(), maxy);
|
|
||||||
}
|
|
||||||
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public int compareTo(Rectangle other) {
|
|
||||||
|
|
||||||
return ILL_DEFINED_ORDER.compare(this, other);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// I'm bad at Java and need this for fancy sorting in
|
|
||||||
// technology.tabula.TextChunk.
|
|
||||||
public int isLtrDominant() {
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getArea() {
|
|
||||||
|
|
||||||
return this.width * this.height;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float verticalOverlap(Rectangle other) {
|
|
||||||
|
|
||||||
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean verticallyOverlaps(Rectangle other) {
|
|
||||||
|
|
||||||
return verticalOverlap(other) > 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float horizontalOverlap(Rectangle other) {
|
|
||||||
|
|
||||||
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean horizontallyOverlaps(Rectangle other) {
|
|
||||||
|
|
||||||
return horizontalOverlap(other) > 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float verticalOverlapRatio(Rectangle other) {
|
|
||||||
|
|
||||||
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
|
|
||||||
|
|
||||||
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
|
|
||||||
rv = (other.getBottom() - this.getTop()) / delta;
|
|
||||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
|
|
||||||
rv = (this.getBottom() - other.getTop()) / delta;
|
|
||||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
|
|
||||||
rv = (other.getBottom() - other.getTop()) / delta;
|
|
||||||
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
|
|
||||||
rv = (this.getBottom() - this.getTop()) / delta;
|
|
||||||
}
|
|
||||||
|
|
||||||
return rv;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float overlapRatio(Rectangle other) {
|
|
||||||
|
|
||||||
double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
|
||||||
double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
|
||||||
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
|
|
||||||
double unionArea = this.getArea() + other.getArea() - intersectionArea;
|
|
||||||
|
|
||||||
return (float) (intersectionArea / unionArea);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Rectangle merge(Rectangle other) {
|
|
||||||
|
|
||||||
this.setRect(this.createUnion(other));
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getTop() {
|
|
||||||
|
|
||||||
return (float) this.getMinY();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setTop(float top) {
|
|
||||||
|
|
||||||
float deltaHeight = top - this.y;
|
|
||||||
this.setRect(this.x, top, this.width, this.height - deltaHeight);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getRight() {
|
|
||||||
|
|
||||||
return (float) this.getMaxX();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setRight(float right) {
|
|
||||||
|
|
||||||
this.setRect(this.x, this.y, right - this.x, this.height);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getLeft() {
|
|
||||||
|
|
||||||
return (float) this.getMinX();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setLeft(float left) {
|
|
||||||
|
|
||||||
float deltaWidth = left - this.x;
|
|
||||||
this.setRect(left, this.y, this.width - deltaWidth, this.height);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getBottom() {
|
|
||||||
|
|
||||||
return (float) this.getMaxY();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setBottom(float bottom) {
|
|
||||||
|
|
||||||
this.setRect(this.x, this.y, this.width, bottom - this.y);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Point2D[] getPoints() {
|
|
||||||
|
|
||||||
return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()), new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(),
|
|
||||||
this.getBottom()), new Point2D.Float(this.getLeft(), this.getBottom())};
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
String s = super.toString();
|
|
||||||
sb.append(s.substring(0, s.length() - 1));
|
|
||||||
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,437 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
|
||||||
|
|
||||||
import java.awt.geom.Line2D;
|
|
||||||
import java.awt.geom.Point2D;
|
|
||||||
import java.awt.geom.Rectangle2D;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.Formatter;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.TreeMap;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.CohenSutherlandClipping;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
|
|
||||||
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
|
|
||||||
@Slf4j
|
|
||||||
@SuppressWarnings("all")
|
|
||||||
public class Ruling extends Line2D.Float {
|
|
||||||
|
|
||||||
private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2;
|
|
||||||
|
|
||||||
|
|
||||||
public Ruling(Point2D p1, Point2D p2) {
|
|
||||||
|
|
||||||
super(p1, p2);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public static List<Ruling> cropRulingsToArea(List<Ruling> rulings, Rectangle2D area) {
|
|
||||||
|
|
||||||
ArrayList<Ruling> rv = new ArrayList<>();
|
|
||||||
for (Ruling r : rulings) {
|
|
||||||
if (r.intersects(area)) {
|
|
||||||
rv.add(r.intersect(area));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rv;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// log(n) implementation of find_intersections
|
|
||||||
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
|
|
||||||
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
|
|
||||||
|
|
||||||
class SortObject {
|
|
||||||
|
|
||||||
protected SOType type;
|
|
||||||
protected float position;
|
|
||||||
protected Ruling ruling;
|
|
||||||
|
|
||||||
|
|
||||||
public SortObject(SOType type, float position, Ruling ruling) {
|
|
||||||
|
|
||||||
this.type = type;
|
|
||||||
this.position = position;
|
|
||||||
this.ruling = ruling;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
List<SortObject> sos = new ArrayList<>();
|
|
||||||
|
|
||||||
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
|
|
||||||
@Override
|
|
||||||
public int compare(Ruling o1, Ruling o2) {
|
|
||||||
|
|
||||||
return java.lang.Double.compare(o1.getTop(), o2.getTop());
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
|
|
||||||
@Override
|
|
||||||
public int compare(Point2D o1, Point2D o2) {
|
|
||||||
|
|
||||||
if (o1.getY() > o2.getY()) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (o1.getY() < o2.getY()) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
if (o1.getX() > o2.getX()) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (o1.getX() < o2.getX()) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
for (Ruling h : horizontals) {
|
|
||||||
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
|
||||||
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (Ruling v : verticals) {
|
|
||||||
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
|
|
||||||
}
|
|
||||||
|
|
||||||
Collections.sort(sos, new Comparator<SortObject>() {
|
|
||||||
@Override
|
|
||||||
public int compare(SortObject a, SortObject b) {
|
|
||||||
|
|
||||||
int rv;
|
|
||||||
if (DoubleComparisons.feq(a.position, b.position)) {
|
|
||||||
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
|
|
||||||
rv = 1;
|
|
||||||
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
|
|
||||||
rv = -1;
|
|
||||||
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
|
|
||||||
rv = -1;
|
|
||||||
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
|
|
||||||
rv = 1;
|
|
||||||
} else {
|
|
||||||
rv = java.lang.Double.compare(a.position, b.position);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
return java.lang.Double.compare(a.position, b.position);
|
|
||||||
}
|
|
||||||
return rv;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
for (SortObject so : sos) {
|
|
||||||
switch (so.type) {
|
|
||||||
case VERTICAL:
|
|
||||||
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
|
|
||||||
try {
|
|
||||||
Point2D i = h.getKey().intersectionPoint(so.ruling);
|
|
||||||
if (i == null) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
|
|
||||||
} catch (UnsupportedOperationException e) {
|
|
||||||
log.info("Some line are oblique, ignoring...");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case HRIGHT:
|
|
||||||
tree.remove(so.ruling);
|
|
||||||
break;
|
|
||||||
case HLEFT:
|
|
||||||
tree.put(so.ruling, true);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return rv;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean vertical() {
|
|
||||||
|
|
||||||
return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean horizontal() {
|
|
||||||
|
|
||||||
return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
|
||||||
}
|
|
||||||
|
|
||||||
// attributes that make sense only for non-oblique lines
|
|
||||||
// these are used to have a single collapse method (in page, currently)
|
|
||||||
|
|
||||||
|
|
||||||
public boolean oblique() {
|
|
||||||
|
|
||||||
return !(this.vertical() || this.horizontal());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getPosition() {
|
|
||||||
|
|
||||||
if (this.oblique()) {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
return this.vertical() ? this.getLeft() : this.getTop();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getStart() {
|
|
||||||
|
|
||||||
if (this.oblique()) {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
return this.vertical() ? this.getTop() : this.getLeft();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setStart(float v) {
|
|
||||||
|
|
||||||
if (this.oblique()) {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
if (this.vertical()) {
|
|
||||||
this.setTop(v);
|
|
||||||
} else {
|
|
||||||
this.setLeft(v);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getEnd() {
|
|
||||||
|
|
||||||
if (this.oblique()) {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
return this.vertical() ? this.getBottom() : this.getRight();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setEnd(float v) {
|
|
||||||
|
|
||||||
if (this.oblique()) {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
if (this.vertical()) {
|
|
||||||
this.setBottom(v);
|
|
||||||
} else {
|
|
||||||
this.setRight(v);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setStartEnd(float start, float end) {
|
|
||||||
|
|
||||||
if (this.oblique()) {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
if (this.vertical()) {
|
|
||||||
this.setTop(start);
|
|
||||||
this.setBottom(end);
|
|
||||||
} else {
|
|
||||||
this.setLeft(start);
|
|
||||||
this.setRight(end);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean perpendicularTo(Ruling other) {
|
|
||||||
|
|
||||||
return this.vertical() == other.horizontal();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) {
|
|
||||||
|
|
||||||
if (this.intersectsLine(another)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean rv = false;
|
|
||||||
|
|
||||||
if (this.perpendicularTo(another)) {
|
|
||||||
rv = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).intersectsLine(another);
|
|
||||||
} else {
|
|
||||||
rv = this.expand(colinearOrParallelExpandAmount).intersectsLine(another.expand(colinearOrParallelExpandAmount));
|
|
||||||
}
|
|
||||||
|
|
||||||
return rv;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public double length() {
|
|
||||||
|
|
||||||
return Math.sqrt(Math.pow(this.x1 - this.x2, 2) + Math.pow(this.y1 - this.y2, 2));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Ruling intersect(Rectangle2D clip) {
|
|
||||||
|
|
||||||
Float clipee = (Float) this.clone();
|
|
||||||
boolean clipped = new CohenSutherlandClipping(clip).clip(clipee);
|
|
||||||
|
|
||||||
if (clipped) {
|
|
||||||
return new Ruling(clipee.getP1(), clipee.getP2());
|
|
||||||
} else {
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Ruling expand(float amount) {
|
|
||||||
|
|
||||||
Ruling r = (Ruling) this.clone();
|
|
||||||
try {
|
|
||||||
r.setStart(this.getStart() - amount);
|
|
||||||
r.setEnd(this.getEnd() + amount);
|
|
||||||
} catch (UnsupportedOperationException e) {
|
|
||||||
log.warn("Could not expand ruling!");
|
|
||||||
}
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Point2D intersectionPoint(Ruling other) {
|
|
||||||
|
|
||||||
Ruling this_l = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
|
|
||||||
Ruling other_l = other.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
|
|
||||||
Ruling horizontal, vertical;
|
|
||||||
|
|
||||||
if (!this_l.intersectsLine(other_l)) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this_l.horizontal() && other_l.vertical()) {
|
|
||||||
horizontal = this_l;
|
|
||||||
vertical = other_l;
|
|
||||||
} else if (this_l.vertical() && other_l.horizontal()) {
|
|
||||||
vertical = this_l;
|
|
||||||
horizontal = other_l;
|
|
||||||
} else {
|
|
||||||
log.warn("lines must be orthogonal, vertical and horizontal");
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
return new Point2D.Float(vertical.getLeft(), horizontal.getTop());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object other) {
|
|
||||||
|
|
||||||
if (this == other) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!(other instanceof Ruling)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ruling o = (Ruling) other;
|
|
||||||
return this.getP1().equals(o.getP1()) && this.getP2().equals(o.getP2());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode() {
|
|
||||||
|
|
||||||
return super.hashCode();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getTop() {
|
|
||||||
|
|
||||||
return this.y1;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setTop(float v) {
|
|
||||||
|
|
||||||
setLine(this.getLeft(), v, this.getRight(), this.getBottom());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getLeft() {
|
|
||||||
|
|
||||||
return this.x1;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setLeft(float v) {
|
|
||||||
|
|
||||||
setLine(v, this.getTop(), this.getRight(), this.getBottom());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getBottom() {
|
|
||||||
|
|
||||||
return this.y2;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setBottom(float v) {
|
|
||||||
|
|
||||||
setLine(this.getLeft(), this.getTop(), this.getRight(), v);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getRight() {
|
|
||||||
|
|
||||||
return this.x2;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setRight(float v) {
|
|
||||||
|
|
||||||
setLine(this.getLeft(), this.getTop(), v, this.getBottom());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getWidth() {
|
|
||||||
|
|
||||||
return this.getRight() - this.getLeft();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getHeight() {
|
|
||||||
|
|
||||||
return this.getBottom() - this.getTop();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public double getAngle() {
|
|
||||||
|
|
||||||
double angle = Math.toDegrees(Math.atan2(this.getP2().getY() - this.getP1().getY(), this.getP2().getX() - this.getP1().getX()));
|
|
||||||
|
|
||||||
if (angle < 0) {
|
|
||||||
angle += 360;
|
|
||||||
}
|
|
||||||
return angle;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
Formatter formatter = new Formatter(sb);
|
|
||||||
String rv = formatter.format("%s[minX=%f minY=%f maxX=%f maxY=%f]", this.getClass().toString(), this.x1, this.y1, this.x2, this.y2).toString();
|
|
||||||
formatter.close();
|
|
||||||
return rv;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private enum SOType {
|
|
||||||
VERTICAL,
|
|
||||||
HRIGHT,
|
|
||||||
HLEFT
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,100 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
|
||||||
|
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
|
||||||
import org.springframework.beans.BeanUtils;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Data;
|
|
||||||
import lombok.NoArgsConstructor;
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@Builder
|
|
||||||
@NoArgsConstructor
|
|
||||||
@AllArgsConstructor
|
|
||||||
public class RedTextPosition {
|
|
||||||
|
|
||||||
private String textMatrix;
|
|
||||||
private float[] position;
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private int rotation;
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private float pageHeight;
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private float pageWidth;
|
|
||||||
|
|
||||||
private String unicode;
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private float dir;
|
|
||||||
|
|
||||||
// not used in reanalysis
|
|
||||||
@JsonIgnore
|
|
||||||
private float widthOfSpace;
|
|
||||||
|
|
||||||
// not used in reanalysis
|
|
||||||
@JsonIgnore
|
|
||||||
private float fontSizeInPt;
|
|
||||||
|
|
||||||
// not used in reanalysis
|
|
||||||
@JsonIgnore
|
|
||||||
private String fontName;
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
|
|
||||||
|
|
||||||
var pos = new RedTextPosition();
|
|
||||||
BeanUtils.copyProperties(textPosition, pos);
|
|
||||||
pos.setFontName(textPosition.getFont().getName());
|
|
||||||
|
|
||||||
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
|
||||||
|
|
||||||
pos.setTextMatrix(textPosition.getTextMatrix().toString());
|
|
||||||
|
|
||||||
var position = new float[4];
|
|
||||||
|
|
||||||
position[0] = textPosition.getXDirAdj();
|
|
||||||
position[1] = textPosition.getYDirAdj();
|
|
||||||
position[2] = textPosition.getWidthDirAdj();
|
|
||||||
position[3] = textPosition.getHeightDir();
|
|
||||||
|
|
||||||
pos.setPosition(position);
|
|
||||||
return pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
public float getXDirAdj() {
|
|
||||||
|
|
||||||
return position[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
public float getYDirAdj() {
|
|
||||||
|
|
||||||
return position[1];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
public float getWidthDirAdj() {
|
|
||||||
|
|
||||||
return position[2];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
public float getHeightDir() {
|
|
||||||
|
|
||||||
return position[3];
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,48 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
|
|
||||||
|
|
||||||
import lombok.Getter;
|
|
||||||
|
|
||||||
@Getter
|
|
||||||
public class SearchableText {
|
|
||||||
|
|
||||||
private final List<TextPositionSequence> sequences = new ArrayList<>();
|
|
||||||
|
|
||||||
|
|
||||||
public void add(TextPositionSequence textPositionSequence) {
|
|
||||||
|
|
||||||
sequences.add(textPositionSequence);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void addAll(List<TextPositionSequence> textPositionSequences) {
|
|
||||||
|
|
||||||
sequences.addAll(textPositionSequences);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
|
|
||||||
return buildString(sequences);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public static String buildString(List<TextPositionSequence> sequences) {
|
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
for (TextPositionSequence word : sequences) {
|
|
||||||
sb.append(word);
|
|
||||||
sb.append(' ');
|
|
||||||
}
|
|
||||||
String text = sb.toString();
|
|
||||||
text = TextNormalizationUtilities.removeHyphenLineBreaks(text);
|
|
||||||
text = TextNormalizationUtilities.removeLineBreaks(text);
|
|
||||||
text = TextNormalizationUtilities.removeRepeatingWhitespaces(text);
|
|
||||||
return text;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,17 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Data;
|
|
||||||
import lombok.NoArgsConstructor;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@Builder
|
|
||||||
@NoArgsConstructor
|
|
||||||
@AllArgsConstructor
|
|
||||||
public class SimplifiedSectionText {
|
|
||||||
|
|
||||||
private int sectionNumber;
|
|
||||||
private String text;
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,20 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Data;
|
|
||||||
import lombok.NoArgsConstructor;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@Builder
|
|
||||||
@NoArgsConstructor
|
|
||||||
@AllArgsConstructor
|
|
||||||
public class SimplifiedText {
|
|
||||||
|
|
||||||
private int numberOfPages;
|
|
||||||
private List<SimplifiedSectionText> sectionTexts = new ArrayList<>();
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,367 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
|
||||||
|
|
||||||
import static java.util.stream.Collectors.toSet;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Data;
|
|
||||||
import lombok.EqualsAndHashCode;
|
|
||||||
import lombok.NoArgsConstructor;
|
|
||||||
|
|
||||||
@EqualsAndHashCode(callSuper = true)
|
|
||||||
@Data
|
|
||||||
@AllArgsConstructor
|
|
||||||
@Builder
|
|
||||||
@NoArgsConstructor
|
|
||||||
public class TextPageBlock extends AbstractPageBlock {
|
|
||||||
|
|
||||||
@Builder.Default
|
|
||||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private int rotation;
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private String mostPopularWordFont;
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private String mostPopularWordStyle;
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private float mostPopularWordFontSize;
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private float mostPopularWordHeight;
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private float mostPopularWordSpaceWidth;
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private float highestFontSize;
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private PageBlockType classification;
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
public TextDirection getDir() {
|
|
||||||
|
|
||||||
return sequences.get(0).getDir();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private float getPageHeight() {
|
|
||||||
|
|
||||||
return sequences.get(0).getPageHeight();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private float getPageWidth() {
|
|
||||||
|
|
||||||
return sequences.get(0).getPageWidth();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
|
||||||
|
|
||||||
List<TextPositionSequence> sequences = textBlocksToMerge.stream().map(TextPageBlock::getSequences).flatMap(java.util.Collection::stream).toList();
|
|
||||||
sequences = new ArrayList<>(sequences);
|
|
||||||
return fromTextPositionSequences(sequences);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
|
|
||||||
|
|
||||||
TextPageBlock textBlock = null;
|
|
||||||
|
|
||||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
|
||||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
|
||||||
|
|
||||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
|
||||||
|
|
||||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
|
||||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
|
||||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
|
||||||
fontFrequencyCounter.add(wordBlock.getFont());
|
|
||||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
|
||||||
|
|
||||||
if (textBlock == null) {
|
|
||||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
|
||||||
wordBlock.getMaxXDirAdj(),
|
|
||||||
wordBlock.getMinYDirAdj(),
|
|
||||||
wordBlock.getMaxYDirAdj(),
|
|
||||||
wordBlockList,
|
|
||||||
wordBlock.getRotation());
|
|
||||||
} else {
|
|
||||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
|
||||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (textBlock != null) {
|
|
||||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences()
|
|
||||||
.stream()
|
|
||||||
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
|
|
||||||
.collect(toSet())
|
|
||||||
.size() == 1) {
|
|
||||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
|
||||||
}
|
|
||||||
return textBlock;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the minX value in pdf coordinate system.
|
|
||||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
|
||||||
* 0 -> LowerLeft
|
|
||||||
* 90 -> UpperLeft
|
|
||||||
* 180 -> UpperRight
|
|
||||||
* 270 -> LowerRight
|
|
||||||
*
|
|
||||||
* @return the minX value in pdf coordinate system
|
|
||||||
*/
|
|
||||||
@JsonIgnore
|
|
||||||
public float getPdfMinX() {
|
|
||||||
|
|
||||||
if (getDir().getDegrees() == 90) {
|
|
||||||
return minY;
|
|
||||||
} else if (getDir().getDegrees() == 180) {
|
|
||||||
return getPageWidth() - maxX;
|
|
||||||
|
|
||||||
} else if (getDir().getDegrees() == 270) {
|
|
||||||
|
|
||||||
return getPageWidth() - maxY;
|
|
||||||
} else {
|
|
||||||
return minX;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the maxX value in pdf coordinate system.
|
|
||||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
|
||||||
* 0 -> LowerLeft
|
|
||||||
* 90 -> UpperLeft
|
|
||||||
* 180 -> UpperRight
|
|
||||||
* 270 -> LowerRight
|
|
||||||
*
|
|
||||||
* @return the maxX value in pdf coordinate system
|
|
||||||
*/
|
|
||||||
@JsonIgnore
|
|
||||||
public float getPdfMaxX() {
|
|
||||||
|
|
||||||
if (getDir().getDegrees() == 90) {
|
|
||||||
return maxY;
|
|
||||||
} else if (getDir().getDegrees() == 180) {
|
|
||||||
return getPageWidth() - minX;
|
|
||||||
} else if (getDir().getDegrees() == 270) {
|
|
||||||
return getPageWidth() - minY;
|
|
||||||
|
|
||||||
} else {
|
|
||||||
return maxX;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the minY value in pdf coordinate system.
|
|
||||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
|
||||||
* 0 -> LowerLeft
|
|
||||||
* 90 -> UpperLeft
|
|
||||||
* 180 -> UpperRight
|
|
||||||
* 270 -> LowerRight
|
|
||||||
*
|
|
||||||
* @return the minY value in pdf coordinate system
|
|
||||||
*/
|
|
||||||
@JsonIgnore
|
|
||||||
public float getPdfMinY() {
|
|
||||||
|
|
||||||
if (getDir().getDegrees() == 90) {
|
|
||||||
return minX;
|
|
||||||
} else if (getDir().getDegrees() == 180) {
|
|
||||||
return maxY;
|
|
||||||
|
|
||||||
} else if (getDir().getDegrees() == 270) {
|
|
||||||
return getPageHeight() - maxX;
|
|
||||||
|
|
||||||
} else {
|
|
||||||
return getPageHeight() - maxY;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the maxY value in pdf coordinate system.
|
|
||||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
|
||||||
* 0 -> LowerLeft
|
|
||||||
* 90 -> UpperLeft
|
|
||||||
* 180 -> UpperRight
|
|
||||||
* 270 -> LowerRight
|
|
||||||
*
|
|
||||||
* @return the maxY value in pdf coordinate system
|
|
||||||
*/
|
|
||||||
@JsonIgnore
|
|
||||||
public float getPdfMaxY() {
|
|
||||||
|
|
||||||
if (getDir().getDegrees() == 90) {
|
|
||||||
return maxX;
|
|
||||||
} else if (getDir().getDegrees() == 180) {
|
|
||||||
|
|
||||||
return minY;
|
|
||||||
} else if (getDir().getDegrees() == 270) {
|
|
||||||
return getPageHeight() - minX;
|
|
||||||
} else {
|
|
||||||
return getPageHeight() - minY;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public TextPageBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
|
|
||||||
|
|
||||||
this.minX = minX;
|
|
||||||
this.maxX = maxX;
|
|
||||||
this.minY = minY;
|
|
||||||
this.maxY = maxY;
|
|
||||||
this.sequences = sequences;
|
|
||||||
this.rotation = rotation;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public TextPageBlock union(TextPositionSequence r) {
|
|
||||||
|
|
||||||
TextPageBlock union = this.copy();
|
|
||||||
union.add(r);
|
|
||||||
return union;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public TextPageBlock union(TextPageBlock r) {
|
|
||||||
|
|
||||||
TextPageBlock union = this.copy();
|
|
||||||
union.add(r);
|
|
||||||
return union;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void add(TextPageBlock r) {
|
|
||||||
|
|
||||||
if (r.getMinX() < minX) {
|
|
||||||
minX = r.getMinX();
|
|
||||||
}
|
|
||||||
if (r.getMaxX() > maxX) {
|
|
||||||
maxX = r.getMaxX();
|
|
||||||
}
|
|
||||||
if (r.getMinY() < minY) {
|
|
||||||
minY = r.getMinY();
|
|
||||||
}
|
|
||||||
if (r.getMaxY() > maxY) {
|
|
||||||
maxY = r.getMaxY();
|
|
||||||
}
|
|
||||||
sequences.addAll(r.getSequences());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void add(TextPositionSequence r) {
|
|
||||||
|
|
||||||
if (r.getMinXDirAdj() < minX) {
|
|
||||||
minX = r.getMinXDirAdj();
|
|
||||||
}
|
|
||||||
if (r.getMaxXDirAdj() > maxX) {
|
|
||||||
maxX = r.getMaxXDirAdj();
|
|
||||||
}
|
|
||||||
if (r.getMinYDirAdj() < minY) {
|
|
||||||
minY = r.getMinYDirAdj();
|
|
||||||
}
|
|
||||||
if (r.getMaxYDirAdj() > maxY) {
|
|
||||||
maxY = r.getMaxYDirAdj();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public TextPageBlock copy() {
|
|
||||||
|
|
||||||
return new TextPageBlock(minX, maxX, minY, maxY, sequences, rotation);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void resize(float x1, float y1, float width, float height) {
|
|
||||||
|
|
||||||
set(x1, y1, x1 + width, y1 + height);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void set(float x1, float y1, float x2, float y2) {
|
|
||||||
|
|
||||||
this.minX = Math.min(x1, x2);
|
|
||||||
this.maxX = Math.max(x1, x2);
|
|
||||||
this.minY = Math.min(y1, y2);
|
|
||||||
this.maxY = Math.max(y1, y2);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
|
|
||||||
for (int i = 0; i < sequences.size(); i++) {
|
|
||||||
String sequenceAsString = sequences.get(i).toString();
|
|
||||||
// Fix for missing Whitespace. This is recognized in getSequences method. See PDFTextStripper Line 1730.
|
|
||||||
if (i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() - 1) != ' ' && sequenceAsString.charAt(0) != ' ') {
|
|
||||||
builder.append(' ');
|
|
||||||
}
|
|
||||||
builder.append(sequenceAsString);
|
|
||||||
}
|
|
||||||
|
|
||||||
return builder.toString();
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
@JsonIgnore
|
|
||||||
public String getText() {
|
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
|
|
||||||
TextPositionSequence previous = null;
|
|
||||||
for (TextPositionSequence word : sequences) {
|
|
||||||
if (previous != null) {
|
|
||||||
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
|
|
||||||
sb.append('\n');
|
|
||||||
} else {
|
|
||||||
sb.append(' ');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
sb.append(word.toString());
|
|
||||||
previous = word;
|
|
||||||
}
|
|
||||||
|
|
||||||
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,315 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
|
||||||
|
|
||||||
import java.awt.geom.AffineTransform;
|
|
||||||
import java.awt.geom.Point2D;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
|
||||||
|
|
||||||
import com.dslplatform.json.JsonAttribute;
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Data;
|
|
||||||
import lombok.NoArgsConstructor;
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
|
|
||||||
@Slf4j
|
|
||||||
@Data
|
|
||||||
@Builder
|
|
||||||
@NoArgsConstructor
|
|
||||||
@AllArgsConstructor
|
|
||||||
public class TextPositionSequence implements CharSequence {
|
|
||||||
|
|
||||||
public static final int HEIGHT_PADDING = 2;
|
|
||||||
private int page;
|
|
||||||
private List<RedTextPosition> textPositions = new ArrayList<>();
|
|
||||||
|
|
||||||
private TextDirection dir;
|
|
||||||
private int rotation;
|
|
||||||
private float pageHeight;
|
|
||||||
private float pageWidth;
|
|
||||||
private boolean isParagraphStart;
|
|
||||||
|
|
||||||
|
|
||||||
public TextPositionSequence(int page) {
|
|
||||||
|
|
||||||
this.page = page;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public TextPositionSequence(List<TextPosition> textPositions, int page, boolean isParagraphStart) {
|
|
||||||
|
|
||||||
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
|
|
||||||
this.page = page;
|
|
||||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
|
||||||
this.rotation = textPositions.get(0).getRotation();
|
|
||||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
|
||||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
|
||||||
this.isParagraphStart = isParagraphStart;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int length() {
|
|
||||||
|
|
||||||
return textPositions.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public char charAt(int index) {
|
|
||||||
|
|
||||||
RedTextPosition textPosition = textPositionAt(index);
|
|
||||||
String text = textPosition.getUnicode();
|
|
||||||
return text.charAt(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public char charAt(int index, boolean caseInSensitive) {
|
|
||||||
|
|
||||||
RedTextPosition textPosition = textPositionAt(index);
|
|
||||||
String text = textPosition.getUnicode();
|
|
||||||
return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public TextPositionSequence subSequence(int start, int end) {
|
|
||||||
|
|
||||||
var textPositionSequence = new TextPositionSequence();
|
|
||||||
textPositionSequence.textPositions = textPositions.subList(start, end);
|
|
||||||
textPositionSequence.page = page;
|
|
||||||
textPositionSequence.dir = dir;
|
|
||||||
textPositionSequence.rotation = rotation;
|
|
||||||
textPositionSequence.pageHeight = pageHeight;
|
|
||||||
textPositionSequence.pageWidth = pageWidth;
|
|
||||||
|
|
||||||
return textPositionSequence;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
|
|
||||||
StringBuilder builder = new StringBuilder(length());
|
|
||||||
for (int i = 0; i < length(); i++) {
|
|
||||||
builder.append(charAt(i));
|
|
||||||
}
|
|
||||||
return builder.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public RedTextPosition textPositionAt(int index) {
|
|
||||||
|
|
||||||
return textPositions.get(index);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void add(TextPositionSequence textPositionSequence, RedTextPosition textPosition) {
|
|
||||||
|
|
||||||
this.textPositions.add(textPosition);
|
|
||||||
this.page = textPositionSequence.getPage();
|
|
||||||
this.dir = textPositionSequence.getDir();
|
|
||||||
this.rotation = textPositionSequence.getRotation();
|
|
||||||
this.pageHeight = textPositionSequence.getPageHeight();
|
|
||||||
this.pageWidth = textPositionSequence.getPageWidth();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void add(TextPosition textPosition) {
|
|
||||||
|
|
||||||
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
|
|
||||||
|
|
||||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
|
||||||
this.rotation = textPositions.get(0).getRotation();
|
|
||||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
|
||||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
|
||||||
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
|
|
||||||
*
|
|
||||||
* @return the text direction adjusted minX value
|
|
||||||
*/
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public float getMinXDirAdj() {
|
|
||||||
|
|
||||||
return textPositions.get(0).getXDirAdj();
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
|
||||||
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
|
|
||||||
*
|
|
||||||
* @return the text direction adjusted maxX value
|
|
||||||
*/
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public float getMaxXDirAdj() {
|
|
||||||
|
|
||||||
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
|
||||||
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
|
|
||||||
*
|
|
||||||
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
|
|
||||||
*/
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public float getMinYDirAdj() {
|
|
||||||
|
|
||||||
return textPositions.get(0).getYDirAdj() - getTextHeight();
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
|
||||||
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
|
|
||||||
*
|
|
||||||
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
|
|
||||||
*/
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public float getMaxYDirAdj() {
|
|
||||||
|
|
||||||
return textPositions.get(0).getYDirAdj();
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public float getTextHeight() {
|
|
||||||
|
|
||||||
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public float getHeight() {
|
|
||||||
|
|
||||||
return getMaxYDirAdj() - getMinYDirAdj();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public float getWidth() {
|
|
||||||
|
|
||||||
return getMaxXDirAdj() - getMinXDirAdj();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public String getFont() {
|
|
||||||
|
|
||||||
return textPositions.get(0).getFontName().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", "");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public String getFontStyle() {
|
|
||||||
|
|
||||||
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
|
|
||||||
|
|
||||||
if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) {
|
|
||||||
return "bold, italic";
|
|
||||||
} else if (lowercaseFontName.contains("bold")) {
|
|
||||||
return "bold";
|
|
||||||
} else if (lowercaseFontName.contains("italic")) {
|
|
||||||
return "italic";
|
|
||||||
} else {
|
|
||||||
return "standard";
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public float getFontSize() {
|
|
||||||
|
|
||||||
return textPositions.get(0).getFontSizeInPt();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
public float getSpaceWidth() {
|
|
||||||
|
|
||||||
return textPositions.get(0).getWidthOfSpace();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This returns the bounding box of the word in Pdf Coordinate System where {0,0} rotated with the page rotation.
|
|
||||||
* 0 -> LowerLeft
|
|
||||||
* 90 -> UpperLeft
|
|
||||||
* 180 -> UpperRight
|
|
||||||
* 270 -> LowerRight
|
|
||||||
*
|
|
||||||
* @return bounding box of the word in Pdf Coordinate System
|
|
||||||
*/
|
|
||||||
@JsonIgnore
|
|
||||||
@JsonAttribute(ignore = true)
|
|
||||||
@SneakyThrows
|
|
||||||
public Rectangle getRectangle() {
|
|
||||||
|
|
||||||
log.debug("Page: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir);
|
|
||||||
|
|
||||||
float textHeight = getTextHeight();
|
|
||||||
|
|
||||||
RedTextPosition firstTextPos = textPositions.get(0);
|
|
||||||
RedTextPosition lastTextPos = textPositions.get(textPositions.size() - 1);
|
|
||||||
|
|
||||||
Point2D bottomLeft = new Point2D.Double(firstTextPos.getXDirAdj(), firstTextPos.getYDirAdj() - HEIGHT_PADDING);
|
|
||||||
Point2D topRight = new Point2D.Double(lastTextPos.getXDirAdj() + lastTextPos.getWidthDirAdj(), lastTextPos.getYDirAdj() + textHeight + HEIGHT_PADDING);
|
|
||||||
|
|
||||||
AffineTransform transform = new AffineTransform();
|
|
||||||
if (dir == TextDirection.ZERO || dir == TextDirection.HALF_CIRCLE) {
|
|
||||||
transform.rotate(dir.getRadians(), pageWidth / 2f, pageHeight / 2f);
|
|
||||||
transform.translate(0f, pageHeight + textHeight);
|
|
||||||
transform.scale(1., -1.);
|
|
||||||
} else if (dir == TextDirection.QUARTER_CIRCLE) {
|
|
||||||
transform.rotate(dir.getRadians(), pageWidth / 2f, pageWidth / 2f);
|
|
||||||
transform.translate(0f, pageWidth + textHeight);
|
|
||||||
transform.scale(1., -1.);
|
|
||||||
} else {
|
|
||||||
transform.rotate(dir.getRadians(), pageHeight / 2f, pageHeight / 2f);
|
|
||||||
transform.translate(0f, pageWidth + textHeight);
|
|
||||||
transform.scale(1., -1.);
|
|
||||||
}
|
|
||||||
|
|
||||||
bottomLeft = transform.transform(bottomLeft, null);
|
|
||||||
topRight = transform.transform(topRight, null);
|
|
||||||
|
|
||||||
return new Rectangle( //
|
|
||||||
new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()),
|
|
||||||
(float) (topRight.getX() - bottomLeft.getX()),
|
|
||||||
(float) (topRight.getY() - bottomLeft.getY()),
|
|
||||||
page);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@ -1,280 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.regex.Matcher;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.Orientation;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.RulingTextDirAdjustUtil;
|
|
||||||
|
|
||||||
@Service
|
|
||||||
@SuppressWarnings("all")
|
|
||||||
public class BlockificationService {
|
|
||||||
|
|
||||||
private static final float THRESHOLD = 1f;
|
|
||||||
private static final float Y_GAP_SPLIT_HEIGHT_MODIFIER = 1.25f;
|
|
||||||
private static final int X_GAP_SPLIT_CONSTANT = 50;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
|
||||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
|
||||||
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
|
||||||
*
|
|
||||||
* @param textPositions The words of a page.
|
|
||||||
* @param horizontalRulingLines Horizontal table lines.
|
|
||||||
* @param verticalRulingLines Vertical table lines.
|
|
||||||
* @return ClassificationPage object that contains the Textblock and text statistics.
|
|
||||||
*/
|
|
||||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
|
||||||
|
|
||||||
List<TextPageBlock> classificationTextBlocks = constructFineGranularTextPageBlocks(textPositions, horizontalRulingLines, verticalRulingLines);
|
|
||||||
|
|
||||||
classificationTextBlocks = mergeFineGranularTextPageBlocks(classificationTextBlocks);
|
|
||||||
|
|
||||||
return new ClassificationPage(new ArrayList<>(classificationTextBlocks.stream().map(classificationTextBlock -> (AbstractPageBlock) classificationTextBlock).toList()));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private List<TextPageBlock> mergeFineGranularTextPageBlocks(List<TextPageBlock> classificationTextBlocks) {
|
|
||||||
|
|
||||||
if (classificationTextBlocks.isEmpty()) {
|
|
||||||
return new ArrayList<>();
|
|
||||||
}
|
|
||||||
List<List<TextPageBlock>> textBlocksToMerge = new LinkedList<>();
|
|
||||||
List<TextPageBlock> currentTextBlocksToMerge = new LinkedList<>();
|
|
||||||
textBlocksToMerge.add(currentTextBlocksToMerge);
|
|
||||||
TextPageBlock previousTextBlock = null;
|
|
||||||
for (TextPageBlock currentTextBlock : classificationTextBlocks) {
|
|
||||||
if (previousTextBlock == null) {
|
|
||||||
currentTextBlocksToMerge.add(currentTextBlock);
|
|
||||||
previousTextBlock = currentTextBlock;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < 1;
|
|
||||||
boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < 5;
|
|
||||||
if (alignsXRight && smallYGap) {
|
|
||||||
currentTextBlocksToMerge.add(currentTextBlock);
|
|
||||||
} else {
|
|
||||||
currentTextBlocksToMerge = new LinkedList<>();
|
|
||||||
currentTextBlocksToMerge.add(currentTextBlock);
|
|
||||||
textBlocksToMerge.add(currentTextBlocksToMerge);
|
|
||||||
}
|
|
||||||
previousTextBlock = currentTextBlock;
|
|
||||||
}
|
|
||||||
return textBlocksToMerge.stream().map(TextPageBlock::merge).toList();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void assignOrientations(List<TextPageBlock> classificationTextBlocks) {
|
|
||||||
|
|
||||||
Iterator<TextPageBlock> itty = classificationTextBlocks.iterator();
|
|
||||||
|
|
||||||
TextPageBlock previousLeft = null;
|
|
||||||
TextPageBlock previousRight = null;
|
|
||||||
while (itty.hasNext()) {
|
|
||||||
TextPageBlock block = (TextPageBlock) itty.next();
|
|
||||||
|
|
||||||
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
|
|
||||||
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
|
|
||||||
previousLeft.add(block);
|
|
||||||
itty.remove();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) {
|
|
||||||
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) {
|
|
||||||
previousRight.add(block);
|
|
||||||
itty.remove();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (block.getOrientation().equals(Orientation.LEFT)) {
|
|
||||||
previousLeft = block;
|
|
||||||
} else if (block.getOrientation().equals(Orientation.RIGHT)) {
|
|
||||||
previousRight = block;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
itty = classificationTextBlocks.iterator();
|
|
||||||
TextPageBlock previous = null;
|
|
||||||
while (itty.hasNext()) {
|
|
||||||
TextPageBlock block = (TextPageBlock) itty.next();
|
|
||||||
|
|
||||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(
|
|
||||||
block.getMaxY(),
|
|
||||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
|
||||||
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
|
||||||
previous.add(block);
|
|
||||||
itty.remove();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
previous = block;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private List<TextPageBlock> constructFineGranularTextPageBlocks(List<TextPositionSequence> textPositions,
|
|
||||||
List<Ruling> horizontalRulingLines,
|
|
||||||
List<Ruling> verticalRulingLines) {
|
|
||||||
|
|
||||||
int indexOnPage = 0;
|
|
||||||
List<TextPositionSequence> wordClusterToCombine = new ArrayList<>();
|
|
||||||
List<TextPageBlock> classificationTextBlocks = new ArrayList<>();
|
|
||||||
|
|
||||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
|
||||||
TextPositionSequence prev = null;
|
|
||||||
|
|
||||||
var listIdentitifier = Pattern.compile("\\b(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)])", Pattern.CASE_INSENSITIVE);
|
|
||||||
|
|
||||||
boolean wasSplitted = false;
|
|
||||||
Float splitX1 = null;
|
|
||||||
for (TextPositionSequence word : textPositions) {
|
|
||||||
|
|
||||||
Matcher listIdentifierPattern = listIdentitifier.matcher(word.toString());
|
|
||||||
|
|
||||||
boolean yGap = word.getMinYDirAdj() - maxY > word.getHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
|
||||||
boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj());
|
|
||||||
boolean positiveXGapInline = prev != null && maxX + X_GAP_SPLIT_CONSTANT < word.getMinXDirAdj() && sameLine;
|
|
||||||
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
|
|
||||||
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
|
|
||||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
|
||||||
boolean splitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
|
||||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
|
||||||
boolean fontChange = prev != null && (!word.getFont().equals(prev.getFont()) || !word.getFontStyle()
|
|
||||||
.equals(prev.getFontStyle()) || word.getFontSize() != prev.getFontSize());
|
|
||||||
boolean newline = prev != null && Math.abs(word.getMinYDirAdj() - prev.getMinYDirAdj()) > word.getHeight();
|
|
||||||
boolean isListIdentifier = listIdentifierPattern.matches();
|
|
||||||
|
|
||||||
if (prev != null && (prev.isParagraphStart() || negativeXGap || positiveXGapInline || yGap || startFromTop || splitByRuling || (newline && (fontChange || isListIdentifier)))) {
|
|
||||||
// if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
|
||||||
|
|
||||||
Orientation prevOrientation = null;
|
|
||||||
if (!classificationTextBlocks.isEmpty()) {
|
|
||||||
prevOrientation = classificationTextBlocks.get(classificationTextBlocks.size() - 1).getOrientation();
|
|
||||||
}
|
|
||||||
|
|
||||||
TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine);
|
|
||||||
|
|
||||||
classificationTextBlocks.add(classificationTextBlock);
|
|
||||||
wordClusterToCombine = new ArrayList<>();
|
|
||||||
|
|
||||||
if (positiveXGapInline && !splitByRuling) {
|
|
||||||
wasSplitted = true;
|
|
||||||
classificationTextBlock.setOrientation(Orientation.LEFT);
|
|
||||||
splitX1 = word.getMinXDirAdj();
|
|
||||||
} else if (newLineAfterSplit && !splitByRuling) {
|
|
||||||
wasSplitted = false;
|
|
||||||
classificationTextBlock.setOrientation(Orientation.RIGHT);
|
|
||||||
splitX1 = null;
|
|
||||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (yGap || !startFromTop || !positiveXGapInline || !newLineAfterSplit || !splitByRuling)) {
|
|
||||||
classificationTextBlock.setOrientation(Orientation.LEFT);
|
|
||||||
}
|
|
||||||
|
|
||||||
minX = 1000;
|
|
||||||
maxX = 0;
|
|
||||||
minY = 1000;
|
|
||||||
maxY = 0;
|
|
||||||
prev = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
wordClusterToCombine.add(word);
|
|
||||||
|
|
||||||
prev = word;
|
|
||||||
if (word.getMinXDirAdj() < minX) {
|
|
||||||
minX = word.getMinXDirAdj();
|
|
||||||
}
|
|
||||||
if (word.getMaxXDirAdj() > maxX) {
|
|
||||||
maxX = word.getMaxXDirAdj();
|
|
||||||
}
|
|
||||||
if (word.getMinYDirAdj() < minY) {
|
|
||||||
minY = word.getMinYDirAdj();
|
|
||||||
}
|
|
||||||
if (word.getMaxYDirAdj() > maxY) {
|
|
||||||
maxY = word.getMaxYDirAdj();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine);
|
|
||||||
if (classificationTextBlock != null) {
|
|
||||||
classificationTextBlocks.add(classificationTextBlock);
|
|
||||||
}
|
|
||||||
return classificationTextBlocks;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean equalsWithThreshold(float f1, float f2) {
|
|
||||||
|
|
||||||
return Math.abs(f1 - f2) < THRESHOLD;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean isSplitByRuling(float minX,
|
|
||||||
float minY,
|
|
||||||
float maxX,
|
|
||||||
float maxY,
|
|
||||||
TextPositionSequence word,
|
|
||||||
List<Ruling> horizontalRulingLines,
|
|
||||||
List<Ruling> verticalRulingLines) {
|
|
||||||
|
|
||||||
return isSplitByRuling(maxX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMinYDirAdj(),
|
|
||||||
verticalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight()) //
|
|
||||||
|| isSplitByRuling(minX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMaxYDirAdj(),
|
|
||||||
horizontalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight()) //
|
|
||||||
|| isSplitByRuling(maxX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMinYDirAdj(),
|
|
||||||
horizontalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight()) //
|
|
||||||
|| isSplitByRuling(minX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMaxYDirAdj(),
|
|
||||||
verticalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight()); //
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
|
||||||
|
|
||||||
for (Ruling ruling : rulingLines) {
|
|
||||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
|
||||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@ -1,164 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils;
|
|
||||||
|
|
||||||
@Service
|
|
||||||
public class BodyTextFrameService {
|
|
||||||
|
|
||||||
private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.0f;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Adjusts and sets the body text frame to a page.
|
|
||||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
|
||||||
* 0 -> LowerLeft
|
|
||||||
* 90 -> UpperLeft
|
|
||||||
* 180 -> UpperRight
|
|
||||||
* 270 -> LowerRight
|
|
||||||
* The aspect ratio of the page is also regarded.
|
|
||||||
*
|
|
||||||
* @param page The page
|
|
||||||
* @param bodyTextFrame frame that contains the main text on portrait pages
|
|
||||||
* @param landscapeBodyTextFrame frame that contains the main text on landscape pages
|
|
||||||
*/
|
|
||||||
public void setBodyTextFrameAdjustedToPage(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) {
|
|
||||||
|
|
||||||
Rectangle textFrame = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame;
|
|
||||||
|
|
||||||
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() == 270) {
|
|
||||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), page.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()),
|
|
||||||
textFrame.getHeight(),
|
|
||||||
textFrame.getWidth(),
|
|
||||||
0);
|
|
||||||
} else if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
|
|
||||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), page.getPageNumber());
|
|
||||||
} else if (page.getRotation() == 180) {
|
|
||||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), page.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()),
|
|
||||||
textFrame.getWidth(),
|
|
||||||
textFrame.getHeight(),
|
|
||||||
0);
|
|
||||||
}
|
|
||||||
page.setBodyTextFrame(textFrame);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Calculates the frame that contains the main text, text outside the frame will be e.g. headers or footers.
|
|
||||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
|
||||||
* 0 -> LowerLeft
|
|
||||||
* 90 -> UpperLeft
|
|
||||||
* 180 -> UpperRight
|
|
||||||
* 270 -> LowerRight
|
|
||||||
* The aspect ratio of the page is also regarded.
|
|
||||||
*
|
|
||||||
* @param pages List of all pages
|
|
||||||
* @param documentFontSizeCounter Statistics of the document
|
|
||||||
* @param landscape Calculate for landscape or portrait
|
|
||||||
* @return Rectangle of the text frame
|
|
||||||
*/
|
|
||||||
public Rectangle calculateBodyTextFrame(List<ClassificationPage> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
|
|
||||||
|
|
||||||
BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle();
|
|
||||||
|
|
||||||
for (ClassificationPage page : pages) {
|
|
||||||
|
|
||||||
if (page.getTextBlocks().isEmpty() || landscape != page.isLandscape()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (AbstractPageBlock container : page.getTextBlocks()) {
|
|
||||||
|
|
||||||
if (container instanceof TextPageBlock) {
|
|
||||||
TextPageBlock textBlock = (TextPageBlock) container;
|
|
||||||
if (textBlock.getMostPopularWordFont() == null || textBlock.getMostPopularWordStyle() == null) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
|
|
||||||
if (approxLineCount < APPROXIMATE_HEADER_LINE_COUNT) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (documentFontSizeCounter.getMostPopular() != null && textBlock.getMostPopularWordFontSize() >= documentFontSizeCounter.getMostPopular()) {
|
|
||||||
|
|
||||||
expandRectangle(textBlock, page, expansionsRectangle);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (container instanceof TablePageBlock) {
|
|
||||||
TablePageBlock table = (TablePageBlock) container;
|
|
||||||
for (List<Cell> row : table.getRows()) {
|
|
||||||
for (Cell cell : row) {
|
|
||||||
|
|
||||||
if (cell == null || cell.getTextBlocks() == null) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
for (TextPageBlock textBlock : cell.getTextBlocks()) {
|
|
||||||
expandRectangle(textBlock, page, expansionsRectangle);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return new Rectangle(new Point(expansionsRectangle.minX, expansionsRectangle.minY),
|
|
||||||
expansionsRectangle.maxX - expansionsRectangle.minX,
|
|
||||||
expansionsRectangle.maxY - expansionsRectangle.minY,
|
|
||||||
0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void expandRectangle(TextPageBlock textBlock, ClassificationPage page, BodyTextFrameExpansionsRectangle expansionsRectangle) {
|
|
||||||
|
|
||||||
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
|
|
||||||
if (textBlock.getPdfMinY() < expansionsRectangle.minX) {
|
|
||||||
expansionsRectangle.minX = textBlock.getPdfMinY();
|
|
||||||
}
|
|
||||||
if (textBlock.getPdfMaxY() > expansionsRectangle.maxX) {
|
|
||||||
expansionsRectangle.maxX = textBlock.getPdfMaxY();
|
|
||||||
}
|
|
||||||
if (textBlock.getPdfMinX() < expansionsRectangle.minY) {
|
|
||||||
expansionsRectangle.minY = textBlock.getPdfMinX();
|
|
||||||
}
|
|
||||||
if (textBlock.getPdfMaxX() > expansionsRectangle.maxY) {
|
|
||||||
expansionsRectangle.maxY = textBlock.getPdfMaxX();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (textBlock.getPdfMinX() < expansionsRectangle.minX) {
|
|
||||||
expansionsRectangle.minX = textBlock.getPdfMinX();
|
|
||||||
}
|
|
||||||
if (textBlock.getPdfMaxX() > expansionsRectangle.maxX) {
|
|
||||||
expansionsRectangle.maxX = textBlock.getPdfMaxX();
|
|
||||||
}
|
|
||||||
if (textBlock.getPdfMinY() < expansionsRectangle.minY) {
|
|
||||||
expansionsRectangle.minY = textBlock.getPdfMinY();
|
|
||||||
}
|
|
||||||
if (textBlock.getPdfMaxY() > expansionsRectangle.maxY) {
|
|
||||||
expansionsRectangle.maxY = textBlock.getPdfMaxY();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private class BodyTextFrameExpansionsRectangle {
|
|
||||||
|
|
||||||
float minX = 10000;
|
|
||||||
float maxX = -100;
|
|
||||||
float minY = 10000;
|
|
||||||
float maxY = -100;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,112 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils;
|
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
|
|
||||||
@Slf4j
|
|
||||||
@Service
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
public class ClassificationService {
|
|
||||||
|
|
||||||
private final BodyTextFrameService bodyTextFrameService;
|
|
||||||
|
|
||||||
|
|
||||||
public void classifyDocument(ClassificationDocument document) {
|
|
||||||
|
|
||||||
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
|
|
||||||
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
|
|
||||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
|
||||||
|
|
||||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
|
||||||
|
|
||||||
for (ClassificationPage page : document.getPages()) {
|
|
||||||
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
|
||||||
classifyPage(page, document, headlineFontSizes);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
|
||||||
|
|
||||||
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
|
||||||
if (textBlock instanceof TextPageBlock) {
|
|
||||||
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
|
||||||
|
|
||||||
var bodyTextFrame = page.getBodyTextFrame();
|
|
||||||
|
|
||||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
|
||||||
textBlock.setClassification(PageBlockType.OTHER);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
|
|
||||||
textBlock.setClassification(PageBlockType.HEADER);
|
|
||||||
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
|
|
||||||
textBlock.setClassification(PageBlockType.FOOTER);
|
|
||||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
|
||||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
|
||||||
.size() == 1)) {
|
|
||||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
|
||||||
textBlock.setClassification(PageBlockType.TITLE);
|
|
||||||
}
|
|
||||||
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
|
|
||||||
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
|
|
||||||
.getCountPerValue()
|
|
||||||
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
|
|
||||||
.get(0)
|
|
||||||
.getTextPositions()
|
|
||||||
.get(0)
|
|
||||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
|
||||||
|
|
||||||
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
|
||||||
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(i));
|
|
||||||
document.setHeadlines(true);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
|
|
||||||
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
|
|
||||||
.get(0)
|
|
||||||
.getTextPositions()
|
|
||||||
.get(0)
|
|
||||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
|
|
||||||
document.setHeadlines(true);
|
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
|
||||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
|
||||||
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
|
||||||
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
|
||||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
|
|
||||||
.getMostPopular()
|
|
||||||
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
|
||||||
} else {
|
|
||||||
textBlock.setClassification(PageBlockType.OTHER);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,136 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
|
||||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.parsing.PDFLinesTextStripper;
|
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
|
|
||||||
@Slf4j
|
|
||||||
@Service
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
public class PdfParsingService {
|
|
||||||
|
|
||||||
private final RulingCleaningService rulingCleaningService;
|
|
||||||
private final TableExtractionService tableExtractionService;
|
|
||||||
private final BlockificationService blockificationService;
|
|
||||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
|
||||||
|
|
||||||
|
|
||||||
public ClassificationDocument parseDocument(PDDocument originDocument, Map<Integer, List<TableCells>> pdfTableCells, Map<Integer, List<ClassifiedImage>> pdfImages) {
|
|
||||||
|
|
||||||
ClassificationDocument document = new ClassificationDocument();
|
|
||||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
|
||||||
|
|
||||||
originDocument.setAllSecurityToBeRemoved(true);
|
|
||||||
long pageCount = originDocument.getNumberOfPages();
|
|
||||||
|
|
||||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
|
||||||
parsePage(pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber);
|
|
||||||
}
|
|
||||||
|
|
||||||
document.setPages(classificationPages);
|
|
||||||
|
|
||||||
return document;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
private void parsePage(Map<Integer, List<ClassifiedImage>> pdfImages,
|
|
||||||
PDDocument pdDocument,
|
|
||||||
Map<Integer, List<TableCells>> pdfTableCells,
|
|
||||||
ClassificationDocument document,
|
|
||||||
List<ClassificationPage> classificationPages,
|
|
||||||
int pageNumber) {
|
|
||||||
|
|
||||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
|
||||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
|
||||||
stripper.setPageNumber(pageNumber);
|
|
||||||
stripper.setStartPage(pageNumber);
|
|
||||||
stripper.setEndPage(pageNumber);
|
|
||||||
stripper.setPdpage(pdPage);
|
|
||||||
stripper.getText(pdDocument);
|
|
||||||
|
|
||||||
PDRectangle pdr = pdPage.getMediaBox();
|
|
||||||
|
|
||||||
int rotation = pdPage.getRotation();
|
|
||||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
|
||||||
|
|
||||||
PDRectangle cropbox = pdPage.getCropBox();
|
|
||||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
|
|
||||||
stripper.getRulings(),
|
|
||||||
stripper.getMinCharWidth(),
|
|
||||||
stripper.getMaxCharHeight());
|
|
||||||
ClassificationPage classificationPage = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
|
||||||
|
|
||||||
classificationPage.setRotation(rotation);
|
|
||||||
classificationPage.setLandscape(isLandscape);
|
|
||||||
classificationPage.setPageNumber(pageNumber);
|
|
||||||
classificationPage.setPageWidth(cropbox.getWidth());
|
|
||||||
classificationPage.setPageHeight(cropbox.getHeight());
|
|
||||||
|
|
||||||
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
|
|
||||||
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
|
|
||||||
classificationPage.setImages(pdfImages.get(pageNumber));
|
|
||||||
imageServiceResponseAdapter.findOcr(classificationPage);
|
|
||||||
}
|
|
||||||
|
|
||||||
tableExtractionService.extractTables(cleanRulings, classificationPage);
|
|
||||||
buildPageStatistics(classificationPage);
|
|
||||||
increaseDocumentStatistics(classificationPage, document);
|
|
||||||
|
|
||||||
classificationPages.add(classificationPage);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
|
||||||
|
|
||||||
if (!classificationPage.isLandscape()) {
|
|
||||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
|
||||||
}
|
|
||||||
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
|
||||||
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
|
||||||
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void buildPageStatistics(ClassificationPage classificationPage) {
|
|
||||||
|
|
||||||
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
|
|
||||||
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
|
|
||||||
if (textBlock instanceof TextPageBlock) {
|
|
||||||
if (((TextPageBlock) textBlock).getSequences() == null) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) {
|
|
||||||
classificationPage.getTextHeightCounter().add(word.getTextHeight());
|
|
||||||
classificationPage.getFontCounter().add(word.getFont());
|
|
||||||
classificationPage.getFontSizeCounter().add(word.getFontSize());
|
|
||||||
classificationPage.getFontStyleCounter().add(word.getFontStyle());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@ -1,231 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
|
||||||
|
|
||||||
import java.awt.geom.Line2D;
|
|
||||||
import java.awt.geom.Point2D;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
|
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
|
|
||||||
@Slf4j
|
|
||||||
@Service
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
public class RulingCleaningService {
|
|
||||||
|
|
||||||
public CleanRulings getCleanRulings(List<TableCells> tableCells, List<Ruling> rulings, float minCharWidth, float maxCharHeight) {
|
|
||||||
|
|
||||||
if (!rulings.isEmpty()) {
|
|
||||||
snapPoints(rulings, minCharWidth, maxCharHeight);
|
|
||||||
}
|
|
||||||
|
|
||||||
List<Ruling> vrs = new ArrayList<>();
|
|
||||||
for (Ruling vr : rulings) {
|
|
||||||
if (vr.vertical()) {
|
|
||||||
vrs.add(vr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (vrs.isEmpty()) {
|
|
||||||
vrs.addAll(extractVerticalRulings(tableCells));
|
|
||||||
}
|
|
||||||
List<Ruling> verticalRulingLines = collapseOrientedRulings(vrs);
|
|
||||||
|
|
||||||
List<Ruling> hrs = new ArrayList<>();
|
|
||||||
for (Ruling hr : rulings) {
|
|
||||||
if (hr.horizontal()) {
|
|
||||||
hrs.add(hr);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (hrs.isEmpty()) {
|
|
||||||
hrs.addAll(extractHorizontalRulings(tableCells));
|
|
||||||
}
|
|
||||||
List<Ruling> horizontalRulingLines = collapseOrientedRulings(hrs);
|
|
||||||
|
|
||||||
return CleanRulings.builder().vertical(verticalRulingLines).horizontal(horizontalRulingLines).build();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void snapPoints(List<? extends Line2D.Float> rulings, float xThreshold, float yThreshold) {
|
|
||||||
|
|
||||||
// collect points and keep a Line -> p1,p2 map
|
|
||||||
Map<Line2D.Float, Point2D[]> linesToPoints = new HashMap<>();
|
|
||||||
List<Point2D> points = new ArrayList<>();
|
|
||||||
for (Line2D.Float r : rulings) {
|
|
||||||
Point2D p1 = r.getP1();
|
|
||||||
Point2D p2 = r.getP2();
|
|
||||||
linesToPoints.put(r, new Point2D[]{p1, p2});
|
|
||||||
points.add(p1);
|
|
||||||
points.add(p2);
|
|
||||||
}
|
|
||||||
|
|
||||||
// snap by X
|
|
||||||
points.sort(Comparator.comparingDouble(Point2D::getX));
|
|
||||||
|
|
||||||
List<List<Point2D>> groupedPoints = new ArrayList<>();
|
|
||||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(points.get(0))));
|
|
||||||
|
|
||||||
for (Point2D p : points.subList(1, points.size() - 1)) {
|
|
||||||
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
|
|
||||||
if (Math.abs(p.getX() - last.get(0).getX()) < xThreshold) {
|
|
||||||
groupedPoints.get(groupedPoints.size() - 1).add(p);
|
|
||||||
} else {
|
|
||||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (List<Point2D> group : groupedPoints) {
|
|
||||||
float avgLoc = 0;
|
|
||||||
for (Point2D p : group) {
|
|
||||||
avgLoc += p.getX();
|
|
||||||
}
|
|
||||||
avgLoc /= group.size();
|
|
||||||
for (Point2D p : group) {
|
|
||||||
p.setLocation(avgLoc, p.getY());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// ---
|
|
||||||
|
|
||||||
// snap by Y
|
|
||||||
points.sort(Comparator.comparingDouble(Point2D::getY));
|
|
||||||
|
|
||||||
groupedPoints = new ArrayList<>();
|
|
||||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(points.get(0))));
|
|
||||||
|
|
||||||
for (Point2D p : points.subList(1, points.size() - 1)) {
|
|
||||||
List<Point2D> last = groupedPoints.get(groupedPoints.size() - 1);
|
|
||||||
if (Math.abs(p.getY() - last.get(0).getY()) < yThreshold) {
|
|
||||||
groupedPoints.get(groupedPoints.size() - 1).add(p);
|
|
||||||
} else {
|
|
||||||
groupedPoints.add(new ArrayList<>(Collections.singletonList(p)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (List<Point2D> group : groupedPoints) {
|
|
||||||
float avgLoc = 0;
|
|
||||||
for (Point2D p : group) {
|
|
||||||
avgLoc += p.getY();
|
|
||||||
}
|
|
||||||
avgLoc /= group.size();
|
|
||||||
for (Point2D p : group) {
|
|
||||||
p.setLocation(p.getX(), avgLoc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// ---
|
|
||||||
|
|
||||||
// finally, modify lines
|
|
||||||
for (Map.Entry<Line2D.Float, Point2D[]> ltp : linesToPoints.entrySet()) {
|
|
||||||
Point2D[] p = ltp.getValue();
|
|
||||||
ltp.getKey().setLine(p[0], p[1]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private Collection<? extends Ruling> extractVerticalRulings(List<TableCells> cvParsedTableCells) {
|
|
||||||
|
|
||||||
List<Ruling> vrs = new ArrayList<>();
|
|
||||||
|
|
||||||
if (cvParsedTableCells != null) {
|
|
||||||
for (TableCells cvParsedTableCell : cvParsedTableCells) {
|
|
||||||
Ruling leftLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX0(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1());
|
|
||||||
Ruling rightLine = createRuling(cvParsedTableCell.getX1(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1());
|
|
||||||
vrs.add(leftLine);
|
|
||||||
vrs.add(rightLine);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return vrs;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private Collection<? extends Ruling> extractHorizontalRulings(List<TableCells> cvParsedTableCells) {
|
|
||||||
|
|
||||||
List<Ruling> hrs = new ArrayList<>();
|
|
||||||
|
|
||||||
if (cvParsedTableCells != null) {
|
|
||||||
for (TableCells cvParsedTableCell : cvParsedTableCells) {
|
|
||||||
Ruling topLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY1(), cvParsedTableCell.getY1());
|
|
||||||
Ruling baseLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY0());
|
|
||||||
hrs.add(topLine);
|
|
||||||
hrs.add(baseLine);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return hrs;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private Ruling createRuling(float tableCellX0, float tableCellX1, float tableCellY0, float tableCellY1) {
|
|
||||||
|
|
||||||
float x0 = tableCellX0;
|
|
||||||
float x1 = tableCellX1;
|
|
||||||
float y0 = tableCellY0;
|
|
||||||
float y1 = tableCellY1;
|
|
||||||
|
|
||||||
if (x1 < x0) {
|
|
||||||
x0 = tableCellX1;
|
|
||||||
x1 = tableCellX0;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (y1 < y0) {
|
|
||||||
y0 = tableCellY1;
|
|
||||||
y1 = tableCellY0;
|
|
||||||
}
|
|
||||||
|
|
||||||
return new Ruling(new Point2D.Float(x0, y0), new Point2D.Float(x1, y1));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private List<Ruling> collapseOrientedRulings(List<Ruling> lines) {
|
|
||||||
|
|
||||||
int COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1;
|
|
||||||
return collapseOrientedRulings(lines, COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private List<Ruling> collapseOrientedRulings(List<Ruling> lines, int expandAmount) {
|
|
||||||
|
|
||||||
ArrayList<Ruling> rv = new ArrayList<>();
|
|
||||||
lines.sort((a, b) -> {
|
|
||||||
final float diff = a.getPosition() - b.getPosition();
|
|
||||||
return Float.compare(diff == 0 ? a.getStart() - b.getStart() : diff, 0f);
|
|
||||||
});
|
|
||||||
|
|
||||||
for (Ruling next_line : lines) {
|
|
||||||
Ruling last = rv.isEmpty() ? null : rv.get(rv.size() - 1);
|
|
||||||
// if current line colinear with next, and are "close enough": expand current line
|
|
||||||
if (last != null && DoubleComparisons.feq(next_line.getPosition(), last.getPosition()) && last.nearlyIntersects(next_line, expandAmount)) {
|
|
||||||
final float lastStart = last.getStart();
|
|
||||||
final float lastEnd = last.getEnd();
|
|
||||||
|
|
||||||
final boolean lastFlipped = lastStart > lastEnd;
|
|
||||||
final boolean nextFlipped = next_line.getStart() > next_line.getEnd();
|
|
||||||
|
|
||||||
boolean differentDirections = nextFlipped != lastFlipped;
|
|
||||||
float nextS = differentDirections ? next_line.getEnd() : next_line.getStart();
|
|
||||||
float nextE = differentDirections ? next_line.getStart() : next_line.getEnd();
|
|
||||||
|
|
||||||
final float newStart = lastFlipped ? Math.max(nextS, lastStart) : Math.min(nextS, lastStart);
|
|
||||||
final float newEnd = lastFlipped ? Math.min(nextE, lastEnd) : Math.max(nextE, lastEnd);
|
|
||||||
last.setStartEnd(newStart, newEnd);
|
|
||||||
assert !last.oblique();
|
|
||||||
} else if (next_line.length() == 0) {
|
|
||||||
continue;
|
|
||||||
} else {
|
|
||||||
rv.add(next_line);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rv;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,344 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Rectangle;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
|
|
||||||
|
|
||||||
@Service
|
|
||||||
public class TableExtractionService {
|
|
||||||
|
|
||||||
private static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (arg0, arg1) -> {
|
|
||||||
|
|
||||||
int rv = 0;
|
|
||||||
float arg0X = DoubleComparisons.round(arg0.getX(), 2);
|
|
||||||
float arg0Y = DoubleComparisons.round(arg0.getY(), 2);
|
|
||||||
float arg1X = DoubleComparisons.round(arg1.getX(), 2);
|
|
||||||
float arg1Y = DoubleComparisons.round(arg1.getY(), 2);
|
|
||||||
|
|
||||||
if (arg0X > arg1X) {
|
|
||||||
rv = 1;
|
|
||||||
} else if (arg0X < arg1X) {
|
|
||||||
rv = -1;
|
|
||||||
} else if (arg0Y > arg1Y) {
|
|
||||||
rv = 1;
|
|
||||||
} else if (arg0Y < arg1Y) {
|
|
||||||
rv = -1;
|
|
||||||
}
|
|
||||||
return rv;
|
|
||||||
};
|
|
||||||
private static final Comparator<Point2D> POINT_COMPARATOR = (arg0, arg1) -> {
|
|
||||||
|
|
||||||
int rv = 0;
|
|
||||||
float arg0X = DoubleComparisons.round(arg0.getX(), 2);
|
|
||||||
float arg0Y = DoubleComparisons.round(arg0.getY(), 2);
|
|
||||||
float arg1X = DoubleComparisons.round(arg1.getX(), 2);
|
|
||||||
float arg1Y = DoubleComparisons.round(arg1.getY(), 2);
|
|
||||||
|
|
||||||
if (arg0Y > arg1Y) {
|
|
||||||
rv = 1;
|
|
||||||
} else if (arg0Y < arg1Y) {
|
|
||||||
rv = -1;
|
|
||||||
} else if (arg0X > arg1X) {
|
|
||||||
rv = 1;
|
|
||||||
} else if (arg0X < arg1X) {
|
|
||||||
rv = -1;
|
|
||||||
}
|
|
||||||
return rv;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Finds tables on a page and moves textblocks into cells of the found tables.
|
|
||||||
* Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation.
|
|
||||||
* 0 -> LowerLeft
|
|
||||||
* 90 -> UpperLeft
|
|
||||||
* 180 -> UpperRight
|
|
||||||
* 270 -> LowerRight
|
|
||||||
* <p>
|
|
||||||
* DirAdj (Text direction adjusted) values can not be used here.
|
|
||||||
*
|
|
||||||
* @param cleanRulings The lines used to build the table.
|
|
||||||
* @param page Page object that contains textblocks and statistics.
|
|
||||||
*/
|
|
||||||
public void extractTables(CleanRulings cleanRulings, ClassificationPage page) {
|
|
||||||
|
|
||||||
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
|
||||||
|
|
||||||
List<TextPageBlock> toBeRemoved = new ArrayList<>();
|
|
||||||
|
|
||||||
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
|
||||||
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
|
|
||||||
for (Cell cell : cells) {
|
|
||||||
if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(),
|
|
||||||
textBlock.getPdfMinY(),
|
|
||||||
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
|
|
||||||
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
|
|
||||||
cell.addTextBlock(textBlock);
|
|
||||||
toBeRemoved.add(textBlock);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
cells = new ArrayList<>(new HashSet<>(cells));
|
|
||||||
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
|
|
||||||
|
|
||||||
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).toList();
|
|
||||||
|
|
||||||
List<TablePageBlock> tables = new ArrayList<>();
|
|
||||||
for (Rectangle area : spreadsheetAreas) {
|
|
||||||
|
|
||||||
List<Cell> overlappingCells = new ArrayList<>();
|
|
||||||
for (Cell c : cells) {
|
|
||||||
if (c.hasMinimumSize() && c.intersects(area)) {
|
|
||||||
overlappingCells.add(c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
tables.add(new TablePageBlock(overlappingCells, area, page.getRotation()));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (TablePageBlock table : tables) {
|
|
||||||
int position = -1;
|
|
||||||
|
|
||||||
Iterator<AbstractPageBlock> itty = page.getTextBlocks().iterator();
|
|
||||||
while (itty.hasNext()) {
|
|
||||||
AbstractPageBlock textBlock = itty.next();
|
|
||||||
if (textBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) textBlock) : table.contains(textBlock) && position == -1) {
|
|
||||||
position = page.getTextBlocks().indexOf(textBlock);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (position != -1) {
|
|
||||||
page.getTextBlocks().add(position, table);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
page.getTextBlocks().removeAll(toBeRemoved);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
|
||||||
|
|
||||||
for (Ruling r : horizontalRulingLines) {
|
|
||||||
if (r.getX2() < r.getX1()) {
|
|
||||||
double a = r.getX2();
|
|
||||||
r.x2 = (float) r.getX1();
|
|
||||||
r.x1 = (float) a;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
List<Cell> cellsFound = new ArrayList<>();
|
|
||||||
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
|
|
||||||
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
|
|
||||||
intersectionPointsList.sort(POINT_COMPARATOR);
|
|
||||||
|
|
||||||
for (int i = 0; i < intersectionPointsList.size(); i++) {
|
|
||||||
Point2D topLeft = intersectionPointsList.get(i);
|
|
||||||
Ruling[] hv = intersectionPoints.get(topLeft);
|
|
||||||
|
|
||||||
// CrossingPointsDirectlyBelow( topLeft );
|
|
||||||
List<Point2D> xPoints = new ArrayList<>();
|
|
||||||
// CrossingPointsDirectlyToTheRight( topLeft );
|
|
||||||
List<Point2D> yPoints = new ArrayList<>();
|
|
||||||
|
|
||||||
for (Point2D p : intersectionPointsList.subList(i, intersectionPointsList.size())) {
|
|
||||||
if (p.getX() == topLeft.getX() && p.getY() > topLeft.getY()) {
|
|
||||||
xPoints.add(p);
|
|
||||||
}
|
|
||||||
if (p.getY() == topLeft.getY() && p.getX() > topLeft.getX()) {
|
|
||||||
yPoints.add(p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
outer:
|
|
||||||
for (Point2D xPoint : xPoints) {
|
|
||||||
// is there a vertical edge b/w topLeft and xPoint?
|
|
||||||
if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
for (Point2D yPoint : yPoints) {
|
|
||||||
// is there an horizontal edge b/w topLeft and yPoint ?
|
|
||||||
if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
|
|
||||||
if (intersectionPoints.containsKey(btmRight) && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) && intersectionPoints.get(btmRight)[1].equals(
|
|
||||||
intersectionPoints.get(yPoint)[1])) {
|
|
||||||
cellsFound.add(new Cell(topLeft, btmRight));
|
|
||||||
break outer;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO create cells for vertical ruling lines with aligned endpoints at the top/bottom of a grid
|
|
||||||
// that aren't connected with an horizontal ruler?
|
|
||||||
// see: https://github.com/jazzido/tabula-extractor/issues/78#issuecomment-41481207
|
|
||||||
|
|
||||||
return cellsFound;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
|
|
||||||
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
|
|
||||||
List<Rectangle> rectangles = new ArrayList<>();
|
|
||||||
Set<Point2D> pointSet = new HashSet<>();
|
|
||||||
Map<Point2D, Point2D> edgesH = new HashMap<>();
|
|
||||||
Map<Point2D, Point2D> edgesV = new HashMap<>();
|
|
||||||
int i = 0;
|
|
||||||
|
|
||||||
for (Rectangle cell : cells) {
|
|
||||||
for (Point2D pt : cell.getPoints()) {
|
|
||||||
if (pointSet.contains(pt)) { // shared vertex, remove it
|
|
||||||
pointSet.remove(pt);
|
|
||||||
} else {
|
|
||||||
pointSet.add(pt);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// X first sort
|
|
||||||
List<Point2D> pointsSortX = new ArrayList<>(pointSet);
|
|
||||||
pointsSortX.sort(X_FIRST_POINT_COMPARATOR);
|
|
||||||
// Y first sort
|
|
||||||
List<Point2D> pointsSortY = new ArrayList<>(pointSet);
|
|
||||||
pointsSortY.sort(POINT_COMPARATOR);
|
|
||||||
|
|
||||||
while (i < pointSet.size()) {
|
|
||||||
float currY = (float) pointsSortY.get(i).getY();
|
|
||||||
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortY.get(i).getY(), currY)) {
|
|
||||||
edgesH.put(pointsSortY.get(i), pointsSortY.get(i + 1));
|
|
||||||
edgesH.put(pointsSortY.get(i + 1), pointsSortY.get(i));
|
|
||||||
i += 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
i = 0;
|
|
||||||
while (i < pointSet.size()) {
|
|
||||||
float currX = (float) pointsSortX.get(i).getX();
|
|
||||||
while (i < pointSet.size() && DoubleComparisons.feq(pointsSortX.get(i).getX(), currX)) {
|
|
||||||
edgesV.put(pointsSortX.get(i), pointsSortX.get(i + 1));
|
|
||||||
edgesV.put(pointsSortX.get(i + 1), pointsSortX.get(i));
|
|
||||||
i += 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get all the polygons
|
|
||||||
List<List<PolygonVertex>> polygons = new ArrayList<>();
|
|
||||||
Point2D nextVertex;
|
|
||||||
while (!edgesH.isEmpty()) {
|
|
||||||
ArrayList<PolygonVertex> polygon = new ArrayList<>();
|
|
||||||
Point2D first = edgesH.keySet().iterator().next();
|
|
||||||
polygon.add(new PolygonVertex(first, Direction.HORIZONTAL));
|
|
||||||
edgesH.remove(first);
|
|
||||||
|
|
||||||
while (true) {
|
|
||||||
PolygonVertex curr = polygon.get(polygon.size() - 1);
|
|
||||||
PolygonVertex lastAddedVertex;
|
|
||||||
if (curr.direction == Direction.HORIZONTAL) {
|
|
||||||
nextVertex = edgesV.get(curr.point);
|
|
||||||
edgesV.remove(curr.point);
|
|
||||||
lastAddedVertex = new PolygonVertex(nextVertex, Direction.VERTICAL);
|
|
||||||
} else {
|
|
||||||
nextVertex = edgesH.get(curr.point);
|
|
||||||
edgesH.remove(curr.point);
|
|
||||||
lastAddedVertex = new PolygonVertex(nextVertex, Direction.HORIZONTAL);
|
|
||||||
}
|
|
||||||
polygon.add(lastAddedVertex);
|
|
||||||
|
|
||||||
if (lastAddedVertex.equals(polygon.get(0))) {
|
|
||||||
// closed polygon
|
|
||||||
polygon.remove(polygon.size() - 1);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (PolygonVertex vertex : polygon) {
|
|
||||||
edgesH.remove(vertex.point);
|
|
||||||
edgesV.remove(vertex.point);
|
|
||||||
}
|
|
||||||
polygons.add(polygon);
|
|
||||||
}
|
|
||||||
|
|
||||||
// calculate grid-aligned minimum area rectangles for each found polygon
|
|
||||||
for (List<PolygonVertex> poly : polygons) {
|
|
||||||
float top = Float.MAX_VALUE;
|
|
||||||
float left = Float.MAX_VALUE;
|
|
||||||
float bottom = Float.MIN_VALUE;
|
|
||||||
float right = Float.MIN_VALUE;
|
|
||||||
for (PolygonVertex pt : poly) {
|
|
||||||
top = (float) Math.min(top, pt.point.getY());
|
|
||||||
left = (float) Math.min(left, pt.point.getX());
|
|
||||||
bottom = (float) Math.max(bottom, pt.point.getY());
|
|
||||||
right = (float) Math.max(right, pt.point.getX());
|
|
||||||
}
|
|
||||||
rectangles.add(new Rectangle(top, left, right - left, bottom - top));
|
|
||||||
}
|
|
||||||
|
|
||||||
return rectangles;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private enum Direction {
|
|
||||||
HORIZONTAL,
|
|
||||||
VERTICAL
|
|
||||||
}
|
|
||||||
|
|
||||||
static class PolygonVertex {
|
|
||||||
|
|
||||||
Point2D point;
|
|
||||||
Direction direction;
|
|
||||||
|
|
||||||
|
|
||||||
PolygonVertex(Point2D point, Direction direction) {
|
|
||||||
|
|
||||||
this.direction = direction;
|
|
||||||
this.point = point;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object other) {
|
|
||||||
|
|
||||||
if (this == other) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (!(other instanceof PolygonVertex)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return this.point.equals(((PolygonVertex) other).point);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode() {
|
|
||||||
|
|
||||||
return this.point.hashCode();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
|
|
||||||
return String.format("%s[point=%s,direction=%s]", this.getClass().getName(), this.point.toString(), this.direction.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,31 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
|
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
|
||||||
|
|
||||||
@UtilityClass
|
|
||||||
public final class TextNormalizationUtilities {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Revert hyphenation due to line breaks.
|
|
||||||
*
|
|
||||||
* @param text Text to be processed.
|
|
||||||
* @return Text without line-break hyphenation.
|
|
||||||
*/
|
|
||||||
public static String removeHyphenLineBreaks(String text) {
|
|
||||||
|
|
||||||
return text.replaceAll("([^\\s\\d\\-]{2,500})[\\-\\u00AD]\\R", "$1");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public static String removeLineBreaks(String text) {
|
|
||||||
|
|
||||||
return text.replaceAll("\n", " ");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public static String removeRepeatingWhitespaces(String text) {
|
|
||||||
|
|
||||||
return text.replaceAll(" {2}", " ");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -0,0 +1,98 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.EnumMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.LineBuilderService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.NearestNeighbourService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class DocstrumSegmentationService {
|
||||||
|
|
||||||
|
public static final double SAME_DIRECTION_THRESHOLD = 0.9;
|
||||||
|
private final NearestNeighbourService nearestNeighbourService;
|
||||||
|
private final SpacingService spacingService;
|
||||||
|
private final LineBuilderService lineBuilderService;
|
||||||
|
private final ZoneBuilderService zoneBuilderService;
|
||||||
|
private final ReadingOrderService readingOrderService;
|
||||||
|
|
||||||
|
|
||||||
|
public List<Zone> segmentPage(List<Word> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) {
|
||||||
|
|
||||||
|
EnumMap<TextDirection, Integer> directionCounts = new EnumMap<>(TextDirection.class);
|
||||||
|
|
||||||
|
List<Zone> newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO);
|
||||||
|
directionCounts.put(TextDirection.ZERO, newZones.size());
|
||||||
|
List<Zone> zones = new ArrayList<>(newZones);
|
||||||
|
|
||||||
|
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE);
|
||||||
|
directionCounts.put(TextDirection.QUARTER_CIRCLE, newZones.size());
|
||||||
|
zones.addAll(newZones);
|
||||||
|
|
||||||
|
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE);
|
||||||
|
directionCounts.put(TextDirection.HALF_CIRCLE, newZones.size());
|
||||||
|
zones.addAll(newZones);
|
||||||
|
|
||||||
|
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE);
|
||||||
|
directionCounts.put(TextDirection.THREE_QUARTER_CIRCLE, newZones.size());
|
||||||
|
zones.addAll(newZones);
|
||||||
|
|
||||||
|
return readingOrderService.resolve(zones, xyOrder, mostSameDirection(directionCounts));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean mostSameDirection(EnumMap<TextDirection, Integer> directionCounts) {
|
||||||
|
|
||||||
|
int total = directionCounts.values()
|
||||||
|
.stream()
|
||||||
|
.mapToInt(i -> i).sum();
|
||||||
|
|
||||||
|
if ((double) directionCounts.get(TextDirection.ZERO) / total > SAME_DIRECTION_THRESHOLD) {
|
||||||
|
return true;
|
||||||
|
} else if ((double) directionCounts.get(TextDirection.QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
|
||||||
|
return true;
|
||||||
|
} else if ((double) directionCounts.get(TextDirection.HALF_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
|
||||||
|
return true;
|
||||||
|
} else if ((double) directionCounts.get(TextDirection.THREE_QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<Zone> computeZones(List<Word> textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) {
|
||||||
|
|
||||||
|
List<Character> characters = textPositions.stream()
|
||||||
|
.filter(t -> t.getDir() == direction)
|
||||||
|
.map(Word::getCharacters)
|
||||||
|
.flatMap(List::stream)
|
||||||
|
.toList();
|
||||||
|
|
||||||
|
nearestNeighbourService.findNearestNeighbors(characters);
|
||||||
|
|
||||||
|
double characterSpacing = spacingService.computeCharacterSpacing(characters);
|
||||||
|
double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
|
||||||
|
|
||||||
|
List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings);
|
||||||
|
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,31 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||||
|
|
||||||
|
public class AngleFilter {
|
||||||
|
|
||||||
|
protected double lowerAngle;
|
||||||
|
protected double upperAngle;
|
||||||
|
|
||||||
|
|
||||||
|
public AngleFilter(double lowerAngle, double upperAngle) {
|
||||||
|
|
||||||
|
this.lowerAngle = lowerAngle < -Math.PI / 2 ? lowerAngle + Math.PI : lowerAngle;
|
||||||
|
this.upperAngle = upperAngle >= Math.PI / 2 ? upperAngle - Math.PI : upperAngle;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean matches(Neighbor neighbor) {
|
||||||
|
|
||||||
|
return matches(neighbor.getAngle());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean matches(double angle) {
|
||||||
|
|
||||||
|
if (lowerAngle <= upperAngle) {
|
||||||
|
return lowerAngle <= angle && angle < upperAngle;
|
||||||
|
} else {
|
||||||
|
return lowerAngle <= angle || angle < upperAngle;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,279 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.experimental.SuperBuilder;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@SuperBuilder
|
||||||
|
@NoArgsConstructor
|
||||||
|
public abstract class BoundingBox {
|
||||||
|
|
||||||
|
// Java coordinate system: (0, 0) is always upper left, x is increasing left to right and y is increasing from top to bottom.
|
||||||
|
// should be used when determining reading order or other tasks which require coordinates in a harmonized system.
|
||||||
|
protected Rectangle2D bBox; // I would not trust this coordinate when comparing rulings and text, due to the text positions being slightly off.
|
||||||
|
|
||||||
|
// PDF coordinate system: depends on page rotation, (0, 0) is lower left corner, x is increasing left to right and y from bottom to top.
|
||||||
|
// This rotates completely in 90 degree steps with page rotation.
|
||||||
|
// Needs to be used when writing to a PDF.
|
||||||
|
// Also, these are definitely correct and should be used whenever possible.
|
||||||
|
protected Rectangle2D bBoxPdf;
|
||||||
|
|
||||||
|
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||||
|
|
||||||
|
|
||||||
|
public double getX() {
|
||||||
|
|
||||||
|
return bBox.getX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getY() {
|
||||||
|
|
||||||
|
return bBox.getY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getMinX() {
|
||||||
|
|
||||||
|
return bBox.getMinX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getMinY() {
|
||||||
|
|
||||||
|
return bBox.getMinY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getPdfMinX() {
|
||||||
|
|
||||||
|
return bBoxPdf.getMinX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getPdfMaxX() {
|
||||||
|
|
||||||
|
return bBoxPdf.getMaxX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getPdfMinY() {
|
||||||
|
|
||||||
|
return bBoxPdf.getMinY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getPdfMaxY() {
|
||||||
|
|
||||||
|
return bBoxPdf.getMaxY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getWidth() {
|
||||||
|
|
||||||
|
return bBox.getWidth();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getHeight() {
|
||||||
|
|
||||||
|
return bBox.getHeight();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getMaxX() {
|
||||||
|
|
||||||
|
return bBox.getMaxX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getMaxY() {
|
||||||
|
|
||||||
|
return bBox.getMaxY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getArea() {
|
||||||
|
|
||||||
|
return (bBox.getHeight() * bBox.getWidth());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean contains(BoundingBox contained) {
|
||||||
|
|
||||||
|
return contains(contained, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean contains(BoundingBox contained, double tolerance) {
|
||||||
|
|
||||||
|
return getPdfMinX() <= contained.getPdfMinX() + tolerance
|
||||||
|
&& getPdfMinY() <= contained.getPdfMinY() + tolerance
|
||||||
|
&& getPdfMaxX() >= contained.getPdfMaxX() - tolerance
|
||||||
|
&& getPdfMaxY() >= contained.getPdfMaxY() - tolerance;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersects(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsX(other) && this.intersectsY(other);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersects(BoundingBox other, float yThreshold, float xThreshold) {
|
||||||
|
|
||||||
|
return this.intersectsX(other, xThreshold) && this.intersectsY(other, yThreshold);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsX(BoundingBox other, float threshold) {
|
||||||
|
|
||||||
|
return this.getX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsPdf(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsXPdf(other) && this.intersectsYPdf(other);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsPdf(BoundingBox other, float yThreshold, float xThreshold) {
|
||||||
|
|
||||||
|
return this.intersectsXPdf(other, xThreshold) && this.intersectsYPdf(other, yThreshold);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsYPdf(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.getPdfMinY() <= other.getPdfMaxY() && this.getPdfMaxY() >= other.getPdfMinY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsY(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsY(BoundingBox other, float threshold) {
|
||||||
|
|
||||||
|
return this.getY() - threshold <= other.getMaxY() && this.getMaxY() + threshold >= other.getY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsYPdf(BoundingBox other, float threshold) {
|
||||||
|
|
||||||
|
return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsXPdf(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsX(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsXPdf(BoundingBox other, float threshold) {
|
||||||
|
|
||||||
|
return this.getPdfMinX() - threshold <= other.getPdfMaxX() && this.getMaxX() + threshold >= other.getPdfMinX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void setToBBoxOfComponents(List<? extends BoundingBox> components) {
|
||||||
|
|
||||||
|
this.bBox = components.stream()
|
||||||
|
.map(BoundingBox::getBBox)
|
||||||
|
.collect(RectangleTransformations.collectBBox());
|
||||||
|
this.bBoxPdf = components.stream()
|
||||||
|
.map(BoundingBox::getBBoxPdf)
|
||||||
|
.collect(RectangleTransformations.collectBBox());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double verticalOverlap(BoundingBox other) {
|
||||||
|
|
||||||
|
return Math.max(0, Math.min(this.getPdfMaxY(), other.getPdfMaxY()) - Math.max(this.getPdfMinY(), other.getPdfMinY()));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static final Comparator<BoundingBox> ILL_DEFINED_ORDER = (o1, o2) -> {
|
||||||
|
|
||||||
|
if (o1.equals(o2)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD * ((o1.getHeight() + o2.getHeight()) / 2)) {
|
||||||
|
return Double.compare(o1.getPdfMinX(), o2.getPdfMinX());
|
||||||
|
} else {
|
||||||
|
return Double.compare(o1.getPdfMaxY(), o2.getPdfMaxY());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
public double horizontalDistance(BoundingBox other) {
|
||||||
|
|
||||||
|
double rect1Right = getMaxX();
|
||||||
|
double rect1Left = getMinX();
|
||||||
|
double rect2Right = other.getMaxX();
|
||||||
|
double rect2Left = other.getMinX();
|
||||||
|
|
||||||
|
if (rect1Left > rect2Right || rect2Left > rect1Right) {
|
||||||
|
return Math.max(rect2Left - rect1Right, rect1Left - rect2Right);
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double verticalDistance(BoundingBox other) {
|
||||||
|
|
||||||
|
double rect1Top = getMaxY();
|
||||||
|
double rect1Bottom = getMinY();
|
||||||
|
double rect2Top = other.getMaxY();
|
||||||
|
double rect2Bottom = other.getMinY();
|
||||||
|
|
||||||
|
if (rect1Bottom > rect2Top || rect2Bottom > rect1Top) {
|
||||||
|
return Math.max(rect2Bottom - rect1Top, rect1Bottom - rect2Top);
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean rightOf(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsY(other) && other.getMaxX() <= this.getMinX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean leftOf(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsY(other) && other.getMinX() >= this.getMaxX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isAbove(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsX(other) && other.getMinY() >= this.getMaxY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isBelow(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsX(other) && this.getMinY() >= other.getMaxY();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,86 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||||
|
public class Character {
|
||||||
|
|
||||||
|
@EqualsAndHashCode.Include
|
||||||
|
private final double x;
|
||||||
|
@EqualsAndHashCode.Include
|
||||||
|
private final double y;
|
||||||
|
private final RedTextPosition textPosition;
|
||||||
|
|
||||||
|
@Setter
|
||||||
|
private List<Neighbor> neighbors = new ArrayList<>();
|
||||||
|
|
||||||
|
|
||||||
|
public Character(RedTextPosition chunk) {
|
||||||
|
|
||||||
|
this.x = chunk.getBBoxDirAdj().getCenterX();
|
||||||
|
this.y = chunk.getBBoxDirAdj().getCenterY();
|
||||||
|
this.textPosition = chunk;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getHeight() {
|
||||||
|
|
||||||
|
return textPosition.getHeightDirAdj();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double distance(Character character) {
|
||||||
|
|
||||||
|
double dx = getX() - character.getX();
|
||||||
|
double dy = getY() - character.getY();
|
||||||
|
return Math.sqrt(dx * dx + dy * dy);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double horizontalDistance(Character character) {
|
||||||
|
|
||||||
|
return Math.abs(getX() - character.getX());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double verticalDistance(Character character) {
|
||||||
|
|
||||||
|
return Math.abs(getY() - character.getY());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double overlappingDistance(Character other) {
|
||||||
|
|
||||||
|
double[] xs = new double[4];
|
||||||
|
double s = Math.sin(-0);
|
||||||
|
double c = Math.cos(-0);
|
||||||
|
xs[0] = c * x - s * y;
|
||||||
|
xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDirAdj());
|
||||||
|
xs[2] = c * other.x - s * other.y;
|
||||||
|
xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDirAdj());
|
||||||
|
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
|
||||||
|
Arrays.sort(xs);
|
||||||
|
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double angle(Character character) {
|
||||||
|
|
||||||
|
if (getX() > character.getX()) {
|
||||||
|
return FastAtan2.fastAtan2(getY() - character.getY(), getX() - character.getX());
|
||||||
|
} else {
|
||||||
|
return FastAtan2.fastAtan2(character.getY() - getY(), character.getX() - getX());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,324 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
WIP, mostly working, needs to be tested a bit more
|
||||||
|
*/
|
||||||
|
public class ColumnDetector {
|
||||||
|
|
||||||
|
public static final double MAX_VALUE_THRESHOLD = 0.5;
|
||||||
|
final static int bins_num = 512;
|
||||||
|
final static int globalStartIdx = 0; // ignore outer parts completely, we don't expect columns there
|
||||||
|
final static int globalEndIdx = bins_num; // i chose 7, since thirds seems a likely split for columns, therefore divided by 6 would eliminate those.
|
||||||
|
public static final double DERIVATIVE_ZERO_THRESHOLD = 1e-10;
|
||||||
|
public static final double MINIMUM_THRESHOLD_FOR_COLUMNS = 0.05;
|
||||||
|
public static final double NEAR_GLOBAL_THRESHOLD = 0.5;
|
||||||
|
double minY;
|
||||||
|
double maxY;
|
||||||
|
double midY;
|
||||||
|
double[] histogram;
|
||||||
|
double min;
|
||||||
|
double max;
|
||||||
|
double resolution;
|
||||||
|
double sum;
|
||||||
|
int N;
|
||||||
|
|
||||||
|
|
||||||
|
public ColumnDetector(double min, double max, double minY, double maxY) {
|
||||||
|
|
||||||
|
this.min = min;
|
||||||
|
this.max = max;
|
||||||
|
this.minY = minY;
|
||||||
|
this.maxY = maxY;
|
||||||
|
this.midY = maxY - minY;
|
||||||
|
this.resolution = (max - min) / bins_num;
|
||||||
|
this.histogram = new double[bins_num];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void add(BoundingBox zone) {
|
||||||
|
|
||||||
|
N++;
|
||||||
|
double weight = computeWeight(zone);
|
||||||
|
int start = (int) ((zone.getMinX() - min) / resolution);
|
||||||
|
int end = (int) ((zone.getMaxX() - min) / resolution);
|
||||||
|
for (int i = start; i < end; i++) {
|
||||||
|
histogram[i] += weight;
|
||||||
|
sum += histogram[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double computeWeight(BoundingBox zone) {
|
||||||
|
|
||||||
|
double areaWeight = zone.getBBox().getHeight();
|
||||||
|
|
||||||
|
double relativeDistance = relativeDistanceToMiddle(zone.getBBox().getCenterY());
|
||||||
|
|
||||||
|
double distanceWeight;
|
||||||
|
if (relativeDistance < 0.6) {
|
||||||
|
distanceWeight = 1;
|
||||||
|
} else if (relativeDistance < 0.8) {
|
||||||
|
distanceWeight = 0.8;
|
||||||
|
} else {
|
||||||
|
distanceWeight = 0.1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return areaWeight * distanceWeight;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double relativeDistanceToMiddle(double y) {
|
||||||
|
|
||||||
|
double range = (maxY - minY) / 2;
|
||||||
|
double mid = minY + range;
|
||||||
|
|
||||||
|
return Math.abs(y - mid) / range;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double[] computeDerivative() {
|
||||||
|
|
||||||
|
int length = histogram.length;
|
||||||
|
double[] derivative = new double[length];
|
||||||
|
|
||||||
|
for (int i = 0; i < length; i++) {
|
||||||
|
if (i == 0) {
|
||||||
|
derivative[i] = (histogram[i + 1] - histogram[i]) / resolution;
|
||||||
|
} else if (i == length - 1) {
|
||||||
|
derivative[i] = (histogram[i] - histogram[i - 1]) / resolution;
|
||||||
|
} else {
|
||||||
|
derivative[i] = (histogram[i + 1] - histogram[i - 1]) / (2 * resolution);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return derivative;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double calcMean(double[] arr, int start, int end) {
|
||||||
|
|
||||||
|
if (start == end) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
double sum = 0;
|
||||||
|
for (int i = start; i < end; i++) {
|
||||||
|
sum += arr[i];
|
||||||
|
}
|
||||||
|
return sum / (end - start);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
Find columns, by finding all local maxima/minima of the derivative. Filtering them for the ones with the biggest values.
|
||||||
|
For each found minima, we will step to the right until we hit a 0 in the derivative, this indicates a minimum in the main histogram. If this minimum is below a threshold, it is deemed a column divider.
|
||||||
|
Same goes for maxima, but stepping to the left now, since minima in the function will always be to the left of a maximum in its derivative.
|
||||||
|
*/
|
||||||
|
public List<Double> determineColumnsWithDerivative(double[] derivative) {
|
||||||
|
|
||||||
|
assert derivative.length == histogram.length;
|
||||||
|
|
||||||
|
Set<Integer> columnIndices = new HashSet<>();
|
||||||
|
double mean = calcMean(histogram, 0, histogram.length);
|
||||||
|
double maxDvValue = calcMax(derivative);
|
||||||
|
double minDvValue = calcMin(derivative);
|
||||||
|
|
||||||
|
if (maxDvValue - minDvValue < mean * MAX_VALUE_THRESHOLD) {
|
||||||
|
Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
Extrema derivativeExtrema = calculateNearGlobalExtrema(derivative, maxDvValue, minDvValue);
|
||||||
|
|
||||||
|
List<Integer> columnsRightOfMinima = findZerosToTheRightOfMinima(derivative, derivativeExtrema.minima(), mean);
|
||||||
|
columnIndices.addAll(columnsRightOfMinima);
|
||||||
|
|
||||||
|
List<Integer> columnsLeftOfMaxima = findZerosToTheLeftOfMaxima(derivative, derivativeExtrema.maxima(), mean);
|
||||||
|
columnIndices.addAll(columnsLeftOfMaxima);
|
||||||
|
|
||||||
|
return columnIndices.stream()
|
||||||
|
.sorted(Comparator.naturalOrder())
|
||||||
|
.map(this::calculateXCoordinateFromIdx)
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<Integer> findZerosToTheLeftOfMaxima(double[] derivative, List<Integer> derivativeMaxima, double mean) {
|
||||||
|
|
||||||
|
List<Integer> columnsLeftOfMaxima = new ArrayList<>();
|
||||||
|
|
||||||
|
for (int i = 0; i < derivativeMaxima.size(); i++) {
|
||||||
|
List<Integer> consecutiveZeroes = new LinkedList<>();
|
||||||
|
boolean maximumFound = false;
|
||||||
|
int maximaIdx = derivativeMaxima.get(i) - 1; // the highest derivative will always be at least one step away from the lowest value.
|
||||||
|
int endIdx = (int) Math.max(globalStartIdx,
|
||||||
|
Math.min(maximaIdx - 1,
|
||||||
|
maximaIdx - 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the left edge;
|
||||||
|
|
||||||
|
for (int j = maximaIdx; j >= endIdx; j--) {
|
||||||
|
if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) {
|
||||||
|
maximumFound = true;
|
||||||
|
consecutiveZeroes.add(j);
|
||||||
|
} else if (maximumFound) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (maximumFound) {
|
||||||
|
int midIdx = consecutiveZeroes.size() / 2;
|
||||||
|
int middleMinimumIdx = consecutiveZeroes.get(midIdx);
|
||||||
|
if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) {
|
||||||
|
columnsLeftOfMaxima.add(middleMinimumIdx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return columnsLeftOfMaxima;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<Integer> findZerosToTheRightOfMinima(double[] derivative, List<Integer> derivativeMinima, double mean) {
|
||||||
|
|
||||||
|
List<Integer> columnIndixes = new LinkedList<>();
|
||||||
|
for (int i = 0; i < derivativeMinima.size(); i++) {
|
||||||
|
List<Integer> consecutiveZeroes = new LinkedList<>();
|
||||||
|
boolean minimumFound = false;
|
||||||
|
int minimaIdx = derivativeMinima.get(i) + 1; // the highest derivative will always be at least one step earlier than the lowest value.
|
||||||
|
int endIdx = (int) Math.min(globalEndIdx,
|
||||||
|
Math.max(minimaIdx + 1,
|
||||||
|
minimaIdx + 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the right edge;
|
||||||
|
|
||||||
|
for (int j = minimaIdx; j < endIdx; j++) {
|
||||||
|
if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) {
|
||||||
|
minimumFound = true;
|
||||||
|
consecutiveZeroes.add(j);
|
||||||
|
} else if (minimumFound) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (minimumFound) {
|
||||||
|
int midIdx = consecutiveZeroes.size() / 2;
|
||||||
|
int middleMinimumIdx = consecutiveZeroes.get(midIdx);
|
||||||
|
if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) {
|
||||||
|
columnIndixes.add(middleMinimumIdx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return columnIndixes;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double calcMax(double[] array) {
|
||||||
|
|
||||||
|
double max = Double.NEGATIVE_INFINITY;
|
||||||
|
for (int i = 0; i < array.length; i++) {
|
||||||
|
if (array[i] > max) {
|
||||||
|
max = array[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return max;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double calcMin(double[] array) {
|
||||||
|
|
||||||
|
double min = Double.POSITIVE_INFINITY;
|
||||||
|
for (int i = 0; i < array.length; i++) {
|
||||||
|
if (array[i] < min) {
|
||||||
|
min = array[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return min;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Extrema calculateNearGlobalExtrema(double[] derivative, double maxDvValue, double minDvValue) {
|
||||||
|
|
||||||
|
List<Integer> nearGlobalDvMaximaIdx = new LinkedList<>();
|
||||||
|
List<Integer> nearGlobalDvMinimaIdx = new LinkedList<>();
|
||||||
|
for (int i = globalStartIdx; i < globalEndIdx; i++) {
|
||||||
|
if (derivative[i] <= minDvValue * NEAR_GLOBAL_THRESHOLD) {
|
||||||
|
nearGlobalDvMinimaIdx.add(i);
|
||||||
|
}
|
||||||
|
if (derivative[i] >= maxDvValue * NEAR_GLOBAL_THRESHOLD) {
|
||||||
|
nearGlobalDvMaximaIdx.add(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
nearGlobalDvMinimaIdx = removeConsecutive(nearGlobalDvMinimaIdx);
|
||||||
|
nearGlobalDvMaximaIdx = removeConsecutive(nearGlobalDvMaximaIdx);
|
||||||
|
|
||||||
|
return new Extrema(nearGlobalDvMaximaIdx, nearGlobalDvMinimaIdx);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private record Extrema(List<Integer> maxima, List<Integer> minima) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Double calculateXCoordinateFromIdx(int globalMinIdx) {
|
||||||
|
|
||||||
|
return min + ((globalMinIdx + 1) * resolution);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static List<Integer> removeConsecutive(List<Integer> numbers) {
|
||||||
|
|
||||||
|
List<Integer> result = new ArrayList<>();
|
||||||
|
if (numbers == null || numbers.isEmpty()) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
result.add(numbers.get(0)); // Add the first number
|
||||||
|
|
||||||
|
for (int i = 1; i < numbers.size(); i++) {
|
||||||
|
if (numbers.get(i) != numbers.get(i - 1) + 1) {
|
||||||
|
result.add(numbers.get(i)); // Add non-consecutive numbers
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void kernelSmooth(double[] kernel) {
|
||||||
|
|
||||||
|
double[] newFrequencies = new double[histogram.length];
|
||||||
|
int shift = (kernel.length - 1) / 2;
|
||||||
|
for (int i = 0; i < kernel.length; i++) {
|
||||||
|
int jStart = Math.max(0, i - shift);
|
||||||
|
int jEnd = Math.min(histogram.length, histogram.length + i - shift);
|
||||||
|
for (int j = jStart; j < jEnd; j++) {
|
||||||
|
newFrequencies[j - i + shift] += kernel[i] * histogram[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
histogram = newFrequencies;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double[] createGaussianKernel(int length, double stdDeviation) {
|
||||||
|
|
||||||
|
int r = length / 2;
|
||||||
|
|
||||||
|
int size = 2 * r + 1;
|
||||||
|
double[] kernel = new double[size];
|
||||||
|
double sum = 0;
|
||||||
|
double b = 2 * (stdDeviation) * (stdDeviation);
|
||||||
|
double a = 1 / Math.sqrt(Math.PI * b);
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
|
||||||
|
sum += kernel[i];
|
||||||
|
}
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
kernel[i] /= sum;
|
||||||
|
}
|
||||||
|
return kernel;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,90 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||||
|
|
||||||
|
public class Histogram {
|
||||||
|
|
||||||
|
private static final double EPSILON = 1.0e-6;
|
||||||
|
private final double min;
|
||||||
|
private final double resolution;
|
||||||
|
private double[] frequencies;
|
||||||
|
|
||||||
|
|
||||||
|
public Histogram(double minValue, double maxValue, double resolution) {
|
||||||
|
|
||||||
|
this.min = minValue - EPSILON;
|
||||||
|
double delta = maxValue - minValue + 2 * EPSILON;
|
||||||
|
int size = Math.max(1, (int) Math.round((maxValue - minValue) / resolution));
|
||||||
|
this.resolution = delta / size;
|
||||||
|
this.frequencies = new double[size];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void kernelSmooth(double[] kernel) {
|
||||||
|
|
||||||
|
double[] newFrequencies = new double[frequencies.length];
|
||||||
|
int shift = (kernel.length - 1) / 2;
|
||||||
|
for (int i = 0; i < kernel.length; i++) {
|
||||||
|
int jStart = Math.max(0, i - shift);
|
||||||
|
int jEnd = Math.min(frequencies.length, frequencies.length + i - shift);
|
||||||
|
for (int j = jStart; j < jEnd; j++) {
|
||||||
|
newFrequencies[j - i + shift] += kernel[i] * frequencies[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
frequencies = newFrequencies;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double[] createGaussianKernel(double length, double stdDeviation) {
|
||||||
|
|
||||||
|
int r = (int) Math.round(length / resolution) / 2;
|
||||||
|
|
||||||
|
int size = 2 * r + 1;
|
||||||
|
double[] kernel = new double[size];
|
||||||
|
double sum = 0;
|
||||||
|
double b = 2 * (stdDeviation / resolution) * (stdDeviation / resolution);
|
||||||
|
double a = 1 / Math.sqrt(Math.PI * b);
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
|
||||||
|
sum += kernel[i];
|
||||||
|
}
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
kernel[i] /= sum;
|
||||||
|
}
|
||||||
|
return kernel;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void gaussianSmooth(double windowLength, double stdDeviation) {
|
||||||
|
|
||||||
|
kernelSmooth(createGaussianKernel(windowLength, stdDeviation));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void add(double value) {
|
||||||
|
|
||||||
|
frequencies[(int) ((value - min) / resolution)] += 1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public int getSize() {
|
||||||
|
|
||||||
|
return frequencies.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getPeakValue() {
|
||||||
|
|
||||||
|
int peakIndex = 0;
|
||||||
|
for (int i = 1; i < frequencies.length; i++) {
|
||||||
|
if (frequencies[i] > frequencies[peakIndex]) {
|
||||||
|
peakIndex = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int peakEndIndex = peakIndex + 1;
|
||||||
|
final double EPS = 0.0001;
|
||||||
|
while (peakEndIndex < frequencies.length && Math.abs(frequencies[peakEndIndex] - frequencies[peakIndex]) < EPS) {
|
||||||
|
peakEndIndex++;
|
||||||
|
}
|
||||||
|
return ((double) peakIndex + peakEndIndex) / 2 * resolution + min;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,194 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||||
|
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.BOLD;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.BOLD_ITALIC;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.ITALIC;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.STANDARD;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.EnumMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.FontStyle;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
|
||||||
|
public class Line extends TextBoundingBox {
|
||||||
|
|
||||||
|
private static final double WORD_DISTANCE_MULTIPLIER = 0.17;
|
||||||
|
|
||||||
|
@EqualsAndHashCode.Include
|
||||||
|
private final double x0;
|
||||||
|
@EqualsAndHashCode.Include
|
||||||
|
private final double y0;
|
||||||
|
|
||||||
|
@EqualsAndHashCode.Include
|
||||||
|
private final double x1;
|
||||||
|
@EqualsAndHashCode.Include
|
||||||
|
private final double y1;
|
||||||
|
|
||||||
|
private FontStyle fontStyle;
|
||||||
|
|
||||||
|
private final List<Word> words;
|
||||||
|
|
||||||
|
|
||||||
|
public Line(List<Character> characters, double wordSpacing) {
|
||||||
|
|
||||||
|
if (characters.size() >= 2) {
|
||||||
|
// linear regression
|
||||||
|
double sx = 0.0;
|
||||||
|
double sxx = 0.0;
|
||||||
|
double sxy = 0.0;
|
||||||
|
double sy = 0.0;
|
||||||
|
for (Character character : characters) {
|
||||||
|
sx += character.getX();
|
||||||
|
sxx += character.getX() * character.getX();
|
||||||
|
sxy += character.getX() * character.getY();
|
||||||
|
sy += character.getY();
|
||||||
|
}
|
||||||
|
double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx);
|
||||||
|
double a = (sy - b * sx) / characters.size();
|
||||||
|
|
||||||
|
this.x0 = characters.get(0).getX();
|
||||||
|
this.y0 = a + b * this.x0;
|
||||||
|
this.x1 = characters.get(characters.size() - 1).getX();
|
||||||
|
this.y1 = a + b * this.x1;
|
||||||
|
} else {
|
||||||
|
Character character = characters.get(0);
|
||||||
|
double dx = character.getTextPosition().getWidthDirAdj() / 3;
|
||||||
|
double dy = dx * Math.tan(0);
|
||||||
|
this.x0 = character.getX() - dx;
|
||||||
|
this.x1 = character.getX() + dx;
|
||||||
|
this.y0 = character.getY() - dy;
|
||||||
|
this.y1 = character.getY() + dy;
|
||||||
|
}
|
||||||
|
this.words = new ArrayList<>();
|
||||||
|
computeWords(characters, wordSpacing * WORD_DISTANCE_MULTIPLIER);
|
||||||
|
buildBBox();
|
||||||
|
computeFontStyle();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Line(List<Word> words) {
|
||||||
|
|
||||||
|
this.words = words;
|
||||||
|
buildBBox();
|
||||||
|
x0 = getMinX();
|
||||||
|
y0 = getMinY();
|
||||||
|
x1 = getMaxX();
|
||||||
|
y1 = getMaxY();
|
||||||
|
computeFontStyle();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void computeFontStyle() {
|
||||||
|
|
||||||
|
EnumMap<FontStyle, AtomicInteger> fontStyleCounter = new EnumMap<>(FontStyle.class);
|
||||||
|
for (FontStyle fontStyle : FontStyle.values()) {
|
||||||
|
fontStyleCounter.put(fontStyle, new AtomicInteger(0));
|
||||||
|
}
|
||||||
|
for (Word word : words) {
|
||||||
|
switch (word.getFontStyle()) {
|
||||||
|
case STANDARD -> fontStyleCounter.get(FontStyle.REGULAR).getAndIncrement();
|
||||||
|
case BOLD -> fontStyleCounter.get(FontStyle.BOLD).getAndIncrement();
|
||||||
|
case ITALIC -> fontStyleCounter.get(FontStyle.ITALIC).getAndIncrement();
|
||||||
|
case BOLD_ITALIC -> fontStyleCounter.get(FontStyle.BOLD_ITALIC).getAndIncrement();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fontStyle = fontStyleCounter.entrySet()
|
||||||
|
.stream()
|
||||||
|
.max(Comparator.comparing(entry -> entry.getValue().get()))
|
||||||
|
.map(Map.Entry::getKey).orElse(FontStyle.REGULAR);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getAngle() {
|
||||||
|
|
||||||
|
return FastAtan2.fastAtan2(y1 - y0, x1 - x0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getLength() {
|
||||||
|
|
||||||
|
return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double angularDifference(Line j) {
|
||||||
|
|
||||||
|
double diff = Math.abs(getAngle() - j.getAngle());
|
||||||
|
if (diff <= Math.PI / 2) {
|
||||||
|
return diff;
|
||||||
|
} else {
|
||||||
|
return Math.PI - diff;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double horizontalDistance(Line other) {
|
||||||
|
|
||||||
|
double[] xs = new double[4];
|
||||||
|
xs[0] = x0;
|
||||||
|
xs[1] = x1;
|
||||||
|
xs[2] = other.x0;
|
||||||
|
xs[3] = other.x1;
|
||||||
|
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
|
||||||
|
Arrays.sort(xs);
|
||||||
|
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double verticalDistance(Line other) {
|
||||||
|
|
||||||
|
double ym = (y0 + y1) / 2;
|
||||||
|
double yn = (other.y0 + other.y1) / 2;
|
||||||
|
return Math.abs(ym - yn);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void computeWords(List<Character> characters, double wordSpacing) {
|
||||||
|
|
||||||
|
// Imo, the width of space should be scaled with the font size, but it only depends on the median distance between horizontal neighbours.
|
||||||
|
// If there are large differences in fontsize on a page, this might lead to missing spaces for the smaller fonts and too many for larger fonts.
|
||||||
|
// I've just now changed the scaling factor. If you come across this comment with missing whitespaces again, try scaling the fontsize instead of simply changing the factor again.
|
||||||
|
Word word = new Word();
|
||||||
|
Character previous = null;
|
||||||
|
for (Character current : characters) {
|
||||||
|
if (previous != null) {
|
||||||
|
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
|
||||||
|
if (dist > wordSpacing) {
|
||||||
|
words.add(word);
|
||||||
|
word = new Word();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
word.add(current);
|
||||||
|
previous = current;
|
||||||
|
}
|
||||||
|
words.add(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void buildBBox() {
|
||||||
|
|
||||||
|
this.setToBBoxOfComponents(words);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
words.forEach(word -> sb.append(word.toString()).append(" "));
|
||||||
|
return sb.toString().trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@ -0,0 +1,43 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
|
||||||
|
public class Neighbor {
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
private final double distance;
|
||||||
|
private Double angle;
|
||||||
|
private final Character originCharacter;
|
||||||
|
@Getter
|
||||||
|
private final Character character;
|
||||||
|
|
||||||
|
|
||||||
|
public Neighbor(Character neighbor, Character origin) {
|
||||||
|
|
||||||
|
this.distance = neighbor.distance(origin);
|
||||||
|
this.character = neighbor;
|
||||||
|
this.originCharacter = origin;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getHorizontalDistance() {
|
||||||
|
|
||||||
|
return character.horizontalDistance(originCharacter);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getVerticalDistance() {
|
||||||
|
|
||||||
|
return character.verticalDistance(originCharacter);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getAngle() {
|
||||||
|
|
||||||
|
if (angle != null) {
|
||||||
|
return angle;
|
||||||
|
}
|
||||||
|
return this.character.angle(this.originCharacter);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,180 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.Setter;
|
||||||
|
import lombok.experimental.SuperBuilder;
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
@SuperBuilder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@EqualsAndHashCode(callSuper = false)
|
||||||
|
public abstract class TextBoundingBox extends BoundingBox {
|
||||||
|
|
||||||
|
protected Rectangle2D bBoxDirAdj;
|
||||||
|
|
||||||
|
protected TextDirection dir;
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setToBBoxOfComponents(List<? extends BoundingBox> components) {
|
||||||
|
|
||||||
|
super.setToBBoxOfComponents(components);
|
||||||
|
this.bBoxDirAdj = components.stream()
|
||||||
|
.filter(c -> c instanceof TextBoundingBox)
|
||||||
|
.map(c -> (TextBoundingBox) c)
|
||||||
|
.map(TextBoundingBox::getBBoxDirAdj)
|
||||||
|
.collect(RectangleTransformations.collectBBox());
|
||||||
|
|
||||||
|
Set<TextDirection> textDirections = components.stream()
|
||||||
|
.filter(c -> c instanceof TextBoundingBox)
|
||||||
|
.map(c -> (TextBoundingBox) c)
|
||||||
|
.map(TextBoundingBox::getDir)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
if (textDirections.isEmpty()) {
|
||||||
|
dir = TextDirection.ZERO;
|
||||||
|
} else if (textDirections.size() > 1) {
|
||||||
|
throw new IllegalArgumentException("More than one text direction found");
|
||||||
|
} else {
|
||||||
|
dir = textDirections.iterator().next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getXDirAdj() {
|
||||||
|
|
||||||
|
return this.bBoxDirAdj.getX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getYDirAdj() {
|
||||||
|
|
||||||
|
return this.bBoxDirAdj.getY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getWidthDirAdj() {
|
||||||
|
|
||||||
|
return this.bBoxDirAdj.getWidth();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getHeightDirAdj() {
|
||||||
|
|
||||||
|
return this.bBoxDirAdj.getHeight();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getMaxXDirAdj() {
|
||||||
|
|
||||||
|
return this.bBoxDirAdj.getMaxX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getMaxYDirAdj() {
|
||||||
|
|
||||||
|
return this.bBoxDirAdj.getMaxY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getCenterYDirAdj() {
|
||||||
|
|
||||||
|
return this.bBoxDirAdj.getCenterY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getCenterXDirAdj() {
|
||||||
|
|
||||||
|
return this.bBoxDirAdj.getCenterX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double horizontalDistanceDirAdj(TextBoundingBox other) {
|
||||||
|
|
||||||
|
double rect1Right = getMaxXDirAdj();
|
||||||
|
double rect1Left = getXDirAdj();
|
||||||
|
double rect2Right = other.getMaxXDirAdj();
|
||||||
|
double rect2Left = other.getXDirAdj();
|
||||||
|
|
||||||
|
if (rect1Left > rect2Right || rect2Left > rect1Right) {
|
||||||
|
return Math.max(rect2Left - rect1Right, rect1Left - rect2Right);
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double verticalDistanceDirAdj(TextBoundingBox other) {
|
||||||
|
|
||||||
|
double rect1Top = getMaxYDirAdj();
|
||||||
|
double rect1Bottom = getYDirAdj();
|
||||||
|
double rect2Top = other.getMaxYDirAdj();
|
||||||
|
double rect2Bottom = other.getYDirAdj();
|
||||||
|
|
||||||
|
if (rect1Bottom > rect2Top || rect2Bottom > rect1Top) {
|
||||||
|
return Math.max(rect2Bottom - rect1Top, rect1Bottom - rect2Top);
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsDirAdj(TextBoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsXDirAdj(other) && this.intersectsYDirAdj(other);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsDirAdj(TextBoundingBox other, float yThreshold, float xThreshold) {
|
||||||
|
|
||||||
|
return this.intersectsXDirAdj(other, xThreshold) && this.intersectsYDirAdj(other, yThreshold);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsXDirAdj(TextBoundingBox other, float threshold) {
|
||||||
|
|
||||||
|
return this.getXDirAdj() - threshold <= other.getMaxXDirAdj() && this.getMaxXDirAdj() + threshold >= other.getXDirAdj();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsXDirAdj(TextBoundingBox other) {
|
||||||
|
|
||||||
|
return this.getXDirAdj() <= other.getMaxXDirAdj() && this.getMaxXDirAdj() >= other.getXDirAdj();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsYDirAdj(TextBoundingBox other) {
|
||||||
|
|
||||||
|
return this.getYDirAdj() <= other.getMaxYDirAdj() && this.getMaxYDirAdj() >= other.getYDirAdj();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsYDirAdj(TextBoundingBox other, float threshold) {
|
||||||
|
|
||||||
|
return this.getYDirAdj() - threshold <= other.getMaxYDirAdj() && this.getMaxYDirAdj() + threshold >= other.getYDirAdj();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isAboveDirAdj(TextBoundingBox other) {
|
||||||
|
|
||||||
|
return other.isBelow(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isBelowDirAdj(TextBoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsXDirAdj(other) && this.getYDirAdj() >= other.getMaxYDirAdj();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,37 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.LinkedHashSet;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
public class UnionFind<T> extends org.jgrapht.alg.util.UnionFind<T> {
|
||||||
|
|
||||||
|
public UnionFind(Set<T> elements) {
|
||||||
|
|
||||||
|
super(elements);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Collection<Set<T>> getGroups() {
|
||||||
|
|
||||||
|
Map<T, Set<T>> setRep = new LinkedHashMap<>();
|
||||||
|
for (T t : getParentMap().keySet()) {
|
||||||
|
T representative = find(t);
|
||||||
|
if (!setRep.containsKey(representative)) {
|
||||||
|
setRep.put(representative, new LinkedHashSet<>());
|
||||||
|
}
|
||||||
|
setRep.get(representative).add(t);
|
||||||
|
}
|
||||||
|
|
||||||
|
return setRep.values();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Collection<T> getElements() {
|
||||||
|
|
||||||
|
return getParentMap().keySet();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,33 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@EqualsAndHashCode(callSuper = false)
|
||||||
|
public class Zone extends TextBoundingBox {
|
||||||
|
|
||||||
|
private List<Line> lines;
|
||||||
|
|
||||||
|
|
||||||
|
@SuppressWarnings("PMD.ConstructorCallsOverridableMethod")
|
||||||
|
public Zone(List<Line> lines) {
|
||||||
|
|
||||||
|
this.lines = lines;
|
||||||
|
setToBBoxOfComponents(lines);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
lines.forEach(line -> sb.append(line.toString()).append("\n"));
|
||||||
|
return sb.toString().trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,58 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
public class LineBuilderService {
|
||||||
|
|
||||||
|
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
|
||||||
|
private static final double LINE_SPACING_THRESHOLD_MULTIPLIER = 0.67;
|
||||||
|
private static final double ANGLE_TOLERANCE = Math.toRadians(5);
|
||||||
|
|
||||||
|
|
||||||
|
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing, CleanRulings rulings) {
|
||||||
|
|
||||||
|
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
|
||||||
|
double maxVerticalDistance = lineSpacing * LINE_SPACING_THRESHOLD_MULTIPLIER;
|
||||||
|
|
||||||
|
UnionFind<Character> unionFind = new UnionFind<>(new HashSet<>(characters));
|
||||||
|
|
||||||
|
AngleFilter angleFilter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
||||||
|
|
||||||
|
characters.forEach(character -> {
|
||||||
|
character.getNeighbors()
|
||||||
|
.forEach(neighbor -> {
|
||||||
|
double normalizedHorizontalDistance = neighbor.getHorizontalDistance() / maxHorizontalDistance;
|
||||||
|
double normalizedVerticalDistance = neighbor.getVerticalDistance() / maxVerticalDistance;
|
||||||
|
|
||||||
|
if (character.getTextPosition().getDir() != neighbor.getCharacter().getTextPosition().getDir() //
|
||||||
|
|| !angleFilter.matches(neighbor) //
|
||||||
|
|| Math.pow(normalizedHorizontalDistance, 2) + Math.pow(normalizedVerticalDistance, 2) > 1 //
|
||||||
|
|| rulings.lineBetween(character.getTextPosition(), neighbor.getCharacter().getTextPosition())) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
unionFind.union(character, neighbor.getCharacter());
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return unionFind.getGroups()
|
||||||
|
.stream()
|
||||||
|
.map(lineCharacters -> lineCharacters.stream()
|
||||||
|
.sorted(Comparator.comparingDouble(Character::getX))
|
||||||
|
.toList())
|
||||||
|
.map(lineCharacters -> new Line(lineCharacters, characterSpacing))
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,36 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
import org.tinspin.index.Index;
|
||||||
|
import org.tinspin.index.kdtree.KDTree;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Neighbor;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
public class NearestNeighbourService {
|
||||||
|
|
||||||
|
private static final int NUMBER_OF_NEIGHBOURS = 8;
|
||||||
|
|
||||||
|
|
||||||
|
public void findNearestNeighbors(List<Character> characters) {
|
||||||
|
KDTree<Character> kdTree = KDTree.create(2);
|
||||||
|
characters.forEach(c -> kdTree.insert(new double[]{c.getX(), c.getY()}, c));
|
||||||
|
|
||||||
|
for(Character c : characters) {
|
||||||
|
Index.PointIteratorKnn<Character> iterator = kdTree.queryKnn(new double[]{c.getX(), c.getY()}, NUMBER_OF_NEIGHBOURS + 1);
|
||||||
|
// skip the first as this is identity
|
||||||
|
if(iterator.hasNext()) {
|
||||||
|
iterator.next();
|
||||||
|
}
|
||||||
|
while(iterator.hasNext()) {
|
||||||
|
c.getNeighbors().add(new Neighbor(iterator.next().value(), c));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@ -0,0 +1,192 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ListIterator;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
public class ReadingOrderService {
|
||||||
|
|
||||||
|
private static final double THRESHOLD = 5;
|
||||||
|
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
|
||||||
|
|
||||||
|
private static final Comparator<TextBoundingBox> COMPARATOR = //
|
||||||
|
Comparator.comparing(TextBoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||||
|
.thenComparing(TextBoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
|
||||||
|
|
||||||
|
private static final Comparator<TextBoundingBox> COMPARATOR_DIR_ADJ = //
|
||||||
|
Comparator.comparing(TextBoundingBox::getYDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||||
|
.thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
|
||||||
|
|
||||||
|
|
||||||
|
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder, boolean useDirAdjCoords) {
|
||||||
|
|
||||||
|
if (zones.isEmpty() || zones.size() == 1) {
|
||||||
|
return zones;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (xyReadingOrder) {
|
||||||
|
return resolveSingleColumnReadingOrder(zones, useDirAdjCoords);
|
||||||
|
}
|
||||||
|
|
||||||
|
Map<Long, Integer> histogram = new HashMap<>();
|
||||||
|
for (Zone zone : zones) {
|
||||||
|
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
|
||||||
|
long minY = Math.round(bbox.getMinY());
|
||||||
|
long maxY = Math.round(bbox.getMaxY());
|
||||||
|
for (long i = minY; i <= maxY; i++) {
|
||||||
|
histogram.put(i, histogram.getOrDefault(i, 0) + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (histogram.values()
|
||||||
|
.stream()
|
||||||
|
.mapToInt(Integer::intValue).average()
|
||||||
|
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
||||||
|
return resolveSingleColumnReadingOrder(zones, useDirAdjCoords);
|
||||||
|
} else {
|
||||||
|
|
||||||
|
return resolveMultiColumnReadingOder(zones, useDirAdjCoords);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones, boolean useDirAdjCoords) {
|
||||||
|
|
||||||
|
if (useDirAdjCoords) {
|
||||||
|
return zones.stream()
|
||||||
|
.collect(Collectors.groupingBy(TextBoundingBox::getDir)).values()
|
||||||
|
.stream()
|
||||||
|
.flatMap(words -> words.stream()
|
||||||
|
.sorted(COMPARATOR_DIR_ADJ))
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
zones.sort(COMPARATOR);
|
||||||
|
return zones;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones, boolean useDirAdjCoords) {
|
||||||
|
|
||||||
|
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
|
||||||
|
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
|
||||||
|
|
||||||
|
double minX = Double.POSITIVE_INFINITY;
|
||||||
|
double maxX = Double.NEGATIVE_INFINITY;
|
||||||
|
|
||||||
|
for (Zone zone : zones) {
|
||||||
|
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
|
||||||
|
if (bbox.getX() < minX) {
|
||||||
|
minX = zone.getXDirAdj();
|
||||||
|
}
|
||||||
|
if (bbox.getMaxX() > maxX) {
|
||||||
|
maxX = zone.getMaxXDirAdj();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double midLineXCoordinate = (minX + maxX) / 2;
|
||||||
|
|
||||||
|
List<Zone> leftOf = new ArrayList<>();
|
||||||
|
List<Zone> rightOf = new ArrayList<>();
|
||||||
|
List<Zone> middle = new ArrayList<>();
|
||||||
|
|
||||||
|
for (Zone zone : zones) {
|
||||||
|
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
|
||||||
|
if (bbox.getX() < midLineXCoordinate && bbox.getX() + bbox.getWidth() < midLineXCoordinate) {
|
||||||
|
leftOf.add(zone);
|
||||||
|
} else if (bbox.getX() > midLineXCoordinate && bbox.getX() + bbox.getWidth() > midLineXCoordinate) {
|
||||||
|
rightOf.add(zone);
|
||||||
|
} else {
|
||||||
|
middle.add(zone);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (useDirAdjCoords) {
|
||||||
|
leftOf.sort(COMPARATOR_DIR_ADJ);
|
||||||
|
rightOf.sort(COMPARATOR_DIR_ADJ);
|
||||||
|
middle.sort(COMPARATOR_DIR_ADJ);
|
||||||
|
} else {
|
||||||
|
leftOf.sort(COMPARATOR);
|
||||||
|
rightOf.sort(COMPARATOR);
|
||||||
|
middle.sort(COMPARATOR);
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
List<Zone> leftNotIntersecting = new ArrayList<>();
|
||||||
|
for (Zone leftZone : leftOf) {
|
||||||
|
boolean intersects = false;
|
||||||
|
for (Zone rightZone : rightOf) {
|
||||||
|
if (leftZone.intersectsY(rightZone)) {
|
||||||
|
intersects = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// early stopping
|
||||||
|
if (rightZone.getBBox().getMinY() > leftZone.getBBox().getMaxY()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!intersects) {
|
||||||
|
leftNotIntersecting.add(leftZone);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Zone> rightNotIntersecting = new ArrayList<>();
|
||||||
|
for (Zone rightZone : rightOf) {
|
||||||
|
boolean intersects = false;
|
||||||
|
for (Zone leftZone : leftOf) {
|
||||||
|
if (rightZone.intersectsY(leftZone)) {
|
||||||
|
intersects = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// early stopping
|
||||||
|
if (leftZone.getBBox().getMinY() > rightZone.getBBox().getMaxY()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!intersects) {
|
||||||
|
rightNotIntersecting.add(rightZone);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
leftOf.removeAll(leftNotIntersecting);
|
||||||
|
rightOf.removeAll(rightNotIntersecting);
|
||||||
|
|
||||||
|
middle.addAll(leftNotIntersecting);
|
||||||
|
middle.addAll(rightNotIntersecting);
|
||||||
|
*/
|
||||||
|
List<Zone> sortedZones = new ArrayList<>();
|
||||||
|
sortedZones.addAll(leftOf);
|
||||||
|
sortedZones.addAll(rightOf);
|
||||||
|
|
||||||
|
ListIterator<Zone> itty = middle.listIterator();
|
||||||
|
|
||||||
|
while (itty.hasNext()) {
|
||||||
|
Zone current = itty.next();
|
||||||
|
Rectangle2D bbox = useDirAdjCoords ? current.getBBoxDirAdj() : current.getBBox();
|
||||||
|
for (int i = 0; i < sortedZones.size(); i++) {
|
||||||
|
if (bbox.getY() < sortedZones.get(i).getY()) {
|
||||||
|
sortedZones.add(i, current);
|
||||||
|
itty.remove();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sortedZones.addAll(middle);
|
||||||
|
|
||||||
|
return sortedZones;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,56 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Histogram;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Neighbor;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
public class SpacingService {
|
||||||
|
|
||||||
|
private static final double SPACING_HISTOGRAM_RESOLUTION = 0.5;
|
||||||
|
private static final double SPACING_HISTOGRAM_SMOOTHING_LENGTH = 2.5;
|
||||||
|
private static final double SPACING_HIST_SMOOTHING_STANDARD_DEVIATION = 0.5;
|
||||||
|
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||||
|
|
||||||
|
|
||||||
|
public double computeCharacterSpacing(List<Character> characters) {
|
||||||
|
|
||||||
|
return computeSpacing(characters, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double computeLineSpacing(List<Character> characters) {
|
||||||
|
|
||||||
|
return computeSpacing(characters, Math.PI / 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double computeSpacing(List<Character> characters, double angle) {
|
||||||
|
|
||||||
|
double maxDistance = Double.NEGATIVE_INFINITY;
|
||||||
|
|
||||||
|
for (Character character : characters) {
|
||||||
|
for (Neighbor neighbor : character.getNeighbors()) {
|
||||||
|
maxDistance = Math.max(maxDistance, neighbor.getDistance());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION);
|
||||||
|
AngleFilter angleFilter = new AngleFilter(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
|
||||||
|
for (Character character : characters) {
|
||||||
|
for (Neighbor neighbor : character.getNeighbors()) {
|
||||||
|
if (angleFilter.matches(neighbor)) {
|
||||||
|
histogram.add(neighbor.getDistance());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
histogram.gaussianSmooth(SPACING_HISTOGRAM_SMOOTHING_LENGTH, SPACING_HIST_SMOOTHING_STANDARD_DEVIATION);
|
||||||
|
return histogram.getPeakValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,126 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
public class ZoneBuilderService {
|
||||||
|
|
||||||
|
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
|
||||||
|
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
|
||||||
|
|
||||||
|
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -7;
|
||||||
|
|
||||||
|
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
|
||||||
|
|
||||||
|
private static final double MIN_LINE_SIZE_SCALE = 0.9;
|
||||||
|
|
||||||
|
private static final double MAX_LINE_SIZE_SCALE = 2.5;
|
||||||
|
|
||||||
|
private static final double ANGLE_TOLERANCE = Math.toRadians(5);
|
||||||
|
|
||||||
|
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
|
||||||
|
|
||||||
|
|
||||||
|
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing, CleanRulings rulings) {
|
||||||
|
|
||||||
|
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
|
||||||
|
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
|
||||||
|
double minHorizontalMergeDistance = lineSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
|
||||||
|
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
|
||||||
|
|
||||||
|
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
|
||||||
|
|
||||||
|
double meanHeight = calculateMeanHeight(lines);
|
||||||
|
|
||||||
|
lines.forEach(outerLine -> {
|
||||||
|
lines.forEach(innerLine -> {
|
||||||
|
|
||||||
|
if (innerLine == outerLine //
|
||||||
|
|| unionFind.inSameSet(outerLine, innerLine)//
|
||||||
|
|| outerLine.angularDifference(innerLine) > ANGLE_TOLERANCE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// if (!innerLine.getFontStyle().equals(outerLine.getFontStyle()) //
|
||||||
|
// && !outerLine.intersectsY(innerLine, -2f)) {
|
||||||
|
// return;
|
||||||
|
// }
|
||||||
|
|
||||||
|
double horizontalScale = Math.min(outerLine.getHeightDirAdj(), innerLine.getHeightDirAdj()) / meanHeight;
|
||||||
|
horizontalScale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(horizontalScale, MAX_LINE_SIZE_SCALE));
|
||||||
|
double verticalScale = horizontalScale;
|
||||||
|
|
||||||
|
// if (innerLine.toString().endsWith(":")
|
||||||
|
// || outerLine.toString().endsWith(":")
|
||||||
|
// || numericalIdentifierPattern.matcher(innerLine.toString()).matches()
|
||||||
|
// || numericalIdentifierPattern.matcher(outerLine.toString()).matches()) {
|
||||||
|
//
|
||||||
|
// horizontalScale *= 5;
|
||||||
|
// verticalScale /= 10;
|
||||||
|
// }
|
||||||
|
|
||||||
|
double horizontalDistance = outerLine.horizontalDistance(innerLine) / horizontalScale;
|
||||||
|
double verticalDistance = outerLine.verticalDistance(innerLine) / verticalScale;
|
||||||
|
|
||||||
|
if ((!(minHorizontalDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalDistance)) //
|
||||||
|
&& (!(minHorizontalMergeDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalMergeDistance))) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rulings.lineBetween(outerLine, innerLine)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
unionFind.union(outerLine, innerLine);
|
||||||
|
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return unionFind.getGroups()
|
||||||
|
.stream()
|
||||||
|
.map(group -> mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing))
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double calculateMeanHeight(List<Line> lines) {
|
||||||
|
|
||||||
|
double meanHeight = 0.0;
|
||||||
|
double weights = 0.0;
|
||||||
|
for (Line line : lines) {
|
||||||
|
double weight = line.getLength();
|
||||||
|
meanHeight += line.getHeightDirAdj() * weight;
|
||||||
|
weights += weight;
|
||||||
|
}
|
||||||
|
meanHeight /= weights;
|
||||||
|
return meanHeight;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Zone mergeLinesInZone(List<Line> lines, double characterSpacing, double lineSpacing) {
|
||||||
|
|
||||||
|
Set<Word> words = lines.stream()
|
||||||
|
.map(Line::getWords)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
Collection<Set<Word>> groupedLines = TextPositionOperations.groupByLine(words);
|
||||||
|
|
||||||
|
List<Line> sortedLines = TextPositionOperations.sortLines(groupedLines);
|
||||||
|
return new Zone(sortedLines);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,18 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.utils;
|
||||||
|
|
||||||
|
public class DoubleUtils {
|
||||||
|
|
||||||
|
public static int compareDouble(double d1, double d2, double precision) {
|
||||||
|
|
||||||
|
if (Double.isNaN(d1) || Double.isNaN(d2)) {
|
||||||
|
return Double.compare(d1, d2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Math.abs(d1 - d2) < precision) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return Double.compare(d1, d2);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,76 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.utils;
|
||||||
|
|
||||||
|
public class FastAtan2 {
|
||||||
|
|
||||||
|
static final private int Size_Ac = 1000;
|
||||||
|
static final private int Size_Ar = Size_Ac + 1;
|
||||||
|
static final private double Pi = (float) Math.PI;
|
||||||
|
static final private double Pi_H = Pi / 2;
|
||||||
|
|
||||||
|
static final private double[] Atan2 = new double[Size_Ar];
|
||||||
|
static final private double[] Atan2_PM = new double[Size_Ar];
|
||||||
|
static final private double[] Atan2_MP = new double[Size_Ar];
|
||||||
|
static final private double[] Atan2_MM = new double[Size_Ar];
|
||||||
|
|
||||||
|
static final private double[] Atan2_R = new double[Size_Ar];
|
||||||
|
static final private double[] Atan2_RPM = new double[Size_Ar];
|
||||||
|
static final private double[] Atan2_RMP = new double[Size_Ar];
|
||||||
|
static final private double[] Atan2_RMM = new double[Size_Ar];
|
||||||
|
|
||||||
|
static {
|
||||||
|
for (int i = 0; i <= Size_Ac; i++) {
|
||||||
|
double d = (double) i / Size_Ac;
|
||||||
|
double x = 1;
|
||||||
|
double y = x * d;
|
||||||
|
double v = Math.atan2(y, x);
|
||||||
|
Atan2[i] = v;
|
||||||
|
Atan2_PM[i] = Pi - v;
|
||||||
|
Atan2_MP[i] = -v;
|
||||||
|
Atan2_MM[i] = -Pi + v;
|
||||||
|
|
||||||
|
Atan2_R[i] = Pi_H - v;
|
||||||
|
Atan2_RPM[i] = Pi_H + v;
|
||||||
|
Atan2_RMP[i] = -Pi_H + v;
|
||||||
|
Atan2_RMM[i] = -Pi_H - v;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("ParameterAssignment")
|
||||||
|
static public double fastAtan2(double y, double x) {
|
||||||
|
|
||||||
|
if (y < 0) {
|
||||||
|
if (x < 0) {
|
||||||
|
//(y < x) because == (-y > -x)
|
||||||
|
if (y < x) {
|
||||||
|
return Atan2_RMM[(int) (x / y * Size_Ac)];
|
||||||
|
} else {
|
||||||
|
return Atan2_MM[(int) (y / x * Size_Ac)];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
y = -y;
|
||||||
|
if (y > x) {
|
||||||
|
return Atan2_RMP[(int) (x / y * Size_Ac)];
|
||||||
|
} else {
|
||||||
|
return Atan2_MP[(int) (y / x * Size_Ac)];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (x < 0) {
|
||||||
|
x = -x;
|
||||||
|
if (y > x) {
|
||||||
|
return Atan2_RPM[(int) (x / y * Size_Ac)];
|
||||||
|
} else {
|
||||||
|
return Atan2_PM[(int) (y / x * Size_Ac)];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (y > x) {
|
||||||
|
return Atan2_R[(int) (x / y * Size_Ac)];
|
||||||
|
} else {
|
||||||
|
return Atan2[(int) (y / x * Size_Ac)];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,246 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.factory;
|
|
||||||
|
|
||||||
import static java.lang.String.format;
|
|
||||||
import static java.util.stream.Collectors.groupingBy;
|
|
||||||
import static java.util.stream.Collectors.toList;
|
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.NoSuchElementException;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationFooter;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationHeader;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Footer;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Header;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Headline;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Paragraph;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Section;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.experimental.FieldDefaults;
|
|
||||||
import lombok.experimental.UtilityClass;
|
|
||||||
|
|
||||||
@UtilityClass
|
|
||||||
public class DocumentGraphFactory {
|
|
||||||
|
|
||||||
public Document buildDocumentGraph(ClassificationDocument document) {
|
|
||||||
|
|
||||||
Document documentGraph = new Document();
|
|
||||||
Context context = new Context(documentGraph);
|
|
||||||
|
|
||||||
document.getPages().forEach(context::buildAndAddPageWithCounter);
|
|
||||||
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image));
|
|
||||||
addSections(document, context);
|
|
||||||
addHeaderAndFooterToEachPage(document, context);
|
|
||||||
|
|
||||||
documentGraph.setNumberOfPages(context.pages.size());
|
|
||||||
documentGraph.setPages(context.pages.keySet());
|
|
||||||
documentGraph.setDocumentTree(context.documentTree);
|
|
||||||
documentGraph.setTextBlock(documentGraph.getTextBlock());
|
|
||||||
return documentGraph;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void addSections(ClassificationDocument document, Context context) {
|
|
||||||
|
|
||||||
document.getSections().forEach(section -> SectionNodeFactory.addSection(null, section.getPageBlocks(), section.getImages(), context));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void addParagraphOrHeadline(GenericSemanticNode parentNode, TextPageBlock originalTextBlock, Context context, List<TextPageBlock> textBlocksToMerge) {
|
|
||||||
|
|
||||||
Page page = context.getPage(originalTextBlock.getPage());
|
|
||||||
|
|
||||||
GenericSemanticNode node;
|
|
||||||
if (originalTextBlock.isHeadline()) {
|
|
||||||
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
|
||||||
} else {
|
|
||||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
|
||||||
}
|
|
||||||
|
|
||||||
page.getMainBody().add(node);
|
|
||||||
|
|
||||||
List<TextPageBlock> textBlocks = new ArrayList<>(textBlocksToMerge);
|
|
||||||
textBlocks.add(originalTextBlock);
|
|
||||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page);
|
|
||||||
List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node);
|
|
||||||
node.setLeafTextBlock(textBlock);
|
|
||||||
node.setTreeId(treeId);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void addImage(Section section, ClassifiedImage image, Context context) {
|
|
||||||
|
|
||||||
Rectangle2D position = image.getPosition();
|
|
||||||
Page page = context.getPage(image.getPage());
|
|
||||||
Image imageNode = Image.builder()
|
|
||||||
.id(IdBuilder.buildId(Set.of(page), List.of(position)))
|
|
||||||
.imageType(image.getImageType())
|
|
||||||
.position(position)
|
|
||||||
.transparent(image.isHasTransparency())
|
|
||||||
.page(page)
|
|
||||||
.documentTree(context.getDocumentTree())
|
|
||||||
.build();
|
|
||||||
page.getMainBody().add(imageNode);
|
|
||||||
|
|
||||||
List<Integer> tocId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode);
|
|
||||||
imageNode.setTreeId(tocId);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void addHeaderAndFooterToEachPage(ClassificationDocument document, Context context) {
|
|
||||||
|
|
||||||
Map<Integer, List<TextPageBlock>> headers = document.getHeaders()
|
|
||||||
.stream()
|
|
||||||
.map(ClassificationHeader::getTextBlocks)
|
|
||||||
.flatMap(List::stream)
|
|
||||||
.collect(groupingBy(AbstractPageBlock::getPage, toList()));
|
|
||||||
|
|
||||||
Map<Integer, List<TextPageBlock>> footers = document.getFooters()
|
|
||||||
.stream()
|
|
||||||
.map(ClassificationFooter::getTextBlocks)
|
|
||||||
.flatMap(List::stream)
|
|
||||||
.collect(groupingBy(AbstractPageBlock::getPage, toList()));
|
|
||||||
|
|
||||||
for (int pageIndex = 1; pageIndex <= document.getPages().size(); pageIndex++) {
|
|
||||||
if (headers.containsKey(pageIndex)) {
|
|
||||||
addHeader(headers.get(pageIndex), context);
|
|
||||||
} else {
|
|
||||||
addEmptyHeader(pageIndex, context);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int pageIndex = 1; pageIndex <= document.getPages().size(); pageIndex++) {
|
|
||||||
if (footers.containsKey(pageIndex)) {
|
|
||||||
addFooter(footers.get(pageIndex), context);
|
|
||||||
} else {
|
|
||||||
addEmptyFooter(pageIndex, context);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void addFooter(List<TextPageBlock> textBlocks, Context context) {
|
|
||||||
|
|
||||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
|
||||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
|
||||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
|
|
||||||
footer,
|
|
||||||
context,
|
|
||||||
page);
|
|
||||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
|
||||||
footer.setTreeId(tocId);
|
|
||||||
footer.setLeafTextBlock(textBlock);
|
|
||||||
page.setFooter(footer);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void addHeader(List<TextPageBlock> textBlocks, Context context) {
|
|
||||||
|
|
||||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
|
||||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
|
||||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
|
|
||||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
|
||||||
header.setTreeId(tocId);
|
|
||||||
header.setLeafTextBlock(textBlock);
|
|
||||||
page.setHeader(header);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void addEmptyFooter(int pageIndex, Context context) {
|
|
||||||
|
|
||||||
Page page = context.getPage(pageIndex);
|
|
||||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
|
||||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
|
|
||||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
|
||||||
footer.setTreeId(tocId);
|
|
||||||
footer.setLeafTextBlock(textBlock);
|
|
||||||
page.setFooter(footer);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void addEmptyHeader(int pageIndex, Context context) {
|
|
||||||
|
|
||||||
Page page = context.getPage(pageIndex);
|
|
||||||
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
|
|
||||||
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
|
|
||||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
|
|
||||||
header.setTreeId(tocId);
|
|
||||||
header.setLeafTextBlock(textBlock);
|
|
||||||
page.setHeader(header);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Getter
|
|
||||||
@Builder
|
|
||||||
@AllArgsConstructor
|
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
|
||||||
public final class Context {
|
|
||||||
|
|
||||||
DocumentTree documentTree;
|
|
||||||
Map<Page, Integer> pages;
|
|
||||||
List<Section> sections;
|
|
||||||
List<ClassifiedImage> images;
|
|
||||||
TextBlockFactory textBlockFactory;
|
|
||||||
|
|
||||||
|
|
||||||
public Context(Document document) {
|
|
||||||
|
|
||||||
documentTree = new DocumentTree(document);
|
|
||||||
pages = new HashMap<>();
|
|
||||||
sections = new LinkedList<>();
|
|
||||||
images = new LinkedList<>();
|
|
||||||
textBlockFactory = new TextBlockFactory();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void buildAndAddPageWithCounter(ClassificationPage classificationPage) {
|
|
||||||
|
|
||||||
Page page = Page.fromClassificationPage(classificationPage);
|
|
||||||
//this counter counts the TextBlocks per page
|
|
||||||
//initial value is set to 1, because 0 is reserved for Header
|
|
||||||
pages.put(page, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public int getAndIncrementTextBlockNumberOnPage(Page page) {
|
|
||||||
|
|
||||||
Integer textBlockNumberOnPage = pages.get(page);
|
|
||||||
pages.merge(page, 1, Integer::sum);
|
|
||||||
return textBlockNumberOnPage;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Page getPage(int pageIndex) {
|
|
||||||
|
|
||||||
return pages.keySet()
|
|
||||||
.stream()
|
|
||||||
.filter(page -> page.getNumber() == pageIndex)
|
|
||||||
.findFirst()
|
|
||||||
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,183 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.factory;
|
|
||||||
|
|
||||||
import static java.lang.String.format;
|
|
||||||
import static java.util.Collections.emptyList;
|
|
||||||
import static java.util.stream.Collectors.groupingBy;
|
|
||||||
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Section;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility;
|
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
|
||||||
|
|
||||||
@UtilityClass
|
|
||||||
public class SectionNodeFactory {
|
|
||||||
|
|
||||||
public void addSection(GenericSemanticNode parentNode, List<AbstractPageBlock> pageBlocks, List<ClassifiedImage> images, DocumentGraphFactory.Context context) {
|
|
||||||
|
|
||||||
if (pageBlocks.isEmpty()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractPageBlock::getPage));
|
|
||||||
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
|
|
||||||
|
|
||||||
context.getSections().add(section);
|
|
||||||
blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
|
|
||||||
|
|
||||||
section.setTreeId(getTreeId(parentNode, context, section));
|
|
||||||
|
|
||||||
addFirstHeadlineDirectlyToSection(pageBlocks, context, section);
|
|
||||||
if (containsTablesAndTextBlocks(pageBlocks)) {
|
|
||||||
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context));
|
|
||||||
} else {
|
|
||||||
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section);
|
|
||||||
}
|
|
||||||
|
|
||||||
images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, Section section) {
|
|
||||||
|
|
||||||
if (parentNode == null) {
|
|
||||||
return context.getDocumentTree().createNewMainEntryAndReturnId(section);
|
|
||||||
} else {
|
|
||||||
return context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, section);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
|
|
||||||
|
|
||||||
if (pageBlocks.get(0).isHeadline()) {
|
|
||||||
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section);
|
|
||||||
pageBlocks.remove(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
|
|
||||||
|
|
||||||
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
|
||||||
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
|
|
||||||
for (AbstractPageBlock abstractPageBlock : pageBlocks) {
|
|
||||||
|
|
||||||
if (alreadyMerged.contains(abstractPageBlock)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
remainingBlocks.removeAll(alreadyMerged);
|
|
||||||
|
|
||||||
if (abstractPageBlock instanceof TextPageBlock) {
|
|
||||||
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY(abstractPageBlock, remainingBlocks);
|
|
||||||
alreadyMerged.addAll(textBlocks);
|
|
||||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
|
|
||||||
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
|
||||||
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
|
|
||||||
alreadyMerged.addAll(tablesToMerge);
|
|
||||||
TableNodeFactory.addTable(section, tablesToMerge, context);
|
|
||||||
} else {
|
|
||||||
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean containsTablesAndTextBlocks(List<AbstractPageBlock> pageBlocks) {
|
|
||||||
|
|
||||||
return pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This function splits the list of PageBlocks around TablePageBlocks, such that SubSections can be created, that don't include tables.
|
|
||||||
* This is needed so we can execute rules on sections, that do not contain tables.
|
|
||||||
* See: <a href="https://knecon.atlassian.net/wiki/spaces/RED/pages/14765218/Document+Structure">document structure wiki</a>
|
|
||||||
*
|
|
||||||
* @param pageBlocks a List of AbstractPageBlocks, which have at least one TablePageBlock and one ClassificationTextBlock
|
|
||||||
* @return List of Lists of AbstractPageBlocks, which include either a single Headline ClassificationTextBlock and a TablePageBlock or only ClassificationTextBlocks.
|
|
||||||
*/
|
|
||||||
private List<List<AbstractPageBlock>> splitPageBlocksIntoSubSections(List<AbstractPageBlock> pageBlocks) {
|
|
||||||
|
|
||||||
List<List<AbstractPageBlock>> splitList = splitIntoCoherentList(pageBlocks);
|
|
||||||
movePrecedingHeadlineToTableList(splitList);
|
|
||||||
return splitList.stream().filter(list -> !list.isEmpty()).toList();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void movePrecedingHeadlineToTableList(List<List<AbstractPageBlock>> splitList) {
|
|
||||||
|
|
||||||
for (int i = 0; i < splitList.size(); i++) {
|
|
||||||
if (listIsTablesOnly(splitList.get(i)) && i > 0) {
|
|
||||||
List<AbstractPageBlock> previousList = splitList.get(i - 1);
|
|
||||||
AbstractPageBlock lastPageBlockInPreviousList = previousList.get(previousList.size() - 1);
|
|
||||||
if (lastPageBlockInPreviousList.isHeadline()) {
|
|
||||||
previousList.remove(i - 1);
|
|
||||||
splitList.get(i).add(0, lastPageBlockInPreviousList);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean listIsTablesOnly(List<AbstractPageBlock> abstractPageBlocks) {
|
|
||||||
|
|
||||||
return abstractPageBlocks.stream().allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param pageBlocks a List of AbstractPageBlocks, which have at least one TablePageBlock and one ClassificationTextBlock
|
|
||||||
* @return List of Lists of AbstractPageBlocks, which are exclusively of type ClassificationTextBlock or TablePageBlock
|
|
||||||
*/
|
|
||||||
private List<List<AbstractPageBlock>> splitIntoCoherentList(List<AbstractPageBlock> pageBlocks) {
|
|
||||||
|
|
||||||
List<List<AbstractPageBlock>> splitList = new LinkedList<>();
|
|
||||||
List<AbstractPageBlock> currentList = new LinkedList<>();
|
|
||||||
splitList.add(currentList);
|
|
||||||
|
|
||||||
Class<? extends AbstractPageBlock> lastPageBlockClass = pageBlocks.get(0).getClass();
|
|
||||||
for (AbstractPageBlock pageBlock : pageBlocks) {
|
|
||||||
if (lastPageBlockClass.isInstance(pageBlock)) {
|
|
||||||
currentList.add(pageBlock);
|
|
||||||
} else {
|
|
||||||
currentList = new LinkedList<>();
|
|
||||||
currentList.add(pageBlock);
|
|
||||||
splitList.add(currentList);
|
|
||||||
lastPageBlockClass = pageBlock.getClass();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return splitList;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private List<TextPageBlock> findTextBlocksWithSameClassificationAndAlignsY(AbstractPageBlock atc, List<AbstractPageBlock> pageBlocks) {
|
|
||||||
|
|
||||||
return pageBlocks.stream()
|
|
||||||
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
|
|
||||||
.filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage())
|
|
||||||
.filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock)
|
|
||||||
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc))
|
|
||||||
.map(abstractTextContainer -> (TextPageBlock) abstractTextContainer)
|
|
||||||
.toList();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, Section section, Integer pageNumber) {
|
|
||||||
|
|
||||||
Page page = context.getPage(pageNumber);
|
|
||||||
page.getMainBody().add(section);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,136 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.factory;
|
|
||||||
|
|
||||||
import static java.util.Collections.emptyList;
|
|
||||||
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
|
||||||
|
|
||||||
@UtilityClass
|
|
||||||
public class TableNodeFactory {
|
|
||||||
|
|
||||||
public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
|
|
||||||
|
|
||||||
|
|
||||||
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) {
|
|
||||||
|
|
||||||
setPageNumberInCells(tablesToMerge);
|
|
||||||
Set<Page> pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet());
|
|
||||||
List<List<Cell>> mergedRows = tablesToMerge.stream().map(TablePageBlock::getRows).flatMap(Collection::stream).toList();
|
|
||||||
Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.get(0).size()).numberOfRows(mergedRows.size()).build();
|
|
||||||
|
|
||||||
pages.forEach(page -> addTableToPage(page, parentNode, table));
|
|
||||||
|
|
||||||
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
|
|
||||||
table.setTreeId(treeId);
|
|
||||||
addTableCells(mergedRows, table, context);
|
|
||||||
|
|
||||||
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void setPageNumberInCells(List<TablePageBlock> tablesToMerge) {
|
|
||||||
|
|
||||||
// For some reason I can't figure out, in some table cells, the ClassificationTextBlocks have 0 as page number
|
|
||||||
// So I am fixing this here, but this should actually be fixed upstream.
|
|
||||||
tablesToMerge.forEach(table -> table.getRows()
|
|
||||||
.stream()
|
|
||||||
.flatMap(Collection::stream)
|
|
||||||
.peek(cell -> cell.setPageNumber(table.getPage()))
|
|
||||||
.forEach(cell -> setPageNumberInTextBlocksWithPageNumberSetTo0(table, cell)));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void setPageNumberInTextBlocksWithPageNumberSetTo0(TablePageBlock table, Cell cell) {
|
|
||||||
|
|
||||||
cell.getTextBlocks().stream()//
|
|
||||||
.filter(tb -> tb.getPage() == 0)//
|
|
||||||
.forEach(tb -> tb.setPage(table.getPage()));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
|
||||||
private void addTableToPage(Page page, SemanticNode parentNode, Table table) {
|
|
||||||
|
|
||||||
if (!page.getMainBody().contains(parentNode)) {
|
|
||||||
parentNode.getPages().add(page);
|
|
||||||
}
|
|
||||||
|
|
||||||
page.getMainBody().add(table);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
|
|
||||||
|
|
||||||
if (table.streamHeaders().findAny().isEmpty()) {
|
|
||||||
table.streamRow(0).forEach(tableCellNode -> tableCellNode.setHeader(true));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) {
|
|
||||||
|
|
||||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
|
||||||
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
|
|
||||||
addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
|
||||||
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) {
|
|
||||||
|
|
||||||
Page page = context.getPage(cell.getPageNumber());
|
|
||||||
|
|
||||||
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()).build();
|
|
||||||
page.getMainBody().add(tableCell);
|
|
||||||
|
|
||||||
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
|
|
||||||
tableCell.setTreeId(treeId);
|
|
||||||
|
|
||||||
TextBlock textBlock;
|
|
||||||
if (cell.getTextBlocks().isEmpty()) {
|
|
||||||
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
|
|
||||||
} else if (cell.getTextBlocks().size() == 1) {
|
|
||||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
|
|
||||||
tableCell.setLeafTextBlock(textBlock);
|
|
||||||
} else if (firstTextBlockIsHeadline(cell)) {
|
|
||||||
SectionNodeFactory.addSection(tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context);
|
|
||||||
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
|
|
||||||
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
|
|
||||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
|
|
||||||
tableCell.setLeafTextBlock(textBlock);
|
|
||||||
} else {
|
|
||||||
cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean cellAreaIsSmallerThanPageAreaTimesThreshold(Cell cell, Page page) {
|
|
||||||
|
|
||||||
return cell.getArea() < TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD * page.getHeight() * page.getWidth();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean firstTextBlockIsHeadline(Cell cell) {
|
|
||||||
|
|
||||||
return cell.getTextBlocks().get(0).isHeadline();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,74 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.factory;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock;
|
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
|
||||||
import lombok.experimental.FieldDefaults;
|
|
||||||
|
|
||||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
|
||||||
public class TextBlockFactory {
|
|
||||||
|
|
||||||
int stringOffset;
|
|
||||||
long textBlockIdx;
|
|
||||||
|
|
||||||
|
|
||||||
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
|
|
||||||
|
|
||||||
Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page);
|
|
||||||
return buildAtomicTextBlock(sequences, parent, numberOnPage, page);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, Integer numberOnPage, Page page) {
|
|
||||||
|
|
||||||
SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionDto(sequences);
|
|
||||||
int offset = stringOffset;
|
|
||||||
stringOffset += searchTextWithTextPositionDto.getSearchText().length();
|
|
||||||
long idx = textBlockIdx;
|
|
||||||
textBlockIdx++;
|
|
||||||
String orientation;
|
|
||||||
int textDirection;
|
|
||||||
if (sequences.isEmpty()) {
|
|
||||||
orientation = null;
|
|
||||||
textDirection = 0;
|
|
||||||
} else {
|
|
||||||
orientation = sequences.get(0).getDir().toString();
|
|
||||||
textDirection = sequences.get(0).getRotation();
|
|
||||||
}
|
|
||||||
return AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(),
|
|
||||||
searchTextWithTextPositionDto.getLineBreaks(),
|
|
||||||
searchTextWithTextPositionDto.getBoldTextBoundaries(),
|
|
||||||
searchTextWithTextPositionDto.getItalicTextBoundaries(),
|
|
||||||
searchTextWithTextPositionDto.getPositions(),
|
|
||||||
searchTextWithTextPositionDto.getStringIdxToPositionIdx(),
|
|
||||||
idx,
|
|
||||||
parent,
|
|
||||||
numberOnPage,
|
|
||||||
page,
|
|
||||||
offset,
|
|
||||||
orientation,
|
|
||||||
textDirection);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public AtomicTextBlock emptyTextBlock(SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
|
|
||||||
|
|
||||||
long idx = textBlockIdx;
|
|
||||||
textBlockIdx++;
|
|
||||||
return AtomicTextBlock.empty(idx, stringOffset, page, context.getAndIncrementTextBlockNumberOnPage(page), parent);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public AtomicTextBlock emptyTextBlock(SemanticNode parent, Integer numberOnPage, Page page) {
|
|
||||||
|
|
||||||
long idx = textBlockIdx;
|
|
||||||
textBlockIdx++;
|
|
||||||
return AtomicTextBlock.empty(idx, stringOffset, page, numberOnPage, parent);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,139 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.graph;
|
|
||||||
|
|
||||||
import static java.lang.String.format;
|
|
||||||
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import lombok.EqualsAndHashCode;
|
|
||||||
import lombok.Setter;
|
|
||||||
|
|
||||||
@Setter
|
|
||||||
@EqualsAndHashCode
|
|
||||||
public class Boundary implements Comparable<Boundary> {
|
|
||||||
|
|
||||||
private int start;
|
|
||||||
private int end;
|
|
||||||
|
|
||||||
|
|
||||||
public Boundary(int start, int end) {
|
|
||||||
|
|
||||||
if (start > end) {
|
|
||||||
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
|
||||||
}
|
|
||||||
this.start = start;
|
|
||||||
this.end = end;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public int length() {
|
|
||||||
|
|
||||||
return end - start;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public int start() {
|
|
||||||
|
|
||||||
return start;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public int end() {
|
|
||||||
|
|
||||||
return end;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean contains(Boundary boundary) {
|
|
||||||
|
|
||||||
return start <= boundary.start() && boundary.end() <= end;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean containedBy(Boundary boundary) {
|
|
||||||
|
|
||||||
return boundary.contains(this);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean contains(int start, int end) {
|
|
||||||
|
|
||||||
if (start > end) {
|
|
||||||
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
|
||||||
}
|
|
||||||
return this.start <= start && end <= this.end;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean containedBy(int start, int end) {
|
|
||||||
|
|
||||||
if (start > end) {
|
|
||||||
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
|
||||||
}
|
|
||||||
return start <= this.start && this.end <= end;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean contains(int index) {
|
|
||||||
|
|
||||||
return start <= index && index < end;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean intersects(Boundary boundary) {
|
|
||||||
|
|
||||||
return boundary.start() < this.end && this.start < boundary.end();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public List<Boundary> split(List<Integer> splitIndices) {
|
|
||||||
|
|
||||||
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
|
|
||||||
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
|
|
||||||
}
|
|
||||||
List<Boundary> splitBoundaries = new LinkedList<>();
|
|
||||||
int previousIndex = start;
|
|
||||||
for (int splitIndex : splitIndices) {
|
|
||||||
|
|
||||||
// skip split if it would produce a boundary of length 0
|
|
||||||
if (splitIndex == previousIndex) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
splitBoundaries.add(new Boundary(previousIndex, splitIndex));
|
|
||||||
previousIndex = splitIndex;
|
|
||||||
}
|
|
||||||
splitBoundaries.add(new Boundary(previousIndex, end));
|
|
||||||
return splitBoundaries;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public static Boundary merge(Collection<Boundary> boundaries) {
|
|
||||||
|
|
||||||
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
|
|
||||||
int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new);
|
|
||||||
return new Boundary(minStart, maxEnd);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
|
|
||||||
return format("Boundary [%d|%d)", start, end);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compareTo(Boundary boundary) {
|
|
||||||
|
|
||||||
if (end < boundary.end() && start < boundary.start()) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
if (start > boundary.start() && end > boundary.end()) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,217 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.graph;
|
|
||||||
|
|
||||||
import static java.lang.String.format;
|
|
||||||
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlockCollector;
|
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.Data;
|
|
||||||
import lombok.EqualsAndHashCode;
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.experimental.FieldDefaults;
|
|
||||||
|
|
||||||
@Data
|
|
||||||
@EqualsAndHashCode
|
|
||||||
public class DocumentTree {
|
|
||||||
|
|
||||||
private final Entry root;
|
|
||||||
|
|
||||||
|
|
||||||
public DocumentTree(Document document) {
|
|
||||||
|
|
||||||
root = Entry.builder().treeId(Collections.emptyList()).children(new LinkedList<>()).node(document).build();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public TextBlock buildTextBlock() {
|
|
||||||
|
|
||||||
return allEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public List<Integer> createNewMainEntryAndReturnId(GenericSemanticNode node) {
|
|
||||||
|
|
||||||
return createNewChildEntryAndReturnIdImpl(Collections.emptyList(), node);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, GenericSemanticNode node) {
|
|
||||||
|
|
||||||
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, Table node) {
|
|
||||||
|
|
||||||
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public List<Integer> createNewTableChildEntryAndReturnId(Table parentTable, TableCell tableCell) {
|
|
||||||
|
|
||||||
return createNewChildEntryAndReturnIdImpl(parentTable.getTreeId(), tableCell);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
|
|
||||||
private List<Integer> createNewChildEntryAndReturnIdImpl(List<Integer> parentId, SemanticNode node) {
|
|
||||||
|
|
||||||
if (!entryExists(parentId)) {
|
|
||||||
throw new IllegalArgumentException(format("parentId %s does not exist!", parentId));
|
|
||||||
}
|
|
||||||
|
|
||||||
Entry parent = getEntryById(parentId);
|
|
||||||
List<Integer> newId = new LinkedList<>(parentId);
|
|
||||||
newId.add(parent.children.size());
|
|
||||||
parent.children.add(Entry.builder().treeId(newId).node(node).build());
|
|
||||||
|
|
||||||
return newId;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean entryExists(List<Integer> treeId) {
|
|
||||||
|
|
||||||
if (treeId.isEmpty()) {
|
|
||||||
return root != null;
|
|
||||||
}
|
|
||||||
Entry entry = root.children.get(treeId.get(0));
|
|
||||||
for (int id : treeId.subList(1, treeId.size())) {
|
|
||||||
if (id >= entry.children.size() || 0 > id) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
entry = entry.children.get(id);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Entry getParentEntryById(List<Integer> treeId) {
|
|
||||||
|
|
||||||
return getEntryById(getParentId(treeId));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean hasParentById(List<Integer> treeId) {
|
|
||||||
|
|
||||||
return !treeId.isEmpty();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Stream<SemanticNode> childNodes(List<Integer> treeId) {
|
|
||||||
|
|
||||||
return getEntryById(treeId).children.stream().map(Entry::getNode);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Stream<SemanticNode> childNodesOfType(List<Integer> treeId, NodeType nodeType) {
|
|
||||||
|
|
||||||
return getEntryById(treeId).children.stream().filter(entry -> entry.node.getType().equals(nodeType)).map(Entry::getNode);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static List<Integer> getParentId(List<Integer> treeId) {
|
|
||||||
|
|
||||||
if (treeId.isEmpty()) {
|
|
||||||
throw new UnsupportedOperationException("Root has no parent!");
|
|
||||||
}
|
|
||||||
if (treeId.size() < 2) {
|
|
||||||
return Collections.emptyList();
|
|
||||||
}
|
|
||||||
return treeId.subList(0, treeId.size() - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Entry getEntryById(List<Integer> treeId) {
|
|
||||||
|
|
||||||
if (treeId.isEmpty()) {
|
|
||||||
return root;
|
|
||||||
}
|
|
||||||
Entry entry = root.children.get(treeId.get(0));
|
|
||||||
for (int id : treeId.subList(1, treeId.size())) {
|
|
||||||
entry = entry.children.get(id);
|
|
||||||
}
|
|
||||||
return entry;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Stream<Entry> mainEntries() {
|
|
||||||
|
|
||||||
return root.children.stream();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Stream<Entry> allEntriesInOrder() {
|
|
||||||
|
|
||||||
return Stream.of(root).flatMap(DocumentTree::flatten);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Stream<Entry> allSubEntriesInOrder(List<Integer> parentId) {
|
|
||||||
|
|
||||||
return getEntryById(parentId).children.stream().flatMap(DocumentTree::flatten);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
|
|
||||||
return String.join("\n", allEntriesInOrder().map(Entry::toString).toList());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static Stream<Entry> flatten(Entry entry) {
|
|
||||||
|
|
||||||
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTree::flatten));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public SemanticNode getHighestParentById(List<Integer> treeId) {
|
|
||||||
|
|
||||||
if (treeId.isEmpty()) {
|
|
||||||
return root.node;
|
|
||||||
}
|
|
||||||
return root.children.get(treeId.get(0)).node;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Builder
|
|
||||||
@Getter
|
|
||||||
@AllArgsConstructor
|
|
||||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
|
||||||
public static class Entry {
|
|
||||||
|
|
||||||
List<Integer> treeId;
|
|
||||||
SemanticNode node;
|
|
||||||
@Builder.Default
|
|
||||||
List<Entry> children = new LinkedList<>();
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
|
|
||||||
return node.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public NodeType getType() {
|
|
||||||
|
|
||||||
return node.getType();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,8 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.graph.entity;
|
|
||||||
|
|
||||||
public enum EntityType {
|
|
||||||
ENTITY,
|
|
||||||
RECOMMENDATION,
|
|
||||||
FALSE_POSITIVE,
|
|
||||||
FALSE_RECOMMENDATION
|
|
||||||
}
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user