Compare commits

...

270 Commits

Author SHA1 Message Date
Dominique Eifländer
ef23ee0ade Merge branch 'RED-10752-main' into 'main'
RED-10752: Enabled prometheus

See merge request fforesight/layout-parser!267
2025-01-29 13:34:01 +01:00
Dominique Eifländer
af31f52b47 RED-10752: Enabled prometheus 2025-01-29 11:09:29 +01:00
Kilian Schüttler
b5152112ee Merge branch 'RM-231' into 'main'
RM-231: missing whitespace in name

See merge request fforesight/layout-parser!264
2025-01-14 13:04:10 +01:00
Kilian Schuettler
85ea4ef455 RM-231: missing whitespace in name 2025-01-14 12:59:01 +01:00
Kilian Schüttler
01f8c01fff Merge branch 'RED-10714' into 'main'
RED-10714: fix IndexOutOfBoundsException

See merge request fforesight/layout-parser!262
2025-01-10 12:33:18 +01:00
Kilian Schuettler
0b6a292c75 RED-10714: fix IndexOutOfBoundsException 2025-01-10 12:12:14 +01:00
Maverick Studer
e24020589c Merge branch 'feature/RED-9998' into 'main'
RED-9998: App version history (for conditional re-analyzing the layout of a file)

See merge request fforesight/layout-parser!259
2024-12-12 09:58:46 +01:00
Maverick Studer
c619b845e8 RED-9998: App version history (for conditional re-analyzing the layout of a file) 2024-12-12 09:58:46 +01:00
Kilian Schüttler
ed0371ca11 Merge branch 'RED-10127' into 'main'
RED-10127: Paragraphs with multiple table, appendix, figure can't be headlines

See merge request fforesight/layout-parser!257
2024-12-06 14:49:48 +01:00
Kilian Schuettler
89b5be8d67 RED-10127: Paragraphs with multiple table, appendix, figure can't be headlines 2024-12-06 13:41:44 +01:00
Kilian Schuettler
077ce60c9d RED-9139: update document version 2024-11-15 16:48:56 +01:00
Kilian Schüttler
ab171be6e2 Merge branch 'feature/RED-9139' into 'main'
RED-9139: more robust TOC detection

See merge request fforesight/layout-parser!253
2024-11-14 16:50:52 +01:00
Kilian Schuettler
664b47b4c3 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:49 +01:00
Kilian Schuettler
8005c1f25f RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
42185a95a0 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
51b42efaf6 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
6a50d45947 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
073ac12cf7 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
84b054a4cc RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
905b65a5fa RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
7617c1f308 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
2b3936c09b RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
6e5b1f1978 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
cf846d18bc RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
25c46f16ac RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
96acefed78 RED-9139: move document to module in redaction-service
* add TableOfContents node
2024-11-14 16:39:48 +01:00
Kilian Schuettler
366241e6c6 RED-9139: move document to module in redaction-service
* add TableOfContents node
2024-11-14 16:39:48 +01:00
Kilian Schuettler
7f472ccc52 RED-9139: move document to module in redaction-service
* add TableOfContents node
2024-11-14 16:39:48 +01:00
Kilian Schuettler
6f807c7d94 RED-9139: add new TableOfContents Node
* rename previous TableOfContent to SectionTree
* added protobuf compile script
2024-11-14 16:39:48 +01:00
Kilian Schuettler
6e04c15f3d RED-9139: add new TableOfContents Node
* rename previous TableOfContent to SectionTree
* added protobuf compile script
2024-11-14 16:39:48 +01:00
Kilian Schuettler
1384584e2f RED-9139: more robust TOC detection
* detect numbers in words, and not just whole words that are numbers
2024-11-14 16:39:46 +01:00
Kilian Schuettler
e58011e111 RED-9139: more robust TOC detection
* detect numbers in words, and not just whole words that are numbers
2024-11-14 16:39:21 +01:00
Kilian Schüttler
a821570065 Merge branch 'RED-9139-bp' into 'main'
RED-9139: more robust TOC detection

See merge request fforesight/layout-parser!254
2024-11-13 10:54:39 +01:00
Kilian Schüttler
7ee1f9e360 RED-9139: more robust TOC detection 2024-11-13 10:54:39 +01:00
Kilian Schüttler
f9b25c8157 Merge branch 'RED-10249' into 'main'
RED-10249: regex found incorrectly due to wrong text sorting

See merge request fforesight/layout-parser!252
2024-11-04 12:51:38 +01:00
Kilian Schüttler
c90874da7a RED-10249: regex found incorrectly due to wrong text sorting 2024-11-04 12:51:37 +01:00
Kilian Schüttler
4683c696a5 Merge branch 'RED-10247' into 'main'
RED-10247: dictionary entry not found in footer due to wrong text sorting

See merge request fforesight/layout-parser!251
2024-10-25 18:30:35 +02:00
Kilian Schuettler
95c02ce3cf RED-10247: dictionary entry not found in footer due to wrong text sorting 2024-10-25 17:18:14 +02:00
Kilian Schüttler
b2d62e32fe Merge branch 'RED-10270-fp' into 'main'
RED-10270: fix NumberFormatException

See merge request fforesight/layout-parser!248
2024-10-24 17:14:47 +02:00
Kilian Schuettler
65c1f03ea3 RED-10270: fix NumberFormatException 2024-10-24 10:59:05 +02:00
Kilian Schüttler
2219519a2b Merge branch 'RED-10127' into 'main'
RED-10127: rename TextPositionSequence to Word

See merge request fforesight/layout-parser!244
2024-10-18 12:20:15 +02:00
Kilian Schüttler
af05218e37 RED-10127: rename TextPositionSequence to Word 2024-10-18 12:20:15 +02:00
Kilian Schüttler
736f531df3 Merge branch 'hotfix' into 'main'
Hotfix

See merge request fforesight/layout-parser!243
2024-10-18 12:12:15 +02:00
Kilian Schüttler
c64445d54b Hotfix 2024-10-18 12:12:15 +02:00
Kilian Schüttler
af29233b10 Merge branch 'feature/RED-10127' into 'main'
RED-10127: add more units

See merge request fforesight/layout-parser!242
2024-10-15 09:57:21 +02:00
Kilian Schuettler
5f04b45554 RED-10127: add more units 2024-10-15 09:47:39 +02:00
Kilian Schüttler
6c41533f0b Merge branch 'feature/RED-10127' into 'main'
RED-10127: improve list classification

See merge request fforesight/layout-parser!240
2024-10-14 17:34:33 +02:00
Kilian Schuettler
9d2596e5ef RED-10127: improve list classification
* add one more format to list identification
* add 'ppb' to known units
* special case for headlines continuing with 14C after the identifier (quite often in some specific files)
2024-10-14 17:21:44 +02:00
Kilian Schüttler
e7b01161ac Merge branch 'feature/RED-10127' into 'main'
RED-10127: add list classification

See merge request fforesight/layout-parser!237
2024-10-10 10:50:10 +02:00
Kilian Schüttler
7b073eb4f3 RED-10127: add list classification 2024-10-10 10:50:10 +02:00
Dominique Eifländer
4b0c041d84 Merge branch 'feature/RED-10127' into 'main'
RED-10127: improve headline detection

See merge request fforesight/layout-parser!235
2024-10-09 08:48:48 +02:00
Kilian Schüttler
6c7442ac6d RED-10127: improve headline detection 2024-10-09 08:48:48 +02:00
Maverick Studer
23e23328ee Merge branch 'RED-10126' into 'main'
RM-187: Footers are recognized in the middle of the page

See merge request fforesight/layout-parser!233
2024-10-08 14:27:45 +02:00
Maverick Studer
9d1ffdd779 RM-187: Footers are recognized in the middle of the page 2024-10-08 14:27:44 +02:00
Maverick Studer
3109a30ae1 Merge branch 'RED-9123-proto' into 'main'
RED-9123: Improve performance of re-analysis (Spike)

See merge request fforesight/layout-parser!232
2024-10-07 12:28:10 +02:00
Maverick Studer
fe2ed1807e RED-9123: Improve performance of re-analysis (Spike) 2024-10-07 12:28:10 +02:00
Maverick Studer
31de229fa5 Merge branch 'feature/RED-9010' into 'main'
RED-9010: remove redaction log

See merge request fforesight/layout-parser!231
2024-09-19 11:34:32 +02:00
Maverick Studer
8a80abfff1 RED-9010: remove redaction log 2024-09-19 11:34:32 +02:00
Dominique Eifländer
7c08905eda Merge branch 'RED-9975-main' into 'main'
RED-9975: Fixed missing section numbers in layout grid

See merge request fforesight/layout-parser!230
2024-09-18 11:29:51 +02:00
Dominique Eifländer
4f40c9dbc9 RED-9975: Fixed missing section numbers in layout grid 2024-09-18 11:22:37 +02:00
Dominique Eifländer
32381b4472 Merge branch 'RED-9974' into 'main'
Red 9974: improce headline classification, fix font size calculation

See merge request fforesight/layout-parser!226
2024-09-16 14:06:48 +02:00
Kilian Schüttler
469da38952 Red 9974: improce headline classification, fix font size calculation 2024-09-16 14:06:48 +02:00
Dominique Eifländer
0f8c4674b3 Merge branch 'hotfix' into 'main'
hotfix: viewerDocService doesn't remove existing marked content

See merge request fforesight/layout-parser!225
2024-09-12 09:12:54 +02:00
Kilian Schuettler
8e165a41d7 hotfix: viewerDocService doesn't remove existing marked content 2024-09-11 16:34:21 +02:00
Kilian Schüttler
ed7a701ad9 Merge branch 'RED-9975' into 'main'
RED-9975: improve SuperSection handling

See merge request fforesight/layout-parser!223
2024-09-11 13:38:09 +02:00
Kilian Schüttler
393103e074 RED-9975: improve SuperSection handling 2024-09-11 13:38:09 +02:00
Dominique Eifländer
bd02066e2c Merge branch 'RED-9976-main' into 'main'
RED-9976: Removed sorting that scrambles text in PDFTextStripper

See merge request fforesight/layout-parser!222
2024-09-10 13:02:36 +02:00
Dominique Eifländer
fec19f4afb RED-9976: Removed sorting that scrambles text in PDFTextStripper 2024-09-10 12:50:37 +02:00
Kilian Schüttler
c726a643f0 Merge branch 'hotfix' into 'main'
Hotfix: unmerge super large tables

See merge request fforesight/layout-parser!220
2024-09-05 15:05:21 +02:00
Kilian Schüttler
519e95735c Hotfix: unmerge super large tables 2024-09-05 15:05:21 +02:00
Maverick Studer
b52af2637f Merge branch 'RED-9942-2' into 'main'
RED-9942: File only with images not recognised

See merge request fforesight/layout-parser!218
2024-09-05 10:49:12 +02:00
Maverick Studer
46ea7edc4c RED-9942: File only with images not recognised 2024-09-05 10:49:12 +02:00
Kilian Schüttler
9650195afd Merge branch 'hotfix-fp' into 'main'
hotfix: add Java advanced imaging

See merge request fforesight/layout-parser!217
2024-09-04 15:43:56 +02:00
Kilian Schuettler
ce628a99f7 hotfix: add Java advanced imaging 2024-09-04 15:18:12 +02:00
Maverick Studer
b66afe135c Merge branch 'RED-9524' into 'main'
RED-9524: File processing does not annotate images

See merge request fforesight/layout-parser!214
2024-09-04 13:27:06 +02:00
Maverick Studer
dc892d0fec RED-9524: File processing does not annotate images 2024-09-04 13:27:06 +02:00
Kilian Schüttler
af45f2cd8c Merge branch 'RED-9964' into 'main'
RED-9964: fix errors with images

See merge request fforesight/layout-parser!212
2024-09-04 09:16:59 +02:00
Kilian Schuettler
befb6b1df6 RED-9964: fix errors with images 2024-09-03 16:37:48 +02:00
Maverick Studer
61efb4cae9 Merge branch 'update-tc' into 'main'
Update tenant-commons for dlq fix

See merge request fforesight/layout-parser!211
2024-09-03 13:50:02 +02:00
maverickstuder
4a06059258 Update tenant-commons for dlq fix 2024-09-03 13:15:08 +02:00
Dominique Eifländer
292e5b215e Merge branch 'RED-9988-main' into 'main'
RED-9988: Fixed NPE when image representation is not present

See merge request fforesight/layout-parser!210
2024-09-02 09:56:53 +02:00
Dominique Eifländer
7c2db6c3c5 RED-9988: Fixed NPE when image representation is not present 2024-09-02 09:51:59 +02:00
Dominique Eifländer
4395074b21 Merge branch 'RED-9975' into 'main'
Red 9975: fix outline detection

See merge request fforesight/layout-parser!206
2024-09-02 09:02:36 +02:00
Kilian Schüttler
8e14b74da2 Red 9975: fix outline detection 2024-09-02 09:02:36 +02:00
Kilian Schüttler
3b91639ea9 Merge branch 'RED-9964-fp' into 'main'
RED-9964: don't merge tables on non-consecutive pages

See merge request fforesight/layout-parser!205
2024-08-30 14:00:48 +02:00
Kilian Schüttler
c5178ea5c2 RED-9964: don't merge tables on non-consecutive pages 2024-08-30 14:00:48 +02:00
Dominique Eifländer
cf39d4dfcc Merge branch 'RED-9974' into 'main'
RED-9974: Improved headline detection for documine old

See merge request fforesight/layout-parser!202
2024-08-30 10:57:20 +02:00
Dominique Eifländer
bb40345f79 RED-9974: Improved headline detection for documine old 2024-08-30 10:36:22 +02:00
Kilian Schüttler
e3e9d16145 Merge branch 'RED-9975' into 'main'
RED-9975: activate outline detection

See merge request fforesight/layout-parser!201
2024-08-29 14:27:00 +02:00
Kilian Schuettler
f6ca5a3c17 RED-9975: activate outline detection 2024-08-29 14:18:29 +02:00
Maverick Studer
15e3dced35 Merge branch 'tenants-retry' into 'main'
Tenants retry logic and queue renames

See merge request fforesight/layout-parser!197
2024-08-29 13:46:54 +02:00
Maverick Studer
933054b332 Tenants retry logic and queue renames 2024-08-29 13:46:54 +02:00
Kilian Schüttler
ab86714cb3 Merge branch 'RED-9975' into 'main'
RED-9975: activate outline detection

See merge request fforesight/layout-parser!198
2024-08-29 12:25:42 +02:00
Kilian Schuettler
8626b106d0 RED-9975: activate outline detection 2024-08-29 12:16:07 +02:00
Maverick Studer
52e948e66c Merge branch 'RED-9331' into 'main'
RED-9331: Explore possibilities for fair upload / analysis processing per tenant

See merge request fforesight/layout-parser!182
2024-08-27 09:27:37 +02:00
Maverick Studer
3b33405cbf RED-9331: Explore possibilities for fair upload / analysis processing per tenant 2024-08-27 09:27:37 +02:00
Maverick Studer
b2fa14dde2 Merge branch 'AZURE_NER' into 'main'
RED-9918: Azure entity recognition (Spike)

See merge request fforesight/layout-parser!196
2024-08-26 14:34:46 +02:00
Maverick Studer
62e07686d7 RED-9918: Azure entity recognition (Spike) 2024-08-26 14:34:46 +02:00
Dominique Eifländer
3eb97d614f Merge branch 'RED-9760-NPE' into 'main'
RED-9760: Fixed nullpointer in TextPageBlock

See merge request fforesight/layout-parser!194
2024-08-13 13:24:48 +02:00
Dominique Eifländer
81469413b0 RED-9760: Fixed nullpointer in TextPageBlock 2024-08-13 13:18:50 +02:00
Dominique Eifländer
2993676a6f Merge branch 'RED-9670' into 'main'
RED-9760: change compareDouble to something sensible

See merge request fforesight/layout-parser!193
2024-08-12 16:02:51 +02:00
Kilian Schüttler
8e115dcd8a RED-9760: change compareDouble to something sensible 2024-08-12 16:02:50 +02:00
Dominique Eifländer
173911b840 Merge branch 'hotfix-reading-order' into 'main'
hotfix: threshold adjustements

See merge request fforesight/layout-parser!192
2024-08-12 14:59:20 +02:00
Kilian Schuettler
b0ae00aa02 hotfix: threshold adjustements 2024-08-12 14:52:18 +02:00
Dominique Eifländer
00bf9f279e Merge branch 'hotfix-reading-order' into 'main'
hotfix: use center coordinates

See merge request fforesight/layout-parser!191
2024-08-09 15:51:42 +02:00
Kilian Schuettler
d16377a24a hotfix: line comparison with center coordinates 2024-08-09 15:45:23 +02:00
Dominique Eifländer
81179ee744 Merge branch 'RED-9760-dcold' into 'main'
RED-9760: Changed lineSeparation threshold for documine old

See merge request fforesight/layout-parser!190
2024-08-09 14:50:39 +02:00
Dominique Eifländer
1953b5924f RED-9760: Changed lineSeparation threshold for documine old 2024-08-09 14:42:14 +02:00
Kilian Schüttler
6f6e8d5d4e Merge branch 'hotfix-reading-order' into 'main'
hotfix reading order

See merge request fforesight/layout-parser!187
2024-08-09 11:49:12 +02:00
Kilian Schüttler
69bcd4f68d hotfix reading order 2024-08-09 11:49:12 +02:00
Timo Bejan
b900cfaf31 Merge branch 'CLARI-140' into 'main'
CLARI-140 - case issue

See merge request fforesight/layout-parser!189
2024-08-08 21:49:40 +02:00
Timo Bejan
cdc2081785 CLARI-140 - case issue 2024-08-08 22:40:11 +03:00
Timo Bejan
a9287ec406 Merge branch 'CLARI-139' into 'main'
CLAR-139 - fixed outline error for unparsable object

See merge request fforesight/layout-parser!188
2024-08-08 16:36:41 +02:00
Timo Bejan
5b6a706c28 CLAR-139 - fixed outline error for unparsable object 2024-08-08 16:20:14 +03:00
Timo Bejan
28d8ad0a3f Merge branch 'CLARI-128' into 'main'
Fixed Index out of bounds exception in blockificationpostprocessingservice -...

See merge request fforesight/layout-parser!186
2024-07-30 16:56:21 +02:00
Timo Bejan
0c1583c1be Fixed Index out of bounds exception in blockificationpostprocessingservice - this could should be documented btw, there are also probably other use-cases where the code doesnt work 2024-07-30 17:45:05 +03:00
Andrei Isvoran
7633566d9b Merge branch 'RED-9607-fp' into 'main'
RED-9607 - Correctly determine text position sequence based on file rotation

See merge request fforesight/layout-parser!184
2024-07-25 14:11:00 +02:00
Andrei Isvoran
cc4f09711e RED-9607 - Correctly determine text position sequence based on file rotation 2024-07-24 16:35:11 +03:00
Kilian Schüttler
370165dc59 Merge branch 'document-data-markdown' into 'main'
CLARI: document-data-markdown

See merge request fforesight/layout-parser!181
2024-07-18 17:19:44 +02:00
Maverick Studer
8c052c38d7 CLARI: document-data-markdown 2024-07-18 17:19:43 +02:00
Kilian Schüttler
ea18d3d307 Merge branch 'RED-8800' into 'main'
RED-8800: adjust coordinates in BE to ignore cropbox

See merge request fforesight/layout-parser!179
2024-07-15 17:45:13 +02:00
Kilian Schüttler
2726fc3fe1 RED-8800: adjust coordinates in BE to ignore cropbox 2024-07-15 17:45:13 +02:00
Kilian Schüttler
033279e261 Merge branch 'RED-9353' into 'main'
RED-9353: refactor PDFTronViewerDocumentService

See merge request fforesight/layout-parser!178
2024-07-15 12:54:17 +02:00
Kilian Schüttler
ec0dd032c9 RED-9353: refactor PDFTronViewerDocumentService 2024-07-15 12:54:17 +02:00
Andrei Isvoran
598fa7f1c7 Merge branch 'RED-9496-shutdown-fp' into 'main'
RED-9496 - Implement graceful shutdown

See merge request fforesight/layout-parser!176
2024-07-04 14:26:00 +02:00
Andrei Isvoran
65b1f7d179 RED-9496 - Implement graceful shutdown 2024-07-04 14:21:20 +03:00
Kilian Schüttler
3173610be5 Merge branch 'CLARI-003' into 'main'
CLARI-003: add treeId to StructureObject

See merge request fforesight/layout-parser!176
2024-07-02 11:37:22 +02:00
Kilian Schuettler
e920eb5a78 CLARI-003: add treeId to StructureObject 2024-07-01 13:56:16 +02:00
Kilian Schüttler
7e4baea7e5 Merge branch 'RED-9353' into 'main'
RED-9353: use azure ocr service

See merge request fforesight/layout-parser!175
2024-07-01 11:13:27 +02:00
Kilian Schüttler
66d3433e04 RED-9353: use azure ocr service 2024-07-01 11:13:26 +02:00
Yannik Hampe
a2f559af51 Merge branch 'RED-3813' into 'main'
RED-3813: Recategorize same image as experimental feature

See merge request fforesight/layout-parser!155
2024-06-26 13:42:42 +02:00
Yannik Hampe
39f527a57c Merge branch 'main' into 'RED-3813'
# Conflicts:
#   layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
2024-06-26 09:10:59 +02:00
yhampe
5c2844fe31 RED-3813: Recategorize same image as experimental feature
fixed failing test
2024-06-26 09:08:37 +02:00
Kilian Schüttler
b216f02e15 Merge branch 'RED-9194' into 'main'
RED-9194: roll back single digit headline change

See merge request fforesight/layout-parser!171
2024-06-21 15:13:40 +02:00
Kilian Schuettler
2e2f30ba35 RED-9194: roll back single digit headline change 2024-06-21 14:42:30 +02:00
Kilian Schuettler
9f7ed974ec RED-9194: roll back single digit headline change 2024-06-21 14:41:30 +02:00
Kilian Schuettler
570a348a77 RED-9194: roll back single digit headline change 2024-06-21 14:39:47 +02:00
Maverick Studer
859dba2ecf Merge branch 'RED-9374' into 'main'
hotfix for table/paragraph section creation on document start before first headline

See merge request fforesight/layout-parser!170
2024-06-18 17:36:04 +02:00
Maverick Studer
1c5d755111 hotfix for table/paragraph section creation on document start before first headline 2024-06-18 17:36:04 +02:00
Maverick Studer
133e06460f Merge branch 'RED-9374' into 'main'
RED-9374: Ner Entities are at wrong locations

See merge request fforesight/layout-parser!169
2024-06-18 16:31:24 +02:00
Maverick Studer
da91fcff97 RED-9374: Ner Entities are at wrong locations 2024-06-18 16:31:24 +02:00
Kilian Schüttler
79795e408a Merge branch 'RED-9194' into 'main'
RED-9194: allow single digit headline identifiers

See merge request fforesight/layout-parser!168
2024-06-07 09:09:25 +02:00
Kilian Schuettler
b719db86ab RED-9194: allow single digit headline identifiers 2024-06-06 16:32:05 +02:00
Maverick Studer
797602e373 Merge branch 'thread-safe-hcs-fields' into 'main'
fixed issue with thread-safety of local fields in the HeadlineClassificationService

See merge request fforesight/layout-parser!167
2024-06-06 14:51:24 +02:00
maverickstuder
3d2f66cf10 fixed issue with thread-safety of local fields in the HeadlineClassificationService:
* HeadlineClassificationService is no singleton anymore
* instead initialize it in the ClassificationService and pass it to the classifyMethods as required
2024-06-06 14:39:23 +02:00
Kilian Schüttler
e304a9f2d7 Merge branch 'RED-7074-le' into 'main'
RED-7074: Design Subsection section tree structure algorithm

See merge request fforesight/layout-parser!166
2024-06-06 13:22:14 +02:00
Maverick Studer
c05f67cf44 RED-7074: Design Subsection section tree structure algorithm 2024-06-06 13:22:14 +02:00
yhampe
9ecf9ca19f RED-3813: Recategorize same image as experimental feature
now writing hash into structure
2024-06-05 14:20:33 +02:00
Corina Olariu
3a2ee903af Merge branch 'RED-9206-2' into 'main'
RED-9206 - Sections are no longer correctly separated from each other in the test file

See merge request fforesight/layout-parser!165
2024-06-05 13:39:15 +02:00
Corina Olariu
072a8aa3da RED-9206 - Sections are no longer correctly separated from each other in the test file
- add REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH case
2024-06-05 14:26:54 +03:00
Corina Olariu
b5cfa7b63d Merge branch 'RED-9206' into 'main'
RED-9206 - Sections are no longer correctly separated from each other in the test file

See merge request fforesight/layout-parser!163
2024-06-05 13:13:45 +02:00
Corina Olariu
5f5a6258c5 Merge branch 'main' into RED-9206 2024-06-05 13:34:14 +03:00
Maverick Studer
ac0e83725a Merge branch 'RED-7074-lgs' into 'main'
RED-7074: Design Subsection section tree structure algorithm

See merge request fforesight/layout-parser!164
2024-06-05 12:28:00 +02:00
Maverick Studer
5d33ad570e RED-7074: Design Subsection section tree structure algorithm 2024-06-05 12:28:00 +02:00
Corina Olariu
fd698a78fc RED-9206 - Sections are no longer correctly separated from each other in the test file
- introduce new layout parsing type: REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH to include changes from REDACT_MANAGER apart from duplicate paragraph.
- updated junit tests
-
2024-06-04 20:55:37 +03:00
Maverick Studer
c3edeb3c7d Merge branch 'RED-7074-test' into 'main'
RED-7074: Design Subsection section tree structure algorithm

See merge request fforesight/layout-parser!162
2024-06-04 15:07:40 +02:00
Maverick Studer
fc06dba2ce RED-7074: Design Subsection section tree structure algorithm 2024-06-04 15:07:40 +02:00
Maverick Studer
b6742c1e89 Merge branch 'RED-7074_2' into 'main'
RED-7074: Design Subsection section tree structure algorithm

See merge request fforesight/layout-parser!160
2024-05-28 14:48:21 +02:00
Maverick Studer
efb1a748af RED-7074: Design Subsection section tree structure algorithm 2024-05-28 14:48:21 +02:00
yhampe
9be672c728 RED-3813: Recategorize same image as experimental feature
working on pushing properties to persistence service
2024-05-28 13:51:45 +02:00
Maverick Studer
23985b14be Merge branch 'RED-7074_2' into 'main'
RED-7074: Design Subsection section tree structure algorithm

See merge request fforesight/layout-parser!159
2024-05-24 13:30:25 +02:00
Maverick Studer
48b7a22e2b RED-7074: Design Subsection section tree structure algorithm 2024-05-24 13:30:25 +02:00
Corina Olariu
546341ee75 Merge branch 'RED-9177' into 'main'
RED-9177 - Layout parser fails to process file

See merge request fforesight/layout-parser!158
2024-05-22 13:26:10 +02:00
Corina Olariu
0ed1481517 RED-9177 - Layout parser fails to process file
- use originFile as viewerDocumentFile
- return layoutGridOCGName in case the name is found and not check further properties
2024-05-22 13:02:42 +03:00
Andrei Isvoran
b2a47f66ae Merge branch 'RED-9149-header' into 'main'
RED-9149 - Remove header detection

See merge request fforesight/layout-parser!157
2024-05-20 14:12:04 +02:00
Andrei Isvoran
3835d03036 RED-9149 - Remove header detection 2024-05-20 14:59:34 +03:00
yhampe
a5fcebce30 RED-3813: Recategorize same image as experimental feature
added representation to image and DocumentStructure
2024-05-17 07:34:05 +02:00
Dominique Eifländer
b867deb9f9 Merge branch 'CLARI-hotfix' into 'main'
hotifx for clarifynd

See merge request fforesight/layout-parser!154
2024-05-15 14:08:07 +02:00
Kilian Schuettler
8648ed0952 hotifx for clarifynd 2024-05-15 14:02:02 +02:00
Kilian Schüttler
53f786b539 Merge branch 'RED-9149' into 'main'
RED-9149 - Header and footer detection by page-association

See merge request fforesight/layout-parser!150
2024-05-13 14:57:33 +02:00
Andrei Isvoran
40465e8778 RED-9149 - Improvements 2024-05-13 15:13:37 +03:00
Andrei Isvoran
a76b2ace3f RED-9149 - Address comments 2024-05-13 13:18:33 +03:00
Andrei Isvoran
aeaca2f278 RED-9149 - Header and footer extraction by page-association 2024-05-10 16:04:06 +03:00
Andrei Isvoran
f1dbcc24a2 RED-9149 - Header and footer extraction by page-association 2024-05-10 15:49:08 +03:00
Andrei Isvoran
fda25852d1 RED-9149 - Header and footer extraction by page-association 2024-05-10 15:17:41 +03:00
Dominique Eifländer
471fadbcca Merge branch 'RED-8933-4.1' into 'main'
RED-8933: Fixed bugs in DocumineClassificationService

See merge request fforesight/layout-parser!148
2024-05-08 13:31:17 +02:00
Dominique Eifländer
87001090d5 RED-8933: Fixed bugs in DocumineClassificationService 2024-05-08 13:01:23 +02:00
Timo Bejan
ea355429c2 Merge branch 'RED-8825-fix' into 'main'
RED-8825: minor fixes

See merge request fforesight/layout-parser!146
2024-05-07 17:47:07 +02:00
Kilian Schuettler
6a65d7f9fc RED-8825: minor fixes
* also added overrides via env variables
2024-05-07 17:37:42 +02:00
Kilian Schuettler
e935cc7b14 RED-8825: some fixes, and experimental column detector 2024-05-06 14:24:39 +02:00
Kilian Schüttler
07733d0855 Merge branch 'RED-8825' into 'main'
RED-8825: improve layoutparsing

See merge request fforesight/layout-parser!132
2024-05-03 12:03:03 +02:00
Kilian Schuettler
abb249e966 RED-8825: general layoutparsing improvements
* fix checkstyle
2024-05-03 00:15:31 +02:00
Kilian Schuettler
bcd1eb9afa RED-8825: general layoutparsing improvements
* added test for table line classification
2024-05-03 00:13:48 +02:00
Kilian Schuettler
60acbac53f RED-8825: general layoutparsing improvements
* fixing a bunch of coordinates
2024-05-03 00:06:29 +02:00
Kilian Schuettler
a3decd292d RED-8825: general layoutparsing improvements
* fix RulingCleaningService
2024-05-02 23:00:22 +02:00
Kilian Schuettler
b6f0a21886 RED-8825: general layoutparsing improvements
* refactor all coordinates
2024-05-02 21:01:25 +02:00
Kilian Schuettler
d61cac8b4f RED-8825: general layoutparsing improvements
* fix tests
2024-04-30 16:06:22 +02:00
Kilian Schuettler
ae46c5f1ca RED-8825: general layoutparsing improvements
* fix tests
2024-04-30 11:55:18 +02:00
Kilian Schuettler
f0a70a5242 RED-8825: general improvements
* some more refactoring
 * fixed text ruling classification for vertical text
 * shrunk min graphics size
2024-04-30 11:09:23 +02:00
Kilian Schuettler
15ea385f4d RED-8825: general improvements
* some more refactoring
 * fixed text ruling classification for vertical text
 * shrunk min graphics size
2024-04-30 10:44:32 +02:00
Kilian Schuettler
08be18db2d RED-8825: general improvements
* some more refactoring
2024-04-29 20:09:53 +02:00
Kilian Schuettler
64209255cb RED-8825: general improvements
* classify rulings as underline/striketrough
* improve performance of CleanRulings.lineBetween
* use lineBetween where possible
* wip, still todo:
 - Header/Footer by Ruling for all rotations
 - actually the ticket, optimizing layoutparsing for documine
2024-04-29 17:24:15 +02:00
Kilian Schuettler
4761d2e1a2 RED-8825: general improvements
* classify rulings as underline/striketrough
* improve performance of CleanRulings.lineBetween
* use lineBetween where possible
* wip, still todo:
 - Header/Footer by Ruling for all rotations
 - actually the ticket, optimizing layoutparsing for documine
2024-04-29 17:22:33 +02:00
Kilian Schuettler
1916e626df RED-8825: general improvements
* classify rulings as underline/striketrough
* improve performance of CleanRulings.lineBetween
* use lineBetween where possible
* wip, still todo:
 - Header/Footer by Ruling for all rotations
 - actually the ticket, optimizing layoutparsing for documine
2024-04-29 17:15:19 +02:00
Kilian Schuettler
e4663ac8db RED-8825: added split by ruling into every step of docstrum 2024-04-29 15:54:56 +02:00
Kilian Schuettler
6a691183dc RED-8825: improve layoutparsing
* added improved debugging capabilities to viewer-doc
* refactored coordinates (wip)
* refactored line intersection algorithm
* removed cropbox correction from pdfbox text positions
2024-04-29 15:54:56 +02:00
Kilian Schuettler
3dd215288a RED-8825: improve layoutparsing
* added improved debugging capabilities to viewer-doc
* refactored coordinates (wip)
* refactored line intersection algorithm
* removed cropbox correction from pdfbox text positions
2024-04-29 15:54:53 +02:00
Kilian Schüttler
6fb1a0bef3 Merge branch 'RED-8992' into 'main'
RED-8992 - Enable to add annotation on header with line breaks

See merge request fforesight/layout-parser!143
2024-04-25 13:03:40 +02:00
Corina Olariu
4e7c3f584b RED-8992 - Enable to add annotation on header with line breaks
- don't reorder textblocks classified as headers and footers
- add unit test
2024-04-25 11:23:10 +03:00
Yannik Hampe
84bdb4d1ed Merge branch 'RED-8701' into 'main'
RED-8701 - Move files to customer data repositories

See merge request fforesight/layout-parser!137
2024-04-25 09:06:35 +02:00
Dominique Eifländer
75ab4df592 Merge branch 'RED-8932' into 'main'
RED-8932 Fixed not merged headline with identifier

See merge request fforesight/layout-parser!141
2024-04-24 11:55:01 +02:00
Dominique Eifländer
8442e60055 RED-8932 Fixed not merged headline with identifier 2024-04-24 11:45:38 +02:00
Corina Olariu
0ef67fc07b RED-8701 - Move files to customer data repositories
- update junit tests and syngenta submodule
2024-04-23 14:54:56 +03:00
Corina Olariu
ea02f31a84 Merge branch 'main' into RED-8701
# Conflicts:
#	layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java
2024-04-23 14:20:00 +03:00
Dominique Eifländer
58acbab85f Merge branch 'RED-8826' into 'main'
Red 8826

See merge request fforesight/layout-parser!138
2024-04-23 13:12:51 +02:00
Kilian Schüttler
d38d023485 Merge branch 'RED-7384' into 'main'
Red 7384

See merge request fforesight/layout-parser!140
2024-04-23 12:13:21 +02:00
Kilian Schüttler
c1afe9b11f Red 7384 2024-04-23 12:13:19 +02:00
Corina Olariu
bdcb9aeda4 RED-8701 - Move files to customer data repositories
- update junit tests
2024-04-23 11:49:29 +03:00
Corina Olariu
6a86036a78 Merge branch 'main' into RED-8701 2024-04-23 11:46:59 +03:00
Corina Olariu
a358d7565e RED-8701 - Move files to customer data repositories
- update junit tests
2024-04-23 11:12:57 +03:00
Corina Olariu
069a6c0b49 RED-8701 - Move files to customer data repositories
- update syngenta submodule
2024-04-23 10:44:23 +03:00
Dominique Eifländer
683f7f1fb8 RED-8826: Do not classify textblocks in graphics as headlines 2024-04-23 09:28:28 +02:00
Corina Olariu
7eab3a4088 RED-8701 - Move files to customer data repositories
- remove customer files from project
2024-04-22 14:57:51 +03:00
Corina Olariu
970fc99ed1 RED-8701 - Move files to customer data repositories
- update junit test
2024-04-22 14:14:47 +03:00
Corina Olariu
48c54f63a0 RED-8701 - Move files to customer data repositories
- update submodules
2024-04-22 13:57:39 +03:00
Corina Olariu
20e4e5ddff RED-8701 - Move files to customer data repositories
- update unit tests with the new path to submodules for customer files
2024-04-22 13:37:27 +03:00
Dominique Eifländer
b53930328a RED-8826: Implemented graphics detection 2024-04-19 15:05:17 +02:00
Dominique Eifländer
c947d552d2 Merge branch 'RED-8995-fp' into 'main'
RED-8995: unclassified text might be missing from document data

See merge request fforesight/layout-parser!135
2024-04-19 09:21:50 +02:00
Corina Olariu
6b1b5eab84 RED-8701 - Move files to customer data repositories
- add syngenta submodule
2024-04-18 20:33:00 +03:00
Corina Olariu
cc9816c8cb RED-8701 - Move files to customer data repositories
- use git lfs to store customer files
2024-04-18 20:31:35 +03:00
Kilian Schuettler
f256f9b30f RED-8995: unclassified text might be missing from document data
* treat TablePageBlock.OTHER like PARAGRAPH (no special treatment)
2024-04-18 17:42:34 +02:00
Yannik Hampe
6167e3fb57 Merge branch 'RED-8402' into 'main'
RED-8402: Header and footer are not indexed / searched

See merge request fforesight/layout-parser!134
2024-04-18 15:08:00 +02:00
yhampe
a78fb0244a Merge remote-tracking branch 'origin/RED-8402' into RED-8402 2024-04-18 14:39:10 +02:00
yhampe
8099a00bb6 RED-8402: Header and footer are not indexed / searched
added unit test and file
2024-04-18 14:39:01 +02:00
yhampe
9bb0468b2b RED-8402: Header and footer are not indexed / searched
added unit test and file
2024-04-18 14:36:25 +02:00
Kilian Schüttler
c4d9c5df02 Merge branch 'RED-8747-fp' into 'main'
RED-8747 - Entities not merged properly - fp

See merge request fforesight/layout-parser!131
2024-04-09 16:30:02 +02:00
Corina Olariu
976f408237 RED-8747 - Entities not merged properly - fp
- rework the extraction of rulings from the table cells
2024-04-09 14:38:48 +03:00
Corina Olariu
319268c53d RED-8747 - Entities not merged properly - fp
- update test
2024-04-09 12:24:19 +03:00
Corina Olariu
014eba9fc3 RED-8747 - Entities not merged properly - fp
- fix typo
- add validate table test
2024-04-09 12:14:57 +03:00
Yannik Hampe
9bd8419770 Merge branch 'RED-8402' into 'main'
RED-8402: Header and footer are not indexed / searched

See merge request fforesight/layout-parser!128
2024-04-08 12:28:06 +02:00
yhampe
c13ff7fbf6 RED-8402: Header and footer are not indexed / searched
checkstyle
added review comments
2024-04-08 12:17:49 +02:00
yhampe
5d3826e9b9 Merge remote-tracking branch 'origin/RED-8402' into RED-8402 2024-04-08 12:02:47 +02:00
yhampe
0c3194276a RED-8402: Header and footer are not indexed / searched
added headers and footers to simplifiedtext
2024-04-08 12:02:36 +02:00
yhampe
e302d9784e RED-8402: Header and footer are not indexed / searched
added headers and footers to simplifiedtext
2024-04-08 11:59:35 +02:00
Corina Olariu
f185b13f2b RED-8747 - Entities not merged properly - fp
- use the rullings from the found tables instead of all rullings as splitting rullings in the blockification service
2024-04-08 09:42:32 +03:00
Dominique Eifländer
990c376ce6 Merge branch 'RED-8873' into 'main'
RED-8773 - Fix images not appearing on specific file

See merge request fforesight/layout-parser!123
2024-04-05 10:11:23 +02:00
Kilian Schüttler
bf6a0d770b Merge branch 'RED-8799' into 'main'
RED-8799: LayoutGrid is wrong draw for some tables

See merge request fforesight/layout-parser!126
2024-04-04 15:23:12 +02:00
Kilian Schuettler
f18bda1d4e RED-8799: LayoutGrid is wrong draw for some tables 2024-04-04 13:33:22 +02:00
Maverick Studer
0a11992361 Merge branch 'RED-8702' into 'main'
RED-8702: Explore document databases to store entityLog

See merge request fforesight/layout-parser!125
2024-04-03 10:00:38 +02:00
Andrei Isvoran
456b8fe4a1 RED-8773 - Fix images not appearing on specific file 2024-04-03 10:20:46 +03:00
maverickstuder
9778ece992 RED-8702: Explore document databases to store entityLog
* fix for duplicate images in document structure that are linked to multiple sections
2024-04-02 14:19:14 +02:00
Timo Bejan
8bd0de6263 Merge branch 'RED-8827' into 'main'
Red 8827

See merge request fforesight/layout-parser!122
2024-03-22 12:25:36 +01:00
Timo Bejan
5c1708f97f Issue with merging text blocks multiple times 2024-03-22 12:47:05 +02:00
Timo Bejan
a35d77be2e ignore mc files 2024-03-22 10:22:00 +02:00
Dominique Eifländer
631160eb22 Merge branch 'RED-8627' into 'main'
RED-8627: Fixed scrambled text after sorting

See merge request fforesight/layout-parser!120
2024-03-19 11:09:48 +01:00
Dominique Eifländer
8e7e588d26 RED-8627: Fixed scrambled text after sorting 2024-03-19 10:58:36 +01:00
Dominique Eifländer
ac850c2626 Merge branch 'RED-7141' into 'main'
RED-7141: Fixed more overlap problems

See merge request fforesight/layout-parser!119
2024-03-14 16:46:10 +01:00
Dominique Eifländer
1d765a6baa RED-7141: Fixed more overlap problems 2024-03-14 16:30:52 +01:00
Dominique Eifländer
c55984aa67 Merge branch 'RED-7141' into 'main'
RED-7141: Fixed overlapping blocks

See merge request fforesight/layout-parser!118
2024-03-14 09:09:52 +01:00
Dominique Eifländer
27aa418029 RED-7141: Fixed overlapping blocks 2024-03-13 16:14:55 +01:00
Dominique Eifländer
c4edff4696 Merge branch 'RED-7141' into 'main'
RED-7141: Readded lost mergeLinesInZones

See merge request fforesight/layout-parser!116
2024-03-12 13:49:09 +01:00
Dominique Eifländer
92fd1a72de RED-7141: Readded lost mergeLinesInZones 2024-03-12 13:42:40 +01:00
Dominique Eifländer
0d3d25e7d7 Merge branch 'RED-7141-hotfix' into 'main'
RED-7141: Align backend text sorting with Webviewer sorting

See merge request fforesight/layout-parser!115
2024-03-12 11:15:41 +01:00
maverickstuder
956fbff872 RED-7141: Align backend text sorting with Webviewer sorting
* hotfix for tables not being detected due to wrong x-y-sorting
2024-03-12 11:06:53 +01:00
Maverick Studer
2488009af1 Merge branch 'RED-8715' into 'main'
RED-8715: Improve NearestNeighbor Algorithm in LayoutParser

See merge request fforesight/layout-parser!114
2024-03-11 15:10:41 +01:00
maverickstuder
16be2467fd RED-8715: Improve NearestNeighbor Algorithm in LayoutParser
* replaced the old algorithm with an algorithm based on a kd-tree
2024-03-11 14:42:28 +01:00
Timo Bejan
f4cae8a7dc Merge branch 'Clarifynd' into 'main'
Clarifynd

See merge request fforesight/layout-parser!113
2024-03-11 10:37:05 +01:00
Timo Bejan
dfc23955d7 Linespacing claryfind 2024-03-11 11:30:51 +02:00
Dominique Eifländer
d6e3d6fe22 Clarifynd 2024-03-11 11:24:58 +02:00
Timo Bejan
bef23e38b5 Merge branch 'clari-30' into 'main'
CLARI-30 - forward analysis headers

See merge request fforesight/layout-parser!112
2024-03-08 15:51:53 +01:00
Timo Bejan
65ab7a1912 CLARI-30 - forward analysis headers 2024-03-08 16:47:27 +02:00
Timo Bejan
d80231e4a9 Merge branch 'clari-30' into 'main'
CLARI-30 - identifier fix for clarifynd

See merge request fforesight/layout-parser!111
2024-03-08 15:28:33 +01:00
Timo Bejan
56c07a4491 CLARI-30 - identifier fix for clarifynd 2024-03-08 16:23:27 +02:00
Dominique Eifländer
0b4ad29dcb Merge branch 'RED-7141' into 'main'
RED-7141: Implemented docstrum layout parsing

See merge request fforesight/layout-parser!108
2024-03-08 14:27:59 +01:00
Dominique Eifländer
0ad0cd45d6 RED-7141: Moved docstrum to root level of processor package 2024-03-08 14:20:28 +01:00
Dominique Eifländer
d659fe7234 RED-7141: Performance improvments 2024-03-08 10:00:52 +01:00
Dominique Eifländer
cb9127b4f3 RED-7141: Fixed pr finding and improved speed 2024-03-07 16:51:48 +01:00
Timo Bejan
05523585c0 orchestrator/persistence service should control queues 2024-03-06 16:55:44 +02:00
Timo Bejan
4ced572949 orchestrator/persistence service should control queues 2024-03-06 16:53:10 +02:00
Dominique Eifländer
79239b751d RED-7141: Implemented docstrum layout parsing 2024-03-06 11:18:40 +01:00
358 changed files with 15034 additions and 31479 deletions

1
.gitattributes vendored Normal file
View File

@ -0,0 +1 @@
*.pdf filter=lfs diff=lfs merge=lfs -text

2
.gitignore vendored
View File

@ -42,3 +42,5 @@ gradlew.bat
gradlew
gradle.properties
gradle/
.DS_Store
.DS_Store/

View File

@ -1,3 +1,7 @@
variables:
# SONAR_PROJECT_KEY: 'fforesight_layout-parser_AYd5quv2mRkBOCG22hvF'
GIT_SUBMODULE_STRATEGY: recursive
GIT_SUBMODULE_FORCE_HTTPS: 'true'
include:
- project: 'gitlab/gitlab'
ref: 'main'
@ -17,5 +21,6 @@ deploy:
dotenv: version.env
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
- if: $CI_COMMIT_BRANCH =~ /^feature/ && $CI_COMMIT_TAG == ""
- if: $CI_COMMIT_BRANCH =~ /^release/
- if: $CI_COMMIT_TAG

8
.gitmodules vendored Normal file
View File

@ -0,0 +1,8 @@
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/basf"]
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/basf
url = ssh://git@git.knecon.com:22222/fforesight/documents/basf.git
update = merge
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta"]
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta
url = ssh://git@git.knecon.com:22222/fforesight/documents/syngenta.git
update = merge

View File

@ -8,6 +8,8 @@ plugins {
group = "com.knecon.fforesight"
val documentVersion by rootProject.extra { "4.433.0" }
java.sourceCompatibility = JavaVersion.VERSION_17
java.targetCompatibility = JavaVersion.VERSION_17
@ -42,6 +44,19 @@ tasks.jacocoTestReport {
}
allprojects {
tasks.withType<Javadoc> {
options {
this as StandardJavadocDocletOptions
addBooleanOption("Xdoclint:none", true)
addStringOption("Xmaxwarns", "1")
}
}
pmd {
setConsoleOutput(true)
}
publishing {
publications {
create<MavenPublication>(name) {
@ -64,6 +79,7 @@ java {
withJavadocJar()
}
repositories {
mavenLocal()
mavenCentral()

View File

@ -1,28 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.io.Serializable;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@Schema(description = "Object containing the complete document layout parsing information. It is split into 4 categories, structure, text, positions and pages: " + "The document tree structure of SemanticNodes such as Section, Paragraph, Headline, etc. " + "The text, which is stored as separate blocks of data. " + "The text positions, which are also stored as separate blocks. The Blocks are equal to the text blocks in length and order. " + "The page information.")
public class DocumentData implements Serializable {
@Schema(description = "Contains information about the document's pages.")
DocumentPage[] documentPages;
@Schema(description = "Contains information about the document's text.")
DocumentTextData[] documentTextData;
@Schema(description = "Contains information about the document's text positions.")
DocumentPositionData[] documentPositions;
@Schema(description = "Contains information about the document's semantic structure.")
DocumentStructure documentStructure;
}

View File

@ -1,30 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.io.Serializable;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@Schema(description = "Object containing information about the document's pages.")
public class DocumentPage implements Serializable {
@Schema(description = "The page number, starting with 1.")
int number;
@Schema(description = "The page height in PDF user units.", example = "792")
int height;
@Schema(description = "The page width in PDF user units.", example = "694")
int width;
@Schema(description = "The page rotation as specified by the PDF.", example = "90", allowableValues = {"0", "90", "180", "270"})
int rotation;
}

View File

@ -1,28 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.io.Serializable;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@Schema(description = "Object containing text positional information of a specific text block. A document is split into multiple text blocks, which are supposed to be read in order. Every text block can only occur on a single page.")
public class DocumentPositionData implements Serializable {
@Schema(description = "Identifier of the text block.")
Long id;
@Schema(description = "For each string coordinate in the search text of the text block, the array contains an entry relating the string coordinate to the position coordinate. This is required due to the text and position coordinates not being equal.")
int[] stringIdxToPositionIdx;
@Schema(description = "The bounding box for each glyph as a rectangle. This matrix is of size (n,4), where n is the number of glyphs in the text block. The second dimension specifies the rectangle with the value x, y, width, height, with x, y specifying the lower left corner. In order to access this information, the stringIdxToPositionIdx array must be used to transform the coordinates.")
float[][] positions;
}

View File

@ -1,144 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.awt.geom.Rectangle2D;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Stream;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@Schema(description = "Object containing information about the parsed tree structure of the SemanticNodes, such as Section, Paragraph, Headline etc inside of the document.")
public class DocumentStructure implements Serializable {
@Schema(description = "The root EntryData represents the Document.")
EntryData root;
@Schema(description = "Object containing the extra field names, a table has in its properties field.")
public static class TableProperties implements Serializable {
public static final String NUMBER_OF_ROWS = "numberOfRows";
public static final String NUMBER_OF_COLS = "numberOfCols";
}
@Schema(description = "Object containing the extra field names, an Image has in its properties field.")
public static class ImageProperties implements Serializable {
public static final String TRANSPARENT = "transparent";
public static final String IMAGE_TYPE = "imageType";
public static final String POSITION = "position";
public static final String ID = "id";
}
@Schema(description = "Object containing the extra field names, a table cell has in its properties field.")
public static class TableCellProperties implements Serializable {
public static final String B_BOX = "bBox";
public static final String ROW = "row";
public static final String COL = "col";
public static final String HEADER = "header";
}
public static final String RECTANGLE_DELIMITER = ";";
public static Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(RECTANGLE_DELIMITER)).map(Float::parseFloat).toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
public EntryData get(List<Integer> tocId) {
if (tocId.isEmpty()) {
return root;
}
EntryData entry = root.children.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
entry = entry.children.get(id);
}
return entry;
}
public Stream<EntryData> streamAllEntries() {
return Stream.concat(Stream.of(root), root.children.stream()).flatMap(DocumentStructure::flatten);
}
public String toString() {
return String.join("\n", streamAllEntries().map(EntryData::toString).toList());
}
private static Stream<EntryData> flatten(EntryData entry) {
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentStructure::flatten));
}
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@Schema(description = "Object containing information of a SemanticNode and also structuring the layout with children.")
public static class EntryData implements Serializable {
@Schema(description = "Type of the semantic node.", allowableValues = {"DOCUMENT", "SECTION", "PARAGRAPH", "HEADLINE", "TABLE", "TABLE_CELL", "HEADER", "FOOTER", "IMAGE"})
NodeType type;
@Schema(description = "Specifies the position in the parsed tree structure.", example = "[1, 0, 2]")
int[] treeId;
@Schema(description = "Specifies the text block IDs associated with this semantic node. The value should be joined with the DocumentTextData/DocumentPositionData. Is empty, if no text block is directly associated with this semantic node. Only Paragraph, Headline, Header or Footer is directly associated with a text block.", example = "[1]")
Long[] atomicBlockIds;
@Schema(description = "Specifies the pages this semantic node appears on. The value should be joined with the PageData.", example = "[1, 2, 3]")
Long[] pageNumbers;
@Schema(description = "Some semantic nodes have additional information, this information is stored in this Map. The extra fields are specified by the Properties subclasses.", example = "For a Table: {\"numberOfRows\": 3, \"numberOfCols\": 4}")
Map<String, String> properties;
@Schema(description = "All child Entries of this Entry.", example = "[1, 2, 3]")
List<EntryData> children;
@Schema(description = "Describes the origin of the semantic node", example = "[ALGORITHM]")
Set<LayoutEngine> engines;
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("[");
for (int i : treeId) {
sb.append(i);
sb.append(",");
}
sb.delete(sb.length() - 1, sb.length());
sb.append("]: ");
sb.append(type);
sb.append(" atbs = ");
sb.append(atomicBlockIds.length);
return sb.toString();
}
}
}

View File

@ -1,36 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.io.Serializable;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@Schema(description = "Object containing text information of a specific text block. A document is split into multiple text blocks, which are supposed to be read in order. Every text block can only occur on a single page.")
public class DocumentTextData implements Serializable {
@Schema(description = "Identifier of the text block.")
Long id;
@Schema(description = "The page the text block occurs on.")
Long page;
@Schema(description = "The text the text block.")
String searchText;
@Schema(description = "Each text block is assigned a number on a page, starting from 0.")
int numberOnPage;
@Schema(description = "The text blocks are ordered, this number represents the start of the text block as a string offset.")
int start;
@Schema(description = "The text blocks are ordered, this number represents the end of the text block as a string offset.")
int end;
@Schema(description = "The line breaks in the text of this semantic node in string offsets. They are exclusive end. At the end of each semantic node there is an implicit linebreak.", example = "[5, 10]")
int[] lineBreaks;
}

View File

@ -1,6 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
public enum LayoutEngine {
ALGORITHM,
AI
}

View File

@ -1,22 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.io.Serializable;
import java.util.Locale;
public enum NodeType implements Serializable {
DOCUMENT,
SECTION,
HEADLINE,
PARAGRAPH,
TABLE,
TABLE_CELL,
IMAGE,
HEADER,
FOOTER;
public String toString() {
return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT);
}
}

View File

@ -14,7 +14,7 @@ import lombok.NoArgsConstructor;
public class SimplifiedSectionText {
@Schema(description = "The number of this Section. This is used to map the simplified section text back to the original Section.")
private int sectionNumber;
private String sectionNumber;
@Schema(description = "The text in this Section.")
private String text;

View File

@ -19,6 +19,16 @@ public class SimplifiedText {
@Schema(description = "Number of pages in the entire document.")
private int numberOfPages;
@Schema(description = "A List of simplified Sections, which contains almost exclusively text.")
@Builder.Default
private List<SimplifiedSectionText> sectionTexts = new ArrayList<>();
@Schema(description = "A list of the main section numbers ")
@Builder.Default
private List<String> mainSectionNumbers = new ArrayList<>();
@Schema(description = "A list of the header section numbers ")
@Builder.Default
private List<String> headerSectionNumbers = new ArrayList<>();
@Schema(description = "A list of the footer section numbers ")
@Builder.Default
private List<String> footerSectionNumbers = new ArrayList<>();
}

View File

@ -1,5 +1,7 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
import java.util.List;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AllArgsConstructor;
import lombok.Builder;
@ -13,6 +15,8 @@ public class StructureObject {
@Schema(description = "The ID of this StructureObject.")
Integer structureObjectNumber;
@Schema(description = "The Tree ID of this StructureObject.")
List<Integer> treeId;
@Schema(description = "This value indicates the start of the string offsets in this Object, with respect to the reading order.")
int page;
@Schema(description = "This stringOffset indicates the start of the string offsets in this Object, with respect to the reading order of the entire document. It is equal to the previous' StructureObject stringOffset + its length.")

View File

@ -8,13 +8,20 @@ import lombok.Builder;
@Builder
@Schema(description = "Object containing information about the layout parsing.")
public record LayoutParsingFinishedEvent(
@Schema(description = "General purpose identifier. It is returned exactly the same way it is inserted with the LayoutParsingRequest.")
Map<String, String> identifier,//
@Schema(description = "The duration of a single layout parsing in ms.")
long duration,//
@Schema(description = "The number of pages of the parsed document.")
int numberOfPages,//
@Schema(description = "A general message. It contains some information useful for a developer, like the paths where the files are stored. Not meant to be machine readable.")
String message) {
@Schema(description = "General purpose identifier. It is returned exactly the same way it is inserted with the LayoutParsingRequest.") //
Map<String, String> identifier,
@Schema(description = "The duration of a single layout parsing in ms.") //
long duration,
@Schema(description = "The number of pages of the parsed document.") //
int numberOfPages,
@Schema(description = "A general message. It contains some information useful for a developer, like the paths where the files are stored. Not meant to be machine readable.") //
String message,
@Schema(description = "The app version of the layout parser.") //
String layoutParserVersion
) {
}

View File

@ -2,7 +2,9 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
public class LayoutParsingQueueNames {
public static final String LAYOUT_PARSING_REQUEST_QUEUE = "layout_parsing_request_queue";
public static final String LAYOUT_PARSING_DLQ = "layout_parsing_dead_letter_queue";
public static final String LAYOUT_PARSING_FINISHED_EVENT_QUEUE = "layout_parsing_response_queue";
public static final String LAYOUT_PARSING_REQUEST_QUEUE_PREFIX = "layout_parsing_request";
public static final String LAYOUT_PARSING_REQUEST_EXCHANGE = "layout_parsing_request_exchange";
public static final String LAYOUT_PARSING_RESPONSE_QUEUE_PREFIX = "layout_parsing_response";
public static final String LAYOUT_PARSING_RESPONSE_EXCHANGE = "layout_parsing_response_exchange";
public static final String LAYOUT_PARSING_DLQ = "layout_parsing_error";
}

View File

@ -19,7 +19,6 @@ public record LayoutParsingRequest(
@Schema(description = "Path to the original PDF file.")//
@NonNull String originFileStorageId,//
@Schema(description = "Optional Path to the table extraction file.")//
Optional<String> tablesFileStorageId,//
@Schema(description = "Optional Path to the image classification file.")//
@ -37,9 +36,12 @@ public record LayoutParsingRequest(
@NonNull String positionBlockFileStorageId,//
@Schema(description = "Path where the Document Pages File will be stored.")//
@NonNull String pageFileStorageId,//
@Schema(description = "Path where the Document Markdown File will be stored.")//
Optional<String> documentMarkdownFileStorageId,//
@Schema(description = "Path where the Simplified Text File will be stored.")//
@NonNull String simplifiedTextStorageId,//
@Schema(description = "Path where the Viewer Document PDF will be stored.")//
@NonNull String viewerDocumentStorageId) {
@NonNull String viewerDocumentStorageId
) {
}

View File

@ -2,6 +2,11 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
public enum LayoutParsingType {
REDACT_MANAGER,
TAAS,
DOCUMINE
REDACT_MANAGER_OLD,
REDACT_MANAGER_PARAGRAPH_DEBUG,
REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
DOCUMINE,
DOCUMINE_OLD,
CLARIFYND,
CLARIFYND_PARAGRAPH_DEBUG
}

View File

@ -8,20 +8,33 @@ description = "layoutparser-service-processor"
val jacksonVersion = "2.15.2"
val pdfBoxVersion = "3.0.0"
dependencies {
implementation(project(":layoutparser-service-internal-api"))
implementation(project(":viewer-doc-processor"))
implementation("com.iqser.red.service:persistence-service-shared-api-v1:2.144.0") {
implementation("com.knecon.fforesight:document:${rootProject.extra.get("documentVersion")}")
implementation("com.iqser.red.service:persistence-service-shared-api-v1:2.564.0-RED9010.0") {
exclude("org.springframework.boot", "spring-boot-starter-security")
exclude("org.springframework.boot", "spring-boot-starter-validation")
}
implementation("com.knecon.fforesight:tenant-commons:0.21.0")
implementation("com.iqser.red.commons:storage-commons:2.45.0")
implementation("com.knecon.fforesight:tenant-commons:0.30.0") {
exclude("com.iqser.red.commons", "storage-commons")
}
implementation("com.iqser.red.commons:storage-commons:2.50.0")
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
implementation("org.jgrapht:jgrapht-core:1.5.2")
implementation("org.apache.pdfbox:jbig2-imageio:3.0.4")
implementation("com.github.jai-imageio:jai-imageio-core:1.4.0")
implementation("com.github.jai-imageio:jai-imageio-jpeg2000:1.4.0")
implementation("org.tinspin:tinspin-indexes:2.1.3")
implementation("org.commonmark:commonmark:0.22.0")
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
implementation("com.pdftron:PDFNet:10.11.0")
implementation("org.apache.commons:commons-text:1.12.0")
}

View File

@ -0,0 +1,20 @@
package com.knecon.fforesight.service.layoutparser.processor;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import lombok.AccessLevel;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Configuration
@ConfigurationProperties("layoutparser")
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutParserSettings {
boolean debug;
LayoutParsingType layoutParsingTypeOverride;
}

View File

@ -2,12 +2,15 @@ package com.knecon.fforesight.service.layoutparser.processor;
import static java.lang.String.format;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
@ -17,20 +20,35 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.iqser.red.service.redaction.v1.server.mapper.DocumentDataMapper;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEnhancementService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.VisualLayoutParsingAdapter;
@ -38,23 +56,24 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
import io.micrometer.observation.Observation;
import io.micrometer.observation.ObservationRegistry;
@ -69,26 +88,32 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutParsingPipeline {
ImageServiceResponseAdapter imageServiceResponseAdapter;
CvTableParsingAdapter cvTableParsingAdapter;
LayoutParsingStorageService layoutParsingStorageService;
SectionsBuilderService sectionsBuilderService;
TaasClassificationService taasClassificationService;
RedactManagerClassificationService redactManagerClassificationService;
DocuMineClassificationService docuMineClassificationService;
SimplifiedSectionTextService simplifiedSectionTextService;
BodyTextFrameService bodyTextFrameService;
RulingCleaningService rulingCleaningService;
TableExtractionService tableExtractionService;
TaasBlockificationService taasBlockificationService;
DocuMineBlockificationService docuMineBlockificationService;
RedactManagerBlockificationService redactManagerBlockificationService;
LayoutGridService layoutGridService;
ObservationRegistry observationRegistry;
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
final ImageServiceResponseAdapter imageServiceResponseAdapter;
final CvTableParsingAdapter cvTableParsingAdapter;
final LayoutParsingStorageService layoutParsingStorageService;
final SectionsBuilderService sectionsBuilderService;
final SimplifiedSectionTextService simplifiedSectionTextService;
final RulingCleaningService rulingCleaningService;
final TableExtractionService tableExtractionService;
final DocuMineBlockificationService docuMineBlockificationService;
final RedactManagerBlockificationService redactManagerBlockificationService;
final BlockificationPostprocessingService blockificationPostprocessingService;
final DocstrumBlockificationService docstrumBlockificationService;
final LayoutGridService layoutGridService;
final ObservationRegistry observationRegistry;
final VisualLayoutParsingAdapter visualLayoutParsingAdapter;
final GraphicExtractorService graphicExtractorService;
final OutlineExtractorService outlineExtractorService;
final SectionTreeBuilderService sectionTreeBuilderService;
final SectionTreeEnhancementService sectionTreeEnhancementService;
final LayoutParserSettings settings;
final ClassificationService classificationService;
@Value("${LAYOUT_PARSER_VERSION:}")
private String layoutParserVersion;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
@ -100,62 +125,59 @@ public class LayoutParsingPipeline {
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
if (layoutParsingRequest.visualLayoutParsingFileId()
.isPresent()) {
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId()
.get());
}
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
.map(layoutParsingStorageService::getVisualLayoutParsingFile)
.orElse(new VisualLayoutParsingResponse());
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
.map(layoutParsingStorageService::getImagesFile)
.orElse(new ImageServiceResponse());
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
.map(layoutParsingStorageService::getTablesFile)
.orElse(new TableServiceResponse());
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId()
.isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get());
}
LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride();
TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId()
.isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
.get());
}
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
ClassificationDocument classificationDocument = parseLayout(layoutParsingType,
originFile,
imageServiceResponse,
tableServiceResponse,
visualLayoutParsingResponse,
layoutParsingRequest.identifier().toString());
layoutParsingRequest.identifier());
log.info("Building document graph for {}", layoutParsingRequest.identifier());
Document documentGraph = observeBuildDocumentGraph(classificationDocument);
DocumentWithVisualization documentWithVisualization = observeBuildDocumentGraph(layoutParsingType, classificationDocument);
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false);
layoutGridService.addLayoutGrid(viewerDocumentFile, documentWithVisualization, viewerDocumentFile, layoutParsingType, layoutParserVersion, false);
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentWithVisualization.document()));
if (layoutParsingRequest.documentMarkdownFileStorageId()
.isPresent()) {
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
.get(), new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
}
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentWithVisualization.document()));
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) {
if (layoutParsingRequest.researchDocumentStorageId() != null) {
log.info("Building research document data for {}", layoutParsingRequest.identifier());
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentWithVisualization.document());
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
}
if (!viewerDocumentFile.equals(originFile)) {
viewerDocumentFile.delete();
assert !viewerDocumentFile.exists() || viewerDocumentFile.delete();
}
originFile.delete();
assert !originFile.exists() || originFile.delete();
return LayoutParsingFinishedEvent.builder()
.identifier(layoutParsingRequest.identifier())
.numberOfPages(documentGraph.getNumberOfPages())
.numberOfPages(documentWithVisualization.document().getNumberOfPages())
.duration(System.currentTimeMillis() - start)
.message(format("""
Layout parsing has finished in %.02f s.
@ -170,25 +192,26 @@ public class LayoutParsingPipeline {
Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()),
buildSemanticNodeCountMessage(documentWithVisualization.document().getNumberOfPages(), documentWithVisualization.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId()))
.layoutParserVersion(layoutParserVersion)
.build();
}
private Document observeBuildDocumentGraph(ClassificationDocument classificationDocument) {
private DocumentWithVisualization observeBuildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument) {
AtomicReference<Document> documentReference = new AtomicReference<>();
AtomicReference<DocumentWithVisualization> documentReference = new AtomicReference<>();
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry)
.contextualName("build-document-graph")
.observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument)));
.observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(layoutParsingType, classificationDocument)));
return documentReference.get();
}
@ -196,15 +219,15 @@ public class LayoutParsingPipeline {
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
return format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
}
@ -215,16 +238,25 @@ public class LayoutParsingPipeline {
ImageServiceResponse imageServiceResponse,
TableServiceResponse tableServiceResponse,
VisualLayoutParsingResponse visualLayoutParsingResponse,
String identifier) {
Map<String, String> identifier) {
PDDocument originDocument = openDocument(originFile);
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
ClassificationDocument classificationDocument = new ClassificationDocument();
if (settings.isDebug() || identifier.containsKey("debug")) {
classificationDocument.getLayoutDebugLayer().setActive(true);
}
List<ClassificationPage> classificationPages = new ArrayList<>();
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
long pageCount = originDocument.getNumberOfPages();
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
@ -247,49 +279,73 @@ public class LayoutParsingPipeline {
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE)) {
stripper.setSortByPosition(true);
}
stripper.getText(originDocument);
List<Word> words = stripper.getWords();
// rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
var lines = TextPositionOperations.groupByLine(new HashSet<>(words));
classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber);
words = TextPositionOperations.sortWords(lines);
}
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
PDRectangle pdr = pdPage.getMediaBox();
int rotation = pdPage.getRotation();
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
List<Ruling> rulings = stripper.getRulings();
classificationDocument.getLayoutDebugLayer().addRulingVisualization(rulings, pageNumber);
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), rulings);
PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber);
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getWords(), false);
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
.addAll(graphics.stream()
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()),
ImageType.GRAPHIC,
false,
stripper.getPageNumber(),
""))
.toList());
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer());
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
};
classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape);
classificationPage.setPageNumber(pageNumber);
classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight());
updateClassificationPage(pdPage, pdr, classificationPage, cleanRulings, pageNumber, pageInformation);
blockificationPostprocessingService.findHeadlinesFromOutline(classificationDocument, pageNumber, classificationPage, pageInformation);
classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
if (pdfImages.containsKey(pageNumber)) {
classificationPage.setImages(pdfImages.get(pageNumber));
imageServiceResponseAdapter.findOcr(classificationPage);
}
if (signatures.containsKey(pageNumber)) {
if (classificationPage.getImages() == null || classificationPage.getImages().size() == 0) {
if (classificationPage.getImages() == null || classificationPage.getImages().isEmpty()) {
classificationPage.setImages(signatures.get(pageNumber));
} else {
classificationPage.getImages().addAll(signatures.get(pageNumber));
}
}
tableExtractionService.extractTables(cleanRulings, classificationPage);
tableExtractionService.extractTables(emptyTableCells, classificationPage);
buildPageStatistics(classificationPage);
increaseDocumentStatistics(classificationPage, classificationDocument);
@ -299,22 +355,67 @@ public class LayoutParsingPipeline {
originDocument.close();
log.info("Calculating BodyTextFrame for {}", identifier);
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
log.info("Classify TextBlocks for {}", identifier);
switch (layoutParsingType) {
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
}
classificationService.classify(classificationDocument, layoutParsingType, identifier);
SectionTree sectionTree = sectionTreeBuilderService.createSectionTree(classificationDocument);
classificationDocument.setSectionTree(sectionTree);
log.info("Building Sections for {}", identifier);
sectionsBuilderService.buildSections(classificationDocument);
sectionsBuilderService.addImagesToSections(classificationDocument);
switch (layoutParsingType) {
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
default -> sectionTreeEnhancementService.assignSectionBlocksAndImages(classificationDocument);
}
return classificationDocument;
}
private static void updateClassificationPage(PDPage pdPage,
PDRectangle pdr,
ClassificationPage classificationPage,
CleanRulings cleanRulings,
int pageNumber,
PageInformation pageInformation) {
int rotation = pdPage.getRotation();
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape);
classificationPage.setPageNumber(pageNumber);
classificationPage.setPageWidth((float) pageInformation.width());
classificationPage.setPageHeight((float) pageInformation.height());
}
private static void rotateDirAdjExactly(List<Word> words, PDPage pdPage) {
for (TextDirection dir : TextDirection.values()) {
double averageRotation = words.stream()
.map(Word::getCharacters)
.flatMap(Collection::stream)
.map(Character::getTextPosition)
.filter(pos -> pos.getDir().equals(dir))
.mapToDouble(RedTextPosition::getExactDir).average()
.orElse(0);
if (averageRotation == 0) {
continue;
}
AffineTransform rotateInstance = AffineTransform.getRotateInstance(averageRotation, pdPage.getMediaBox().getWidth() / 2, pdPage.getMediaBox().getHeight() / 2);
for (Word word : words) {
if (!dir.equals(word.getDir())) {
continue;
}
word.transform(rotateInstance);
}
}
}
private void addNumberOfPagesToTrace(int numberOfPages, long size) {
if (observationRegistry.getCurrentObservation() != null) {
@ -356,10 +457,10 @@ public class LayoutParsingPipeline {
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
if (((TextPageBlock) textBlock).getSequences() == null) {
if (((TextPageBlock) textBlock).getWords() == null) {
continue;
}
for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) {
for (Word word : ((TextPageBlock) textBlock).getWords()) {
classificationPage.getTextHeightCounter().add(word.getTextHeight());
classificationPage.getFontCounter().add(word.getFont());
classificationPage.getFontSizeCounter().add(word.getFontSize());

View File

@ -5,7 +5,7 @@ import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
import io.micrometer.observation.ObservationRegistry;
@ -13,12 +13,11 @@ import io.micrometer.observation.ObservationRegistry;
@ComponentScan
public class LayoutParsingServiceProcessorConfiguration {
@Bean
@Autowired
public ViewerDocumentService viewerDocumentService(ObservationRegistry registry) {
public PDFTronViewerDocumentService viewerDocumentService(ObservationRegistry registry) {
return new ViewerDocumentService(registry);
return new PDFTronViewerDocumentService(registry);
}
}

View File

@ -1,26 +1,31 @@
package com.knecon.fforesight.service.layoutparser.processor;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import org.springframework.core.task.TaskExecutor;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.redaction.v1.server.data.DocumentData;
import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocVersioningUtility;
import com.knecon.fforesight.tenantcommons.TenantContext;
import io.micrometer.observation.annotation.Observed;
@ -36,6 +41,9 @@ public class LayoutParsingStorageService {
private final StorageService storageService;
private final ObjectMapper objectMapper;
private final TaskExecutor taskExecutor;
@Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file")
public File getOriginFile(String storageId) throws IOException {
@ -53,11 +61,18 @@ public class LayoutParsingStorageService {
}
File tempFile = createTempFile("viewerDocument", ".pdf");
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
if (!ViewerDocVersioningUtility.isCurrentVersion(tempFile)) {
assert tempFile.delete();
return Optional.empty();
}
return Optional.of(tempFile);
}
public ImageServiceResponse getImagesFile(String storageId) throws IOException {
@SneakyThrows
public ImageServiceResponse getImagesFile(String storageId) {
try (InputStream inputStream = getObject(storageId)) {
@ -68,7 +83,8 @@ public class LayoutParsingStorageService {
}
public TableServiceResponse getTablesFile(String storageId) throws IOException {
@SneakyThrows
public TableServiceResponse getTablesFile(String storageId) {
try (var tableClassificationStream = getObject(storageId)) {
@ -78,22 +94,45 @@ public class LayoutParsingStorageService {
}
}
public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) throws IOException {
@SneakyThrows
public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) {
try (InputStream inputStream = getObject(storageId)) {
VisualLayoutParsingResponse visualLayoutParsingResponse = objectMapper.readValue(inputStream, VisualLayoutParsingResponse.class);
return visualLayoutParsingResponse;
return objectMapper.readValue(inputStream, VisualLayoutParsingResponse.class);
}
}
@SneakyThrows
@Observed(name = "LayoutParsingStorageService", contextualName = "store-document-data")
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentStructure());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getDocumentTextData());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getDocumentPositions());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getDocumentPages());
Runnable storeDocumentStructureRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.structureFileStorageId(),
documentData.getDocumentStructure());
CompletableFuture<Void> storeDocumentStructureFuture = CompletableFuture.runAsync(storeDocumentStructureRunnable, taskExecutor);
Runnable storeDocumentTextDataRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.textBlockFileStorageId(),
documentData.getDocumentTextData());
CompletableFuture<Void> storeDocumentTextDataFuture = CompletableFuture.runAsync(storeDocumentTextDataRunnable, taskExecutor);
Runnable storeDocumentPositionsRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.positionBlockFileStorageId(),
documentData.getDocumentPositionData());
CompletableFuture<Void> storeDocumentPositionsFuture = CompletableFuture.runAsync(storeDocumentPositionsRunnable, taskExecutor);
Runnable storeDocumentPagesRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.pageFileStorageId(),
documentData.getDocumentPages());
CompletableFuture<Void> storeDocumentPagesFuture = CompletableFuture.runAsync(storeDocumentPagesRunnable, taskExecutor);
CompletableFuture.allOf(storeDocumentStructureFuture, storeDocumentTextDataFuture, storeDocumentPositionsFuture, storeDocumentPagesFuture).join();
}
@ -154,4 +193,16 @@ public class LayoutParsingStorageService {
}
}
@SneakyThrows
@Observed(name = "LayoutParsingStorageService", contextualName = "store-markdown-file")
public void storeMarkdownFile(String markdownFileStorageId, String markdownContent) {
try (InputStream inputStream = new ByteArrayInputStream(markdownContent.getBytes(StandardCharsets.UTF_8))) {
storageService.storeObject(TenantContext.getTenantId(), markdownFileStorageId, inputStream);
}
}
}

View File

@ -0,0 +1,98 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.List;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.LineBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.NearestNeighbourService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
import lombok.RequiredArgsConstructor;
@Service
@RequiredArgsConstructor
public class DocstrumSegmentationService {
public static final double SAME_DIRECTION_THRESHOLD = 0.9;
private final NearestNeighbourService nearestNeighbourService;
private final SpacingService spacingService;
private final LineBuilderService lineBuilderService;
private final ZoneBuilderService zoneBuilderService;
private final ReadingOrderService readingOrderService;
public List<Zone> segmentPage(List<Word> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) {
EnumMap<TextDirection, Integer> directionCounts = new EnumMap<>(TextDirection.class);
List<Zone> newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO);
directionCounts.put(TextDirection.ZERO, newZones.size());
List<Zone> zones = new ArrayList<>(newZones);
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE);
directionCounts.put(TextDirection.QUARTER_CIRCLE, newZones.size());
zones.addAll(newZones);
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE);
directionCounts.put(TextDirection.HALF_CIRCLE, newZones.size());
zones.addAll(newZones);
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE);
directionCounts.put(TextDirection.THREE_QUARTER_CIRCLE, newZones.size());
zones.addAll(newZones);
return readingOrderService.resolve(zones, xyOrder, mostSameDirection(directionCounts));
}
private boolean mostSameDirection(EnumMap<TextDirection, Integer> directionCounts) {
int total = directionCounts.values()
.stream()
.mapToInt(i -> i).sum();
if ((double) directionCounts.get(TextDirection.ZERO) / total > SAME_DIRECTION_THRESHOLD) {
return true;
} else if ((double) directionCounts.get(TextDirection.QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
return true;
} else if ((double) directionCounts.get(TextDirection.HALF_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
return true;
} else if ((double) directionCounts.get(TextDirection.THREE_QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
return true;
}
return false;
}
private List<Zone> computeZones(List<Word> textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) {
List<Character> characters = textPositions.stream()
.filter(t -> t.getDir() == direction)
.map(Word::getCharacters)
.flatMap(List::stream)
.toList();
nearestNeighbourService.findNearestNeighbors(characters);
double characterSpacing = spacingService.computeCharacterSpacing(characters);
double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings);
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
}
}

View File

@ -0,0 +1,31 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
public class AngleFilter {
protected double lowerAngle;
protected double upperAngle;
public AngleFilter(double lowerAngle, double upperAngle) {
this.lowerAngle = lowerAngle < -Math.PI / 2 ? lowerAngle + Math.PI : lowerAngle;
this.upperAngle = upperAngle >= Math.PI / 2 ? upperAngle - Math.PI : upperAngle;
}
public boolean matches(Neighbor neighbor) {
return matches(neighbor.getAngle());
}
public boolean matches(double angle) {
if (lowerAngle <= upperAngle) {
return lowerAngle <= angle && angle < upperAngle;
} else {
return lowerAngle <= angle || angle < upperAngle;
}
}
}

View File

@ -0,0 +1,279 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@NoArgsConstructor
public abstract class BoundingBox {
// Java coordinate system: (0, 0) is always upper left, x is increasing left to right and y is increasing from top to bottom.
// should be used when determining reading order or other tasks which require coordinates in a harmonized system.
protected Rectangle2D bBox; // I would not trust this coordinate when comparing rulings and text, due to the text positions being slightly off.
// PDF coordinate system: depends on page rotation, (0, 0) is lower left corner, x is increasing left to right and y from bottom to top.
// This rotates completely in 90 degree steps with page rotation.
// Needs to be used when writing to a PDF.
// Also, these are definitely correct and should be used whenever possible.
protected Rectangle2D bBoxPdf;
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
public double getX() {
return bBox.getX();
}
public double getY() {
return bBox.getY();
}
public double getMinX() {
return bBox.getMinX();
}
public double getMinY() {
return bBox.getMinY();
}
public double getPdfMinX() {
return bBoxPdf.getMinX();
}
public double getPdfMaxX() {
return bBoxPdf.getMaxX();
}
public double getPdfMinY() {
return bBoxPdf.getMinY();
}
public double getPdfMaxY() {
return bBoxPdf.getMaxY();
}
public double getWidth() {
return bBox.getWidth();
}
public double getHeight() {
return bBox.getHeight();
}
public double getMaxX() {
return bBox.getMaxX();
}
public double getMaxY() {
return bBox.getMaxY();
}
public double getArea() {
return (bBox.getHeight() * bBox.getWidth());
}
public boolean contains(BoundingBox contained) {
return contains(contained, 0);
}
public boolean contains(BoundingBox contained, double tolerance) {
return getPdfMinX() <= contained.getPdfMinX() + tolerance
&& getPdfMinY() <= contained.getPdfMinY() + tolerance
&& getPdfMaxX() >= contained.getPdfMaxX() - tolerance
&& getPdfMaxY() >= contained.getPdfMaxY() - tolerance;
}
public boolean intersects(BoundingBox other) {
return this.intersectsX(other) && this.intersectsY(other);
}
public boolean intersects(BoundingBox other, float yThreshold, float xThreshold) {
return this.intersectsX(other, xThreshold) && this.intersectsY(other, yThreshold);
}
public boolean intersectsX(BoundingBox other, float threshold) {
return this.getX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getX();
}
public boolean intersectsPdf(BoundingBox other) {
return this.intersectsXPdf(other) && this.intersectsYPdf(other);
}
public boolean intersectsPdf(BoundingBox other, float yThreshold, float xThreshold) {
return this.intersectsXPdf(other, xThreshold) && this.intersectsYPdf(other, yThreshold);
}
public boolean intersectsYPdf(BoundingBox other) {
return this.getPdfMinY() <= other.getPdfMaxY() && this.getPdfMaxY() >= other.getPdfMinY();
}
public boolean intersectsY(BoundingBox other) {
return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY();
}
public boolean intersectsY(BoundingBox other, float threshold) {
return this.getY() - threshold <= other.getMaxY() && this.getMaxY() + threshold >= other.getY();
}
public boolean intersectsYPdf(BoundingBox other, float threshold) {
return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY();
}
public boolean intersectsXPdf(BoundingBox other) {
return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX();
}
public boolean intersectsX(BoundingBox other) {
return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX();
}
public boolean intersectsXPdf(BoundingBox other, float threshold) {
return this.getPdfMinX() - threshold <= other.getPdfMaxX() && this.getMaxX() + threshold >= other.getPdfMinX();
}
public void setToBBoxOfComponents(List<? extends BoundingBox> components) {
this.bBox = components.stream()
.map(BoundingBox::getBBox)
.collect(RectangleTransformations.collectBBox());
this.bBoxPdf = components.stream()
.map(BoundingBox::getBBoxPdf)
.collect(RectangleTransformations.collectBBox());
}
public double verticalOverlap(BoundingBox other) {
return Math.max(0, Math.min(this.getPdfMaxY(), other.getPdfMaxY()) - Math.max(this.getPdfMinY(), other.getPdfMinY()));
}
public static final Comparator<BoundingBox> ILL_DEFINED_ORDER = (o1, o2) -> {
if (o1.equals(o2)) {
return 0;
}
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD * ((o1.getHeight() + o2.getHeight()) / 2)) {
return Double.compare(o1.getPdfMinX(), o2.getPdfMinX());
} else {
return Double.compare(o1.getPdfMaxY(), o2.getPdfMaxY());
}
};
public double horizontalDistance(BoundingBox other) {
double rect1Right = getMaxX();
double rect1Left = getMinX();
double rect2Right = other.getMaxX();
double rect2Left = other.getMinX();
if (rect1Left > rect2Right || rect2Left > rect1Right) {
return Math.max(rect2Left - rect1Right, rect1Left - rect2Right);
} else {
return 0;
}
}
public double verticalDistance(BoundingBox other) {
double rect1Top = getMaxY();
double rect1Bottom = getMinY();
double rect2Top = other.getMaxY();
double rect2Bottom = other.getMinY();
if (rect1Bottom > rect2Top || rect2Bottom > rect1Top) {
return Math.max(rect2Bottom - rect1Top, rect1Bottom - rect2Top);
} else {
return 0;
}
}
public boolean rightOf(BoundingBox other) {
return this.intersectsY(other) && other.getMaxX() <= this.getMinX();
}
public boolean leftOf(BoundingBox other) {
return this.intersectsY(other) && other.getMinX() >= this.getMaxX();
}
public boolean isAbove(BoundingBox other) {
return this.intersectsX(other) && other.getMinY() >= this.getMaxY();
}
public boolean isBelow(BoundingBox other) {
return this.intersectsX(other) && this.getMinY() >= other.getMaxY();
}
}

View File

@ -0,0 +1,86 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Setter;
@Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class Character {
@EqualsAndHashCode.Include
private final double x;
@EqualsAndHashCode.Include
private final double y;
private final RedTextPosition textPosition;
@Setter
private List<Neighbor> neighbors = new ArrayList<>();
public Character(RedTextPosition chunk) {
this.x = chunk.getBBoxDirAdj().getCenterX();
this.y = chunk.getBBoxDirAdj().getCenterY();
this.textPosition = chunk;
}
public double getHeight() {
return textPosition.getHeightDirAdj();
}
public double distance(Character character) {
double dx = getX() - character.getX();
double dy = getY() - character.getY();
return Math.sqrt(dx * dx + dy * dy);
}
public double horizontalDistance(Character character) {
return Math.abs(getX() - character.getX());
}
public double verticalDistance(Character character) {
return Math.abs(getY() - character.getY());
}
public double overlappingDistance(Character other) {
double[] xs = new double[4];
double s = Math.sin(-0);
double c = Math.cos(-0);
xs[0] = c * x - s * y;
xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDirAdj());
xs[2] = c * other.x - s * other.y;
xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDirAdj());
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
}
public double angle(Character character) {
if (getX() > character.getX()) {
return FastAtan2.fastAtan2(getY() - character.getY(), getX() - character.getX());
} else {
return FastAtan2.fastAtan2(character.getY() - getY(), character.getX() - getX());
}
}
}

View File

@ -0,0 +1,324 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
/*
WIP, mostly working, needs to be tested a bit more
*/
public class ColumnDetector {
public static final double MAX_VALUE_THRESHOLD = 0.5;
final static int bins_num = 512;
final static int globalStartIdx = 0; // ignore outer parts completely, we don't expect columns there
final static int globalEndIdx = bins_num; // i chose 7, since thirds seems a likely split for columns, therefore divided by 6 would eliminate those.
public static final double DERIVATIVE_ZERO_THRESHOLD = 1e-10;
public static final double MINIMUM_THRESHOLD_FOR_COLUMNS = 0.05;
public static final double NEAR_GLOBAL_THRESHOLD = 0.5;
double minY;
double maxY;
double midY;
double[] histogram;
double min;
double max;
double resolution;
double sum;
int N;
public ColumnDetector(double min, double max, double minY, double maxY) {
this.min = min;
this.max = max;
this.minY = minY;
this.maxY = maxY;
this.midY = maxY - minY;
this.resolution = (max - min) / bins_num;
this.histogram = new double[bins_num];
}
public void add(BoundingBox zone) {
N++;
double weight = computeWeight(zone);
int start = (int) ((zone.getMinX() - min) / resolution);
int end = (int) ((zone.getMaxX() - min) / resolution);
for (int i = start; i < end; i++) {
histogram[i] += weight;
sum += histogram[i];
}
}
private double computeWeight(BoundingBox zone) {
double areaWeight = zone.getBBox().getHeight();
double relativeDistance = relativeDistanceToMiddle(zone.getBBox().getCenterY());
double distanceWeight;
if (relativeDistance < 0.6) {
distanceWeight = 1;
} else if (relativeDistance < 0.8) {
distanceWeight = 0.8;
} else {
distanceWeight = 0.1;
}
return areaWeight * distanceWeight;
}
private double relativeDistanceToMiddle(double y) {
double range = (maxY - minY) / 2;
double mid = minY + range;
return Math.abs(y - mid) / range;
}
public double[] computeDerivative() {
int length = histogram.length;
double[] derivative = new double[length];
for (int i = 0; i < length; i++) {
if (i == 0) {
derivative[i] = (histogram[i + 1] - histogram[i]) / resolution;
} else if (i == length - 1) {
derivative[i] = (histogram[i] - histogram[i - 1]) / resolution;
} else {
derivative[i] = (histogram[i + 1] - histogram[i - 1]) / (2 * resolution);
}
}
return derivative;
}
public double calcMean(double[] arr, int start, int end) {
if (start == end) {
return 0;
}
double sum = 0;
for (int i = start; i < end; i++) {
sum += arr[i];
}
return sum / (end - start);
}
/*
Find columns, by finding all local maxima/minima of the derivative. Filtering them for the ones with the biggest values.
For each found minima, we will step to the right until we hit a 0 in the derivative, this indicates a minimum in the main histogram. If this minimum is below a threshold, it is deemed a column divider.
Same goes for maxima, but stepping to the left now, since minima in the function will always be to the left of a maximum in its derivative.
*/
public List<Double> determineColumnsWithDerivative(double[] derivative) {
assert derivative.length == histogram.length;
Set<Integer> columnIndices = new HashSet<>();
double mean = calcMean(histogram, 0, histogram.length);
double maxDvValue = calcMax(derivative);
double minDvValue = calcMin(derivative);
if (maxDvValue - minDvValue < mean * MAX_VALUE_THRESHOLD) {
Collections.emptyList();
}
Extrema derivativeExtrema = calculateNearGlobalExtrema(derivative, maxDvValue, minDvValue);
List<Integer> columnsRightOfMinima = findZerosToTheRightOfMinima(derivative, derivativeExtrema.minima(), mean);
columnIndices.addAll(columnsRightOfMinima);
List<Integer> columnsLeftOfMaxima = findZerosToTheLeftOfMaxima(derivative, derivativeExtrema.maxima(), mean);
columnIndices.addAll(columnsLeftOfMaxima);
return columnIndices.stream()
.sorted(Comparator.naturalOrder())
.map(this::calculateXCoordinateFromIdx)
.toList();
}
private List<Integer> findZerosToTheLeftOfMaxima(double[] derivative, List<Integer> derivativeMaxima, double mean) {
List<Integer> columnsLeftOfMaxima = new ArrayList<>();
for (int i = 0; i < derivativeMaxima.size(); i++) {
List<Integer> consecutiveZeroes = new LinkedList<>();
boolean maximumFound = false;
int maximaIdx = derivativeMaxima.get(i) - 1; // the highest derivative will always be at least one step away from the lowest value.
int endIdx = (int) Math.max(globalStartIdx,
Math.min(maximaIdx - 1,
maximaIdx - 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the left edge;
for (int j = maximaIdx; j >= endIdx; j--) {
if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) {
maximumFound = true;
consecutiveZeroes.add(j);
} else if (maximumFound) {
break;
}
}
if (maximumFound) {
int midIdx = consecutiveZeroes.size() / 2;
int middleMinimumIdx = consecutiveZeroes.get(midIdx);
if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) {
columnsLeftOfMaxima.add(middleMinimumIdx);
}
}
}
return columnsLeftOfMaxima;
}
private List<Integer> findZerosToTheRightOfMinima(double[] derivative, List<Integer> derivativeMinima, double mean) {
List<Integer> columnIndixes = new LinkedList<>();
for (int i = 0; i < derivativeMinima.size(); i++) {
List<Integer> consecutiveZeroes = new LinkedList<>();
boolean minimumFound = false;
int minimaIdx = derivativeMinima.get(i) + 1; // the highest derivative will always be at least one step earlier than the lowest value.
int endIdx = (int) Math.min(globalEndIdx,
Math.max(minimaIdx + 1,
minimaIdx + 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the right edge;
for (int j = minimaIdx; j < endIdx; j++) {
if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) {
minimumFound = true;
consecutiveZeroes.add(j);
} else if (minimumFound) {
break;
}
}
if (minimumFound) {
int midIdx = consecutiveZeroes.size() / 2;
int middleMinimumIdx = consecutiveZeroes.get(midIdx);
if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) {
columnIndixes.add(middleMinimumIdx);
}
}
}
return columnIndixes;
}
private double calcMax(double[] array) {
double max = Double.NEGATIVE_INFINITY;
for (int i = 0; i < array.length; i++) {
if (array[i] > max) {
max = array[i];
}
}
return max;
}
private double calcMin(double[] array) {
double min = Double.POSITIVE_INFINITY;
for (int i = 0; i < array.length; i++) {
if (array[i] < min) {
min = array[i];
}
}
return min;
}
private Extrema calculateNearGlobalExtrema(double[] derivative, double maxDvValue, double minDvValue) {
List<Integer> nearGlobalDvMaximaIdx = new LinkedList<>();
List<Integer> nearGlobalDvMinimaIdx = new LinkedList<>();
for (int i = globalStartIdx; i < globalEndIdx; i++) {
if (derivative[i] <= minDvValue * NEAR_GLOBAL_THRESHOLD) {
nearGlobalDvMinimaIdx.add(i);
}
if (derivative[i] >= maxDvValue * NEAR_GLOBAL_THRESHOLD) {
nearGlobalDvMaximaIdx.add(i);
}
}
nearGlobalDvMinimaIdx = removeConsecutive(nearGlobalDvMinimaIdx);
nearGlobalDvMaximaIdx = removeConsecutive(nearGlobalDvMaximaIdx);
return new Extrema(nearGlobalDvMaximaIdx, nearGlobalDvMinimaIdx);
}
private record Extrema(List<Integer> maxima, List<Integer> minima) {
}
private Double calculateXCoordinateFromIdx(int globalMinIdx) {
return min + ((globalMinIdx + 1) * resolution);
}
public static List<Integer> removeConsecutive(List<Integer> numbers) {
List<Integer> result = new ArrayList<>();
if (numbers == null || numbers.isEmpty()) {
return result;
}
result.add(numbers.get(0)); // Add the first number
for (int i = 1; i < numbers.size(); i++) {
if (numbers.get(i) != numbers.get(i - 1) + 1) {
result.add(numbers.get(i)); // Add non-consecutive numbers
}
}
return result;
}
public void kernelSmooth(double[] kernel) {
double[] newFrequencies = new double[histogram.length];
int shift = (kernel.length - 1) / 2;
for (int i = 0; i < kernel.length; i++) {
int jStart = Math.max(0, i - shift);
int jEnd = Math.min(histogram.length, histogram.length + i - shift);
for (int j = jStart; j < jEnd; j++) {
newFrequencies[j - i + shift] += kernel[i] * histogram[j];
}
}
histogram = newFrequencies;
}
public double[] createGaussianKernel(int length, double stdDeviation) {
int r = length / 2;
int size = 2 * r + 1;
double[] kernel = new double[size];
double sum = 0;
double b = 2 * (stdDeviation) * (stdDeviation);
double a = 1 / Math.sqrt(Math.PI * b);
for (int i = 0; i < size; i++) {
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
sum += kernel[i];
}
for (int i = 0; i < size; i++) {
kernel[i] /= sum;
}
return kernel;
}
}

View File

@ -0,0 +1,90 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
public class Histogram {
private static final double EPSILON = 1.0e-6;
private final double min;
private final double resolution;
private double[] frequencies;
public Histogram(double minValue, double maxValue, double resolution) {
this.min = minValue - EPSILON;
double delta = maxValue - minValue + 2 * EPSILON;
int size = Math.max(1, (int) Math.round((maxValue - minValue) / resolution));
this.resolution = delta / size;
this.frequencies = new double[size];
}
public void kernelSmooth(double[] kernel) {
double[] newFrequencies = new double[frequencies.length];
int shift = (kernel.length - 1) / 2;
for (int i = 0; i < kernel.length; i++) {
int jStart = Math.max(0, i - shift);
int jEnd = Math.min(frequencies.length, frequencies.length + i - shift);
for (int j = jStart; j < jEnd; j++) {
newFrequencies[j - i + shift] += kernel[i] * frequencies[j];
}
}
frequencies = newFrequencies;
}
public double[] createGaussianKernel(double length, double stdDeviation) {
int r = (int) Math.round(length / resolution) / 2;
int size = 2 * r + 1;
double[] kernel = new double[size];
double sum = 0;
double b = 2 * (stdDeviation / resolution) * (stdDeviation / resolution);
double a = 1 / Math.sqrt(Math.PI * b);
for (int i = 0; i < size; i++) {
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
sum += kernel[i];
}
for (int i = 0; i < size; i++) {
kernel[i] /= sum;
}
return kernel;
}
public void gaussianSmooth(double windowLength, double stdDeviation) {
kernelSmooth(createGaussianKernel(windowLength, stdDeviation));
}
public void add(double value) {
frequencies[(int) ((value - min) / resolution)] += 1.0;
}
public int getSize() {
return frequencies.length;
}
public double getPeakValue() {
int peakIndex = 0;
for (int i = 1; i < frequencies.length; i++) {
if (frequencies[i] > frequencies[peakIndex]) {
peakIndex = i;
}
}
int peakEndIndex = peakIndex + 1;
final double EPS = 0.0001;
while (peakEndIndex < frequencies.length && Math.abs(frequencies[peakEndIndex] - frequencies[peakIndex]) < EPS) {
peakEndIndex++;
}
return ((double) peakIndex + peakEndIndex) / 2 * resolution + min;
}
}

View File

@ -0,0 +1,194 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.BOLD;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.BOLD_ITALIC;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.ITALIC;
import static com.knecon.fforesight.service.layoutparser.processor.model.text.Word.STANDARD;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.EnumMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.model.text.FontStyle;
import lombok.Data;
import lombok.EqualsAndHashCode;
@Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
public class Line extends TextBoundingBox {
private static final double WORD_DISTANCE_MULTIPLIER = 0.17;
@EqualsAndHashCode.Include
private final double x0;
@EqualsAndHashCode.Include
private final double y0;
@EqualsAndHashCode.Include
private final double x1;
@EqualsAndHashCode.Include
private final double y1;
private FontStyle fontStyle;
private final List<Word> words;
public Line(List<Character> characters, double wordSpacing) {
if (characters.size() >= 2) {
// linear regression
double sx = 0.0;
double sxx = 0.0;
double sxy = 0.0;
double sy = 0.0;
for (Character character : characters) {
sx += character.getX();
sxx += character.getX() * character.getX();
sxy += character.getX() * character.getY();
sy += character.getY();
}
double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx);
double a = (sy - b * sx) / characters.size();
this.x0 = characters.get(0).getX();
this.y0 = a + b * this.x0;
this.x1 = characters.get(characters.size() - 1).getX();
this.y1 = a + b * this.x1;
} else {
Character character = characters.get(0);
double dx = character.getTextPosition().getWidthDirAdj() / 3;
double dy = dx * Math.tan(0);
this.x0 = character.getX() - dx;
this.x1 = character.getX() + dx;
this.y0 = character.getY() - dy;
this.y1 = character.getY() + dy;
}
this.words = new ArrayList<>();
computeWords(characters, wordSpacing * WORD_DISTANCE_MULTIPLIER);
buildBBox();
computeFontStyle();
}
public Line(List<Word> words) {
this.words = words;
buildBBox();
x0 = getMinX();
y0 = getMinY();
x1 = getMaxX();
y1 = getMaxY();
computeFontStyle();
}
private void computeFontStyle() {
EnumMap<FontStyle, AtomicInteger> fontStyleCounter = new EnumMap<>(FontStyle.class);
for (FontStyle fontStyle : FontStyle.values()) {
fontStyleCounter.put(fontStyle, new AtomicInteger(0));
}
for (Word word : words) {
switch (word.getFontStyle()) {
case STANDARD -> fontStyleCounter.get(FontStyle.REGULAR).getAndIncrement();
case BOLD -> fontStyleCounter.get(FontStyle.BOLD).getAndIncrement();
case ITALIC -> fontStyleCounter.get(FontStyle.ITALIC).getAndIncrement();
case BOLD_ITALIC -> fontStyleCounter.get(FontStyle.BOLD_ITALIC).getAndIncrement();
}
}
fontStyle = fontStyleCounter.entrySet()
.stream()
.max(Comparator.comparing(entry -> entry.getValue().get()))
.map(Map.Entry::getKey).orElse(FontStyle.REGULAR);
}
public double getAngle() {
return FastAtan2.fastAtan2(y1 - y0, x1 - x0);
}
public double getLength() {
return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
}
public double angularDifference(Line j) {
double diff = Math.abs(getAngle() - j.getAngle());
if (diff <= Math.PI / 2) {
return diff;
} else {
return Math.PI - diff;
}
}
public double horizontalDistance(Line other) {
double[] xs = new double[4];
xs[0] = x0;
xs[1] = x1;
xs[2] = other.x0;
xs[3] = other.x1;
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
}
public double verticalDistance(Line other) {
double ym = (y0 + y1) / 2;
double yn = (other.y0 + other.y1) / 2;
return Math.abs(ym - yn);
}
private void computeWords(List<Character> characters, double wordSpacing) {
// Imo, the width of space should be scaled with the font size, but it only depends on the median distance between horizontal neighbours.
// If there are large differences in fontsize on a page, this might lead to missing spaces for the smaller fonts and too many for larger fonts.
// I've just now changed the scaling factor. If you come across this comment with missing whitespaces again, try scaling the fontsize instead of simply changing the factor again.
Word word = new Word();
Character previous = null;
for (Character current : characters) {
if (previous != null) {
double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj();
if (dist > wordSpacing) {
words.add(word);
word = new Word();
}
}
word.add(current);
previous = current;
}
words.add(word);
}
private void buildBBox() {
this.setToBBoxOfComponents(words);
}
public String toString() {
StringBuilder sb = new StringBuilder();
words.forEach(word -> sb.append(word.toString()).append(" "));
return sb.toString().trim();
}
}

View File

@ -0,0 +1,43 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import lombok.Getter;
public class Neighbor {
@Getter
private final double distance;
private Double angle;
private final Character originCharacter;
@Getter
private final Character character;
public Neighbor(Character neighbor, Character origin) {
this.distance = neighbor.distance(origin);
this.character = neighbor;
this.originCharacter = origin;
}
public double getHorizontalDistance() {
return character.horizontalDistance(originCharacter);
}
public double getVerticalDistance() {
return character.verticalDistance(originCharacter);
}
public double getAngle() {
if (angle != null) {
return angle;
}
return this.character.angle(this.originCharacter);
}
}

View File

@ -0,0 +1,180 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import lombok.experimental.SuperBuilder;
@Getter
@Setter
@SuperBuilder
@NoArgsConstructor
@EqualsAndHashCode(callSuper = false)
public abstract class TextBoundingBox extends BoundingBox {
protected Rectangle2D bBoxDirAdj;
protected TextDirection dir;
@Override
public void setToBBoxOfComponents(List<? extends BoundingBox> components) {
super.setToBBoxOfComponents(components);
this.bBoxDirAdj = components.stream()
.filter(c -> c instanceof TextBoundingBox)
.map(c -> (TextBoundingBox) c)
.map(TextBoundingBox::getBBoxDirAdj)
.collect(RectangleTransformations.collectBBox());
Set<TextDirection> textDirections = components.stream()
.filter(c -> c instanceof TextBoundingBox)
.map(c -> (TextBoundingBox) c)
.map(TextBoundingBox::getDir)
.collect(Collectors.toSet());
if (textDirections.isEmpty()) {
dir = TextDirection.ZERO;
} else if (textDirections.size() > 1) {
throw new IllegalArgumentException("More than one text direction found");
} else {
dir = textDirections.iterator().next();
}
}
public double getXDirAdj() {
return this.bBoxDirAdj.getX();
}
public double getYDirAdj() {
return this.bBoxDirAdj.getY();
}
public double getWidthDirAdj() {
return this.bBoxDirAdj.getWidth();
}
public double getHeightDirAdj() {
return this.bBoxDirAdj.getHeight();
}
public double getMaxXDirAdj() {
return this.bBoxDirAdj.getMaxX();
}
public double getMaxYDirAdj() {
return this.bBoxDirAdj.getMaxY();
}
public double getCenterYDirAdj() {
return this.bBoxDirAdj.getCenterY();
}
public double getCenterXDirAdj() {
return this.bBoxDirAdj.getCenterX();
}
public double horizontalDistanceDirAdj(TextBoundingBox other) {
double rect1Right = getMaxXDirAdj();
double rect1Left = getXDirAdj();
double rect2Right = other.getMaxXDirAdj();
double rect2Left = other.getXDirAdj();
if (rect1Left > rect2Right || rect2Left > rect1Right) {
return Math.max(rect2Left - rect1Right, rect1Left - rect2Right);
} else {
return 0;
}
}
public double verticalDistanceDirAdj(TextBoundingBox other) {
double rect1Top = getMaxYDirAdj();
double rect1Bottom = getYDirAdj();
double rect2Top = other.getMaxYDirAdj();
double rect2Bottom = other.getYDirAdj();
if (rect1Bottom > rect2Top || rect2Bottom > rect1Top) {
return Math.max(rect2Bottom - rect1Top, rect1Bottom - rect2Top);
} else {
return 0;
}
}
public boolean intersectsDirAdj(TextBoundingBox other) {
return this.intersectsXDirAdj(other) && this.intersectsYDirAdj(other);
}
public boolean intersectsDirAdj(TextBoundingBox other, float yThreshold, float xThreshold) {
return this.intersectsXDirAdj(other, xThreshold) && this.intersectsYDirAdj(other, yThreshold);
}
public boolean intersectsXDirAdj(TextBoundingBox other, float threshold) {
return this.getXDirAdj() - threshold <= other.getMaxXDirAdj() && this.getMaxXDirAdj() + threshold >= other.getXDirAdj();
}
public boolean intersectsXDirAdj(TextBoundingBox other) {
return this.getXDirAdj() <= other.getMaxXDirAdj() && this.getMaxXDirAdj() >= other.getXDirAdj();
}
public boolean intersectsYDirAdj(TextBoundingBox other) {
return this.getYDirAdj() <= other.getMaxYDirAdj() && this.getMaxYDirAdj() >= other.getYDirAdj();
}
public boolean intersectsYDirAdj(TextBoundingBox other, float threshold) {
return this.getYDirAdj() - threshold <= other.getMaxYDirAdj() && this.getMaxYDirAdj() + threshold >= other.getYDirAdj();
}
public boolean isAboveDirAdj(TextBoundingBox other) {
return other.isBelow(this);
}
public boolean isBelowDirAdj(TextBoundingBox other) {
return this.intersectsXDirAdj(other) && this.getYDirAdj() >= other.getMaxYDirAdj();
}
}

View File

@ -0,0 +1,37 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
public class UnionFind<T> extends org.jgrapht.alg.util.UnionFind<T> {
public UnionFind(Set<T> elements) {
super(elements);
}
public Collection<Set<T>> getGroups() {
Map<T, Set<T>> setRep = new LinkedHashMap<>();
for (T t : getParentMap().keySet()) {
T representative = find(t);
if (!setRep.containsKey(representative)) {
setRep.put(representative, new LinkedHashSet<>());
}
setRep.get(representative).add(t);
}
return setRep.values();
}
public Collection<T> getElements() {
return getParentMap().keySet();
}
}

View File

@ -0,0 +1,33 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.util.Comparator;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data;
import lombok.EqualsAndHashCode;
@Data
@EqualsAndHashCode(callSuper = false)
public class Zone extends TextBoundingBox {
private List<Line> lines;
@SuppressWarnings("PMD.ConstructorCallsOverridableMethod")
public Zone(List<Line> lines) {
this.lines = lines;
setToBBoxOfComponents(lines);
}
public String toString() {
StringBuilder sb = new StringBuilder();
lines.forEach(line -> sb.append(line.toString()).append("\n"));
return sb.toString().trim();
}
}

View File

@ -0,0 +1,58 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
@Service
public class LineBuilderService {
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
private static final double LINE_SPACING_THRESHOLD_MULTIPLIER = 0.67;
private static final double ANGLE_TOLERANCE = Math.toRadians(5);
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing, CleanRulings rulings) {
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * LINE_SPACING_THRESHOLD_MULTIPLIER;
UnionFind<Character> unionFind = new UnionFind<>(new HashSet<>(characters));
AngleFilter angleFilter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
characters.forEach(character -> {
character.getNeighbors()
.forEach(neighbor -> {
double normalizedHorizontalDistance = neighbor.getHorizontalDistance() / maxHorizontalDistance;
double normalizedVerticalDistance = neighbor.getVerticalDistance() / maxVerticalDistance;
if (character.getTextPosition().getDir() != neighbor.getCharacter().getTextPosition().getDir() //
|| !angleFilter.matches(neighbor) //
|| Math.pow(normalizedHorizontalDistance, 2) + Math.pow(normalizedVerticalDistance, 2) > 1 //
|| rulings.lineBetween(character.getTextPosition(), neighbor.getCharacter().getTextPosition())) {
return;
}
unionFind.union(character, neighbor.getCharacter());
});
});
return unionFind.getGroups()
.stream()
.map(lineCharacters -> lineCharacters.stream()
.sorted(Comparator.comparingDouble(Character::getX))
.toList())
.map(lineCharacters -> new Line(lineCharacters, characterSpacing))
.toList();
}
}

View File

@ -0,0 +1,36 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.util.List;
import org.springframework.stereotype.Service;
import org.tinspin.index.Index;
import org.tinspin.index.kdtree.KDTree;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Neighbor;
@Service
public class NearestNeighbourService {
private static final int NUMBER_OF_NEIGHBOURS = 8;
public void findNearestNeighbors(List<Character> characters) {
KDTree<Character> kdTree = KDTree.create(2);
characters.forEach(c -> kdTree.insert(new double[]{c.getX(), c.getY()}, c));
for(Character c : characters) {
Index.PointIteratorKnn<Character> iterator = kdTree.queryKnn(new double[]{c.getX(), c.getY()}, NUMBER_OF_NEIGHBOURS + 1);
// skip the first as this is identity
if(iterator.hasNext()) {
iterator.next();
}
while(iterator.hasNext()) {
c.getNeighbors().add(new Neighbor(iterator.next().value(), c));
}
}
}
}

View File

@ -0,0 +1,192 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
@Service
public class ReadingOrderService {
private static final double THRESHOLD = 5;
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
private static final Comparator<TextBoundingBox> COMPARATOR = //
Comparator.comparing(TextBoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(TextBoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
private static final Comparator<TextBoundingBox> COMPARATOR_DIR_ADJ = //
Comparator.comparing(TextBoundingBox::getYDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder, boolean useDirAdjCoords) {
if (zones.isEmpty() || zones.size() == 1) {
return zones;
}
if (xyReadingOrder) {
return resolveSingleColumnReadingOrder(zones, useDirAdjCoords);
}
Map<Long, Integer> histogram = new HashMap<>();
for (Zone zone : zones) {
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
long minY = Math.round(bbox.getMinY());
long maxY = Math.round(bbox.getMaxY());
for (long i = minY; i <= maxY; i++) {
histogram.put(i, histogram.getOrDefault(i, 0) + 1);
}
}
if (histogram.values()
.stream()
.mapToInt(Integer::intValue).average()
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
return resolveSingleColumnReadingOrder(zones, useDirAdjCoords);
} else {
return resolveMultiColumnReadingOder(zones, useDirAdjCoords);
}
}
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones, boolean useDirAdjCoords) {
if (useDirAdjCoords) {
return zones.stream()
.collect(Collectors.groupingBy(TextBoundingBox::getDir)).values()
.stream()
.flatMap(words -> words.stream()
.sorted(COMPARATOR_DIR_ADJ))
.toList();
}
zones.sort(COMPARATOR);
return zones;
}
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones, boolean useDirAdjCoords) {
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
double minX = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
for (Zone zone : zones) {
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
if (bbox.getX() < minX) {
minX = zone.getXDirAdj();
}
if (bbox.getMaxX() > maxX) {
maxX = zone.getMaxXDirAdj();
}
}
double midLineXCoordinate = (minX + maxX) / 2;
List<Zone> leftOf = new ArrayList<>();
List<Zone> rightOf = new ArrayList<>();
List<Zone> middle = new ArrayList<>();
for (Zone zone : zones) {
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
if (bbox.getX() < midLineXCoordinate && bbox.getX() + bbox.getWidth() < midLineXCoordinate) {
leftOf.add(zone);
} else if (bbox.getX() > midLineXCoordinate && bbox.getX() + bbox.getWidth() > midLineXCoordinate) {
rightOf.add(zone);
} else {
middle.add(zone);
}
}
if (useDirAdjCoords) {
leftOf.sort(COMPARATOR_DIR_ADJ);
rightOf.sort(COMPARATOR_DIR_ADJ);
middle.sort(COMPARATOR_DIR_ADJ);
} else {
leftOf.sort(COMPARATOR);
rightOf.sort(COMPARATOR);
middle.sort(COMPARATOR);
}
/*
List<Zone> leftNotIntersecting = new ArrayList<>();
for (Zone leftZone : leftOf) {
boolean intersects = false;
for (Zone rightZone : rightOf) {
if (leftZone.intersectsY(rightZone)) {
intersects = true;
break;
}
// early stopping
if (rightZone.getBBox().getMinY() > leftZone.getBBox().getMaxY()) {
break;
}
}
if (!intersects) {
leftNotIntersecting.add(leftZone);
}
}
List<Zone> rightNotIntersecting = new ArrayList<>();
for (Zone rightZone : rightOf) {
boolean intersects = false;
for (Zone leftZone : leftOf) {
if (rightZone.intersectsY(leftZone)) {
intersects = true;
break;
}
// early stopping
if (leftZone.getBBox().getMinY() > rightZone.getBBox().getMaxY()) {
break;
}
}
if (!intersects) {
rightNotIntersecting.add(rightZone);
}
}
leftOf.removeAll(leftNotIntersecting);
rightOf.removeAll(rightNotIntersecting);
middle.addAll(leftNotIntersecting);
middle.addAll(rightNotIntersecting);
*/
List<Zone> sortedZones = new ArrayList<>();
sortedZones.addAll(leftOf);
sortedZones.addAll(rightOf);
ListIterator<Zone> itty = middle.listIterator();
while (itty.hasNext()) {
Zone current = itty.next();
Rectangle2D bbox = useDirAdjCoords ? current.getBBoxDirAdj() : current.getBBox();
for (int i = 0; i < sortedZones.size(); i++) {
if (bbox.getY() < sortedZones.get(i).getY()) {
sortedZones.add(i, current);
itty.remove();
break;
}
}
}
sortedZones.addAll(middle);
return sortedZones;
}
}

View File

@ -0,0 +1,56 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Histogram;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Neighbor;
@Service
public class SpacingService {
private static final double SPACING_HISTOGRAM_RESOLUTION = 0.5;
private static final double SPACING_HISTOGRAM_SMOOTHING_LENGTH = 2.5;
private static final double SPACING_HIST_SMOOTHING_STANDARD_DEVIATION = 0.5;
private static final double ANGLE_TOLERANCE = Math.PI / 6;
public double computeCharacterSpacing(List<Character> characters) {
return computeSpacing(characters, 0);
}
public double computeLineSpacing(List<Character> characters) {
return computeSpacing(characters, Math.PI / 2);
}
private double computeSpacing(List<Character> characters, double angle) {
double maxDistance = Double.NEGATIVE_INFINITY;
for (Character character : characters) {
for (Neighbor neighbor : character.getNeighbors()) {
maxDistance = Math.max(maxDistance, neighbor.getDistance());
}
}
Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION);
AngleFilter angleFilter = new AngleFilter(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
for (Character character : characters) {
for (Neighbor neighbor : character.getNeighbors()) {
if (angleFilter.matches(neighbor)) {
histogram.add(neighbor.getDistance());
}
}
}
histogram.gaussianSmooth(SPACING_HISTOGRAM_SMOOTHING_LENGTH, SPACING_HIST_SMOOTHING_STANDARD_DEVIATION);
return histogram.getPeakValue();
}
}

View File

@ -0,0 +1,126 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
@Service
public class ZoneBuilderService {
private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5;
private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2;
private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -7;
private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5;
private static final double MIN_LINE_SIZE_SCALE = 0.9;
private static final double MAX_LINE_SIZE_SCALE = 2.5;
private static final double ANGLE_TOLERANCE = Math.toRadians(5);
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing, CleanRulings rulings) {
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
double minHorizontalMergeDistance = lineSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
double meanHeight = calculateMeanHeight(lines);
lines.forEach(outerLine -> {
lines.forEach(innerLine -> {
if (innerLine == outerLine //
|| unionFind.inSameSet(outerLine, innerLine)//
|| outerLine.angularDifference(innerLine) > ANGLE_TOLERANCE) {
return;
}
// if (!innerLine.getFontStyle().equals(outerLine.getFontStyle()) //
// && !outerLine.intersectsY(innerLine, -2f)) {
// return;
// }
double horizontalScale = Math.min(outerLine.getHeightDirAdj(), innerLine.getHeightDirAdj()) / meanHeight;
horizontalScale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(horizontalScale, MAX_LINE_SIZE_SCALE));
double verticalScale = horizontalScale;
// if (innerLine.toString().endsWith(":")
// || outerLine.toString().endsWith(":")
// || numericalIdentifierPattern.matcher(innerLine.toString()).matches()
// || numericalIdentifierPattern.matcher(outerLine.toString()).matches()) {
//
// horizontalScale *= 5;
// verticalScale /= 10;
// }
double horizontalDistance = outerLine.horizontalDistance(innerLine) / horizontalScale;
double verticalDistance = outerLine.verticalDistance(innerLine) / verticalScale;
if ((!(minHorizontalDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalDistance)) //
&& (!(minHorizontalMergeDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalMergeDistance))) {
return;
}
if (rulings.lineBetween(outerLine, innerLine)) {
return;
}
unionFind.union(outerLine, innerLine);
});
});
return unionFind.getGroups()
.stream()
.map(group -> mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing))
.toList();
}
private double calculateMeanHeight(List<Line> lines) {
double meanHeight = 0.0;
double weights = 0.0;
for (Line line : lines) {
double weight = line.getLength();
meanHeight += line.getHeightDirAdj() * weight;
weights += weight;
}
meanHeight /= weights;
return meanHeight;
}
private Zone mergeLinesInZone(List<Line> lines, double characterSpacing, double lineSpacing) {
Set<Word> words = lines.stream()
.map(Line::getWords)
.flatMap(Collection::stream)
.collect(Collectors.toSet());
Collection<Set<Word>> groupedLines = TextPositionOperations.groupByLine(words);
List<Line> sortedLines = TextPositionOperations.sortLines(groupedLines);
return new Zone(sortedLines);
}
}

View File

@ -0,0 +1,18 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.utils;
public class DoubleUtils {
public static int compareDouble(double d1, double d2, double precision) {
if (Double.isNaN(d1) || Double.isNaN(d2)) {
return Double.compare(d1, d2);
}
if (Math.abs(d1 - d2) < precision) {
return 0;
}
return Double.compare(d1, d2);
}
}

View File

@ -0,0 +1,76 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.utils;
public class FastAtan2 {
static final private int Size_Ac = 1000;
static final private int Size_Ar = Size_Ac + 1;
static final private double Pi = (float) Math.PI;
static final private double Pi_H = Pi / 2;
static final private double[] Atan2 = new double[Size_Ar];
static final private double[] Atan2_PM = new double[Size_Ar];
static final private double[] Atan2_MP = new double[Size_Ar];
static final private double[] Atan2_MM = new double[Size_Ar];
static final private double[] Atan2_R = new double[Size_Ar];
static final private double[] Atan2_RPM = new double[Size_Ar];
static final private double[] Atan2_RMP = new double[Size_Ar];
static final private double[] Atan2_RMM = new double[Size_Ar];
static {
for (int i = 0; i <= Size_Ac; i++) {
double d = (double) i / Size_Ac;
double x = 1;
double y = x * d;
double v = Math.atan2(y, x);
Atan2[i] = v;
Atan2_PM[i] = Pi - v;
Atan2_MP[i] = -v;
Atan2_MM[i] = -Pi + v;
Atan2_R[i] = Pi_H - v;
Atan2_RPM[i] = Pi_H + v;
Atan2_RMP[i] = -Pi_H + v;
Atan2_RMM[i] = -Pi_H - v;
}
}
@SuppressWarnings("ParameterAssignment")
static public double fastAtan2(double y, double x) {
if (y < 0) {
if (x < 0) {
//(y < x) because == (-y > -x)
if (y < x) {
return Atan2_RMM[(int) (x / y * Size_Ac)];
} else {
return Atan2_MM[(int) (y / x * Size_Ac)];
}
} else {
y = -y;
if (y > x) {
return Atan2_RMP[(int) (x / y * Size_Ac)];
} else {
return Atan2_MP[(int) (y / x * Size_Ac)];
}
}
} else {
if (x < 0) {
x = -x;
if (y > x) {
return Atan2_RPM[(int) (x / y * Size_Ac)];
} else {
return Atan2_PM[(int) (y / x * Size_Ac)];
}
} else {
if (y > x) {
return Atan2_R[(int) (x / y * Size_Ac)];
} else {
return Atan2[(int) (y / x * Size_Ac)];
}
}
}
}
}

View File

@ -1,7 +1,11 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.HashSet;
import java.util.Set;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.AllArgsConstructor;
@ -13,18 +17,13 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor
@NoArgsConstructor
@EqualsAndHashCode(callSuper = true)
public abstract class AbstractPageBlock extends Rectangle {
public abstract class AbstractPageBlock extends BoundingBox {
@JsonIgnore
protected float minX;
@JsonIgnore
protected float maxX;
@JsonIgnore
protected float minY;
@JsonIgnore
protected float maxY;
@JsonIgnore
protected PageBlockType classification;
Set<LayoutEngine> engines = new HashSet<>();
@JsonIgnore
protected int page;
@ -41,63 +40,6 @@ public abstract class AbstractPageBlock extends Rectangle {
}
public boolean containsBlock(TextPageBlock other) {
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
}
public boolean contains(AbstractPageBlock other) {
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
}
public boolean contains(Rectangle other) {
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft()
.getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
}
@JsonIgnore
public float getHeight() {
return maxY - minY;
}
@JsonIgnore
public float getWidth() {
return maxX - minX;
}
public boolean intersectsY(AbstractPageBlock apb) {
return this.minY <= apb.getMaxY() && this.maxY >= apb.getMinY();
}
public boolean almostIntersects(AbstractPageBlock apb, float yThreshold, float xThreshold) {
return this.almostIntersectsX(apb, xThreshold) && this.almostIntersectsY(apb, yThreshold);
}
private boolean almostIntersectsY(AbstractPageBlock apb, float threshold) {
return this.minY - threshold <= apb.getMaxY() && this.maxY + threshold >= apb.getMinY();
}
private boolean almostIntersectsX(AbstractPageBlock apb, float threshold) {
return this.minX - threshold <= apb.getMaxX() && this.maxX + threshold >= apb.getMinX();
}
public abstract boolean isEmpty();
}

View File

@ -3,8 +3,11 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
import lombok.Data;
import lombok.NoArgsConstructor;
@ -22,8 +25,12 @@ public class ClassificationDocument {
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
private LayoutDebugLayer layoutDebugLayer = new LayoutDebugLayer();
private boolean headlines;
private long rulesVersion;
private OutlineObjectTree outlineObjectTree;
private SectionTree sectionTree;
}

View File

@ -8,21 +8,26 @@ import java.util.Map;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import lombok.Data;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
@Data
@RequiredArgsConstructor
public class ClassificationPage {
@NonNull
private List<AbstractPageBlock> textBlocks;
private List<OutlineObject> outlineObjects = new ArrayList<>();
private List<AbstractPageBlock> headlines = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>();
private Rectangle bodyTextFrame;
@ -40,7 +45,7 @@ public class ClassificationPage {
private float pageWidth;
private float pageHeight;
CleanRulings cleanRulings;
private CleanRulings cleanRulings;
private Map<String, List<Rectangle2D>> markedContentBboxPerType = new HashMap<>();

View File

@ -12,6 +12,7 @@ import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@Deprecated
public class ClassificationSection {
private List<AbstractPageBlock> pageBlocks = new ArrayList<>();

View File

@ -0,0 +1,19 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.Map;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
public record DocumentWithVisualization(Document document, LayoutDebugLayer layoutDebugLayer) {
public Map<NodeType, Long> buildSemanticNodeCounts() {
return document.streamAllSubNodes()
.collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting()));
}
}

View File

@ -1,6 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
@ -12,10 +11,14 @@ import lombok.Getter;
@Getter
public class FloatFrequencyCounter {
Map<Float, Integer> countPerValue = new HashMap<>();
Map<Double, Integer> countPerValue = new HashMap<>();
boolean changed;
Double mostPopularCache;
public void add(float value) {
public void add(double value) {
changed = true;
if (!countPerValue.containsKey(value)) {
countPerValue.put(value, 1);
@ -25,9 +28,11 @@ public class FloatFrequencyCounter {
}
public void addAll(Map<Float, Integer> otherCounter) {
public void addAll(Map<Double, Integer> otherCounter) {
for (Map.Entry<Float, Integer> entry : otherCounter.entrySet()) {
changed = true;
for (Map.Entry<Double, Integer> entry : otherCounter.entrySet()) {
if (countPerValue.containsKey(entry.getKey())) {
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
} else {
@ -37,36 +42,36 @@ public class FloatFrequencyCounter {
}
public Float getMostPopular() {
public Double getMostPopular() {
Map.Entry<Float, Integer> mostPopular = null;
for (Map.Entry<Float, Integer> entry : countPerValue.entrySet()) {
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
mostPopular = entry;
if (changed || mostPopularCache == null) {
Map.Entry<Double, Integer> mostPopular = null;
for (Map.Entry<Double, Integer> entry : countPerValue.entrySet()) {
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
mostPopular = entry;
}
}
mostPopularCache = mostPopular != null ? mostPopular.getKey() : 0;
changed = false;
}
return mostPopular != null ? mostPopular.getKey() : null;
return mostPopularCache;
}
public List<Float> getHighterThanMostPopular() {
public List<Double> getValuesInReverseOrder() {
Float mostPopular = getMostPopular();
List<Float> higher = new ArrayList<>();
for (Float value : countPerValue.keySet()) {
if (value > mostPopular) {
higher.add(value);
}
}
return higher.stream().sorted(Collections.reverseOrder()).collect(Collectors.toList());
return countPerValue.keySet()
.stream()
.sorted(Collections.reverseOrder())
.collect(Collectors.toList());
}
public Float getHighest() {
public Double getHighest() {
Float highest = null;
for (Float value : countPerValue.keySet()) {
Double highest = null;
for (Double value : countPerValue.keySet()) {
if (highest == null || value > highest) {
highest = value;
}

View File

@ -3,7 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
import java.awt.geom.Rectangle2D;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -16,8 +16,8 @@ import lombok.experimental.FieldDefaults;
public class LineInformation {
List<Rectangle2D> lineBBox;
List<List<TextPositionSequence>> sequencesByLines;
List<List<Word>> sequencesByLines;
List<List<Rectangle2D>> bBoxWithGapsByLines;
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines;
List<List<List<Word>>> sequencesWithGapsByLines;
}

View File

@ -9,12 +9,14 @@ public enum PageBlockType {
H6,
HEADER,
FOOTER,
TITLE,
PARAGRAPH,
PARAGRAPH_BOLD,
PARAGRAPH_ITALIC,
PARAGRAPH_UNKNOWN,
OTHER,
TABLE_OF_CONTENTS_HEADLINE,
TABLE_OF_CONTENTS_ITEM,
LIST_ITEM,
TABLE;
@ -31,8 +33,21 @@ public enum PageBlockType {
}
public static int getHeadlineNumber(PageBlockType pageBlockType) {
return switch (pageBlockType) {
case H1, TABLE_OF_CONTENTS_HEADLINE -> 1;
case H2 -> 2;
case H3 -> 3;
case H4 -> 4;
case H5 -> 5;
default -> 6;
};
}
public boolean isHeadline() {
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6) || this.equals(TABLE_OF_CONTENTS_HEADLINE);
}
}

View File

@ -4,7 +4,7 @@ import java.awt.geom.Rectangle2D;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.AllArgsConstructor;
import lombok.Builder;
@ -15,7 +15,7 @@ import lombok.Getter;
@AllArgsConstructor
public class PageContents {
List<TextPositionSequence> sortedTextPositionSequences;
List<Word> sortedWords;
Rectangle2D cropBox;
Rectangle2D mediaBox;
List<Ruling> rulings;

View File

@ -3,26 +3,32 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class SectionIdentifier {
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
public static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d{1,2})(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?(?:[\\s.,;](\\d{1,2}))?");
public static Pattern alphanumericIdentifierPattern = Pattern.compile("^[\\s]?[A-Za-z][\\s.,;]?(\\d{1,2})[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?(\\d{1,2})?[\\s.,;]?");
private enum Format {
public enum Format {
EMPTY,
NUMERICAL,
ALPHANUMERIC,
DOCUMENT
}
@Getter
Format format;
@Getter
String identifierString;
List<Integer> identifiers;
boolean asChild;
@ -38,6 +44,10 @@ public class SectionIdentifier {
if (numericalIdentifierMatcher.find()) {
return buildNumericalSectionIdentifier(headline, numericalIdentifierMatcher);
}
Matcher alphanumericIdentifierMatcher = alphanumericIdentifierPattern.matcher(headline);
if (alphanumericIdentifierMatcher.find()) {
return buildAlphanumericSectionIdentifier(headline, alphanumericIdentifierMatcher);
}
// more formats here
return SectionIdentifier.empty();
}
@ -72,7 +82,36 @@ public class SectionIdentifier {
}
identifiers.add(Integer.parseInt(numericalIdentifier.trim()));
}
return new SectionIdentifier(Format.NUMERICAL, identifierString, identifiers.stream().toList(), false);
return new SectionIdentifier(Format.NUMERICAL,
identifierString,
identifiers.stream()
.toList(),
false);
}
private static SectionIdentifier buildAlphanumericSectionIdentifier(String headline, Matcher alphanumericIdentifierMatcher) {
String identifierString = headline.substring(alphanumericIdentifierMatcher.start(), alphanumericIdentifierMatcher.end());
String alphanumericIdentifier = alphanumericIdentifierMatcher.group(0).substring(0, 1).toUpperCase(Locale.ENGLISH);
int mappedCharacterValue = alphanumericIdentifier.charAt(0) - 'A' + 1;
List<Integer> identifiers = new LinkedList<>();
identifiers.add(mappedCharacterValue);
for (int i = 1; i <= 3; i++) {
String numericalIdentifier = alphanumericIdentifierMatcher.group(i);
if (numericalIdentifier == null || numericalIdentifier.equals("0") || numericalIdentifier.isEmpty() || numericalIdentifier.isBlank()) {
break;
}
identifiers.add(Integer.parseInt(numericalIdentifier.trim()));
}
return new SectionIdentifier(Format.ALPHANUMERIC,
identifierString,
identifiers.stream()
.toList(),
false);
}
@ -120,4 +159,22 @@ public class SectionIdentifier {
return identifierString;
}
public boolean isEmpty() {
return this.format.equals(Format.EMPTY);
}
public int level() {
return identifiers.size();
}
protected List<Integer> getIdentifiers() {
return identifiers;
}
}

View File

@ -1,145 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph;
import static java.lang.String.format;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.IntStream;
import lombok.EqualsAndHashCode;
import lombok.Setter;
@Setter
@EqualsAndHashCode
@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName")
public class Boundary implements Comparable<Boundary> {
private int start;
private int end;
public Boundary(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
this.start = start;
this.end = end;
}
public int length() {
return end - start;
}
public int start() {
return start;
}
public int end() {
return end;
}
public boolean contains(Boundary boundary) {
return start <= boundary.start() && boundary.end() <= end;
}
public boolean containedBy(Boundary boundary) {
return boundary.contains(this);
}
public boolean contains(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
return this.start <= start && end <= this.end;
}
public boolean containedBy(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
}
return start <= this.start && this.end <= end;
}
public boolean contains(int index) {
return start <= index && index < end;
}
public boolean intersects(Boundary boundary) {
return boundary.start() < this.end && this.start < boundary.end();
}
public List<Boundary> split(List<Integer> splitIndices) {
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
}
List<Boundary> splitBoundaries = new LinkedList<>();
int previousIndex = start;
for (int splitIndex : splitIndices) {
// skip split if it would produce a boundary of length 0
if (splitIndex == previousIndex) {
continue;
}
splitBoundaries.add(new Boundary(previousIndex, splitIndex));
previousIndex = splitIndex;
}
splitBoundaries.add(new Boundary(previousIndex, end));
return splitBoundaries;
}
public IntStream intStream() {
return IntStream.range(start, end);
}
public static Boundary merge(Collection<Boundary> boundaries) {
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new);
return new Boundary(minStart, maxEnd);
}
@Override
public String toString() {
return format("Boundary [%d|%d)", start, end);
}
@Override
public int compareTo(Boundary boundary) {
if (end < boundary.end() && start < boundary.start()) {
return -1;
}
if (start > boundary.start() && end > boundary.end()) {
return 1;
}
return 0;
}
}

View File

@ -1,217 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph;
import static java.lang.String.format;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Data
@EqualsAndHashCode
public class DocumentTree {
private final Entry root;
public DocumentTree(Document document) {
root = Entry.builder().treeId(Collections.emptyList()).children(new LinkedList<>()).node(document).build();
}
public TextBlock buildTextBlock() {
return allEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
}
public List<Integer> createNewMainEntryAndReturnId(GenericSemanticNode node) {
return createNewChildEntryAndReturnIdImpl(Collections.emptyList(), node);
}
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, GenericSemanticNode node) {
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
}
public List<Integer> createNewChildEntryAndReturnId(GenericSemanticNode parentNode, Table node) {
return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node);
}
public List<Integer> createNewTableChildEntryAndReturnId(Table parentTable, TableCell tableCell) {
return createNewChildEntryAndReturnIdImpl(parentTable.getTreeId(), tableCell);
}
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
private List<Integer> createNewChildEntryAndReturnIdImpl(List<Integer> parentId, SemanticNode node) {
if (!entryExists(parentId)) {
throw new IllegalArgumentException(format("parentId %s does not exist!", parentId));
}
Entry parent = getEntryById(parentId);
List<Integer> newId = new LinkedList<>(parentId);
newId.add(parent.children.size());
parent.children.add(Entry.builder().treeId(newId).node(node).build());
return newId;
}
private boolean entryExists(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root != null;
}
Entry entry = root.children.get(treeId.get(0));
for (int id : treeId.subList(1, treeId.size())) {
if (id >= entry.children.size() || 0 > id) {
return false;
}
entry = entry.children.get(id);
}
return true;
}
public Entry getParentEntryById(List<Integer> treeId) {
return getEntryById(getParentId(treeId));
}
public boolean hasParentById(List<Integer> treeId) {
return !treeId.isEmpty();
}
public Stream<SemanticNode> childNodes(List<Integer> treeId) {
return getEntryById(treeId).children.stream().map(Entry::getNode);
}
public Stream<SemanticNode> childNodesOfType(List<Integer> treeId, NodeType nodeType) {
return getEntryById(treeId).children.stream().filter(entry -> entry.node.getType().equals(nodeType)).map(Entry::getNode);
}
private static List<Integer> getParentId(List<Integer> treeId) {
if (treeId.isEmpty()) {
throw new UnsupportedOperationException("Root has no parent!");
}
if (treeId.size() < 2) {
return Collections.emptyList();
}
return treeId.subList(0, treeId.size() - 1);
}
public Entry getEntryById(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root;
}
Entry entry = root.children.get(treeId.get(0));
for (int id : treeId.subList(1, treeId.size())) {
entry = entry.children.get(id);
}
return entry;
}
public Stream<Entry> mainEntries() {
return root.children.stream();
}
public Stream<Entry> allEntriesInOrder() {
return Stream.of(root).flatMap(DocumentTree::flatten);
}
public Stream<Entry> allSubEntriesInOrder(List<Integer> parentId) {
return getEntryById(parentId).children.stream().flatMap(DocumentTree::flatten);
}
@Override
public String toString() {
return String.join("\n", allEntriesInOrder().map(Entry::toString).toList());
}
private static Stream<Entry> flatten(Entry entry) {
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTree::flatten));
}
public SemanticNode getHighestParentById(List<Integer> treeId) {
if (treeId.isEmpty()) {
return root.node;
}
return root.children.get(treeId.get(0)).node;
}
@Builder
@Getter
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
public static class Entry {
List<Integer> treeId;
SemanticNode node;
@Builder.Default
List<Entry> children = new LinkedList<>();
@Override
public String toString() {
return node.toString();
}
public NodeType getType() {
return node.getType();
}
}
}

View File

@ -1,8 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.entity;
public enum EntityType {
ENTITY,
RECOMMENDATION,
FALSE_POSITIVE,
FALSE_RECOMMENDATION
}

View File

@ -1,228 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.entity;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.Comparator;
import java.util.Deque;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class RedactionEntity {
// initial values
@EqualsAndHashCode.Include
final Boundary boundary;
@EqualsAndHashCode.Include
final String type;
@EqualsAndHashCode.Include
final EntityType entityType;
// empty defaults
boolean redaction;
boolean removed;
boolean ignored;
boolean resized;
boolean skipRemoveEntitiesContainedInLarger;
boolean dictionaryEntry;
boolean dossierDictionaryEntry;
Set<Engine> engines;
Set<RedactionEntity> references;
@Builder.Default
Deque<Integer> matchedRules = new LinkedList<>();
String redactionReason;
String legalBasis;
// inferred on graph insertion
@EqualsAndHashCode.Include
String value;
String textBefore;
String textAfter;
@Builder.Default
Set<Page> pages = new HashSet<>();
List<RedactionPosition> redactionPositionsPerPage;
@Builder.Default
List<SemanticNode> intersectingNodes = new LinkedList<>();
SemanticNode deepestFullyContainingNode;
public static RedactionEntity initialEntityNode(Boundary boundary, String type, EntityType entityType) {
return RedactionEntity.builder().type(type).entityType(entityType).boundary(boundary).engines(new HashSet<>()).references(new HashSet<>()).build();
}
public boolean occursInNodeOfType(Class<? extends SemanticNode> clazz) {
return intersectingNodes.stream().anyMatch(clazz::isInstance);
}
public boolean occursInNode(SemanticNode semanticNode) {
return intersectingNodes.stream().anyMatch(node -> node.equals(semanticNode));
}
public boolean isType(String type) {
return this.type.equals(type);
}
public boolean isAnyType(List<String> types) {
return types.contains(type);
}
public void addIntersectingNode(SemanticNode containingNode) {
intersectingNodes.add(containingNode);
}
public void removeFromGraph() {
intersectingNodes.forEach(node -> node.getEntities().remove(this));
pages.forEach(page -> page.getEntities().remove(this));
intersectingNodes = new LinkedList<>();
deepestFullyContainingNode = null;
pages = new HashSet<>();
removed = true;
ignored = true;
}
public void addMatchedRule(int ruleNumber) {
matchedRules.add(ruleNumber);
}
public int getMatchedRule() {
if (matchedRules.isEmpty()) {
return 0;
}
return matchedRules.getLast();
}
public List<RedactionPosition> getRedactionPositionsPerPage() {
if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) {
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(boundary);
Page firstPage = rectanglesPerLinePerPage.keySet()
.stream()
.min(Comparator.comparingInt(Page::getNumber))
.orElseThrow(() -> new RuntimeException("No Positions found on any page!"));
String id = IdBuilder.buildId(pages, rectanglesPerLinePerPage.values().stream().flatMap(Collection::stream).toList());
redactionPositionsPerPage = rectanglesPerLinePerPage.entrySet().stream().map(entry -> buildRedactionPosition(firstPage, id, entry)).toList();
}
return redactionPositionsPerPage;
}
private static RedactionPosition buildRedactionPosition(Page firstPage, String id, Map.Entry<Page, List<Rectangle2D>> entry) {
if (entry.getKey().equals(firstPage)) {
return new RedactionPosition(id, entry.getKey(), entry.getValue());
} else {
return new RedactionPosition(id + "-" + entry.getKey().getNumber(), entry.getKey(), entry.getValue());
}
}
public boolean containedBy(RedactionEntity redactionEntity) {
return this.boundary.containedBy(redactionEntity.getBoundary());
}
public boolean contains(RedactionEntity redactionEntity) {
return this.boundary.contains(redactionEntity.getBoundary());
}
public boolean intersects(RedactionEntity redactionEntity) {
return this.boundary.intersects(redactionEntity.getBoundary());
}
public void addEngine(Engine engine) {
engines.add(engine);
}
public void addEngines(Set<Engine> engines) {
this.engines.addAll(engines);
}
public void addReference(RedactionEntity reference) {
references.add(reference);
}
public void addReferences(List<RedactionEntity> references) {
this.references.addAll(references);
}
public boolean matchesAnnotationId(String manualRedactionId) {
return getRedactionPositionsPerPage().stream().anyMatch(entityPosition -> entityPosition.getId().equals(manualRedactionId));
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("Entity[\"");
sb.append(value);
sb.append("\", ");
sb.append(boundary);
sb.append(", pages[");
pages.forEach(page -> {
sb.append(page.getNumber());
sb.append(", ");
});
sb.delete(sb.length() - 2, sb.length());
sb.append("], type = \"");
sb.append(type);
sb.append("\", EntityType.");
sb.append(entityType);
sb.append("]");
return sb.toString();
}
}

View File

@ -1,24 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.entity;
import java.awt.geom.Rectangle2D;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class RedactionPosition {
final String id;
Page page;
// Each entry in this list corresponds to an entry in the redaction log, this means:
// An entity might be represented by multiple redaction log entries
List<Rectangle2D> rectanglePerLine;
}

View File

@ -1,134 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Document implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
Set<Page> pages;
DocumentTree documentTree;
Integer numberOfPages;
TextBlock textBlock;
@Builder.Default
Set<RedactionEntity> entities = new HashSet<>();
@Override
public NodeType getType() {
return NodeType.DOCUMENT;
}
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
}
return textBlock;
}
public List<Section> getMainSections() {
return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node)
.collect(Collectors.toList());
}
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
return streamAllNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock);
}
@Override
public List<Integer> getTreeId() {
return Collections.emptyList();
}
@Override
public void setTreeId(List<Integer> tocId) {
throw new UnsupportedOperationException("Document is always the root of the TablePageBlock of Contents");
}
@Override
public Headline getHeadline() {
return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node)
.findFirst()
.orElse(Headline.builder().build());
}
private Stream<SemanticNode> streamAllNodes() {
return documentTree.allEntriesInOrder()
.map(DocumentTree.Entry::getNode);
}
public Stream<Image> streamAllImages() {
return streamAllSubNodesOfType(NodeType.IMAGE).map(node -> (Image) node);
}
public Map<NodeType, Long> buildSemanticNodeCounts() {
return streamAllSubNodes().collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting()));
}
@Override
public String toString() {
return NodeType.DOCUMENT + ": " + this.getTextBlock().buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
Map<Page, Rectangle2D> bBox = new HashMap<>();
for (Page page : pages) {
bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()));
}
return bBox;
}
}

View File

@ -1,84 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Footer implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
TextBlock leafTextBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@Override
public NodeType getType() {
return NodeType.FOOTER;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return treeId + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
}
}

View File

@ -1,5 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
public interface GenericSemanticNode extends SemanticNode {
}

View File

@ -1,83 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Header implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
TextBlock leafTextBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@Override
public boolean isLeaf() {
return true;
}
@Override
public NodeType getType() {
return NodeType.HEADER;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return treeId + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
}
}

View File

@ -1,90 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Headline implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
TextBlock leafTextBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@Override
public NodeType getType() {
return NodeType.HEADLINE;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return treeId + ": " + NodeType.HEADLINE + ": " + leafTextBlock.buildSummary();
}
@Override
public Headline getHeadline() {
return this;
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
}
}

View File

@ -1,101 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Image implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
String id;
ImageType imageType;
boolean transparent;
Rectangle2D position;
boolean redaction;
boolean ignored;
@Builder.Default
String redactionReason = "";
@Builder.Default
String legalBasis = "";
@Builder.Default
int matchedRule = -1;
@EqualsAndHashCode.Exclude
Page page;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@Override
public NodeType getType() {
return NodeType.IMAGE;
}
@Override
public TextBlock getTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
}
@Override
public Set<Page> getPages() {
return Collections.singleton(page);
}
@Override
public String toString() {
return treeId + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position;
}
@Override
public Map<Page, Rectangle2D> getBBox() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
bBoxPerPage.put(page, position);
return bBoxPerPage;
}
}

View File

@ -1,25 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.util.Locale;
public enum ImageType {
LOGO,
FORMULA,
SIGNATURE,
SIGNATURE_VISUAL,
OTHER,
OCR;
public static ImageType fromString(String imageType) {
return switch (imageType.toLowerCase(Locale.ROOT)) {
case "logo" -> ImageType.LOGO;
case "formula" -> ImageType.FORMULA;
case "signature" -> ImageType.SIGNATURE;
case "ocr" -> ImageType.OCR;
default -> ImageType.OTHER;
};
}
}

View File

@ -1,87 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.Setter;
import lombok.experimental.FieldDefaults;
@Getter
@Setter
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Page {
Integer number;
Integer height;
Integer width;
Integer rotation;
@EqualsAndHashCode.Exclude
List<SemanticNode> mainBody;
@EqualsAndHashCode.Exclude
Header header;
@EqualsAndHashCode.Exclude
Footer footer;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@Builder.Default
@EqualsAndHashCode.Exclude
Set<Image> images = new HashSet<>();
public static Page fromClassificationPage(ClassificationPage classificationPage) {
return Page.builder()
.height((int) classificationPage.getPageHeight())
.width((int) classificationPage.getPageWidth())
.number(classificationPage.getPageNumber())
.rotation(classificationPage.getRotation())
.mainBody(new LinkedList<>())
.build();
}
public TextBlock getMainBodyTextBlock() {
return mainBody.stream().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
}
@Override
public String toString() {
return String.valueOf(number);
}
@Override
public int hashCode() {
return number;
}
@Override
public boolean equals(Object o) {
return o instanceof Page && o.hashCode() == this.hashCode();
}
}

View File

@ -1,82 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Paragraph implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
TextBlock leafTextBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@Override
public NodeType getType() {
return NodeType.PARAGRAPH;
}
@Override
public boolean isLeaf() {
return true;
}
@Override
public TextBlock getTextBlock() {
return leafTextBlock;
}
@Override
public String toString() {
return treeId + ": " + NodeType.PARAGRAPH + ": " + leafTextBlock.buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
}
}

View File

@ -1,98 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Section implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
TextBlock textBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
@Override
public NodeType getType() {
return NodeType.SECTION;
}
public boolean hasTables() {
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
.isPresent();
}
@Override
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
}
return textBlock;
}
@Override
public String toString() {
return treeId.toString() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary();
}
public Headline getHeadline() {
return streamChildrenOfType(NodeType.HEADLINE)//
.map(node -> (Headline) node)//
.findFirst()//
.orElseGet(() -> getParent().getHeadline());
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = GenericSemanticNode.super.getBBox();
}
return bBoxCache;
}
}

View File

@ -1,477 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.EntityType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
public interface SemanticNode {
/**
* Returns the type of this node, such as Section, Paragraph, etc.
*
* @return NodeType of this node
*/
NodeType getType();
/**
* Searches all Nodes located underneath this Node in the DocumentTree and concatenates their AtomicTextBlocks into a single TextBlock.
* So, for a Section all TextBlocks of Subsections, Paragraphs, and Tables are concatenated into a single TextBlock
* If the Node is a Leaf, the LeafTextBlock will be returned instead.
*
* @return TextBlock containing all AtomicTextBlocks that are located under this Node.
*/
TextBlock getTextBlock();
/**
* Any Node maintains its own Set of Entities.
* This Set contains all Entities whose boundary intersects the boundary of this node.
*
* @return Set of all Entities associated with this Node
*/
Set<RedactionEntity> getEntities();
/**
* Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.
*
* @return Set of PageNodes this node appears on.
*/
default Set<Page> getPages() {
return getTextBlock().getPages();
}
default Page getFirstPage() {
return getTextBlock().getPages().stream().min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
}
/**
* Each AtomicTextBlock is assigned a page, so to get the pages for this boundary, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.
*
* @return Set of PageNodes this node appears on.
*/
default Set<Page> getPages(Boundary boundary) {
if (!getBoundary().contains(boundary)) {
throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", boundary, getBoundary()));
}
return getTextBlock().getPages(boundary);
}
default boolean isOnPage(int pageNumber) {
return getPages().stream().anyMatch(page -> page.getNumber() == pageNumber);
}
/**
* Returns the DocumentTree Object.
*
* @return the DocumentTree of the Document this node belongs to
*/
DocumentTree getDocumentTree();
/**
* The id is a List of Integers uniquely identifying this node in the DocumentTree.
*
* @return the DocumentTree ID
*/
List<Integer> getTreeId();
/**
* This should only be used during graph construction.
*
* @param tocId List of Integers
*/
void setTreeId(List<Integer> tocId);
/**
* Traverses the Tree up, until it hits a Headline or hits a Section which will then return the first Headline from its children.
* Throws NotFoundException if no Headline is found this way
*
* @return First Headline found
*/
default Headline getHeadline() {
return getParent().getHeadline();
}
/**
* Checks if its TocId has a length greater than zero.
*
* @return boolean indicating whether this Node has a Parent in the DocumentTree
*/
default boolean hasParent() {
return getDocumentTree().hasParentById(getTreeId());
}
/**
* @return The SemanticNode representing the Parent in the DocumentTree
* throws NotFoundException, when no parent is present
*/
default SemanticNode getParent() {
return getDocumentTree().getParentEntryById(getTreeId()).getNode();
}
/**
* @return The SemanticNode which is directly underneath the document and also under which this node is.
* if this is the highest child node or the document itself, it returns itself.
*/
default SemanticNode getHighestParent() {
return getDocumentTree().getHighestParentById(getTreeId());
}
/**
* Leaf means a SemanticNode has direct access to a TextBlock, by default this is false and must be overridden.
* Currently only Sections, Images, and Tables are not leaves.
* A TableCell might be a leaf depending on its area compared to the page.
*
* @return boolean, indicating if a Node has direct access to a TextBlock
*/
default boolean isLeaf() {
return false;
}
/**
* Leaf means a SemanticNode has direct access to a TextBlock, by default this is false and must be overridden.
* Currently only Sections and Tables are no leaves.
*
* @return AtomicTextBlock
*/
default TextBlock getLeafTextBlock() {
throw new UnsupportedOperationException("Only leaf Nodes have access to LeafTextBlocks!");
}
/**
* Should only be used during construction of the Graph. Sets the LeafTextBlock of this SemanticNode.
*
* @param textBlock the TextBlock to set as the LeafTextBlock of this SemanticNode
*/
default void setLeafTextBlock(TextBlock textBlock) {
throw new UnsupportedOperationException();
}
/**
* Checks whether this SemanticNode has any Entity with EntityType.ENTITY of the provided type.
*
* @param type string representing the type of entity to check for
* @return true, if this SemanticNode has at least one Entity of the provided type
*/
default boolean hasEntitiesOfType(String type) {
return getEntities().stream().filter(entity -> entity.getEntityType().equals(EntityType.ENTITY)).anyMatch(redactionEntity -> redactionEntity.getType().equals(type));
}
/**
* Returns a List of Entities in this SemanticNode which are of the provided type such as "CBI_author".
*
* @param type string representing the type of entities to return
* @return List of RedactionEntities of any the type
*/
default List<RedactionEntity> getEntitiesOfType(String type) {
return getEntities().stream().filter(redactionEntity -> redactionEntity.getType().equals(type)).toList();
}
/**
* Returns a List of Entities in this SemanticNode which have any of the provided types such as "CBI_author".
*
* @param types A list of strings representing the types of entities to return
* @return List of RedactionEntities of any provided type
*/
default List<RedactionEntity> getEntitiesOfType(List<String> types) {
return getEntities().stream().filter(redactionEntity -> redactionEntity.isAnyType(types)).toList();
}
/**
* Each AtomicTextBlock has an index on its page, this returns the number of the first AtomicTextBlock underneath this node.
* If this node does not have any AtomicTexBlocks underneath it, e.g. an empty TableCell. It returns -1.
*
* @return Integer representing the number on the page
*/
default Integer getNumberOnPage() {
TextBlock textBlock = getTextBlock();
if (!textBlock.getAtomicTextBlocks().isEmpty()) {
return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
} else {
return -1;
}
}
/**
* Checks if the SemanticNode contains any text.
*
* @return true, if this node's TextBlock is not empty
*/
default boolean hasText() {
return !getTextBlock().isEmpty();
}
/**
* Checks whether this SemanticNode contains the provided String.
*
* @param string A String which the TextBlock might contain
* @return true, if this node's TextBlock contains the string
*/
default boolean containsString(String string) {
return getTextBlock().getSearchText().contains(string);
}
/**
* Checks whether this SemanticNode contains all the provided Strings.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains all strings
*/
default boolean containsStrings(List<String> strings) {
return strings.stream().allMatch(this::containsString);
}
/**
* Checks whether this SemanticNode contains all the provided Strings ignoring case.
*
* @param string A String which the TextBlock might contain
* @return true, if this node's TextBlock contains the string ignoring case
*/
default boolean containsStringIgnoreCase(String string) {
return getTextBlock().getSearchText().toLowerCase(Locale.ROOT).contains(string.toLowerCase(Locale.ROOT));
}
/**
* Checks whether this SemanticNode contains any of the provided Strings.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the strings
*/
default boolean containsAnyString(List<String> strings) {
return strings.stream().anyMatch(this::containsString);
}
/**
* Checks whether this SemanticNode contains any of the provided Strings ignoring case.
*
* @param strings A List of Strings which the TextBlock might contain
* @return true, if this node's TextBlock contains any of the strings
*/
default boolean containsAnyStringIgnoreCase(List<String> strings) {
return strings.stream().anyMatch(this::containsStringIgnoreCase);
}
/**
* This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the RedactionEntity intersects or even contains the RedactionEntity.
* It sets the fields accordingly and recursively calls this function on all its children.
*
* @param redactionEntity RedactionEntity, which is being inserted into the graph
*/
default void addThisToEntityIfIntersects(RedactionEntity redactionEntity) {
TextBlock textBlock = getTextBlock();
if (textBlock.getBoundary().intersects(redactionEntity.getBoundary())) {
if (textBlock.containsBoundary(redactionEntity.getBoundary())) {
redactionEntity.setDeepestFullyContainingNode(this);
}
redactionEntity.addIntersectingNode(this);
streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(redactionEntity.getBoundary()))
.forEach(node -> node.addThisToEntityIfIntersects(redactionEntity));
}
}
/**
* returns the set of layoutengines.
*
* @return set of layoutengines.
*/
Set<LayoutEngine> getEngines();
/**
* adds a layoutengine to the set.
*/
default void addEngine(LayoutEngine engine) {
getEngines().add(engine);
}
/**
* Streams all children located directly underneath this node in the DocumentTree.
*
* @return Stream of all children
*/
default Stream<SemanticNode> streamChildren() {
return getDocumentTree().childNodes(getTreeId());
}
/**
* Streams all children located directly underneath this node in the DocumentTree of the provided type.
*
* @return Stream of all children
*/
default Stream<SemanticNode> streamChildrenOfType(NodeType nodeType) {
return getDocumentTree().childNodesOfType(getTreeId(), nodeType);
}
/**
* Recursively streams all SemanticNodes located underneath this node in the DocumentTree in order.
*
* @return Stream of all SubNodes
*/
default Stream<SemanticNode> streamAllSubNodes() {
return getDocumentTree().allSubEntriesInOrder(getTreeId()).map(DocumentTree.Entry::getNode);
}
/**
* Recursively streams all SemanticNodes of the provided type located underneath this node in the DocumentTree in order.
*
* @return Stream of all SubNodes
*/
default Stream<SemanticNode> streamAllSubNodesOfType(NodeType nodeType) {
return getDocumentTree().allSubEntriesInOrder(getTreeId()).filter(entry -> entry.getType().equals(nodeType)).map(DocumentTree.Entry::getNode);
}
/**
* The Boundary is the start and end string offsets in the reading order of the document.
*
* @return Boundary of this Node's TextBlock
*/
default Boundary getBoundary() {
return getTextBlock().getBoundary();
}
/**
* If this Node is a Leaf it will calculate the boundingBox of its LeafTextBlock, otherwise it will calculate the Union of the BoundingBoxes of all its Children.
* If called on the Document, it will return the cropbox of each page
*
* @return Rectangle2D fully encapsulating this Node for each page.
*/
default Map<Page, Rectangle2D> getBBox() {
if (isLeaf()) {
return getBBoxFromLeafTextBlock();
}
return getBBoxFromChildren();
}
/**
* Checks whether the Bounding Box of this SemanticNode contains the provided rectangle on the provided page.
*
* @param rectangle2D The rectangle to check if it is contained
* @param pageNumber The Page number on which the rectangle should be checked
* @return boolean
*/
default boolean containsRectangle(Rectangle2D rectangle2D, Integer pageNumber) {
Page helperPage = Page.builder().number(pageNumber).build();
if (!getPages().contains(helperPage)) {
return false;
}
return getBBox().get(helperPage).contains(rectangle2D);
}
/**
* TODO: this produces unwanted results for sections spanning multiple columns.
* Computes the Union of the bounding boxes of all children recursively.
*
* @return The union of the BoundingBoxes of all children
*/
private Map<Page, Rectangle2D> getBBoxFromChildren() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
List<Map<Page, Rectangle2D>> childrenBBoxes = streamChildren().map(SemanticNode::getBBox).toList();
Set<Page> pages = childrenBBoxes.stream().flatMap(map -> map.keySet().stream()).collect(Collectors.toSet());
for (Page page : pages) {
Rectangle2D bBoxOnPage = childrenBBoxes.stream()
.filter(childBboxPerPage -> childBboxPerPage.containsKey(page))
.map(childBboxPerPage -> childBboxPerPage.get(page))
.collect(RectangleTransformations.collectBBox());
bBoxPerPage.put(page, bBoxOnPage);
}
return bBoxPerPage;
}
/**
* @return The union of all BoundingBoxes of the TextBlock of this node
*/
private Map<Page, Rectangle2D> getBBoxFromLeafTextBlock() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage));
atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
return bBoxPerPage;
}
}

View File

@ -1,359 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class Table implements SemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
DocumentTree documentTree;
int numberOfRows;
int numberOfCols;
TextBlock textBlock;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@EqualsAndHashCode.Exclude
Map<Page, Rectangle2D> bBoxCache;
/**
* Streams all entities in this table, that appear in a row, which contains any of the provided strings.
*
* @param strings Strings to check whether a row contains them
* @return Stream of all entities in this table, that appear in a row, which contains any of the provided strings
*/
public Stream<RedactionEntity> streamEntitiesWhereRowContainsStringsIgnoreCase(List<String> strings) {
return IntStream.range(0, numberOfRows).boxed()
.filter(row -> rowContainsStringsIgnoreCase(row, strings))
.flatMap(this::streamRow)
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
/**
* Checks whether the specified row contains all the provided strings.
*
* @param row the row to check as an Integer, must be smaller than numberOfRows
* @param strings a list of strings to check for
* @return true, if all strings appear in the provided row
*/
public boolean rowContainsStringsIgnoreCase(Integer row, List<String> strings) {
String rowText = streamRow(row).map(TableCell::getTextBlock)
.collect(new TextBlockCollector()).getSearchText().toLowerCase(Locale.ROOT);
return strings.stream()
.map(String::toLowerCase)
.allMatch(rowText::contains);
}
/**
* Streams all entities which appear in a row where at least one cell has the provided header and the provided value.
*
* @param header the header value to search for
* @param value the string which the table cell should contain
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and the provided value.
*/
public Stream<RedactionEntity> streamEntitiesWhereRowHasHeaderAndValue(String header, String value) {
List<Integer> vertebrateStudyCols = streamHeaders().filter(headerNode -> headerNode.containsString(header))
.map(TableCell::getCol)
.toList();
return streamTableCells().filter(tableCellNode -> vertebrateStudyCols.stream()
.anyMatch(vertebrateStudyCol -> getCell(tableCellNode.getRow(), vertebrateStudyCol).containsString(value)))
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
/**
* Streams all entities which appear in a row where at least one cell has the provided header and any provided value.
*
* @param header the header value to search for
* @param values the strings which the table cell should contain
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and any provided value.
*/
public Stream<RedactionEntity> streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List<String> values) {
List<Integer> colsWithHeader = streamHeaders().filter(headerNode -> headerNode.containsString(header))
.map(TableCell::getCol)
.toList();
return streamTableCells().filter(tableCellNode -> colsWithHeader.stream()
.anyMatch(colWithHeader -> getCell(tableCellNode.getRow(), colWithHeader).containsAnyString(values)))
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
/**
* Streams all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
*
* @param types type strings to check whether a row contains an entity like them
* @return Stream of all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
*/
public Stream<RedactionEntity> streamEntitiesWhereRowContainsEntitiesOfType(List<String> types) {
List<Integer> rowsWithEntityOfType = IntStream.range(0, numberOfRows).boxed()
.filter(rowNumber -> streamEntityTypesInRow(rowNumber).anyMatch(existingType -> types.stream()
.anyMatch(typeToCheck -> typeToCheck.equals(existingType))))
.toList();
return rowsWithEntityOfType.stream()
.flatMap(this::streamRow)
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
/**
* Streams all entities in this table, that appear in a row, which does not contain any entity with any of the provided types.
*
* @param types type strings to check whether a row doesn't contain an entity like it
* @return Stream of all entities in this table, that appear in a row, which does not contain any entity with any of the provided types.
*/
public Stream<RedactionEntity> streamEntitiesWhereRowContainsNoEntitiesOfType(List<String> types) {
List<Integer> rowsWithNoEntityOfType = IntStream.range(0, numberOfRows).boxed()
.filter(rowNumber -> streamEntityTypesInRow(rowNumber).noneMatch(existingType -> types.stream()
.anyMatch(typeToCheck -> typeToCheck.equals(existingType))))
.toList();
return rowsWithNoEntityOfType.stream()
.flatMap(this::streamRow)
.map(TableCell::getEntities)
.flatMap(Collection::stream);
}
private Stream<String> streamEntityTypesInRow(Integer rowNumber) {
return streamRow(rowNumber).map(TableCell::getEntities)
.flatMap(Collection::stream)
.map(RedactionEntity::getType)
.distinct();
}
/**
* Returns a TableCell at the provided row and column location.
*
* @param row int representing the row, must be smaller than numberOfRows
* @param col int representing the col, must be smaller than numberOfCols
* @return TableCell at the provided location in the table
*/
public TableCell getCell(int row, int col) {
if (numberOfRows - row < 0 || numberOfCols - col < 0) {
throw new IllegalArgumentException(format("row %d, col %d is out of bounds for number of rows of %d and number of cols %d", row, col, numberOfRows, numberOfCols));
}
int idx = row * numberOfCols + col;
return (TableCell) documentTree.getEntryById(treeId).getChildren()
.get(idx).getNode();
}
/**
* Streams all TableCells in this Table row-wise.
*
* @return Stream of all TableCells
*/
public Stream<TableCell> streamTableCells() {
return streamChildrenOfType(NodeType.TABLE_CELL).map(node -> (TableCell) node);
}
/**
* Streams all TableCells in this Table which have the provided header row-wise.
*
* @return Stream of all TableCells which have the provided header
*/
public Stream<TableCell> streamTableCellsWithHeader(String header) {
return streamHeaders().filter(tableCellNode -> tableCellNode.getTextBlock().getSearchText().contains(header))
.map(TableCell::getCol)
.flatMap(this::streamCol)
.filter(tableCellNode -> !tableCellNode.isHeader());
}
/**
* Streams all TableCells belonging to the provided column from top down.
*
* @param col int representing the column
* @return Stream of all TableCell in the provided column
*/
public Stream<TableCell> streamCol(int col) {
return IntStream.range(0, numberOfRows).boxed()
.map(row -> getCell(row, col));
}
/**
* Streams all TableCells belonging to the provided row from left to right.
*
* @param row int representing the row
* @return Stream of all TableCell in the provided row
*/
public Stream<TableCell> streamRow(int row) {
return IntStream.range(0, numberOfCols).boxed()
.map(col -> getCell(row, col));
}
/**
* Streams all TableCells row-wise and filters them with header == true.
*
* @return Stream of all TableCells with header == true
*/
public Stream<TableCell> streamHeaders() {
return streamTableCells().filter(TableCell::isHeader);
}
/**
* Streams all TableCells of the provided row and column and filters them with header == true.
*
* @param row int representing the row
* @param col int representing the column
* @return Stream of all TableCells with header == true in the provided row or col
*/
public Stream<TableCell> streamHeadersForCell(int row, int col) {
return Stream.concat(streamRow(row), streamCol(col))
.filter(TableCell::isHeader);
}
/**
* Streams all Headers and checks if any equal the provided string.
*
* @param header string to check the headers for
* @return true, if at least one header equals the provided string
*/
public boolean hasHeader(String header) {
return streamHeaders().anyMatch(tableCellNode -> tableCellNode.getTextBlock().getSearchText().strip().equals(header));
}
/**
* Checks if this table has a column with the provided header and any of the table cells in that column contain the provided value.
*
* @param header string to find header cells
* @param value string to check cells with provided header
* @return true, if this table has a column with the provided header and any of the table cells in that column contain the provided value
*/
public boolean hasRowWithHeaderAndValue(String header, String value) {
return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsString(value));
}
/**
* Checks if this table has a column with the provided header and any of the table cells in that column contains any of the provided values.
*
* @param header string to find header cells
* @param values List of strings to check cells with provided header
* @return true, if this table has a column with the provided header and any of the table cells in that column contains any of the provided values.
*/
public boolean hasRowWithHeaderAndAnyValue(String header, List<String> values) {
return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsAnyString(values));
}
/**
* Finds all entities of the provided type, which appear in the same row that the provided entity appears in.
*
* @param type the type of entities to search for
* @param redactionEntity the entity, which appears in the row to search
* @return List of all entities of the provided type, which appear in the same row that the provided entity appears in.
*/
public List<RedactionEntity> getEntitiesOfTypeInSameRow(String type, RedactionEntity redactionEntity) {
return redactionEntity.getIntersectingNodes()
.stream()
.filter(node -> node instanceof TableCell)
.map(node -> (TableCell) node)
.flatMap(tableCellNode -> streamRow(tableCellNode.getRow()))
.map(cell -> cell.getEntitiesOfType(type))
.flatMap(Collection::stream)
.toList();
}
@Override
public NodeType getType() {
return NodeType.TABLE;
}
@Override
public TextBlock getTextBlock() {
if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
}
return textBlock;
}
@Override
public String toString() {
return treeId.toString() + ": " + NodeType.TABLE + ": #cols: " + numberOfCols + ", #rows: " + numberOfRows + ", " + this.getTextBlock().buildSummary();
}
@Override
public Map<Page, Rectangle2D> getBBox() {
if (bBoxCache == null) {
bBoxCache = SemanticNode.super.getBBox();
}
return bBoxCache;
}
}

View File

@ -1,102 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TableCell implements GenericSemanticNode {
@Builder.Default
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
List<Integer> treeId;
int row;
int col;
boolean header;
Rectangle2D bBox;
TextBlock leafTextBlock;
TextBlock textBlock;
@EqualsAndHashCode.Exclude
DocumentTree documentTree;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<RedactionEntity> entities = new HashSet<>();
@Override
public Map<Page, Rectangle2D> getBBox() {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
getPages().forEach(page -> bBoxPerPage.put(page, bBox));
return bBoxPerPage;
}
@Override
public NodeType getType() {
return NodeType.TABLE_CELL;
}
@Override
public boolean isLeaf() {
return getDocumentTree().getEntryById(getTreeId()).getChildren().isEmpty();
}
@Override
public TextBlock getTextBlock() {
if (isLeaf()) {
return leafTextBlock;
}
if (textBlock == null) {
textBlock = buildTextBlock();
}
return textBlock;
}
private TextBlock buildTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isLeaf)
.map(SemanticNode::getLeafTextBlock)
.collect(new TextBlockCollector());
}
@Override
public String toString() {
return treeId + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary();
}
}

View File

@ -1,232 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class AtomicTextBlock implements TextBlock {
Long id;
Integer numberOnPage;
Page page;
//string coordinates
Boundary boundary;
String searchText;
@Builder.Default
List<Integer> lineBreaks = new ArrayList<>();
@Builder.Default
List<Boundary> boldTextBoundaries = new ArrayList<>();
@Builder.Default
List<Boundary> italicTextBoundaries = new ArrayList<>();
String orientation;
int textDirection;
//position coordinates
@Builder.Default
List<Integer> stringIdxToPositionIdx = new ArrayList<>();
@Builder.Default
List<Rectangle2D> positions = new ArrayList<>();
@EqualsAndHashCode.Exclude
SemanticNode parent;
@Override
public int numberOfLines() {
return lineBreaks.size() + 1;
}
public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText,
List<Integer> lineBreaks,
List<Boundary> boldTextBoundaries,
List<Boundary> italicTextBoundaries,
List<Rectangle2D> positions,
List<Integer> stringIdxToPositionIdx,
long idx,
SemanticNode parent,
int numberOnPage,
Page page,
int offset,
String orientation,
int textDirection) {
return AtomicTextBlock.builder()
.id(idx)
.parent(parent)
.searchText(searchText)
.numberOnPage(numberOnPage)
.page(page)
.lineBreaks(lineBreaks)
.boldTextBoundaries(boldTextBoundaries)
.italicTextBoundaries(italicTextBoundaries)
.positions(positions)
.stringIdxToPositionIdx(stringIdxToPositionIdx)
.boundary(new Boundary(offset, offset + searchText.length()))
.textDirection(textDirection)
.orientation(orientation)
.build();
}
public static AtomicTextBlock empty(Long textBlockIdx, int stringOffset, Page page, int numberOnPage, SemanticNode parent) {
return AtomicTextBlock.builder()
.id(textBlockIdx)
.boundary(new Boundary(stringOffset, stringOffset))
.searchText("")
.page(page)
.numberOnPage(numberOnPage)
.parent(parent)
.build();
}
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData,
DocumentPositionData documentPositionData,
SemanticNode parent,
Page page) {
return AtomicTextBlock.builder()
.id(documentTextData.getId())
.numberOnPage(documentTextData.getNumberOnPage())
.page(page)
.boundary(new Boundary(documentTextData.getStart(), documentTextData.getEnd()))
.searchText(documentTextData.getSearchText())
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList())
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList())
.positions(toRectangle2DList(documentPositionData.getPositions()))
.parent(parent)
.build();
}
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList();
}
public CharSequence getLine(int lineNumber) {
if (lineNumber >= numberOfLines() || lineNumber < 0) {
throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
}
if (lineNumber == 0) {
return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start());
} else if (lineNumber == numberOfLines() - 1) {
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end());
}
return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start());
}
@Override
public List<AtomicTextBlock> getAtomicTextBlocks() {
return List.of(this);
}
@Override
public int getNextLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak > fromIndex - boundary.start()) //
.findFirst() //
.orElse(searchText.length()) + boundary.start();
}
@Override
public int getPreviousLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak <= fromIndex - boundary.start())//
.reduce((a, b) -> b)//
.orElse(0) + boundary.start();
}
@Override
public Rectangle2D getPosition(int stringIdx) {
return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start()));
}
@Override
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
if (!containsBoundary(stringBoundary)) {
throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringBoundary, this.boundary));
}
if (stringBoundary.length() == 0) {
return Collections.emptyList();
}
int startPositionIdx = stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start());
if (stringBoundary.end() == this.boundary.end()) {
return positions.subList(startPositionIdx, positions.size());
}
return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start()));
}
public Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary) {
List<Rectangle2D> rectanglesPerLine = stringBoundary.split(getAllLineBreaksInBoundary(stringBoundary))
.stream()
.map(this::getPositions)
.map(RectangleTransformations::rectangleBBoxWithGaps)
.flatMap(Collection::stream)
.toList();
Map<Page, List<Rectangle2D>> rectanglePerLinePerPage = new HashMap<>();
rectanglePerLinePerPage.put(page, rectanglesPerLine);
return rectanglePerLinePerPage;
}
private List<Integer> getAllLineBreaksInBoundary(Boundary boundary) {
return getLineBreaks().stream().map(linebreak -> linebreak + this.boundary.start()).filter(boundary::contains).toList();
}
@Override
public String toString() {
return searchText;
}
}

View File

@ -1,220 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import lombok.AccessLevel;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ConcatenatedTextBlock implements TextBlock {
List<AtomicTextBlock> atomicTextBlocks;
String searchText;
Boundary boundary;
public static ConcatenatedTextBlock empty() {
return new ConcatenatedTextBlock(Collections.emptyList());
}
public ConcatenatedTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
this.atomicTextBlocks = new LinkedList<>();
if (atomicTextBlocks.isEmpty()) {
boundary = new Boundary(-1, -1);
return;
}
var firstTextBlock = atomicTextBlocks.get(0);
this.atomicTextBlocks.add(firstTextBlock);
boundary = new Boundary(firstTextBlock.getBoundary().start(), firstTextBlock.getBoundary().end());
atomicTextBlocks.subList(1, atomicTextBlocks.size()).forEach(this::concat);
}
public ConcatenatedTextBlock concat(TextBlock textBlock) {
if (this.atomicTextBlocks.isEmpty()) {
boundary.setStart(textBlock.getBoundary().start());
boundary.setEnd(textBlock.getBoundary().end());
} else if (boundary.end() != textBlock.getBoundary().start()) {
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
}
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
boundary.setEnd(textBlock.getBoundary().end());
this.searchText = null;
return this;
}
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
return atomicTextBlocks.stream().filter(textBlock -> textBlock.getBoundary().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new);
}
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(Boundary boundary) {
return atomicTextBlocks.stream().filter(tb -> tb.getBoundary().intersects(boundary)).toList();
}
@Override
public String getSearchText() {
if (searchText == null) {
StringBuilder sb = new StringBuilder();
getAtomicTextBlocks().forEach(atb -> sb.append(atb.getSearchText()));
searchText = sb.toString();
}
return searchText;
}
@Override
public int numberOfLines() {
return atomicTextBlocks.stream().map(AtomicTextBlock::getLineBreaks).mapToInt(List::size).sum();
}
@Override
public int getNextLinebreak(int fromIndex) {
return getAtomicTextBlockByStringIndex(fromIndex).getNextLinebreak(fromIndex);
}
@Override
public int getPreviousLinebreak(int fromIndex) {
return getAtomicTextBlockByStringIndex(fromIndex).getPreviousLinebreak(fromIndex);
}
@Override
public List<Integer> getLineBreaks() {
return getAtomicTextBlocks().stream().flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks().stream()).toList();
}
@Override
public Rectangle2D getPosition(int stringIdx) {
return getAtomicTextBlockByStringIndex(stringIdx).getPosition(stringIdx);
}
@Override
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
if (textBlocks.size() == 1) {
return textBlocks.get(0).getPositions(stringBoundary);
}
AtomicTextBlock firstTextBlock = textBlocks.get(0);
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
positions.addAll(textBlock.getPositions());
}
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
positions.addAll(lastTextBlock.getPositions(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
return positions;
}
@Override
public Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary) {
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
if (textBlocks.size() == 1) {
return textBlocks.get(0).getPositionsPerPage(stringBoundary);
}
AtomicTextBlock firstTextBlock = textBlocks.get(0);
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end()));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getBoundary()));
}
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage,
lastTextBlock.getPositionsPerPage(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
return rectanglesPerLinePerPage;
}
private Map<Page, List<Rectangle2D>> mergeEntityPositionsWithSamePageNode(Map<Page, List<Rectangle2D>> map1, Map<Page, List<Rectangle2D>> map2) {
Map<Page, List<Rectangle2D>> mergedMap = new HashMap<>(map1);
map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode, rectangles, (l1, l2) -> Stream.concat(l1.stream(), l2.stream()).toList()));
return mergedMap;
}
@Override
public String toString() {
return getSearchText();
}
@Override
public List<Boundary> getBoldTextBoundaries() {
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getBoldTextBoundaries).flatMap(Collection::stream).toList();
}
@Override
public List<Boundary> getItalicTextBoundaries() {
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getItalicTextBoundaries).flatMap(Collection::stream).toList();
}
@Override
public String getOrientation() {
if (atomicTextBlocks.isEmpty()) {
return "";
}
return atomicTextBlocks.get(0).getOrientation();
}
@Override
public int getTextDirection() {
if (atomicTextBlocks.isEmpty()) {
return 0;
}
return atomicTextBlocks.get(0).getTextDirection();
}
}

View File

@ -1,148 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
public interface TextBlock extends CharSequence {
String getSearchText();
List<AtomicTextBlock> getAtomicTextBlocks();
List<Boundary> getBoldTextBoundaries();
List<Boundary> getItalicTextBoundaries();
String getOrientation();
int getTextDirection();
Boundary getBoundary();
int getNextLinebreak(int fromIndex);
int getPreviousLinebreak(int fromIndex);
List<Integer> getLineBreaks();
Rectangle2D getPosition(int stringIdx);
List<Rectangle2D> getPositions(Boundary stringBoundary);
Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary);
int numberOfLines();
default int indexOf(String searchTerm) {
return indexOf(searchTerm, getBoundary().start());
}
default Set<Page> getPages() {
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getPage).collect(Collectors.toUnmodifiableSet());
}
default Set<Page> getPages(Boundary boundary) {
return getAtomicTextBlocks().stream()
.filter(atomicTextBlock -> atomicTextBlock.getBoundary().intersects(boundary))
.map(AtomicTextBlock::getPage)
.collect(Collectors.toUnmodifiableSet());
}
default int indexOf(String searchTerm, int startOffset) {
int start = getSearchText().indexOf(searchTerm, startOffset - getBoundary().start());
if (start == -1) {
return -1;
}
return start + getBoundary().start();
}
default CharSequence getFirstLine() {
return subSequence(getBoundary().start(), getNextLinebreak(getBoundary().start()));
}
default boolean containsBoundary(Boundary boundary) {
if (boundary.end() < boundary.start()) {
throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", boundary));
}
return getBoundary().contains(boundary);
}
default boolean containsIndex(int stringIndex) {
return getBoundary().contains(stringIndex);
}
default CharSequence subSequence(Boundary boundary) {
return subSequence(boundary.start(), boundary.end());
}
default String buildSummary() {
String[] words = getSearchText().split(" ");
int bound = Math.min(words.length, 4);
List<String> list = new ArrayList<>(Arrays.asList(words).subList(0, bound));
return String.join(" ", list);
}
@Override
default CharSequence subSequence(int start, int end) {
return getSearchText().substring(start - getBoundary().start(), end - getBoundary().start());
}
@Override
default int length() {
return getBoundary().length();
}
@Override
default char charAt(int index) {
return getSearchText().charAt(index - getBoundary().start());
}
}

View File

@ -1,49 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import lombok.NoArgsConstructor;
@NoArgsConstructor
public class TextBlockCollector implements Collector<TextBlock, ConcatenatedTextBlock, TextBlock> {
@Override
public Supplier<ConcatenatedTextBlock> supplier() {
return ConcatenatedTextBlock::empty;
}
@Override
public BiConsumer<ConcatenatedTextBlock, TextBlock> accumulator() {
return ConcatenatedTextBlock::concat;
}
@Override
public BinaryOperator<ConcatenatedTextBlock> combiner() {
return ConcatenatedTextBlock::concat;
}
@Override
public Function<ConcatenatedTextBlock, TextBlock> finisher() {
return a -> a;
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.IDENTITY_FINISH, Characteristics.CONCURRENT);
}
}

View File

@ -2,7 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.image;
import java.awt.geom.Rectangle2D;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -10,8 +10,8 @@ import lombok.NonNull;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
@AllArgsConstructor
@RequiredArgsConstructor
public class ClassifiedImage {
@NonNull
@ -20,9 +20,18 @@ public class ClassifiedImage {
private ImageType imageType;
private boolean sourceByAi;
private boolean isAppendedToSection;
@NonNull
private boolean hasTransparency;
@NonNull
private int page;
private String representation;
public ClassifiedImage(@NonNull Rectangle2D position, @NonNull ImageType imageType, boolean hasTransparency, int page, String representation) {
this.position = position;
this.imageType = imageType;
this.hasTransparency = hasTransparency;
this.page = page;
this.representation = representation;
}
}

View File

@ -0,0 +1,229 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Optional;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdmodel.PDDestinationNameTreeNode;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitHeightDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitRectangleDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitWidthDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageXYZDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Service
@Slf4j
public class OutlineExtractorService {
private static final String PDDESTINATION_TYPE_FIT = "Fit";
private static final String PDDESTINATION_TYPE_FIT_B = "FitB";
private static final String PDDESTINATION_TYPE_FIT_H = "FitH";
private static final String PDDESTINATION_TYPE_FIT_V = "FitV";
private static final String PDDESTINATION_TYPE_FIT_R = "FitR";
private static final String PDDESTINATION_TYPE_FIT_BH = "FitBH";
private static final String PDDESTINATION_TYPE_FIT_BV = "FitBV";
private static final String PDDESTINATION_TYPE_XYZ = "XYZ";
@SneakyThrows
public OutlineObjectTree getOutlineObjectTree(PDDocument document) {
PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline();
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
if (documentOutline != null) {
for (PDOutlineItem child : documentOutline.children()) {
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, 1);
outlineObjectWithChildren.ifPresent(rootNodes::add);
}
}
return new OutlineObjectTree(rootNodes);
}
@SneakyThrows
private Optional<OutlineObjectTreeNode> createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) {
Optional<OutlineObjectTreeNode> outlineObject = createOutlineObject(item, document, depth);
if (outlineObject.isPresent()) {
for (var child : item.children()) {
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, depth + 1);
outlineObjectWithChildren.ifPresent(outlineObjectTreeNode -> outlineObject.get().addChild(outlineObjectTreeNode));
}
}
return outlineObject;
}
// if the structure elements are processed beforehand, another case can be handled here as well:
// outline objects can reference structure elements (see pdf documentation)
@SneakyThrows
private Optional<OutlineObjectTreeNode> createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {
String title = item.getTitle();
PDPage page;
try {
// Can throw: "Error: can't convert to Destination COSArray" for some OCR'd PDFs
page = item.findDestinationPage(document);
if (page == null) {
return Optional.empty();
}
} catch (IOException e) {
log.info(String.format("Error occurred during position resolution for outline item with title %s: " + e, title));
return Optional.empty();
}
int pageNumber = document.getPages().indexOf(page) + 1;
AffineTransform userSpaceToPageCoords = CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(PageInformation.fromPDPage(pageNumber, page));
Optional<Point2D> outlinePosition = Optional.empty();
try {
PDDocumentNameDictionary names = document.getDocumentCatalog().getNames();
PDDestinationNameTreeNode destinations = null;
if (names != null) {
destinations = names.getDests();
}
PDDestination destination = item.getDestination();
if (destination != null) {
outlinePosition = getLocationFromCOSBase(destinations, destination.getCOSObject());
}
if (outlinePosition.isEmpty()) {
PDAction action = item.getAction();
if (action != null) {
outlinePosition = extractOutlineLocationGoTo(destinations, action.getCOSObject());
}
}
} catch (Exception e) {
log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title));
}
return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title,
pageNumber,
transformPointToPageCoords(outlinePosition, userSpaceToPageCoords), depth)));
}
private static Point2D transformPointToPageCoords(Optional<Point2D> outlinePosition, AffineTransform userSpaceToPageCoords) {
return outlinePosition.map(point -> userSpaceToPageCoords.transform(point, null)).orElse(null);
}
@SneakyThrows
private static Optional<Point2D> extractOutlineLocationGoTo(PDDestinationNameTreeNode destinations, COSDictionary cosDictionary) {
if (isGoToAction(cosDictionary)) {
COSBase cosBase = cosDictionary.getItem(COSName.D);
return getLocationFromCOSBase(destinations, cosBase);
}
return Optional.empty();
}
private static Optional<Point2D> getLocationFromCOSBase(PDDestinationNameTreeNode destinations, COSBase cosBase) throws IOException {
if (cosBase != null) {
if (cosBase instanceof COSArray cosArray) {
return getLocationFromCosArray(cosArray);
}
if (cosBase instanceof COSString cosString) {
String destinationName = cosString.getString();
COSArray cosArray = destinations.getValue(destinationName).getCOSObject();
return getLocationFromCosArray(cosArray);
}
}
return Optional.empty();
}
private static Optional<Point2D> getLocationFromCosArray(COSArray cosArray) {
boolean located = false;
float x = 0;
float y = 0;
try {
PDDestination destination = PDDestination.create(cosArray);
COSName type = (COSName) cosArray.getObject(1);
String typeString = type.getName();
switch (typeString) {
case PDDESTINATION_TYPE_FIT_V:
case PDDESTINATION_TYPE_FIT_BV:
PDPageFitHeightDestination fitHeightDestination = (PDPageFitHeightDestination) destination;
x = fitHeightDestination.getLeft();
located = true;
break;
case PDDESTINATION_TYPE_FIT_R:
PDPageFitRectangleDestination fitRectangleDestination = (PDPageFitRectangleDestination) destination;
x = fitRectangleDestination.getLeft();
y = fitRectangleDestination.getTop();
located = true;
break;
case PDDESTINATION_TYPE_FIT_H:
case PDDESTINATION_TYPE_FIT_BH:
PDPageFitWidthDestination fitWidthDestination = (PDPageFitWidthDestination) destination;
y = fitWidthDestination.getTop();
located = true;
break;
case PDDESTINATION_TYPE_XYZ:
PDPageXYZDestination xyzDestination = (PDPageXYZDestination) destination;
x = xyzDestination.getLeft();
y = xyzDestination.getTop();
located = true;
break;
case PDDESTINATION_TYPE_FIT:
case PDDESTINATION_TYPE_FIT_B:
default:
}
} catch (IOException e) {
throw new RuntimeException(e);
}
return located ? Optional.of(new Point2D.Float(x, y)) : Optional.empty();
}
private static boolean isGoToAction(COSDictionary cosDictionary) {
return cosDictionary.getNameAsString("S").toLowerCase(Locale.ROOT).equals("goto");
}
}

View File

@ -0,0 +1,77 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.Point2D;
import java.util.Optional;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import lombok.Getter;
import lombok.Setter;
public class OutlineObject {
@Getter
private final String title;
@Getter
private final int pageNumber;
@Getter
private final int treeDepth;
private Point2D point; // java coordinates, (0, 0) is always top left
@Getter
@Setter
private boolean found;
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {
this.title = title;
this.pageNumber = pageNumber;
this.treeDepth = depth;
this.point = point2D;
}
@Override
public String toString() {
return "OutlineObject{" + "title='" + title + '\'' + '}';
}
public Optional<Point2D> getPoint() {
return Optional.ofNullable(point);
}
public boolean isAbove(BoundingBox boundingBox) {
if (point == null) {
return true;
}
return point.getY() <= boundingBox.getMaxY();
}
public double distance(BoundingBox boundingBox) {
if (point == null) {
return 0;
}
if (boundingBox.getBBox().contains(point)) {
return 0;
}
double deltaX = Math.min(Math.abs(boundingBox.getMinX() - point.getX()), Math.abs(boundingBox.getMaxX() - point.getX()));
double deltaY = Math.min(Math.abs(boundingBox.getMinY() - point.getY()), Math.abs(boundingBox.getMaxY() - point.getY()));
return Math.sqrt(deltaX * deltaX + deltaY * deltaY);
}
public void resetPoint() {
this.point = null;
}
}

View File

@ -0,0 +1,66 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.Data;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class OutlineObjectTree {
private List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
private Map<Integer, List<OutlineObject>> outlineObjectsPerPage = new HashMap<>();
public OutlineObjectTree(List<OutlineObjectTreeNode> rootNodes) {
this.rootNodes = rootNodes;
flattenNodesAndGroupByPage(rootNodes);
}
private void flattenNodesAndGroupByPage(List<OutlineObjectTreeNode> outlineObjectTreeNodes) {
for (OutlineObjectTreeNode node : outlineObjectTreeNodes) {
int pageNumber = node.getOutlineObject().getPageNumber();
if (!this.outlineObjectsPerPage.containsKey(pageNumber)) {
outlineObjectsPerPage.put(pageNumber, new ArrayList<>());
}
outlineObjectsPerPage.get(pageNumber).add(node.getOutlineObject());
if (!node.getChildren().isEmpty()) {
flattenNodesAndGroupByPage(node.getChildren());
}
}
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("OutlineObjectTree(\n");
for (OutlineObjectTreeNode node : rootNodes) {
buildString(node, sb, 1);
}
sb.append(")");
return sb.toString();
}
private void buildString(OutlineObjectTreeNode node, StringBuilder sb, int depth) {
for (int i = 0; i < depth; i++) {
sb.append(" ");
}
sb.append(node.getOutlineObject().getTitle()).append("\n");
for (OutlineObjectTreeNode child : node.getChildren()) {
buildString(child, sb, depth + 1);
}
}
}

View File

@ -0,0 +1,34 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.List;
import lombok.Data;
@Data
public class OutlineObjectTreeNode {
private OutlineObject outlineObject;
private List<OutlineObjectTreeNode> children = new ArrayList<>();
public OutlineObjectTreeNode(OutlineObject outlineObject) {
this.outlineObject = outlineObject;
}
public void addChild(OutlineObjectTreeNode outlineObject) {
children.add(outlineObject);
}
@Override
public String toString() {
return "OutlineObjectTreeNode{" + "outlineObject=" + outlineObject + '}';
}
}

View File

@ -0,0 +1,136 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Stack;
import org.springframework.lang.NonNull;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.Data;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class SectionTree implements Iterable<SectionTreeEntry> {
private List<SectionTreeEntry> mainSections = new ArrayList<>();
public SectionTree(List<SectionTreeEntry> mainSections) {
this.mainSections = mainSections;
}
public List<TextPageBlock> getAllTextPageBlocks() {
List<TextPageBlock> allTextPageBlocks = new ArrayList<>();
for (SectionTreeEntry item : mainSections) {
collectTextPageBlocks(item, allTextPageBlocks);
}
return allTextPageBlocks;
}
private void collectTextPageBlocks(SectionTreeEntry item, List<TextPageBlock> textPageBlocks) {
textPageBlocks.add(item.getHeadline());
for (SectionTreeEntry child : item.getChildren()) {
collectTextPageBlocks(child, textPageBlocks);
}
}
public List<SectionTreeEntry> getAllTableOfContentItems() {
List<SectionTreeEntry> allItems = new ArrayList<>();
for (SectionTreeEntry item : mainSections) {
collectTableOfContentItems(item, allItems);
}
return allItems;
}
private void collectTableOfContentItems(SectionTreeEntry item, List<SectionTreeEntry> allItems) {
allItems.add(item);
for (SectionTreeEntry child : item.getChildren()) {
collectTableOfContentItems(child, allItems);
}
}
private boolean containsBlock(TextPageBlock block) {
for (SectionTreeEntry existingItem : this.getMainSections()) {
if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) {
return true;
}
}
return false;
}
private boolean containsItem(SectionTreeEntry tocItem) {
for (SectionTreeEntry existingItem : this.getMainSections()) {
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
return true;
}
}
return false;
}
@Override
public @NonNull Iterator<SectionTreeEntry> iterator() {
return new SectionTreeEntryIterator(mainSections);
}
private static class SectionTreeEntryIterator implements Iterator<SectionTreeEntry> {
private final Stack<Iterator<SectionTreeEntry>> stack = new Stack<>();
SectionTreeEntryIterator(List<SectionTreeEntry> mainSections) {
stack.push(mainSections.iterator());
}
@Override
public boolean hasNext() {
ensureStackTopIsCurrent();
return !stack.isEmpty() && stack.peek().hasNext();
}
@Override
public SectionTreeEntry next() {
ensureStackTopIsCurrent();
SectionTreeEntry currentItem = stack.peek().next();
if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) {
stack.push(currentItem.getChildren()
.iterator());
}
return currentItem;
}
private void ensureStackTopIsCurrent() {
while (!stack.isEmpty() && !stack.peek().hasNext()) {
stack.pop();
}
}
}
}

View File

@ -0,0 +1,82 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.TABLE_OF_CONTENTS_HEADLINE;
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import io.micrometer.observation.annotation.Observed;
import lombok.extern.slf4j.Slf4j;
@Service
@Slf4j
public class SectionTreeBuilderService {
@Observed(name = "OutlineValidationService", contextualName = "create-toc")
public SectionTree createSectionTree(ClassificationDocument classificationDocument) {
List<TextPageBlock> headlines = extractHeadlines(classificationDocument);
List<SectionTreeEntry> mainSections = new ArrayList<>();
Map<Integer, SectionTreeEntry> lastItemsPerDepth = new HashMap<>();
SectionTreeEntry last = null;
TreeSet<Integer> depths = new TreeSet<>();
for (TextPageBlock current : headlines) {
int currentDepth = getHeadlineNumber(current.getClassification());
Integer parentDepth = depths.floor(currentDepth - 1);
var tocItem = new SectionTreeEntry(current);
if (parentDepth == null) {
mainSections.add(tocItem);
lastItemsPerDepth = new HashMap<>();
depths = new TreeSet<>();
} else {
assert last != null;
int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
if (last.getHeadline().getClassification().equals(TABLE_OF_CONTENTS_HEADLINE) && !current.getClassification().equals(TABLE_OF_CONTENTS_HEADLINE)) {
// headline after toc should always start a main section
parentDepth = 1;
} else if (lastDepth < parentDepth) {
parentDepth = lastDepth;
} else if (lastDepth == currentDepth && last.getParent() != null) {
parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
}
SectionTreeEntry parent = lastItemsPerDepth.get(parentDepth);
parent.addChild(tocItem);
}
last = tocItem;
lastItemsPerDepth.put(currentDepth, tocItem);
depths.add(currentDepth);
}
return new SectionTree(mainSections);
}
private static List<TextPageBlock> extractHeadlines(ClassificationDocument classificationDocument) {
return classificationDocument.getPages()
.stream()
.flatMap(classificationPage -> classificationPage.getTextBlocks()
.stream()
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
.map(tb -> (TextPageBlock) tb))
.toList();
}
}

View File

@ -0,0 +1,252 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
public class SectionTreeEnhancementService {
public void assignSectionBlocksAndImages(ClassificationDocument document) {
SectionTree toc = document.getSectionTree();
Iterator<SectionTreeEntry> iterator = toc.iterator();
SectionTreeEntry currentTOCItem = null;
if (iterator.hasNext()) {
currentTOCItem = iterator.next();
}
List<AbstractPageBlock> startBlocks = new ArrayList<>();
List<ClassifiedImage> startImages = new ArrayList<>();
SectionTreeEntry currentSection = null;
boolean foundFirstHeadline = false;
List<ClassificationHeader> headers = new ArrayList<>();
List<ClassificationFooter> footers = new ArrayList<>();
TablePageBlock previousTable = null;
List<SectionTreeEntry> lastFoundTOCItems = new ArrayList<>();
for (ClassificationPage page : document.getPages()) {
List<SectionTreeEntry> currentPageTOCItems = new ArrayList<>();
List<TextPageBlock> header = new ArrayList<>();
List<TextPageBlock> footer = new ArrayList<>();
for (AbstractPageBlock current : page.getTextBlocks()) {
if (current.getClassification() == null) {
continue;
}
current.setPage(page.getPageNumber());
if (current.getClassification().equals(PageBlockType.HEADER)) {
header.add((TextPageBlock) current);
continue;
}
if (current.getClassification().equals(PageBlockType.FOOTER)) {
footer.add((TextPageBlock) current);
continue;
}
if (current instanceof TablePageBlock table) {
if (previousTable != null) {
mergeTableMetadata(table, previousTable);
}
previousTable = table;
}
if (current instanceof TextPageBlock && currentTOCItem != null && currentTOCItem.getHeadline().getText().equals(current.getText())) {
if (!foundFirstHeadline) {
foundFirstHeadline = true;
}
currentSection = currentTOCItem;
currentTOCItem.getSectionBlocks().add(current);
currentPageTOCItems.add(currentTOCItem);
if (iterator.hasNext()) {
currentTOCItem = iterator.next();
}
} else if (!foundFirstHeadline) {
startBlocks.add(current);
} else {
currentSection.getSectionBlocks().add(current);
}
}
if (!currentPageTOCItems.isEmpty()) {
lastFoundTOCItems = currentPageTOCItems;
}
for (ClassifiedImage image : page.getImages()) {
Double xMin = null;
Double yMin = null;
Double xMax = null;
Double yMax = null;
for (SectionTreeEntry tocItem : lastFoundTOCItems) {
var headline = tocItem.getHeadline();
if (headline.getPage() != page.getPageNumber()) {
continue;
}
if (headline.getMinX() < headline.getMaxX()) {
if (xMin == null || headline.getMinX() < xMin) {
xMin = headline.getMinX();
}
if (xMax == null || headline.getMaxX() > xMax) {
xMax = headline.getMaxX();
}
} else {
if (xMin == null || headline.getMaxX() < xMin) {
xMin = headline.getMaxX();
}
if (xMax == null || headline.getMinX() > xMax) {
xMax = headline.getMinX();
}
}
if (headline.getMinY() < headline.getMaxY()) {
if (yMin == null || headline.getMinY() < yMin) {
yMin = headline.getMinY();
}
if (yMax == null || headline.getMaxY() > yMax) {
yMax = headline.getMaxY();
}
} else {
if (yMin == null || headline.getMaxY() < yMin) {
yMin = headline.getMaxY();
}
if (yMax == null || headline.getMinY() > yMax) {
yMax = headline.getMinY();
}
}
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
log.debug("Headline position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
if (image.getPosition().getX() >= xMin && image.getPosition().getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
tocItem.getImages().add(image);
image.setAppendedToSection(true);
break;
}
}
if (!image.isAppendedToSection()) {
log.debug("Image uses last found section");
if (!lastFoundTOCItems.isEmpty()) {
lastFoundTOCItems.get(lastFoundTOCItems.size() - 1).getImages().add(image);
} else {
startImages.add(image);
}
image.setAppendedToSection(true);
}
}
if (!header.isEmpty()) {
headers.add(new ClassificationHeader(header));
}
if (!footer.isEmpty()) {
footers.add(new ClassificationFooter(footer));
}
}
if (!startBlocks.isEmpty() || !startImages.isEmpty()) {
SectionTreeEntry unassigned = new SectionTreeEntry(null);
unassigned.setSectionBlocks(startBlocks);
unassigned.setImages(startImages);
document.getSectionTree().getMainSections().add(0, unassigned);
}
document.setHeaders(headers);
document.setFooters(footers);
}
private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
// Distribute header information for subsequent tables
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
// Allow merging of tables if header row is separated from first logical non-header row
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows().get(0)
.stream()
.map(cell -> {
Cell fakeCell = Cell.copy(cell);
fakeCell.setHeaderCells(Collections.singletonList(cell));
return fakeCell;
})
.toList();
}
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = currentTable.getRows().get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream()
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) {
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
}
}
}
}
}
}
private boolean hasValidHeaderInformation(TablePageBlock table) {
return !hasInvalidHeaderInformation(table);
}
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
return table.getRows()
.stream()
.flatMap(Collection::stream)
.allMatch(cell -> cell.getHeaderCells().isEmpty());
}
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows().get(i);
if (row.size() == 1) {
continue;
}
boolean allNonHeader = true;
for (Cell cell : row) {
if (cell.isHeaderCell()) {
allNonHeader = false;
break;
}
}
if (allNonHeader) {
return row;
}
}
return Collections.emptyList();
}
}

View File

@ -0,0 +1,129 @@
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.Data;
import lombok.EqualsAndHashCode;
@Data
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class SectionTreeEntry {
public enum Type {
SECTION,
SUPER_SECTION,
TOC_SECTION
}
@EqualsAndHashCode.Include
private TextPageBlock headline;
private List<SectionTreeEntry> children = new ArrayList<>();
private SectionTreeEntry parent;
private List<AbstractPageBlock> sectionBlocks = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>();
private GenericSemanticNode section;
public SectionTreeEntry(TextPageBlock headline) {
this.headline = headline;
}
public Type getType() {
if (!Objects.isNull(headline) && headline.getClassification().equals(PageBlockType.TABLE_OF_CONTENTS_HEADLINE)) {
return Type.TOC_SECTION;
}
if (children.isEmpty()) {
return Type.SECTION;
}
return Type.SUPER_SECTION;
}
public void addChild(SectionTreeEntry sectionTreeEntry) {
children.add(sectionTreeEntry);
sectionTreeEntry.setParent(this);
}
public SectionTreeEntry getSiblingBefore() {
if (parent != null) {
int index = parent.getChildren().indexOf(this);
if (index > 0) {
return parent.getChildren().get(index - 1);
}
}
return null;
}
public SectionTreeEntry getSiblingAfter() {
if (parent != null) {
int index = parent.getChildren().indexOf(this);
if (index >= 0 && index < parent.getChildren().size() - 1) {
return parent.getChildren().get(index + 1);
}
}
return null;
}
public boolean contains(TextPageBlock block) {
if (headline.equals(block)) {
return true;
}
for (SectionTreeEntry child : children) {
if (child.contains(block)) {
return true;
}
}
return false;
}
public boolean contains(SectionTreeEntry tocItem) {
if (this.equals(tocItem)) {
return true;
}
for (SectionTreeEntry child : children) {
if (child.contains(tocItem)) {
return true;
}
}
return false;
}
public List<AbstractPageBlock> getNonEmptySectionBlocks() {
return sectionBlocks.stream()
.filter(pageBlock -> !pageBlock.isEmpty())
.collect(Collectors.toList());
}
@Override
public String toString() {
return "OutlineObjectTreeNode{" + "textPageBlock=" + headline + '}';
}
}

View File

@ -1,13 +1,15 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.Data;
@ -18,7 +20,7 @@ import lombok.NoArgsConstructor;
@Data
@EqualsAndHashCode(callSuper = true)
@NoArgsConstructor
public class Cell extends Rectangle {
public class Cell extends BoundingBox {
private List<TextPageBlock> textBlocks = new ArrayList<>();
@ -33,13 +35,24 @@ public class Cell extends Rectangle {
public Cell(Point2D topLeft, Point2D bottomRight) {
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
this.bBoxPdf = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
this.bBox = bBoxPdf;
}
public Cell(Rectangle2D r) {
public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) {
super((float) r.getY(), (float) r.getX(), (float) r.getWidth(), (float) r.getHeight());
this.bBoxPdf = bBoxInitialUserSpace;
this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D();
}
public static Cell copy(Cell cell) {
Cell copy = new Cell();
copy.bBoxPdf = cell.bBoxPdf;
copy.bBox = cell.bBox;
return copy;
}
@ -55,12 +68,12 @@ public class Cell extends Rectangle {
StringBuilder sb = new StringBuilder();
Iterator<TextPageBlock> itty = textBlocks.iterator();
TextPositionSequence previous = null;
Word previous = null;
while (itty.hasNext()) {
TextPageBlock textBlock = itty.next();
for (TextPositionSequence word : textBlock.getSequences()) {
for (Word word : textBlock.getWords()) {
if (previous != null) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
sb.append('\n');
@ -74,7 +87,7 @@ public class Cell extends Rectangle {
}
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" {2}", " ");
return TextNormalizationUtilities.cleanString(sb.toString());
}

View File

@ -1,15 +1,206 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Stream;
import lombok.Builder;
import lombok.Data;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
@Data
@Builder
import lombok.Getter;
@Getter
public class CleanRulings {
List<Ruling> horizontal;
List<Ruling> vertical;
List<Ruling> horizontals; // unmodifiable sorted by Y list
List<Ruling> verticals; // unmodifiable sorted by X list
public CleanRulings(List<Ruling> horizontals, List<Ruling> verticals) {
this.horizontals = horizontals.stream()
.peek(Ruling::assertHorizontal)
.sorted(Comparator.comparing(Line2D.Float::getY1))
.toList();
this.verticals = verticals.stream()
.peek(Ruling::assertVertical)
.sorted(Comparator.comparing(Line2D.Float::getX1))
.toList();
}
public CleanRulings getTableLines() {
return new CleanRulings(horizontals.stream()
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.TABLE_LINE))
.toList(),
verticals.stream()
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.TABLE_LINE))
.toList());
}
public CleanRulings withoutTextRulings() {
return new CleanRulings(horizontals.stream()
.filter(ruling -> !(ruling.getClassification().equals(Ruling.Classification.UNDERLINE) || ruling.getClassification()
.equals(Ruling.Classification.STRIKETROUGH)))
.toList(),
verticals.stream()
.filter(ruling -> !(ruling.getClassification().equals(Ruling.Classification.UNDERLINE) || ruling.getClassification()
.equals(Ruling.Classification.STRIKETROUGH)))
.toList());
}
public List<Ruling> buildAll() {
ArrayList<Ruling> rulings = new ArrayList<>(horizontals.size() + verticals.size());
rulings.addAll(horizontals);
rulings.addAll(verticals);
return rulings;
}
public boolean lineBetween(BoundingBox a, BoundingBox b) {
return lineBetween(a.getBBoxPdf(), b.getBBoxPdf());
}
public boolean lineBetween(Rectangle2D a, Rectangle2D b) {
return lineBetween(new Point2D.Double(a.getCenterX(), a.getCenterY()), new Point2D.Double(b.getCenterX(), b.getCenterY()));
}
public boolean lineBetween(Point2D p1, Point2D p2) {
Ruling ruling = new Ruling(p1, p2);
if (ruling.isHorizontal()) {
return getVerticalsInXInterval(ruling.x1, ruling.x2).stream()
.anyMatch(vertical -> vertical.intersectsLine(ruling));
}
if (ruling.isVertical()) {
return getHorizontalsInYInterval(ruling.y1, ruling.y2).stream()
.anyMatch(horizontal -> horizontal.intersectsLine(ruling));
}
return Stream.of(getVerticalsInXInterval(ruling.x1, ruling.x2), getHorizontalsInYInterval(ruling.y1, ruling.y2))
.flatMap(Collection::stream)
.anyMatch(other -> other.intersectsLine(ruling));
}
public List<Ruling> getHorizontalsInYInterval(float y1, float y2) {
float startY = Math.min(y1, y2);
float endY = Math.max(y1, y2);
if (horizontals.isEmpty() || Float.isNaN(startY) || Float.isNaN(endY)) {
return Collections.emptyList();
}
int firstGreaterThanIdx = findFirstHorizontalRulingIdxAbove(startY);
if (firstGreaterThanIdx == -1) {
return Collections.emptyList();
}
List<Ruling> result = new LinkedList<>();
for (int i = firstGreaterThanIdx; i < horizontals.size(); i++) {
Ruling horizontal = horizontals.get(i);
if (horizontal.y1 > endY) {
break;
}
result.add(horizontal);
}
return result;
}
private int findFirstHorizontalRulingIdxAbove(float y) {
int low = 0;
int high = horizontals.size() - 1;
while (low <= high) {
int mid = low + (high - low) / 2;
Line2D.Float midLine = horizontals.get(mid);
float midY = midLine.y1;
if (midY == y) {
return mid;
} else if (midY > y) {
high = mid - 1;
} else {
low = mid + 1;
}
}
// Return the index of the first element greater than y or -1 if not found
return horizontals.size() > low && horizontals.get(low).y1 > y ? low : -1;
}
public List<Ruling> getVerticalsInXInterval(float x1, float x2) {
float startX = Math.min(x1, x2);
float endX = Math.max(x1, x2);
if (verticals.isEmpty() || Float.isNaN(startX) || Float.isNaN(endX)) {
return Collections.emptyList();
}
int firstGreaterThanIdx = findFirstVerticalRulingIdxRightOf(startX);
if (firstGreaterThanIdx == -1) {
return Collections.emptyList();
}
List<Ruling> result = new LinkedList<>();
for (int i = firstGreaterThanIdx; i < verticals.size(); i++) {
Ruling horizontal = verticals.get(i);
if (horizontal.x1 > endX) {
break;
}
result.add(horizontal);
}
return result;
}
private int findFirstVerticalRulingIdxRightOf(float x) {
int low = 0;
int high = verticals.size() - 1;
while (low <= high) {
int mid = low + (high - low) / 2;
Line2D.Float midLine = verticals.get(mid);
float midX = midLine.x1;
if (midX == x) {
return mid;
} else if (midX > x) {
high = mid - 1;
} else {
low = mid + 1;
}
}
// Return the index of the first element greater than y or -1 if not found
return verticals.size() > low && verticals.get(low).x1 > x ? low : -1;
}
}

View File

@ -1,218 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.List;
@SuppressWarnings("all")
public class Rectangle extends Rectangle2D.Float {
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
/**
* Ill-defined comparator, from when Rectangle was Comparable.
* <p>
* see https://github.com/tabulapdf/tabula-java/issues/116
*
* @deprecated with no replacement
*/
@Deprecated
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
@Override
public int compare(Rectangle o1, Rectangle o2) {
if (o1.equals(o2)) {
return 0;
}
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1 ? -java.lang.Double.compare(o1.getX(), o2.getX()) : java.lang.Double.compare(o1.getX(), o2.getX());
} else {
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
}
}
};
public Rectangle() {
super();
}
public Rectangle(float top, float left, float width, float height) {
super();
this.setRect(left, top, width, height);
}
/**
* @param rectangles
* @return minimum bounding box that contains all the rectangles
*/
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
float minx = java.lang.Float.MAX_VALUE;
float miny = java.lang.Float.MAX_VALUE;
float maxx = java.lang.Float.MIN_VALUE;
float maxy = java.lang.Float.MIN_VALUE;
for (Rectangle r : rectangles) {
minx = (float) Math.min(r.getMinX(), minx);
miny = (float) Math.min(r.getMinY(), miny);
maxx = (float) Math.max(r.getMaxX(), maxx);
maxy = (float) Math.max(r.getMaxY(), maxy);
}
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
}
public int compareTo(Rectangle other) {
return ILL_DEFINED_ORDER.compare(this, other);
}
// I'm bad at Java and need this for fancy sorting in
// technology.tabula.TextChunk.
public int isLtrDominant() {
return 0;
}
public float getArea() {
return this.width * this.height;
}
public float verticalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
}
public boolean verticallyOverlaps(Rectangle other) {
return verticalOverlap(other) > 0;
}
public float horizontalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
}
public boolean horizontallyOverlaps(Rectangle other) {
return horizontalOverlap(other) > 0;
}
public float verticalOverlapRatio(Rectangle other) {
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - this.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - other.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - other.getTop()) / delta;
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - this.getTop()) / delta;
}
return rv;
}
public float overlapRatio(Rectangle other) {
double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
double unionArea = this.getArea() + other.getArea() - intersectionArea;
return (float) (intersectionArea / unionArea);
}
public Rectangle merge(Rectangle other) {
this.setRect(this.createUnion(other));
return this;
}
public float getTop() {
return (float) this.getMinY();
}
public void setTop(float top) {
float deltaHeight = top - this.y;
this.setRect(this.x, top, this.width, this.height - deltaHeight);
}
public float getRight() {
return (float) this.getMaxX();
}
public void setRight(float right) {
this.setRect(this.x, this.y, right - this.x, this.height);
}
public float getLeft() {
return (float) this.getMinX();
}
public void setLeft(float left) {
float deltaWidth = left - this.x;
this.setRect(left, this.y, this.width - deltaWidth, this.height);
}
public float getBottom() {
return (float) this.getMaxY();
}
public void setBottom(float bottom) {
this.setRect(this.x, this.y, this.width, bottom - this.y);
}
public Point2D[] getPoints() {
return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()), new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(),
this.getBottom()), new Point2D.Float(this.getLeft(), this.getBottom())};
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
String s = super.toString();
sb.append(s.substring(0, s.length() - 1));
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
return sb.toString();
}
}

View File

@ -4,16 +4,14 @@ import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Formatter;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import com.knecon.fforesight.service.layoutparser.processor.utils.CohenSutherlandClipping;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@ -23,10 +21,24 @@ public class Ruling extends Line2D.Float {
public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2;
public static final int COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT = 2;
public enum Classification {
TABLE_LINE,
UNDERLINE,
STRIKETROUGH,
HEADER_SEPARATOR,
FOOTER_SEPARATOR,
OTHER
}
@Getter
@Setter
private Classification classification;
public Ruling(Point2D p1, Point2D p2) {
super(p1, p2);
this.classification = Classification.OTHER;
}
@ -60,126 +72,32 @@ public class Ruling extends Line2D.Float {
}
// log(n) implementation of find_intersections
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
class SortObject {
protected SOType type;
protected float position;
protected Ruling ruling;
public SortObject(SOType type, float position, Ruling ruling) {
this.type = type;
this.position = position;
this.ruling = ruling;
}
public void assertHorizontal() {
if (isHorizontal()) {
return;
}
List<SortObject> sos = new ArrayList<>();
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
@Override
public int compare(Ruling o1, Ruling o2) {
return java.lang.Double.compare(o1.getTop(), o2.getTop());
}
});
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
@Override
public int compare(Point2D o1, Point2D o2) {
if (o1.getY() > o2.getY()) {
return 1;
}
if (o1.getY() < o2.getY()) {
return -1;
}
if (o1.getX() > o2.getX()) {
return 1;
}
if (o1.getX() < o2.getX()) {
return -1;
}
return 0;
}
});
for (Ruling h : horizontals) {
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, h));
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, h));
}
for (Ruling v : verticals) {
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
}
Collections.sort(sos, new Comparator<SortObject>() {
@Override
public int compare(SortObject a, SortObject b) {
int rv;
if (DoubleComparisons.feq(a.position, b.position)) {
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
rv = 1;
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
rv = -1;
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
rv = -1;
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
rv = 1;
} else {
rv = java.lang.Double.compare(a.position, b.position);
}
} else {
return java.lang.Double.compare(a.position, b.position);
}
return rv;
}
});
for (SortObject so : sos) {
switch (so.type) {
case VERTICAL:
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
try {
Point2D i = h.getKey().intersectionPoint(so.ruling);
if (i == null) {
continue;
}
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT)});
} catch (UnsupportedOperationException e) {
log.info("Some line are oblique, ignoring...");
continue;
}
}
break;
case HRIGHT:
tree.remove(so.ruling);
break;
case HLEFT:
tree.put(so.ruling, true);
break;
}
}
return rv;
throw new IllegalArgumentException("Ruling " + this + " is not horizontal");
}
public boolean vertical() {
public void assertVertical() {
if (isVertical()) {
return;
}
throw new IllegalArgumentException("Ruling " + this + " is not vertical");
}
public boolean isVertical() {
return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
}
public boolean horizontal() {
public boolean isHorizontal() {
return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
}
@ -188,36 +106,36 @@ public class Ruling extends Line2D.Float {
// these are used to have a single collapse method (in page, currently)
public boolean oblique() {
public boolean isOblique() {
return !(this.vertical() || this.horizontal());
return !(this.isVertical() || this.isHorizontal());
}
public float getPosition() {
if (this.oblique()) {
if (this.isOblique()) {
throw new UnsupportedOperationException();
}
return this.vertical() ? this.getLeft() : this.getTop();
return this.isVertical() ? this.getLeft() : this.getTop();
}
public float getStart() {
if (this.oblique()) {
if (this.isOblique()) {
throw new UnsupportedOperationException();
}
return this.vertical() ? this.getTop() : this.getLeft();
return this.isVertical() ? this.getTop() : this.getLeft();
}
public void setStart(float v) {
if (this.oblique()) {
if (this.isOblique()) {
throw new UnsupportedOperationException();
}
if (this.vertical()) {
if (this.isVertical()) {
this.setTop(v);
} else {
this.setLeft(v);
@ -227,19 +145,19 @@ public class Ruling extends Line2D.Float {
public float getEnd() {
if (this.oblique()) {
if (this.isOblique()) {
throw new UnsupportedOperationException();
}
return this.vertical() ? this.getBottom() : this.getRight();
return this.isVertical() ? this.getBottom() : this.getRight();
}
public void setEnd(float v) {
if (this.oblique()) {
if (this.isOblique()) {
throw new UnsupportedOperationException();
}
if (this.vertical()) {
if (this.isVertical()) {
this.setBottom(v);
} else {
this.setRight(v);
@ -249,10 +167,10 @@ public class Ruling extends Line2D.Float {
public void setStartEnd(float start, float end) {
if (this.oblique()) {
if (this.isOblique()) {
throw new UnsupportedOperationException();
}
if (this.vertical()) {
if (this.isVertical()) {
this.setTop(start);
this.setBottom(end);
} else {
@ -264,7 +182,7 @@ public class Ruling extends Line2D.Float {
public boolean perpendicularTo(Ruling other) {
return this.vertical() == other.horizontal();
return this.isVertical() == other.isHorizontal();
}
@ -318,30 +236,6 @@ public class Ruling extends Line2D.Float {
}
public Point2D intersectionPoint(Ruling other) {
Ruling this_l = this.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
Ruling other_l = other.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
Ruling horizontal, vertical;
if (!this_l.intersectsLine(other_l)) {
return null;
}
if (this_l.horizontal() && other_l.vertical()) {
horizontal = this_l;
vertical = other_l;
} else if (this_l.vertical() && other_l.horizontal()) {
vertical = this_l;
horizontal = other_l;
} else {
log.warn("lines must be orthogonal, vertical and horizontal");
return null;
}
return new Point2D.Float(vertical.getLeft(), horizontal.getTop());
}
@Override
public boolean equals(Object other) {
@ -451,16 +345,9 @@ public class Ruling extends Line2D.Float {
final float TOLERANCE = 1;
return Math.abs(ruling.getX1() - x1) < TOLERANCE &&//
Math.abs(ruling.getY1() - y1) < TOLERANCE &&//
Math.abs(ruling.getX2() - x2) < TOLERANCE &&//
Math.abs(ruling.getY2() - y2) < TOLERANCE;
}
private enum SOType {
VERTICAL,
HRIGHT,
HLEFT
Math.abs(ruling.getY1() - y1) < TOLERANCE &&//
Math.abs(ruling.getX2() - x2) < TOLERANCE &&//
Math.abs(ruling.getY2() - y2) < TOLERANCE;
}
}

View File

@ -36,14 +36,11 @@ public class TablePageBlock extends AbstractPageBlock {
private List<Cell> cells;
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
public TablePageBlock(List<Cell> cells, int rotation) {
setToBBoxOfComponents(cells);
this.cells = cells;
addCells(cells);
minX = area.getLeft();
minY = area.getBottom();
maxX = area.getRight();
maxY = area.getTop();
classification = PageBlockType.TABLE;
this.rotation = rotation;
}
@ -230,15 +227,15 @@ public class TablePageBlock extends AbstractPageBlock {
return new ArrayList<>();
}
Set<Float> uniqueX = new HashSet<>();
Set<Float> uniqueY = new HashSet<>();
Set<Double> uniqueX = new HashSet<>();
Set<Double> uniqueY = new HashSet<>();
cells.stream()
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
.forEach(c -> {
uniqueX.add(c.getLeft());
uniqueX.add(c.getRight());
uniqueY.add(c.getBottom());
uniqueY.add(c.getTop());
uniqueX.add(c.getPdfMinX());
uniqueX.add(c.getPdfMaxX());
uniqueY.add(c.getPdfMinY());
uniqueY.add(c.getPdfMaxY());
});
var sortedUniqueX = uniqueX.stream()
@ -250,22 +247,24 @@ public class TablePageBlock extends AbstractPageBlock {
List<List<Cell>> rowsOfCells = new ArrayList<>();
Float prevY = null;
Double prevY = null;
for (Float y : sortedUniqueY) {
for (Double y : sortedUniqueY) {
List<Cell> row = new ArrayList<>();
Float prevX = null;
for (Float x : sortedUniqueX) {
Double prevX = null;
for (Double x : sortedUniqueX) {
if (prevY != null && prevX != null) {
var cellFromGridStructure = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
var cellFromGridStructure = new Cell(new Point2D.Double(prevX, prevY), new Point2D.Double(x, y));
if (cellFromGridStructure.hasMinimumSize()) {
cells.stream()
.map(originalCell -> new CellWithIntersection(originalCell, RectangleTransformations.calculateIntersectedArea(cellFromGridStructure, originalCell)))
.map(originalCell -> new CellWithIntersection(originalCell,
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxPdf(),
originalCell.getBBoxPdf())))
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
.max(Comparator.comparing(CellWithIntersection::intersectedArea))

View File

@ -0,0 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
public record AbstractBlockOnPage(AbstractPageBlock block, ClassificationPage page) {
}

View File

@ -0,0 +1,9 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
public enum FontStyle {
REGULAR,
BOLD,
ITALIC,
BOLD_ITALIC;
}

View File

@ -0,0 +1,21 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Getter
@NoArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class FrequencyCounters {
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
}

View File

@ -0,0 +1,107 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.List;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ListIdentifier {
public static final Pattern NUMBER_WITH_DOT = Pattern.compile("^\\s*([1-9]{1,4})\\.\\s+");
public static final Pattern NUMBER_IN_PARENTHESES = Pattern.compile("^\\s*\\(([1-9]{1,4})\\)\\s+");
enum Format {
NUMBER_WITH_DOT,
NUMBER_IN_PARENTHESES
}
Format format;
@Getter
Word word;
@Getter
int page;
int representation;
public static Optional<ListIdentifier> parse(TextPageBlock textPageBlock, int page) {
return parse(textPageBlock.getWords().subList(0, Math.min(5, textPageBlock.getWords().size())), page);
}
public static Optional<ListIdentifier> parse(List<Word> sequences, int page) {
StringBuilder sb = new StringBuilder();
for (Word sequence : sequences) {
sb.append(sequence.toString());
sb.append(" ");
}
sb.replace(sb.length() - 1, sb.length(), "");
String text = sb.toString();
Matcher numberMatcher = NUMBER_WITH_DOT.matcher(text);
if (numberMatcher.find()) {
Optional<Integer> representation = parseInteger(numberMatcher.group(1));
if (representation.isPresent()) {
return Optional.of(new ListIdentifier(Format.NUMBER_WITH_DOT, sequences.get(0), page, representation.get()));
}
}
Matcher parenthesisMatcher = NUMBER_IN_PARENTHESES.matcher(text);
if (parenthesisMatcher.find()) {
Optional<Integer> representation = parseInteger(parenthesisMatcher.group(1));
if (representation.isPresent()) {
return Optional.of(new ListIdentifier(Format.NUMBER_IN_PARENTHESES, sequences.get(0), page, representation.get()));
}
}
return Optional.empty();
}
private static Optional<Integer> parseInteger(String text) {
try {
return Optional.of(Integer.parseInt(text));
} catch (NumberFormatException e) {
return Optional.empty();
}
}
public static boolean isInOrder(List<ListIdentifier> listIdentifiers) {
if (listIdentifiers.size() <= 1) {
return true;
}
for (int i = 1; i < listIdentifiers.size(); i++) {
ListIdentifier current = listIdentifiers.get(i);
ListIdentifier previous = listIdentifiers.get(i - 1);
if (current.format != previous.format) {
return false;
}
if (current.representation <= previous.representation) {
return false;
}
if (!current.word.intersectsXDirAdj(previous.word, 2)) {
return false;
}
if (current.page == previous.page && !current.word.isBelowDirAdj(previous.word)) {
return false;
}
if (current.page < previous.page) {
return false;
}
}
return true;
}
}

View File

@ -1,97 +1,94 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.beans.BeanUtils;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Data
@Builder
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public class RedTextPosition {
@EqualsAndHashCode(callSuper = true)
@FieldDefaults(level = AccessLevel.PRIVATE)
public class RedTextPosition extends TextBoundingBox {
private float[] position;
public final static int HEIGHT_PADDING = 2;
@JsonIgnore
private int rotation;
String unicode;
@JsonIgnore
private float pageHeight;
// estimated using the TextMatrix in radians
float exactDir;
@JsonIgnore
private float pageWidth;
float widthOfSpace;
private String unicode;
float fontSizeInPt;
@JsonIgnore
private float dir;
// not used in reanalysis
@JsonIgnore
private float widthOfSpace;
// not used in reanalysis
@JsonIgnore
private float fontSizeInPt;
// not used in reanalysis
@JsonIgnore
private String fontName;
String fontName;
@SneakyThrows
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
var pos = new RedTextPosition();
BeanUtils.copyProperties(textPosition, pos);
pos.setFontName(textPosition.getFont().getName());
pos.setUnicode(textPosition.getUnicode());
pos.setWidthOfSpace(textPosition.getWidthOfSpace());
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
pos.setFontName(textPosition.getFont().getName());
pos.setExactDir((float) FastAtan2.fastAtan2(textPosition.getTextMatrix().getShearY(), textPosition.getTextMatrix().getScaleX()));
pos.setDir(TextDirection.fromDegrees(textPosition.getDir()));
var position = new float[4];
//TODO: There is a mismatch in the java coords of the text and the rulings,
// I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work.
pos.setBBox(new Rectangle2D.Float(textPosition.getX(), textPosition.getY(), textPosition.getWidthDirAdj(), textPosition.getHeight()));
position[0] = textPosition.getXDirAdj();
position[1] = textPosition.getYDirAdj();
position[2] = textPosition.getWidthDirAdj();
position[3] = textPosition.getHeightDir();
float textHeight = textPosition.getHeight() + 2 * HEIGHT_PADDING;
Rectangle2D.Float dirAdjPosition = new Rectangle2D.Float(textPosition.getXDirAdj(),
textPosition.getYDirAdj() - textHeight,
textPosition.getWidthDirAdj(),
textHeight + HEIGHT_PADDING);
pos.setBBoxDirAdj(dirAdjPosition);
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
Rectangle2D bBoxInitialUserSpace = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
pos.setBBoxPdf(bBoxInitialUserSpace); // These are definitely correct
pos.setPosition(position);
return pos;
}
@JsonIgnore
public float getXDirAdj() {
private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) {
return position[0];
}
AffineTransform transform = new AffineTransform();
@JsonIgnore
public float getYDirAdj() {
return position[1];
}
@JsonIgnore
public float getWidthDirAdj() {
return position[2];
}
@JsonIgnore
public float getHeightDir() {
return position[3];
if (textDirection == TextDirection.ZERO || textDirection == TextDirection.HALF_CIRCLE) {
transform.rotate(textDirection.getRadians(), pageWidth / 2f, pageHeight / 2f);
transform.translate(0f, pageHeight);
} else if (textDirection == TextDirection.QUARTER_CIRCLE) {
transform.rotate(textDirection.getRadians(), pageWidth / 2f, pageWidth / 2f);
transform.translate(0f, pageWidth);
} else {
transform.rotate(textDirection.getRadians(), pageHeight / 2f, pageHeight / 2f);
transform.translate(0f, pageWidth);
}
transform.scale(1., -1.);
return transform;
}
}

View File

@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.Getter;
@ -9,18 +10,18 @@ import lombok.Getter;
@Getter
public class SearchableText {
private final List<TextPositionSequence> sequences = new ArrayList<>();
private final List<Word> sequences = new ArrayList<>();
public void add(TextPositionSequence textPositionSequence) {
public void add(Word word) {
sequences.add(textPositionSequence);
sequences.add(word);
}
public void addAll(List<TextPositionSequence> textPositionSequences) {
public void addAll(List<Word> words) {
sequences.addAll(textPositionSequences);
sequences.addAll(words);
}
@ -31,18 +32,14 @@ public class SearchableText {
}
public static String buildString(List<TextPositionSequence> sequences) {
public static String buildString(List<Word> sequences) {
StringBuilder sb = new StringBuilder();
for (TextPositionSequence word : sequences) {
for (Word word : sequences) {
sb.append(word);
sb.append(' ');
}
String text = sb.toString();
text = TextNormalizationUtilities.removeHyphenLineBreaks(text);
text = TextNormalizationUtilities.removeLineBreaks(text);
text = TextNormalizationUtilities.removeRepeatingWhitespaces(text);
return text;
return TextNormalizationUtilities.cleanString(sb.toString());
}
}

View File

@ -9,10 +9,14 @@ public class StringFrequencyCounter {
@Getter
private final Map<String, Integer> countPerValue = new HashMap<>();
boolean changed;
String mostPopularCache;
public void add(String value) {
changed = true;
if (!countPerValue.containsKey(value)) {
countPerValue.put(value, 1);
} else {
@ -23,6 +27,8 @@ public class StringFrequencyCounter {
public void addAll(Map<String, Integer> otherCounter) {
changed = true;
for (Map.Entry<String, Integer> entry : otherCounter.entrySet()) {
if (countPerValue.containsKey(entry.getKey())) {
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
@ -35,13 +41,18 @@ public class StringFrequencyCounter {
public String getMostPopular() {
Map.Entry<String, Integer> mostPopular = null;
for (Map.Entry<String, Integer> entry : countPerValue.entrySet()) {
if (mostPopular == null || entry.getValue() > mostPopular.getValue()) {
mostPopular = entry;
if (changed || mostPopularCache == null) {
Map.Entry<String, Integer> mostPopular = null;
for (Map.Entry<String, Integer> entry : countPerValue.entrySet()) {
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
mostPopular = entry;
}
}
mostPopularCache = mostPopular != null ? mostPopular.getKey() : null;
changed = false;
}
return mostPopular != null ? mostPopular.getKey() : null;
return mostPopularCache;
}
}

View File

@ -0,0 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
public record TextBlockOnPage(TextPageBlock textBlock, ClassificationPage page) {
}

View File

@ -44,4 +44,15 @@ public enum TextDirection {
throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees));
}
public int getRotation() {
return switch (this) {
case ZERO -> 0;
case QUARTER_CIRCLE -> 1;
case HALF_CIRCLE -> 2;
case THREE_QUARTER_CIRCLE -> 3;
};
}
}

View File

@ -1,16 +1,14 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import static java.util.stream.Collectors.toSet;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Collections;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.AllArgsConstructor;
@ -27,234 +25,112 @@ import lombok.NoArgsConstructor;
public class TextPageBlock extends AbstractPageBlock {
@Builder.Default
private List<TextPositionSequence> sequences = new ArrayList<>();
private List<Word> words = new ArrayList<>();
@Builder.Default
private FrequencyCounters frequencyCounters = new FrequencyCounters();
@JsonIgnore
private int rotation;
private Rectangle2D bBoxDirAdj;
@JsonIgnore
private String mostPopularWordFont;
private boolean underlined;
@JsonIgnore
private String mostPopularWordStyle;
@JsonIgnore
private float mostPopularWordFontSize;
@JsonIgnore
private float mostPopularWordHeight;
@JsonIgnore
private float mostPopularWordSpaceWidth;
@JsonIgnore
private float highestFontSize;
@JsonIgnore
private PageBlockType classification;
private boolean toDuplicate;
private String text;
private boolean changed;
public TextPageBlock(List<Word> words) {
this.words = new ArrayList<>(words);
this.frequencyCounters = new FrequencyCounters();
if (!words.isEmpty()) {
addToFrequencyCounters(words);
}
calculateBBox();
}
public List<Word> getWords() {
return Collections.unmodifiableList(words);
}
@JsonIgnore
public TextDirection getDir() {
return sequences.get(0).getDir();
return words.get(0).getDir();
}
@JsonIgnore
private float getPageHeight() {
private void calculateBBox() {
return sequences.get(0).getPageHeight();
if (words == null) {
this.bBox = new Rectangle2D.Double();
this.bBoxPdf = new Rectangle2D.Double();
this.bBoxDirAdj = new Rectangle2D.Double();
return;
}
this.bBoxDirAdj = words.stream()
.map(Word::getBBoxDirAdj)
.collect(RectangleTransformations.collectBBox());
setToBBoxOfComponents(words);
}
@JsonIgnore
private float getPageWidth() {
public void recalculateBBox() {
return sequences.get(0).getPageWidth();
calculateBBox();
}
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
List<TextPositionSequence> sequences = textBlocksToMerge.stream().map(TextPageBlock::getSequences).flatMap(java.util.Collection::stream).toList();
if (textBlocksToMerge.isEmpty()) {
throw new IllegalArgumentException("Need to provide at least one TextPageBlock.");
}
if (textBlocksToMerge.stream()
.map(AbstractPageBlock::getPage)
.distinct()
.count() != 1) {
throw new IllegalArgumentException("Cannot merge textBlocks on different pages.");
}
List<Word> sequences = textBlocksToMerge.stream()
.map(TextPageBlock::getWords)
.flatMap(java.util.Collection::stream)
.toList();
sequences = new ArrayList<>(sequences);
return fromTextPositionSequences(sequences);
}
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences()
.stream()
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
.collect(toSet())
.size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
return new TextPageBlock(sequences);
}
private void addToFrequencyCounters(List<Word> sequences) {
/**
* Returns the minX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the minX value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMinX() {
for (Word wordBlock : sequences) {
if (getDir().getDegrees() == 90) {
return minY;
} else if (getDir().getDegrees() == 180) {
return getPageWidth() - maxX;
} else if (getDir().getDegrees() == 270) {
return getPageWidth() - maxY;
} else {
return minX;
frequencyCounters.getLineHeightFrequencyCounter().add(wordBlock.getTextHeight());
frequencyCounters.getFontSizeFrequencyCounter().add(wordBlock.getFontSize());
frequencyCounters.getSpaceFrequencyCounter().add(wordBlock.getSpaceWidth());
frequencyCounters.getFontFrequencyCounter().add(wordBlock.getFont());
frequencyCounters.getStyleFrequencyCounter().add(wordBlock.getFontStyle());
}
setUnderlined(this.words.stream()
.allMatch(Word::isUnderline));
}
/**
* Returns the maxX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the maxX value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMaxX() {
if (getDir().getDegrees() == 90) {
return maxY;
} else if (getDir().getDegrees() == 180) {
return getPageWidth() - minX;
} else if (getDir().getDegrees() == 270) {
return getPageWidth() - minY;
} else {
return maxX;
}
}
/**
* Returns the minY value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the minY value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMinY() {
if (getDir().getDegrees() == 90) {
return minX;
} else if (getDir().getDegrees() == 180) {
return maxY;
} else if (getDir().getDegrees() == 270) {
return getPageHeight() - maxX;
} else {
return getPageHeight() - maxY;
}
}
/**
* Returns the maxY value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the maxY value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMaxY() {
if (getDir().getDegrees() == 90) {
return maxX;
} else if (getDir().getDegrees() == 180) {
return minY;
} else if (getDir().getDegrees() == 270) {
return getPageHeight() - minX;
} else {
return getPageHeight() - minY;
}
}
public TextPageBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
this.minX = minX;
this.maxX = maxX;
this.minY = minY;
this.maxY = maxY;
this.sequences = sequences;
this.rotation = rotation;
}
public TextPageBlock union(TextPositionSequence r) {
public TextPageBlock union(Word r) {
TextPageBlock union = this.copy();
union.add(r);
addToFrequencyCounters(List.of(r));
calculateBBox();
return union;
}
@ -262,83 +138,50 @@ public class TextPageBlock extends AbstractPageBlock {
public TextPageBlock union(TextPageBlock r) {
TextPageBlock union = this.copy();
union.add(r);
union.addAll(r.getWords());
addToFrequencyCounters(r.getWords());
calculateBBox();
return union;
}
public void add(TextPageBlock r) {
public void add(TextPageBlock textPageBlock) {
if (r.getMinX() < minX) {
minX = r.getMinX();
}
if (r.getMaxX() > maxX) {
maxX = r.getMaxX();
}
if (r.getMinY() < minY) {
minY = r.getMinY();
}
if (r.getMaxY() > maxY) {
maxY = r.getMaxY();
}
sequences.addAll(r.getSequences());
changed = true;
words.addAll(textPageBlock.getWords());
addToFrequencyCounters(textPageBlock.getWords());
calculateBBox();
}
public void add(TextPositionSequence r) {
public void add(Word word) {
if (r.getMinXDirAdj() < minX) {
minX = r.getMinXDirAdj();
}
if (r.getMaxXDirAdj() > maxX) {
maxX = r.getMaxXDirAdj();
}
if (r.getMinYDirAdj() < minY) {
minY = r.getMinYDirAdj();
}
if (r.getMaxYDirAdj() > maxY) {
maxY = r.getMaxYDirAdj();
}
changed = true;
words.add(word);
addToFrequencyCounters(List.of(word));
calculateBBox();
}
public void addAll(List<Word> words) {
changed = true;
this.words.addAll(words);
addToFrequencyCounters(words);
calculateBBox();
}
public TextPageBlock copy() {
return new TextPageBlock(minX, maxX, minY, maxY, sequences, rotation);
}
public void resize(float x1, float y1, float width, float height) {
set(x1, y1, x1 + width, y1 + height);
}
public void set(float x1, float y1, float x2, float y2) {
this.minX = Math.min(x1, x2);
this.maxX = Math.max(x1, x2);
this.minY = Math.min(y1, y2);
this.maxY = Math.max(y1, y2);
return new TextPageBlock(new ArrayList<>(words));
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < sequences.size(); i++) {
String sequenceAsString = sequences.get(i).toString();
// Fix for missing Whitespace. This is recognized in getSequences method. See PDFTextStripper Line 1730.
if (i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() - 1) != ' ' && sequenceAsString.charAt(0) != ' ') {
builder.append(' ');
}
builder.append(sequenceAsString);
}
return builder.toString();
return getText();
}
@ -346,30 +189,88 @@ public class TextPageBlock extends AbstractPageBlock {
@JsonIgnore
public String getText() {
StringBuilder sb = new StringBuilder();
if (text == null || changed) {
TextPositionSequence previous = null;
for (TextPositionSequence word : sequences) {
if (previous != null) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');
StringBuilder sb = new StringBuilder();
Word previous = null;
for (Word word : words) {
if (previous != null) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');
}
}
sb.append(word.toString());
previous = word;
}
sb.append(word.toString());
previous = word;
text = TextNormalizationUtilities.removeHyphenLinebreaks(sb.toString());
changed = false;
}
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
return text;
}
public int getNumberOfLines() {
int numberOfLines = 1;
Word previous = null;
for (Word word : words) {
if (previous != null) {
if (word.getMaxYDirAdj() - previous.getMaxYDirAdj() > word.getTextHeight()) {
numberOfLines++;
}
}
previous = word;
}
return numberOfLines;
}
public String getMostPopularWordFont() {
return frequencyCounters.getFontFrequencyCounter().getMostPopular();
}
public String getMostPopularWordStyle() {
return frequencyCounters.getStyleFrequencyCounter().getMostPopular();
}
public double getMostPopularWordFontSize() {
return frequencyCounters.getFontSizeFrequencyCounter().getMostPopular();
}
public double getMostPopularWordHeight() {
return frequencyCounters.getLineHeightFrequencyCounter().getMostPopular();
}
public double getMostPopularWordSpaceWidth() {
return frequencyCounters.getSpaceFrequencyCounter().getMostPopular();
}
public double getHighestFontSize() {
Double highest = frequencyCounters.getFontSizeFrequencyCounter().getHighest();
return highest == null ? 0 : highest;
}
@Override
public boolean isEmpty() {
return sequences.isEmpty();
return words.isEmpty();
}
}

View File

@ -1,300 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.stream.Collectors;
import org.apache.pdfbox.text.TextPosition;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class TextPositionSequence implements CharSequence {
public static final int HEIGHT_PADDING = 2;
private int page;
private List<RedTextPosition> textPositions = new ArrayList<>();
private TextDirection dir;
private int rotation;
private float pageHeight;
private float pageWidth;
private boolean isParagraphStart;
public TextPositionSequence(int page) {
this.page = page;
}
public TextPositionSequence(List<TextPosition> textPositions, int page, boolean isParagraphStart) {
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
this.page = page;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
this.isParagraphStart = isParagraphStart;
}
@Override
public int length() {
return textPositions.size();
}
@Override
public char charAt(int index) {
RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return text.charAt(0);
}
public char charAt(int index, boolean caseInSensitive) {
RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return caseInSensitive ? text.toLowerCase(Locale.ROOT).charAt(0) : text.charAt(0);
}
@Override
public TextPositionSequence subSequence(int start, int end) {
var textPositionSequence = new TextPositionSequence();
textPositionSequence.textPositions = textPositions.subList(start, end);
textPositionSequence.page = page;
textPositionSequence.dir = dir;
textPositionSequence.rotation = rotation;
textPositionSequence.pageHeight = pageHeight;
textPositionSequence.pageWidth = pageWidth;
return textPositionSequence;
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder(length());
for (int i = 0; i < length(); i++) {
builder.append(charAt(i));
}
return builder.toString();
}
public RedTextPosition textPositionAt(int index) {
return textPositions.get(index);
}
public void add(TextPositionSequence textPositionSequence, RedTextPosition textPosition) {
this.textPositions.add(textPosition);
this.page = textPositionSequence.getPage();
this.dir = textPositionSequence.getDir();
this.rotation = textPositionSequence.getRotation();
this.pageHeight = textPositionSequence.getPageHeight();
this.pageWidth = textPositionSequence.getPageWidth();
}
public void add(TextPosition textPosition) {
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted minX value
*/
public float getMinXDirAdj() {
return textPositions.get(0).getXDirAdj();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted maxX value
*/
public float getMaxXDirAdj() {
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
*/
public float getMinYDirAdj() {
return textPositions.get(0).getYDirAdj() - getTextHeight();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
*/
public float getMaxYDirAdj() {
return textPositions.get(0).getYDirAdj();
}
public float getTextHeight() {
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
}
public float getHeight() {
return getMaxYDirAdj() - getMinYDirAdj();
}
public float getWidth() {
return getMaxXDirAdj() - getMinXDirAdj();
}
public String getFont() {
if (textPositions.get(0).getFontName() == null) {
return "none";
}
return textPositions.get(0).getFontName().toLowerCase(Locale.ROOT).replaceAll(",bold", "").replaceAll(",italic", "");
}
public String getFontStyle() {
if (textPositions.get(0).getFontName() == null) {
return "standard";
}
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(Locale.ROOT);
if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) {
return "bold, italic";
} else if (lowercaseFontName.contains("bold")) {
return "bold";
} else if (lowercaseFontName.contains("italic")) {
return "italic";
} else {
return "standard";
}
}
public float getFontSize() {
return textPositions.get(0).getFontSizeInPt();
}
public float getSpaceWidth() {
return textPositions.get(0).getWidthOfSpace();
}
/**
* This returns the bounding box of the word in Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return bounding box of the word in Pdf Coordinate System
*/
@SneakyThrows
public Rectangle getRectangle() {
log.debug("Page: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir);
float textHeight = getTextHeight();
RedTextPosition firstTextPos = textPositions.get(0);
RedTextPosition lastTextPos = textPositions.get(textPositions.size() - 1);
Point2D bottomLeft = new Point2D.Double(firstTextPos.getXDirAdj(), firstTextPos.getYDirAdj() - HEIGHT_PADDING);
Point2D topRight = new Point2D.Double(lastTextPos.getXDirAdj() + lastTextPos.getWidthDirAdj(), lastTextPos.getYDirAdj() + textHeight + HEIGHT_PADDING);
AffineTransform transform = new AffineTransform();
if (dir == TextDirection.ZERO || dir == TextDirection.HALF_CIRCLE) {
transform.rotate(dir.getRadians(), pageWidth / 2f, pageHeight / 2f);
transform.translate(0f, pageHeight + textHeight);
transform.scale(1., -1.);
} else if (dir == TextDirection.QUARTER_CIRCLE) {
transform.rotate(dir.getRadians(), pageWidth / 2f, pageWidth / 2f);
transform.translate(0f, pageWidth + textHeight);
transform.scale(1., -1.);
} else {
transform.rotate(dir.getRadians(), pageHeight / 2f, pageHeight / 2f);
transform.translate(0f, pageWidth + textHeight);
transform.scale(1., -1.);
}
bottomLeft = transform.transform(bottomLeft, null);
topRight = transform.transform(topRight, null);
return new Rectangle( //
new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()),
(float) (topRight.getX() - bottomLeft.getX()),
(float) (topRight.getY() - bottomLeft.getY()),
page);
}
}

View File

@ -0,0 +1,36 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.Comparator;
import java.util.HashMap;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
public class TocNumberComparator implements Comparator<NumberWord> {
private HashMap<NumberWord, TextBlockOnPage> lookup;
public TocNumberComparator(HashMap<NumberWord, TextBlockOnPage> lookup) {
this.lookup = lookup;
}
@Override
public int compare(NumberWord number1, NumberWord number2) {
int page1 = lookup.get(number1).page().getPageNumber();
int page2 = lookup.get(number2).page().getPageNumber();
if (page1 != page2) {
return Integer.compare(page1, page2);
}
if (number1.word().getY() != number2.word().getY()) {
return Double.compare(number1.word().getY(), number2.word().getY());
}
return Integer.compare(number1.number(), number2.number());
}
}

Some files were not shown because too many files have changed in this diff Show More