Compare commits

...

536 Commits
0.1.14 ... main

Author SHA1 Message Date
Dominique Eifländer
ef23ee0ade Merge branch 'RED-10752-main' into 'main'
RED-10752: Enabled prometheus

See merge request fforesight/layout-parser!267
2025-01-29 13:34:01 +01:00
Dominique Eifländer
af31f52b47 RED-10752: Enabled prometheus 2025-01-29 11:09:29 +01:00
Kilian Schüttler
b5152112ee Merge branch 'RM-231' into 'main'
RM-231: missing whitespace in name

See merge request fforesight/layout-parser!264
2025-01-14 13:04:10 +01:00
Kilian Schuettler
85ea4ef455 RM-231: missing whitespace in name 2025-01-14 12:59:01 +01:00
Kilian Schüttler
01f8c01fff Merge branch 'RED-10714' into 'main'
RED-10714: fix IndexOutOfBoundsException

See merge request fforesight/layout-parser!262
2025-01-10 12:33:18 +01:00
Kilian Schuettler
0b6a292c75 RED-10714: fix IndexOutOfBoundsException 2025-01-10 12:12:14 +01:00
Maverick Studer
e24020589c Merge branch 'feature/RED-9998' into 'main'
RED-9998: App version history (for conditional re-analyzing the layout of a file)

See merge request fforesight/layout-parser!259
2024-12-12 09:58:46 +01:00
Maverick Studer
c619b845e8 RED-9998: App version history (for conditional re-analyzing the layout of a file) 2024-12-12 09:58:46 +01:00
Kilian Schüttler
ed0371ca11 Merge branch 'RED-10127' into 'main'
RED-10127: Paragraphs with multiple table, appendix, figure can't be headlines

See merge request fforesight/layout-parser!257
2024-12-06 14:49:48 +01:00
Kilian Schuettler
89b5be8d67 RED-10127: Paragraphs with multiple table, appendix, figure can't be headlines 2024-12-06 13:41:44 +01:00
Kilian Schuettler
077ce60c9d RED-9139: update document version 2024-11-15 16:48:56 +01:00
Kilian Schüttler
ab171be6e2 Merge branch 'feature/RED-9139' into 'main'
RED-9139: more robust TOC detection

See merge request fforesight/layout-parser!253
2024-11-14 16:50:52 +01:00
Kilian Schuettler
664b47b4c3 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:49 +01:00
Kilian Schuettler
8005c1f25f RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
42185a95a0 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
51b42efaf6 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
6a50d45947 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
073ac12cf7 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
84b054a4cc RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
905b65a5fa RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
7617c1f308 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
2b3936c09b RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
6e5b1f1978 RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
cf846d18bc RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
25c46f16ac RED-9139: move document to module in redaction-service
* add feature version
2024-11-14 16:39:48 +01:00
Kilian Schuettler
96acefed78 RED-9139: move document to module in redaction-service
* add TableOfContents node
2024-11-14 16:39:48 +01:00
Kilian Schuettler
366241e6c6 RED-9139: move document to module in redaction-service
* add TableOfContents node
2024-11-14 16:39:48 +01:00
Kilian Schuettler
7f472ccc52 RED-9139: move document to module in redaction-service
* add TableOfContents node
2024-11-14 16:39:48 +01:00
Kilian Schuettler
6f807c7d94 RED-9139: add new TableOfContents Node
* rename previous TableOfContent to SectionTree
* added protobuf compile script
2024-11-14 16:39:48 +01:00
Kilian Schuettler
6e04c15f3d RED-9139: add new TableOfContents Node
* rename previous TableOfContent to SectionTree
* added protobuf compile script
2024-11-14 16:39:48 +01:00
Kilian Schuettler
1384584e2f RED-9139: more robust TOC detection
* detect numbers in words, and not just whole words that are numbers
2024-11-14 16:39:46 +01:00
Kilian Schuettler
e58011e111 RED-9139: more robust TOC detection
* detect numbers in words, and not just whole words that are numbers
2024-11-14 16:39:21 +01:00
Kilian Schüttler
a821570065 Merge branch 'RED-9139-bp' into 'main'
RED-9139: more robust TOC detection

See merge request fforesight/layout-parser!254
2024-11-13 10:54:39 +01:00
Kilian Schüttler
7ee1f9e360 RED-9139: more robust TOC detection 2024-11-13 10:54:39 +01:00
Kilian Schüttler
f9b25c8157 Merge branch 'RED-10249' into 'main'
RED-10249: regex found incorrectly due to wrong text sorting

See merge request fforesight/layout-parser!252
2024-11-04 12:51:38 +01:00
Kilian Schüttler
c90874da7a RED-10249: regex found incorrectly due to wrong text sorting 2024-11-04 12:51:37 +01:00
Kilian Schüttler
4683c696a5 Merge branch 'RED-10247' into 'main'
RED-10247: dictionary entry not found in footer due to wrong text sorting

See merge request fforesight/layout-parser!251
2024-10-25 18:30:35 +02:00
Kilian Schuettler
95c02ce3cf RED-10247: dictionary entry not found in footer due to wrong text sorting 2024-10-25 17:18:14 +02:00
Kilian Schüttler
b2d62e32fe Merge branch 'RED-10270-fp' into 'main'
RED-10270: fix NumberFormatException

See merge request fforesight/layout-parser!248
2024-10-24 17:14:47 +02:00
Kilian Schuettler
65c1f03ea3 RED-10270: fix NumberFormatException 2024-10-24 10:59:05 +02:00
Kilian Schüttler
2219519a2b Merge branch 'RED-10127' into 'main'
RED-10127: rename TextPositionSequence to Word

See merge request fforesight/layout-parser!244
2024-10-18 12:20:15 +02:00
Kilian Schüttler
af05218e37 RED-10127: rename TextPositionSequence to Word 2024-10-18 12:20:15 +02:00
Kilian Schüttler
736f531df3 Merge branch 'hotfix' into 'main'
Hotfix

See merge request fforesight/layout-parser!243
2024-10-18 12:12:15 +02:00
Kilian Schüttler
c64445d54b Hotfix 2024-10-18 12:12:15 +02:00
Kilian Schüttler
af29233b10 Merge branch 'feature/RED-10127' into 'main'
RED-10127: add more units

See merge request fforesight/layout-parser!242
2024-10-15 09:57:21 +02:00
Kilian Schuettler
5f04b45554 RED-10127: add more units 2024-10-15 09:47:39 +02:00
Kilian Schüttler
6c41533f0b Merge branch 'feature/RED-10127' into 'main'
RED-10127: improve list classification

See merge request fforesight/layout-parser!240
2024-10-14 17:34:33 +02:00
Kilian Schuettler
9d2596e5ef RED-10127: improve list classification
* add one more format to list identification
* add 'ppb' to known units
* special case for headlines continuing with 14C after the identifier (quite often in some specific files)
2024-10-14 17:21:44 +02:00
Kilian Schüttler
e7b01161ac Merge branch 'feature/RED-10127' into 'main'
RED-10127: add list classification

See merge request fforesight/layout-parser!237
2024-10-10 10:50:10 +02:00
Kilian Schüttler
7b073eb4f3 RED-10127: add list classification 2024-10-10 10:50:10 +02:00
Dominique Eifländer
4b0c041d84 Merge branch 'feature/RED-10127' into 'main'
RED-10127: improve headline detection

See merge request fforesight/layout-parser!235
2024-10-09 08:48:48 +02:00
Kilian Schüttler
6c7442ac6d RED-10127: improve headline detection 2024-10-09 08:48:48 +02:00
Maverick Studer
23e23328ee Merge branch 'RED-10126' into 'main'
RM-187: Footers are recognized in the middle of the page

See merge request fforesight/layout-parser!233
2024-10-08 14:27:45 +02:00
Maverick Studer
9d1ffdd779 RM-187: Footers are recognized in the middle of the page 2024-10-08 14:27:44 +02:00
Maverick Studer
3109a30ae1 Merge branch 'RED-9123-proto' into 'main'
RED-9123: Improve performance of re-analysis (Spike)

See merge request fforesight/layout-parser!232
2024-10-07 12:28:10 +02:00
Maverick Studer
fe2ed1807e RED-9123: Improve performance of re-analysis (Spike) 2024-10-07 12:28:10 +02:00
Maverick Studer
31de229fa5 Merge branch 'feature/RED-9010' into 'main'
RED-9010: remove redaction log

See merge request fforesight/layout-parser!231
2024-09-19 11:34:32 +02:00
Maverick Studer
8a80abfff1 RED-9010: remove redaction log 2024-09-19 11:34:32 +02:00
Dominique Eifländer
7c08905eda Merge branch 'RED-9975-main' into 'main'
RED-9975: Fixed missing section numbers in layout grid

See merge request fforesight/layout-parser!230
2024-09-18 11:29:51 +02:00
Dominique Eifländer
4f40c9dbc9 RED-9975: Fixed missing section numbers in layout grid 2024-09-18 11:22:37 +02:00
Dominique Eifländer
32381b4472 Merge branch 'RED-9974' into 'main'
Red 9974: improce headline classification, fix font size calculation

See merge request fforesight/layout-parser!226
2024-09-16 14:06:48 +02:00
Kilian Schüttler
469da38952 Red 9974: improce headline classification, fix font size calculation 2024-09-16 14:06:48 +02:00
Dominique Eifländer
0f8c4674b3 Merge branch 'hotfix' into 'main'
hotfix: viewerDocService doesn't remove existing marked content

See merge request fforesight/layout-parser!225
2024-09-12 09:12:54 +02:00
Kilian Schuettler
8e165a41d7 hotfix: viewerDocService doesn't remove existing marked content 2024-09-11 16:34:21 +02:00
Kilian Schüttler
ed7a701ad9 Merge branch 'RED-9975' into 'main'
RED-9975: improve SuperSection handling

See merge request fforesight/layout-parser!223
2024-09-11 13:38:09 +02:00
Kilian Schüttler
393103e074 RED-9975: improve SuperSection handling 2024-09-11 13:38:09 +02:00
Dominique Eifländer
bd02066e2c Merge branch 'RED-9976-main' into 'main'
RED-9976: Removed sorting that scrambles text in PDFTextStripper

See merge request fforesight/layout-parser!222
2024-09-10 13:02:36 +02:00
Dominique Eifländer
fec19f4afb RED-9976: Removed sorting that scrambles text in PDFTextStripper 2024-09-10 12:50:37 +02:00
Kilian Schüttler
c726a643f0 Merge branch 'hotfix' into 'main'
Hotfix: unmerge super large tables

See merge request fforesight/layout-parser!220
2024-09-05 15:05:21 +02:00
Kilian Schüttler
519e95735c Hotfix: unmerge super large tables 2024-09-05 15:05:21 +02:00
Maverick Studer
b52af2637f Merge branch 'RED-9942-2' into 'main'
RED-9942: File only with images not recognised

See merge request fforesight/layout-parser!218
2024-09-05 10:49:12 +02:00
Maverick Studer
46ea7edc4c RED-9942: File only with images not recognised 2024-09-05 10:49:12 +02:00
Kilian Schüttler
9650195afd Merge branch 'hotfix-fp' into 'main'
hotfix: add Java advanced imaging

See merge request fforesight/layout-parser!217
2024-09-04 15:43:56 +02:00
Kilian Schuettler
ce628a99f7 hotfix: add Java advanced imaging 2024-09-04 15:18:12 +02:00
Maverick Studer
b66afe135c Merge branch 'RED-9524' into 'main'
RED-9524: File processing does not annotate images

See merge request fforesight/layout-parser!214
2024-09-04 13:27:06 +02:00
Maverick Studer
dc892d0fec RED-9524: File processing does not annotate images 2024-09-04 13:27:06 +02:00
Kilian Schüttler
af45f2cd8c Merge branch 'RED-9964' into 'main'
RED-9964: fix errors with images

See merge request fforesight/layout-parser!212
2024-09-04 09:16:59 +02:00
Kilian Schuettler
befb6b1df6 RED-9964: fix errors with images 2024-09-03 16:37:48 +02:00
Maverick Studer
61efb4cae9 Merge branch 'update-tc' into 'main'
Update tenant-commons for dlq fix

See merge request fforesight/layout-parser!211
2024-09-03 13:50:02 +02:00
maverickstuder
4a06059258 Update tenant-commons for dlq fix 2024-09-03 13:15:08 +02:00
Dominique Eifländer
292e5b215e Merge branch 'RED-9988-main' into 'main'
RED-9988: Fixed NPE when image representation is not present

See merge request fforesight/layout-parser!210
2024-09-02 09:56:53 +02:00
Dominique Eifländer
7c2db6c3c5 RED-9988: Fixed NPE when image representation is not present 2024-09-02 09:51:59 +02:00
Dominique Eifländer
4395074b21 Merge branch 'RED-9975' into 'main'
Red 9975: fix outline detection

See merge request fforesight/layout-parser!206
2024-09-02 09:02:36 +02:00
Kilian Schüttler
8e14b74da2 Red 9975: fix outline detection 2024-09-02 09:02:36 +02:00
Kilian Schüttler
3b91639ea9 Merge branch 'RED-9964-fp' into 'main'
RED-9964: don't merge tables on non-consecutive pages

See merge request fforesight/layout-parser!205
2024-08-30 14:00:48 +02:00
Kilian Schüttler
c5178ea5c2 RED-9964: don't merge tables on non-consecutive pages 2024-08-30 14:00:48 +02:00
Dominique Eifländer
cf39d4dfcc Merge branch 'RED-9974' into 'main'
RED-9974: Improved headline detection for documine old

See merge request fforesight/layout-parser!202
2024-08-30 10:57:20 +02:00
Dominique Eifländer
bb40345f79 RED-9974: Improved headline detection for documine old 2024-08-30 10:36:22 +02:00
Kilian Schüttler
e3e9d16145 Merge branch 'RED-9975' into 'main'
RED-9975: activate outline detection

See merge request fforesight/layout-parser!201
2024-08-29 14:27:00 +02:00
Kilian Schuettler
f6ca5a3c17 RED-9975: activate outline detection 2024-08-29 14:18:29 +02:00
Maverick Studer
15e3dced35 Merge branch 'tenants-retry' into 'main'
Tenants retry logic and queue renames

See merge request fforesight/layout-parser!197
2024-08-29 13:46:54 +02:00
Maverick Studer
933054b332 Tenants retry logic and queue renames 2024-08-29 13:46:54 +02:00
Kilian Schüttler
ab86714cb3 Merge branch 'RED-9975' into 'main'
RED-9975: activate outline detection

See merge request fforesight/layout-parser!198
2024-08-29 12:25:42 +02:00
Kilian Schuettler
8626b106d0 RED-9975: activate outline detection 2024-08-29 12:16:07 +02:00
Maverick Studer
52e948e66c Merge branch 'RED-9331' into 'main'
RED-9331: Explore possibilities for fair upload / analysis processing per tenant

See merge request fforesight/layout-parser!182
2024-08-27 09:27:37 +02:00
Maverick Studer
3b33405cbf RED-9331: Explore possibilities for fair upload / analysis processing per tenant 2024-08-27 09:27:37 +02:00
Maverick Studer
b2fa14dde2 Merge branch 'AZURE_NER' into 'main'
RED-9918: Azure entity recognition (Spike)

See merge request fforesight/layout-parser!196
2024-08-26 14:34:46 +02:00
Maverick Studer
62e07686d7 RED-9918: Azure entity recognition (Spike) 2024-08-26 14:34:46 +02:00
Dominique Eifländer
3eb97d614f Merge branch 'RED-9760-NPE' into 'main'
RED-9760: Fixed nullpointer in TextPageBlock

See merge request fforesight/layout-parser!194
2024-08-13 13:24:48 +02:00
Dominique Eifländer
81469413b0 RED-9760: Fixed nullpointer in TextPageBlock 2024-08-13 13:18:50 +02:00
Dominique Eifländer
2993676a6f Merge branch 'RED-9670' into 'main'
RED-9760: change compareDouble to something sensible

See merge request fforesight/layout-parser!193
2024-08-12 16:02:51 +02:00
Kilian Schüttler
8e115dcd8a RED-9760: change compareDouble to something sensible 2024-08-12 16:02:50 +02:00
Dominique Eifländer
173911b840 Merge branch 'hotfix-reading-order' into 'main'
hotfix: threshold adjustements

See merge request fforesight/layout-parser!192
2024-08-12 14:59:20 +02:00
Kilian Schuettler
b0ae00aa02 hotfix: threshold adjustements 2024-08-12 14:52:18 +02:00
Dominique Eifländer
00bf9f279e Merge branch 'hotfix-reading-order' into 'main'
hotfix: use center coordinates

See merge request fforesight/layout-parser!191
2024-08-09 15:51:42 +02:00
Kilian Schuettler
d16377a24a hotfix: line comparison with center coordinates 2024-08-09 15:45:23 +02:00
Dominique Eifländer
81179ee744 Merge branch 'RED-9760-dcold' into 'main'
RED-9760: Changed lineSeparation threshold for documine old

See merge request fforesight/layout-parser!190
2024-08-09 14:50:39 +02:00
Dominique Eifländer
1953b5924f RED-9760: Changed lineSeparation threshold for documine old 2024-08-09 14:42:14 +02:00
Kilian Schüttler
6f6e8d5d4e Merge branch 'hotfix-reading-order' into 'main'
hotfix reading order

See merge request fforesight/layout-parser!187
2024-08-09 11:49:12 +02:00
Kilian Schüttler
69bcd4f68d hotfix reading order 2024-08-09 11:49:12 +02:00
Timo Bejan
b900cfaf31 Merge branch 'CLARI-140' into 'main'
CLARI-140 - case issue

See merge request fforesight/layout-parser!189
2024-08-08 21:49:40 +02:00
Timo Bejan
cdc2081785 CLARI-140 - case issue 2024-08-08 22:40:11 +03:00
Timo Bejan
a9287ec406 Merge branch 'CLARI-139' into 'main'
CLAR-139 - fixed outline error for unparsable object

See merge request fforesight/layout-parser!188
2024-08-08 16:36:41 +02:00
Timo Bejan
5b6a706c28 CLAR-139 - fixed outline error for unparsable object 2024-08-08 16:20:14 +03:00
Timo Bejan
28d8ad0a3f Merge branch 'CLARI-128' into 'main'
Fixed Index out of bounds exception in blockificationpostprocessingservice -...

See merge request fforesight/layout-parser!186
2024-07-30 16:56:21 +02:00
Timo Bejan
0c1583c1be Fixed Index out of bounds exception in blockificationpostprocessingservice - this could should be documented btw, there are also probably other use-cases where the code doesnt work 2024-07-30 17:45:05 +03:00
Andrei Isvoran
7633566d9b Merge branch 'RED-9607-fp' into 'main'
RED-9607 - Correctly determine text position sequence based on file rotation

See merge request fforesight/layout-parser!184
2024-07-25 14:11:00 +02:00
Andrei Isvoran
cc4f09711e RED-9607 - Correctly determine text position sequence based on file rotation 2024-07-24 16:35:11 +03:00
Kilian Schüttler
370165dc59 Merge branch 'document-data-markdown' into 'main'
CLARI: document-data-markdown

See merge request fforesight/layout-parser!181
2024-07-18 17:19:44 +02:00
Maverick Studer
8c052c38d7 CLARI: document-data-markdown 2024-07-18 17:19:43 +02:00
Kilian Schüttler
ea18d3d307 Merge branch 'RED-8800' into 'main'
RED-8800: adjust coordinates in BE to ignore cropbox

See merge request fforesight/layout-parser!179
2024-07-15 17:45:13 +02:00
Kilian Schüttler
2726fc3fe1 RED-8800: adjust coordinates in BE to ignore cropbox 2024-07-15 17:45:13 +02:00
Kilian Schüttler
033279e261 Merge branch 'RED-9353' into 'main'
RED-9353: refactor PDFTronViewerDocumentService

See merge request fforesight/layout-parser!178
2024-07-15 12:54:17 +02:00
Kilian Schüttler
ec0dd032c9 RED-9353: refactor PDFTronViewerDocumentService 2024-07-15 12:54:17 +02:00
Andrei Isvoran
598fa7f1c7 Merge branch 'RED-9496-shutdown-fp' into 'main'
RED-9496 - Implement graceful shutdown

See merge request fforesight/layout-parser!176
2024-07-04 14:26:00 +02:00
Andrei Isvoran
65b1f7d179 RED-9496 - Implement graceful shutdown 2024-07-04 14:21:20 +03:00
Kilian Schüttler
3173610be5 Merge branch 'CLARI-003' into 'main'
CLARI-003: add treeId to StructureObject

See merge request fforesight/layout-parser!176
2024-07-02 11:37:22 +02:00
Kilian Schuettler
e920eb5a78 CLARI-003: add treeId to StructureObject 2024-07-01 13:56:16 +02:00
Kilian Schüttler
7e4baea7e5 Merge branch 'RED-9353' into 'main'
RED-9353: use azure ocr service

See merge request fforesight/layout-parser!175
2024-07-01 11:13:27 +02:00
Kilian Schüttler
66d3433e04 RED-9353: use azure ocr service 2024-07-01 11:13:26 +02:00
Yannik Hampe
a2f559af51 Merge branch 'RED-3813' into 'main'
RED-3813: Recategorize same image as experimental feature

See merge request fforesight/layout-parser!155
2024-06-26 13:42:42 +02:00
Yannik Hampe
39f527a57c Merge branch 'main' into 'RED-3813'
# Conflicts:
#   layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
2024-06-26 09:10:59 +02:00
yhampe
5c2844fe31 RED-3813: Recategorize same image as experimental feature
fixed failing test
2024-06-26 09:08:37 +02:00
Kilian Schüttler
b216f02e15 Merge branch 'RED-9194' into 'main'
RED-9194: roll back single digit headline change

See merge request fforesight/layout-parser!171
2024-06-21 15:13:40 +02:00
Kilian Schuettler
2e2f30ba35 RED-9194: roll back single digit headline change 2024-06-21 14:42:30 +02:00
Kilian Schuettler
9f7ed974ec RED-9194: roll back single digit headline change 2024-06-21 14:41:30 +02:00
Kilian Schuettler
570a348a77 RED-9194: roll back single digit headline change 2024-06-21 14:39:47 +02:00
Maverick Studer
859dba2ecf Merge branch 'RED-9374' into 'main'
hotfix for table/paragraph section creation on document start before first headline

See merge request fforesight/layout-parser!170
2024-06-18 17:36:04 +02:00
Maverick Studer
1c5d755111 hotfix for table/paragraph section creation on document start before first headline 2024-06-18 17:36:04 +02:00
Maverick Studer
133e06460f Merge branch 'RED-9374' into 'main'
RED-9374: Ner Entities are at wrong locations

See merge request fforesight/layout-parser!169
2024-06-18 16:31:24 +02:00
Maverick Studer
da91fcff97 RED-9374: Ner Entities are at wrong locations 2024-06-18 16:31:24 +02:00
Kilian Schüttler
79795e408a Merge branch 'RED-9194' into 'main'
RED-9194: allow single digit headline identifiers

See merge request fforesight/layout-parser!168
2024-06-07 09:09:25 +02:00
Kilian Schuettler
b719db86ab RED-9194: allow single digit headline identifiers 2024-06-06 16:32:05 +02:00
Maverick Studer
797602e373 Merge branch 'thread-safe-hcs-fields' into 'main'
fixed issue with thread-safety of local fields in the HeadlineClassificationService

See merge request fforesight/layout-parser!167
2024-06-06 14:51:24 +02:00
maverickstuder
3d2f66cf10 fixed issue with thread-safety of local fields in the HeadlineClassificationService:
* HeadlineClassificationService is no singleton anymore
* instead initialize it in the ClassificationService and pass it to the classifyMethods as required
2024-06-06 14:39:23 +02:00
Kilian Schüttler
e304a9f2d7 Merge branch 'RED-7074-le' into 'main'
RED-7074: Design Subsection section tree structure algorithm

See merge request fforesight/layout-parser!166
2024-06-06 13:22:14 +02:00
Maverick Studer
c05f67cf44 RED-7074: Design Subsection section tree structure algorithm 2024-06-06 13:22:14 +02:00
yhampe
9ecf9ca19f RED-3813: Recategorize same image as experimental feature
now writing hash into structure
2024-06-05 14:20:33 +02:00
Corina Olariu
3a2ee903af Merge branch 'RED-9206-2' into 'main'
RED-9206 - Sections are no longer correctly separated from each other in the test file

See merge request fforesight/layout-parser!165
2024-06-05 13:39:15 +02:00
Corina Olariu
072a8aa3da RED-9206 - Sections are no longer correctly separated from each other in the test file
- add REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH case
2024-06-05 14:26:54 +03:00
Corina Olariu
b5cfa7b63d Merge branch 'RED-9206' into 'main'
RED-9206 - Sections are no longer correctly separated from each other in the test file

See merge request fforesight/layout-parser!163
2024-06-05 13:13:45 +02:00
Corina Olariu
5f5a6258c5 Merge branch 'main' into RED-9206 2024-06-05 13:34:14 +03:00
Maverick Studer
ac0e83725a Merge branch 'RED-7074-lgs' into 'main'
RED-7074: Design Subsection section tree structure algorithm

See merge request fforesight/layout-parser!164
2024-06-05 12:28:00 +02:00
Maverick Studer
5d33ad570e RED-7074: Design Subsection section tree structure algorithm 2024-06-05 12:28:00 +02:00
Corina Olariu
fd698a78fc RED-9206 - Sections are no longer correctly separated from each other in the test file
- introduce new layout parsing type: REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH to include changes from REDACT_MANAGER apart from duplicate paragraph.
- updated junit tests
-
2024-06-04 20:55:37 +03:00
Maverick Studer
c3edeb3c7d Merge branch 'RED-7074-test' into 'main'
RED-7074: Design Subsection section tree structure algorithm

See merge request fforesight/layout-parser!162
2024-06-04 15:07:40 +02:00
Maverick Studer
fc06dba2ce RED-7074: Design Subsection section tree structure algorithm 2024-06-04 15:07:40 +02:00
Maverick Studer
b6742c1e89 Merge branch 'RED-7074_2' into 'main'
RED-7074: Design Subsection section tree structure algorithm

See merge request fforesight/layout-parser!160
2024-05-28 14:48:21 +02:00
Maverick Studer
efb1a748af RED-7074: Design Subsection section tree structure algorithm 2024-05-28 14:48:21 +02:00
yhampe
9be672c728 RED-3813: Recategorize same image as experimental feature
working on pushing properties to persistence service
2024-05-28 13:51:45 +02:00
Maverick Studer
23985b14be Merge branch 'RED-7074_2' into 'main'
RED-7074: Design Subsection section tree structure algorithm

See merge request fforesight/layout-parser!159
2024-05-24 13:30:25 +02:00
Maverick Studer
48b7a22e2b RED-7074: Design Subsection section tree structure algorithm 2024-05-24 13:30:25 +02:00
Corina Olariu
546341ee75 Merge branch 'RED-9177' into 'main'
RED-9177 - Layout parser fails to process file

See merge request fforesight/layout-parser!158
2024-05-22 13:26:10 +02:00
Corina Olariu
0ed1481517 RED-9177 - Layout parser fails to process file
- use originFile as viewerDocumentFile
- return layoutGridOCGName in case the name is found and not check further properties
2024-05-22 13:02:42 +03:00
Andrei Isvoran
b2a47f66ae Merge branch 'RED-9149-header' into 'main'
RED-9149 - Remove header detection

See merge request fforesight/layout-parser!157
2024-05-20 14:12:04 +02:00
Andrei Isvoran
3835d03036 RED-9149 - Remove header detection 2024-05-20 14:59:34 +03:00
yhampe
a5fcebce30 RED-3813: Recategorize same image as experimental feature
added representation to image and DocumentStructure
2024-05-17 07:34:05 +02:00
Dominique Eifländer
b867deb9f9 Merge branch 'CLARI-hotfix' into 'main'
hotifx for clarifynd

See merge request fforesight/layout-parser!154
2024-05-15 14:08:07 +02:00
Kilian Schuettler
8648ed0952 hotifx for clarifynd 2024-05-15 14:02:02 +02:00
Kilian Schüttler
53f786b539 Merge branch 'RED-9149' into 'main'
RED-9149 - Header and footer detection by page-association

See merge request fforesight/layout-parser!150
2024-05-13 14:57:33 +02:00
Andrei Isvoran
40465e8778 RED-9149 - Improvements 2024-05-13 15:13:37 +03:00
Andrei Isvoran
a76b2ace3f RED-9149 - Address comments 2024-05-13 13:18:33 +03:00
Andrei Isvoran
aeaca2f278 RED-9149 - Header and footer extraction by page-association 2024-05-10 16:04:06 +03:00
Andrei Isvoran
f1dbcc24a2 RED-9149 - Header and footer extraction by page-association 2024-05-10 15:49:08 +03:00
Andrei Isvoran
fda25852d1 RED-9149 - Header and footer extraction by page-association 2024-05-10 15:17:41 +03:00
Dominique Eifländer
471fadbcca Merge branch 'RED-8933-4.1' into 'main'
RED-8933: Fixed bugs in DocumineClassificationService

See merge request fforesight/layout-parser!148
2024-05-08 13:31:17 +02:00
Dominique Eifländer
87001090d5 RED-8933: Fixed bugs in DocumineClassificationService 2024-05-08 13:01:23 +02:00
Timo Bejan
ea355429c2 Merge branch 'RED-8825-fix' into 'main'
RED-8825: minor fixes

See merge request fforesight/layout-parser!146
2024-05-07 17:47:07 +02:00
Kilian Schuettler
6a65d7f9fc RED-8825: minor fixes
* also added overrides via env variables
2024-05-07 17:37:42 +02:00
Kilian Schuettler
e935cc7b14 RED-8825: some fixes, and experimental column detector 2024-05-06 14:24:39 +02:00
Kilian Schüttler
07733d0855 Merge branch 'RED-8825' into 'main'
RED-8825: improve layoutparsing

See merge request fforesight/layout-parser!132
2024-05-03 12:03:03 +02:00
Kilian Schuettler
abb249e966 RED-8825: general layoutparsing improvements
* fix checkstyle
2024-05-03 00:15:31 +02:00
Kilian Schuettler
bcd1eb9afa RED-8825: general layoutparsing improvements
* added test for table line classification
2024-05-03 00:13:48 +02:00
Kilian Schuettler
60acbac53f RED-8825: general layoutparsing improvements
* fixing a bunch of coordinates
2024-05-03 00:06:29 +02:00
Kilian Schuettler
a3decd292d RED-8825: general layoutparsing improvements
* fix RulingCleaningService
2024-05-02 23:00:22 +02:00
Kilian Schuettler
b6f0a21886 RED-8825: general layoutparsing improvements
* refactor all coordinates
2024-05-02 21:01:25 +02:00
Kilian Schuettler
d61cac8b4f RED-8825: general layoutparsing improvements
* fix tests
2024-04-30 16:06:22 +02:00
Kilian Schuettler
ae46c5f1ca RED-8825: general layoutparsing improvements
* fix tests
2024-04-30 11:55:18 +02:00
Kilian Schuettler
f0a70a5242 RED-8825: general improvements
* some more refactoring
 * fixed text ruling classification for vertical text
 * shrunk min graphics size
2024-04-30 11:09:23 +02:00
Kilian Schuettler
15ea385f4d RED-8825: general improvements
* some more refactoring
 * fixed text ruling classification for vertical text
 * shrunk min graphics size
2024-04-30 10:44:32 +02:00
Kilian Schuettler
08be18db2d RED-8825: general improvements
* some more refactoring
2024-04-29 20:09:53 +02:00
Kilian Schuettler
64209255cb RED-8825: general improvements
* classify rulings as underline/striketrough
* improve performance of CleanRulings.lineBetween
* use lineBetween where possible
* wip, still todo:
 - Header/Footer by Ruling for all rotations
 - actually the ticket, optimizing layoutparsing for documine
2024-04-29 17:24:15 +02:00
Kilian Schuettler
4761d2e1a2 RED-8825: general improvements
* classify rulings as underline/striketrough
* improve performance of CleanRulings.lineBetween
* use lineBetween where possible
* wip, still todo:
 - Header/Footer by Ruling for all rotations
 - actually the ticket, optimizing layoutparsing for documine
2024-04-29 17:22:33 +02:00
Kilian Schuettler
1916e626df RED-8825: general improvements
* classify rulings as underline/striketrough
* improve performance of CleanRulings.lineBetween
* use lineBetween where possible
* wip, still todo:
 - Header/Footer by Ruling for all rotations
 - actually the ticket, optimizing layoutparsing for documine
2024-04-29 17:15:19 +02:00
Kilian Schuettler
e4663ac8db RED-8825: added split by ruling into every step of docstrum 2024-04-29 15:54:56 +02:00
Kilian Schuettler
6a691183dc RED-8825: improve layoutparsing
* added improved debugging capabilities to viewer-doc
* refactored coordinates (wip)
* refactored line intersection algorithm
* removed cropbox correction from pdfbox text positions
2024-04-29 15:54:56 +02:00
Kilian Schuettler
3dd215288a RED-8825: improve layoutparsing
* added improved debugging capabilities to viewer-doc
* refactored coordinates (wip)
* refactored line intersection algorithm
* removed cropbox correction from pdfbox text positions
2024-04-29 15:54:53 +02:00
Kilian Schüttler
6fb1a0bef3 Merge branch 'RED-8992' into 'main'
RED-8992 - Enable to add annotation on header with line breaks

See merge request fforesight/layout-parser!143
2024-04-25 13:03:40 +02:00
Corina Olariu
4e7c3f584b RED-8992 - Enable to add annotation on header with line breaks
- don't reorder textblocks classified as headers and footers
- add unit test
2024-04-25 11:23:10 +03:00
Yannik Hampe
84bdb4d1ed Merge branch 'RED-8701' into 'main'
RED-8701 - Move files to customer data repositories

See merge request fforesight/layout-parser!137
2024-04-25 09:06:35 +02:00
Dominique Eifländer
75ab4df592 Merge branch 'RED-8932' into 'main'
RED-8932 Fixed not merged headline with identifier

See merge request fforesight/layout-parser!141
2024-04-24 11:55:01 +02:00
Dominique Eifländer
8442e60055 RED-8932 Fixed not merged headline with identifier 2024-04-24 11:45:38 +02:00
Corina Olariu
0ef67fc07b RED-8701 - Move files to customer data repositories
- update junit tests and syngenta submodule
2024-04-23 14:54:56 +03:00
Corina Olariu
ea02f31a84 Merge branch 'main' into RED-8701
# Conflicts:
#	layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java
2024-04-23 14:20:00 +03:00
Dominique Eifländer
58acbab85f Merge branch 'RED-8826' into 'main'
Red 8826

See merge request fforesight/layout-parser!138
2024-04-23 13:12:51 +02:00
Kilian Schüttler
d38d023485 Merge branch 'RED-7384' into 'main'
Red 7384

See merge request fforesight/layout-parser!140
2024-04-23 12:13:21 +02:00
Kilian Schüttler
c1afe9b11f Red 7384 2024-04-23 12:13:19 +02:00
Corina Olariu
bdcb9aeda4 RED-8701 - Move files to customer data repositories
- update junit tests
2024-04-23 11:49:29 +03:00
Corina Olariu
6a86036a78 Merge branch 'main' into RED-8701 2024-04-23 11:46:59 +03:00
Corina Olariu
a358d7565e RED-8701 - Move files to customer data repositories
- update junit tests
2024-04-23 11:12:57 +03:00
Corina Olariu
069a6c0b49 RED-8701 - Move files to customer data repositories
- update syngenta submodule
2024-04-23 10:44:23 +03:00
Dominique Eifländer
683f7f1fb8 RED-8826: Do not classify textblocks in graphics as headlines 2024-04-23 09:28:28 +02:00
Corina Olariu
7eab3a4088 RED-8701 - Move files to customer data repositories
- remove customer files from project
2024-04-22 14:57:51 +03:00
Corina Olariu
970fc99ed1 RED-8701 - Move files to customer data repositories
- update junit test
2024-04-22 14:14:47 +03:00
Corina Olariu
48c54f63a0 RED-8701 - Move files to customer data repositories
- update submodules
2024-04-22 13:57:39 +03:00
Corina Olariu
20e4e5ddff RED-8701 - Move files to customer data repositories
- update unit tests with the new path to submodules for customer files
2024-04-22 13:37:27 +03:00
Dominique Eifländer
b53930328a RED-8826: Implemented graphics detection 2024-04-19 15:05:17 +02:00
Dominique Eifländer
c947d552d2 Merge branch 'RED-8995-fp' into 'main'
RED-8995: unclassified text might be missing from document data

See merge request fforesight/layout-parser!135
2024-04-19 09:21:50 +02:00
Corina Olariu
6b1b5eab84 RED-8701 - Move files to customer data repositories
- add syngenta submodule
2024-04-18 20:33:00 +03:00
Corina Olariu
cc9816c8cb RED-8701 - Move files to customer data repositories
- use git lfs to store customer files
2024-04-18 20:31:35 +03:00
Kilian Schuettler
f256f9b30f RED-8995: unclassified text might be missing from document data
* treat TablePageBlock.OTHER like PARAGRAPH (no special treatment)
2024-04-18 17:42:34 +02:00
Yannik Hampe
6167e3fb57 Merge branch 'RED-8402' into 'main'
RED-8402: Header and footer are not indexed / searched

See merge request fforesight/layout-parser!134
2024-04-18 15:08:00 +02:00
yhampe
a78fb0244a Merge remote-tracking branch 'origin/RED-8402' into RED-8402 2024-04-18 14:39:10 +02:00
yhampe
8099a00bb6 RED-8402: Header and footer are not indexed / searched
added unit test and file
2024-04-18 14:39:01 +02:00
yhampe
9bb0468b2b RED-8402: Header and footer are not indexed / searched
added unit test and file
2024-04-18 14:36:25 +02:00
Kilian Schüttler
c4d9c5df02 Merge branch 'RED-8747-fp' into 'main'
RED-8747 - Entities not merged properly - fp

See merge request fforesight/layout-parser!131
2024-04-09 16:30:02 +02:00
Corina Olariu
976f408237 RED-8747 - Entities not merged properly - fp
- rework the extraction of rulings from the table cells
2024-04-09 14:38:48 +03:00
Corina Olariu
319268c53d RED-8747 - Entities not merged properly - fp
- update test
2024-04-09 12:24:19 +03:00
Corina Olariu
014eba9fc3 RED-8747 - Entities not merged properly - fp
- fix typo
- add validate table test
2024-04-09 12:14:57 +03:00
Yannik Hampe
9bd8419770 Merge branch 'RED-8402' into 'main'
RED-8402: Header and footer are not indexed / searched

See merge request fforesight/layout-parser!128
2024-04-08 12:28:06 +02:00
yhampe
c13ff7fbf6 RED-8402: Header and footer are not indexed / searched
checkstyle
added review comments
2024-04-08 12:17:49 +02:00
yhampe
5d3826e9b9 Merge remote-tracking branch 'origin/RED-8402' into RED-8402 2024-04-08 12:02:47 +02:00
yhampe
0c3194276a RED-8402: Header and footer are not indexed / searched
added headers and footers to simplifiedtext
2024-04-08 12:02:36 +02:00
yhampe
e302d9784e RED-8402: Header and footer are not indexed / searched
added headers and footers to simplifiedtext
2024-04-08 11:59:35 +02:00
Corina Olariu
f185b13f2b RED-8747 - Entities not merged properly - fp
- use the rullings from the found tables instead of all rullings as splitting rullings in the blockification service
2024-04-08 09:42:32 +03:00
Dominique Eifländer
990c376ce6 Merge branch 'RED-8873' into 'main'
RED-8773 - Fix images not appearing on specific file

See merge request fforesight/layout-parser!123
2024-04-05 10:11:23 +02:00
Kilian Schüttler
bf6a0d770b Merge branch 'RED-8799' into 'main'
RED-8799: LayoutGrid is wrong draw for some tables

See merge request fforesight/layout-parser!126
2024-04-04 15:23:12 +02:00
Kilian Schuettler
f18bda1d4e RED-8799: LayoutGrid is wrong draw for some tables 2024-04-04 13:33:22 +02:00
Maverick Studer
0a11992361 Merge branch 'RED-8702' into 'main'
RED-8702: Explore document databases to store entityLog

See merge request fforesight/layout-parser!125
2024-04-03 10:00:38 +02:00
Andrei Isvoran
456b8fe4a1 RED-8773 - Fix images not appearing on specific file 2024-04-03 10:20:46 +03:00
maverickstuder
9778ece992 RED-8702: Explore document databases to store entityLog
* fix for duplicate images in document structure that are linked to multiple sections
2024-04-02 14:19:14 +02:00
Timo Bejan
8bd0de6263 Merge branch 'RED-8827' into 'main'
Red 8827

See merge request fforesight/layout-parser!122
2024-03-22 12:25:36 +01:00
Timo Bejan
5c1708f97f Issue with merging text blocks multiple times 2024-03-22 12:47:05 +02:00
Timo Bejan
a35d77be2e ignore mc files 2024-03-22 10:22:00 +02:00
Dominique Eifländer
631160eb22 Merge branch 'RED-8627' into 'main'
RED-8627: Fixed scrambled text after sorting

See merge request fforesight/layout-parser!120
2024-03-19 11:09:48 +01:00
Dominique Eifländer
8e7e588d26 RED-8627: Fixed scrambled text after sorting 2024-03-19 10:58:36 +01:00
Dominique Eifländer
ac850c2626 Merge branch 'RED-7141' into 'main'
RED-7141: Fixed more overlap problems

See merge request fforesight/layout-parser!119
2024-03-14 16:46:10 +01:00
Dominique Eifländer
1d765a6baa RED-7141: Fixed more overlap problems 2024-03-14 16:30:52 +01:00
Dominique Eifländer
c55984aa67 Merge branch 'RED-7141' into 'main'
RED-7141: Fixed overlapping blocks

See merge request fforesight/layout-parser!118
2024-03-14 09:09:52 +01:00
Dominique Eifländer
27aa418029 RED-7141: Fixed overlapping blocks 2024-03-13 16:14:55 +01:00
Dominique Eifländer
c4edff4696 Merge branch 'RED-7141' into 'main'
RED-7141: Readded lost mergeLinesInZones

See merge request fforesight/layout-parser!116
2024-03-12 13:49:09 +01:00
Dominique Eifländer
92fd1a72de RED-7141: Readded lost mergeLinesInZones 2024-03-12 13:42:40 +01:00
Dominique Eifländer
0d3d25e7d7 Merge branch 'RED-7141-hotfix' into 'main'
RED-7141: Align backend text sorting with Webviewer sorting

See merge request fforesight/layout-parser!115
2024-03-12 11:15:41 +01:00
maverickstuder
956fbff872 RED-7141: Align backend text sorting with Webviewer sorting
* hotfix for tables not being detected due to wrong x-y-sorting
2024-03-12 11:06:53 +01:00
Maverick Studer
2488009af1 Merge branch 'RED-8715' into 'main'
RED-8715: Improve NearestNeighbor Algorithm in LayoutParser

See merge request fforesight/layout-parser!114
2024-03-11 15:10:41 +01:00
maverickstuder
16be2467fd RED-8715: Improve NearestNeighbor Algorithm in LayoutParser
* replaced the old algorithm with an algorithm based on a kd-tree
2024-03-11 14:42:28 +01:00
Timo Bejan
f4cae8a7dc Merge branch 'Clarifynd' into 'main'
Clarifynd

See merge request fforesight/layout-parser!113
2024-03-11 10:37:05 +01:00
Timo Bejan
dfc23955d7 Linespacing claryfind 2024-03-11 11:30:51 +02:00
Dominique Eifländer
d6e3d6fe22 Clarifynd 2024-03-11 11:24:58 +02:00
Timo Bejan
bef23e38b5 Merge branch 'clari-30' into 'main'
CLARI-30 - forward analysis headers

See merge request fforesight/layout-parser!112
2024-03-08 15:51:53 +01:00
Timo Bejan
65ab7a1912 CLARI-30 - forward analysis headers 2024-03-08 16:47:27 +02:00
Timo Bejan
d80231e4a9 Merge branch 'clari-30' into 'main'
CLARI-30 - identifier fix for clarifynd

See merge request fforesight/layout-parser!111
2024-03-08 15:28:33 +01:00
Timo Bejan
56c07a4491 CLARI-30 - identifier fix for clarifynd 2024-03-08 16:23:27 +02:00
Dominique Eifländer
0b4ad29dcb Merge branch 'RED-7141' into 'main'
RED-7141: Implemented docstrum layout parsing

See merge request fforesight/layout-parser!108
2024-03-08 14:27:59 +01:00
Dominique Eifländer
0ad0cd45d6 RED-7141: Moved docstrum to root level of processor package 2024-03-08 14:20:28 +01:00
Dominique Eifländer
d659fe7234 RED-7141: Performance improvments 2024-03-08 10:00:52 +01:00
Dominique Eifländer
cb9127b4f3 RED-7141: Fixed pr finding and improved speed 2024-03-07 16:51:48 +01:00
Timo Bejan
05523585c0 orchestrator/persistence service should control queues 2024-03-06 16:55:44 +02:00
Timo Bejan
4ced572949 orchestrator/persistence service should control queues 2024-03-06 16:53:10 +02:00
Dominique Eifländer
79239b751d RED-7141: Implemented docstrum layout parsing 2024-03-06 11:18:40 +01:00
Yannik Hampe
f146beeb44 Merge branch 'RED-8481-hotfix' into 'main'
RED-8481: Use visual layout parsing to detect signatures

See merge request fforesight/layout-parser!106
2024-02-29 09:39:17 +01:00
yhampe
f8a4ccfff0 Merge remote-tracking branch 'origin/RED-8481-hotfix' into RED-8481-hotfix 2024-02-29 09:22:41 +01:00
yhampe
a6ba501fa8 RED-8481: Use visual layout parsing to detect signatures
fixed some nullpointer errors
2024-02-29 09:22:27 +01:00
yhampe
7dfb3b2b52 RED-8481: Use visual layout parsing to detect signatures
fixed some nullpointer errors
2024-02-29 09:21:47 +01:00
Kilian Schüttler
c324d3815e Merge branch 'RED-8550-pre-backup' into 'main'
RED-8550: Faulty table recognition and text duplication leads to huge sections

See merge request fforesight/layout-parser!105
2024-02-28 16:13:56 +01:00
Maverick Studer
74f55a5cbf RED-8550: Faulty table recognition and text duplication leads to huge sections 2024-02-28 16:13:56 +01:00
Kilian Schüttler
e7bf607663 Merge branch 'hotfix' into 'main'
hotfix: double viewerdoc writes in rare cases lead to some contentstreams not being written

See merge request fforesight/layout-parser!104
2024-02-26 12:32:14 +01:00
Kilian Schuettler
f4d789311c hotfix: double viewerdoc writes in rare cases lead to some contentstreams not being written 2024-02-26 12:24:15 +01:00
Yannik Hampe
9817eae897 Merge branch 'RED-8481-fix' into 'main'
Red 8481 fix

See merge request fforesight/layout-parser!103
2024-02-23 14:08:57 +01:00
yhampe
477f6af886 RED-8481: Use visual layout parsing to detect signatures
added a new layer for visual parsing results

checkstyle
2024-02-23 14:02:53 +01:00
yhampe
2c171b6a9e RED-8481: Use visual layout parsing to detect signatures
added a new layer for visual parsing results

codestyle
2024-02-23 13:55:11 +01:00
yhampe
71477dabde RED-8481: Use visual layout parsing to detect signatures
added a new layer for visual parsing results

codestyle
2024-02-23 12:46:51 +01:00
yhampe
a927cbd9dc RED-8481: Use visual layout parsing to detect signatures
added a new layer for visual parsing results

fixed tests
2024-02-23 12:38:05 +01:00
yhampe
a1521877d7 RED-8481: Use visual layout parsing to detect signatures
added a new layer for visual parsing results

added a source label to image properties to enable rules
2024-02-23 12:20:11 +01:00
Maverick Studer
f4b6386e1c Merge branch 'RED-8550-testing' into 'main'
RED-8550: Faulty table recognition and text duplication leads to huge sections

See merge request fforesight/layout-parser!102
2024-02-21 13:54:30 +01:00
Maverick Studer
1d64028158 RED-8550: Faulty table recognition and text duplication leads to huge sections 2024-02-21 13:54:30 +01:00
Yannik Hampe
0979a267d4 Merge branch 'RED-8481' into 'main'
RED-8481: Use visual layout parsing to detect signatures

See merge request fforesight/layout-parser!101
2024-02-15 13:07:06 +01:00
yhampe
cc77d19500 RED-8481: Use visual layout parsing to detect signatures
addressed review comments
2024-02-15 13:01:30 +01:00
yhampe
fa048b2fe0 RED-8481: Use visual layout parsing to detect signatures
addressed review comments
2024-02-15 12:19:26 +01:00
yhampe
bdf1161c91 RED-8481: Use visual layout parsing to detect signatures
addressed review comments
2024-02-15 12:12:23 +01:00
yhampe
b4a225144d RED-8481: Use visual layout parsing to detect signatures
working on failing tests
2024-02-15 10:16:07 +01:00
yhampe
903b1c1fd4 RED-8481: Use visual layout parsing to detect signatures
fixed failing tests because of null pointer
2024-02-15 09:27:07 +01:00
yhampe
c3e7582ee3 RED-8481: Use visual layout parsing to detect signatures
fixed failing tests because of null pointer
2024-02-14 12:33:36 +01:00
yhampe
cfc5db45cd RED-8481: Use visual layout parsing to detect signatures
fixed failing tests because of null pointer
2024-02-14 12:24:32 +01:00
yhampe
fbd0196719 RED-8481: Use visual layout parsing to detect signatures
implemented visuallayoutparsingresult
2024-02-14 12:16:37 +01:00
Kilian Schüttler
3c9049dc8a Merge branch 'RED-8156' into 'main'
RED-8156: refactor ViewerDocumentService as a dependency for ocr-service

See merge request fforesight/layout-parser!99
2024-02-07 10:42:21 +01:00
Kilian Schuettler
015984891f RED-8156: refactor ViewerDocumentService as a dependency for ocr-service
* fix pmd
2024-02-06 17:17:26 +01:00
Kilian Schuettler
66fcb62833 RED-8156: refactor ViewerDocumentService as a dependency for ocr-service
* fix pmd
2024-02-06 17:09:21 +01:00
Kilian Schuettler
48824f56a8 RED-8156: refactor ViewerDocumentService as a dependency for ocr-service
* fix pmd
2024-02-06 17:06:53 +01:00
Kilian Schuettler
785628537f RED-8156: refactor ViewerDocumentService as a dependency for ocr-service
* various improvements to experimental parsing steps
* added embed fonts functionality to viewer doc
* fix checkstyle
2024-02-06 17:03:38 +01:00
Kilian Schuettler
23eb0c40a3 RED-8156: refactor ViewerDocumentService as a dependency for ocr-service
* various improvements to experimental parsing steps
* added embed fonts functionality to viewer doc
2024-02-06 16:59:51 +01:00
Dominique Eifländer
1b4aaf4454 Merge branch 'RED-8171' into 'main'
RED-8171: Traces do not stop at @Async

See merge request fforesight/layout-parser!98
2024-02-02 13:34:44 +01:00
Dominique Eifländer
e4f3557b36 RED-8171: Traces do not stop at @Async 2024-02-02 13:22:57 +01:00
Timo Bejan
9be3c86297 Merge branch 'RED-8085' into 'main'
Red 8085

See merge request fforesight/layout-parser!96
2024-01-29 10:31:36 +01:00
Timo Bejan
88855de2da Red 8085 2024-01-29 10:31:36 +01:00
Dominique Eifländer
368a75e985 Merge branch 'RED-8106' into 'main'
RED-8106: Make documentdata serializable

See merge request fforesight/layout-parser!95
2023-12-22 13:33:02 +01:00
Dominique Eifländer
12344d57b2 RED-8106: Make documentdata serializable 2023-12-21 13:42:25 +01:00
Dominique Eifländer
9e854379e7 Merge branch 'RED-1137' into 'main'
RED-1137: Do not observe actuator endpoints

See merge request fforesight/layout-parser!92
2023-12-20 14:11:31 +01:00
Dominique Eifländer
b779c72041 RED-1137: Do not observe actuator endpoints 2023-12-20 14:05:00 +01:00
Dominique Eifländer
760a809900 Merge branch 'RED-7384' into 'main'
RED-7384: fixes for migration

See merge request fforesight/layout-parser!91
2023-12-20 12:40:00 +01:00
Kilian Schüttler
ba1c7c07ab RED-7384: fixes for migration 2023-12-20 12:40:00 +01:00
Dominique Eifländer
ca0cbbcb49 Merge branch 'RED-5223' into 'main'
RED-5223: Use tracing-commons from fforesight

See merge request fforesight/layout-parser!90
2023-12-13 16:05:01 +01:00
Dominique Eifländer
da2cdc288e RED-5223: Use tracing-commons from fforesight 2023-12-13 15:31:26 +01:00
Dominique Eifländer
68da328889 Merge branch 'queueHotfix' into 'main'
hotfix: removed dlq from response queue to be equal to persistence-service

See merge request fforesight/layout-parser!89
2023-12-13 09:54:43 +01:00
Dominique Eifländer
711548d1a7 hotfix: removed dlq from response queue to be equal to persistence-service 2023-12-13 09:47:27 +01:00
Dominique Eifländer
2bddcdafee Merge branch 'RED-5223' into 'main'
RED-5223: Enabled tracing, upgrade spring, use logstash-logback-encoder for json logs

See merge request fforesight/layout-parser!88
2023-12-11 15:13:22 +01:00
Dominique Eifländer
750ccf4ce2 RED-5223: Enabled tracing, upgrade spring, use logstash-logback-encoder for json logs 2023-12-11 15:06:23 +01:00
Ali Oezyetimoglu
57b5d3f48e Merge branch 'RED-7714' into 'main'
RED-7715 - Add log4j config to enable switching between json/line logs

See merge request fforesight/layout-parser!87
2023-12-06 13:00:28 +01:00
Andrei Isvoran
d8c9659469 RED-7715 - Add log4j config to enable switching between json/line logs 2023-12-06 11:59:42 +02:00
Kilian Schüttler
30f060e36c Merge branch 'enable-caching' into 'main'
enable caching in build

See merge request fforesight/layout-parser!84
2023-11-24 10:36:55 +01:00
Kilian Schuettler
53a5824e6c enable caching in build 2023-11-24 10:24:50 +01:00
Dominique Eifländer
e2bcf971c9 Merge branch 'DM-589' into 'main'
DM-589: Filter wrong detected cells that borders from rotation at scanning

See merge request fforesight/layout-parser!83
2023-11-20 16:07:27 +01:00
Dominique Eifländer
dacc2f7f43 DM-589: Filter wrong detected cells that borders from rotation at scanning 2023-11-20 15:54:02 +01:00
Dominique Eifländer
144a9591a2 Merge branch 'TAAS-103-hotfix' into 'main'
* added back in if statement

See merge request fforesight/layout-parser!82
2023-11-16 12:48:48 +01:00
yhampe
207d9dec97 * added back in if statement
* removed not needed commentar
2023-11-16 12:40:49 +01:00
Yannik Hampe
09ee90222e Merge branch 'TAAS-103' into 'main'
TAAS-103: Table Detection and rotated text

See merge request fforesight/layout-parser!81
2023-11-16 09:13:41 +01:00
yhampe
1316a067fe * removed double chechking for height of cell 2023-11-16 08:51:12 +01:00
yhampe
e203210ade * removed not needed properties 2023-11-16 08:23:58 +01:00
yhampe
b25d46291a * checkstyle 2023-11-16 08:12:47 +01:00
yhampe
84148d3b6e * fixed tests 2023-11-16 07:51:08 +01:00
Dominique Eifländer
a6ba66b1aa TAAS-103: Fixed values in wrong cells 2023-11-15 13:36:46 +01:00
yhampe
c3e69b2cdf * fixed bug with incorrect empty cell count by adding threshhold to cell.contains 2023-11-15 10:44:47 +01:00
yhampe
f69331e7d8 *renamed page to firstPage in DocumentStructure and Table 2023-11-07 10:21:19 +01:00
yhampe
01493dc033 TAAS-103: Table Detection and rotated text
* added page property to DocumentStructure to be able to get page of found tables

* added a method to TableExtractionService to get the table area

* added calculateMinCharWidthAndMaxCharHeightInsideTable to LayoutParsingPipeline to calculate the values based upon table area

* refactored PDFLinesTextStripper for better readability

*removed textMatrix from RedTextPosition as it is no longer needed
2023-11-07 08:47:28 +01:00
yhampe
459e0c8be7 TAAS-103: 2023-11-07 08:39:15 +01:00
Kilian Schüttler
1b1f777706 Merge branch 'RED-7806' into 'main'
RED-7806 - Specific customer document cannot be processed

See merge request fforesight/layout-parser!79
2023-10-25 10:50:34 +02:00
Corina Olariu
0e0a811f9d RED-7806 - Specific customer document cannot be processed
- add brackets
2023-10-25 11:36:54 +03:00
Corina Olariu
efa3d75479 RED-7806 - Specific customer document cannot be processed
- check for font name null before using to avoid the NPE
2023-10-25 09:16:47 +03:00
Kilian Schüttler
9abdc6d44d Merge branch 'RED-7434' into 'main'
RED-7434 - Remove Section Grid entirely

See merge request fforesight/layout-parser!78
2023-10-20 10:07:01 +02:00
Corina Olariu
3bab61c446 RED-7434 - Remove Section Grid entirely
- remove sectionGrid relation (including SectionGridCreatorService)
- update junit tests
2023-10-20 09:09:22 +03:00
Dominique Eifländer
d17517d3c3 Merge branch 'hotfix-bdr-doc' into 'main'
hotfix: Fixed parsing for specific taas document

See merge request fforesight/layout-parser!77
2023-10-18 16:12:15 +02:00
Dominique Eifländer
567cbc178b hotfix: Fixed parsing for specific taas document 2023-10-17 15:52:19 +02:00
Kilian Schüttler
3c53772765 Merge branch 'RED-7759' into 'main'
RED-7759: Upgraded storage-commons to newest windwos compatible version

See merge request fforesight/layout-parser!76
2023-10-13 12:21:04 +02:00
Dominique Eifländer
8647cf5a18 RED-7759: Upgraded storage-commons to newest windwos compatible version 2023-10-13 12:15:22 +02:00
Kilian Schüttler
310c07b200 Merge branch 'RED-7607-WIP' into 'main'
RED-7607 - Rotating pages leads to lost annotations (RM & DM)

See merge request fforesight/layout-parser!75
2023-10-05 13:34:12 +02:00
Corina Olariu
daba0bf8a6 RED-7607 - Rotating pages leads to lost annotations (RM & DM)
- remove finally clause
2023-10-04 17:46:46 +03:00
Corina Olariu
3839de215c RED-7607 - Rotating pages leads to lost annotations (RM & DM)
- rollback to getDir().getDegrees()
2023-10-04 15:27:13 +03:00
Corina Olariu
b4d68594f1 RED-7607 - Rotating pages leads to lost annotations (RM & DM)
- use rotation instead of getDir().getDegrees()
2023-10-04 14:22:15 +03:00
Corina Olariu
99ed331a1e RED-7607 - Rotating pages leads to lost annotations (RM & DM)
- use getXDirAdj instead of getX
- add fontSizeCounter for landscape pages also
2023-10-04 14:13:38 +03:00
Corina Olariu
f2c0991987 RED-7607 - Rotating pages leads to lost annotations (RM & DM)
- fix PMD findings
2023-10-04 14:09:46 +03:00
Timo Bejan
b8ef55e6e2 Merge branch 'TAAS-104' into 'main'
TAAS-104: merge visually intersecting Paragraphs

See merge request fforesight/layout-parser!73
2023-09-05 17:08:40 +02:00
Kilian Schuettler
5792ff4a93 TAAS-104: merge visually intersecting Paragraphs
* fix build
2023-09-05 16:54:23 +02:00
Kilian Schuettler
621c3f269d TAAS-104: merge visually intersecting Paragraphs 2023-09-05 16:09:05 +02:00
Dominique Eifländer
8dba392904 Merge branch 'RED-7461' into 'main'
RED-7461: Fixed wrong textblock classifation if footer is marked as header

See merge request fforesight/layout-parser!72
2023-09-01 12:14:38 +02:00
deiflaender
306a53ea79 RED-7461: Fixed wrong textblock classifation if footer is marked as header 2023-09-01 12:07:47 +02:00
Kilian Schüttler
754fd8f933 Merge branch 'TAAS-89' into 'main'
TAAS-89: added log entry and an end2end test

See merge request fforesight/layout-parser!71
2023-08-31 14:40:48 +02:00
Kilian Schuettler
28ec4c9ccb TAAS-89: added log entry and an end2end test 2023-08-31 14:28:18 +02:00
Kilian Schüttler
aed4a55787 Merge branch 'TAAS-89' into 'main'
TAAS-89: fixed weird bug with empty sections

See merge request fforesight/layout-parser!70
2023-08-31 12:00:18 +02:00
Kilian Schuettler
f87e2d75b5 TAAS-89: fixed weird bug with empty sections 2023-08-31 11:41:22 +02:00
Kilian Schüttler
de6760abc1 Merge branch 'TAAS-89' into 'main'
TAAS-89: added some more documentation

See merge request fforesight/layout-parser!69
2023-08-31 10:55:45 +02:00
Kilian Schuettler
261ef4c367 TAAS-89: added some more documentation
* fixed weird bug with empty sections
2023-08-31 10:49:32 +02:00
Timo Bejan
11ba9c6bb9 Merge branch 'TAAS-89' into 'main'
Added some documentation

See merge request fforesight/layout-parser!64
2023-08-25 16:34:18 +02:00
Renovate Bot
b7c3d02978 Merge branch 'renovate/main-spring-boot' into 'main'
Update spring boot to v3.1.3 (main)

See merge request fforesight/layout-parser!63
2023-08-24 21:15:35 +02:00
Kilian Schuettler
bcf0bcbaf4 Added some documentation 2023-08-24 18:37:47 +02:00
Renovate Bot
84cde2a3db Update spring boot to v3.1.3 2023-08-24 13:16:58 +00:00
Renovate Bot
6f2dd4f823 Merge branch 'renovate/main-com.iqser.red.commons-storage-commons-2.x' into 'main'
Update dependency com.iqser.red.commons:storage-commons to v2.40.0 (main)

See merge request fforesight/layout-parser!61
2023-08-24 09:17:14 +02:00
Renovate Bot
a909724217 Update dependency com.iqser.red.commons:storage-commons to v2.40.0 2023-08-24 04:16:37 +00:00
Renovate Bot
67a981e7a8 Merge branch 'renovate/main-aws-java-sdk-monorepo' into 'main'
Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.536 (main)

See merge request fforesight/layout-parser!60
2023-08-24 03:15:54 +02:00
Renovate Bot
0e93fdd515 Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.536 2023-08-23 22:17:08 +00:00
Renovate Bot
d464239f9b Merge branch 'renovate/main-com.iqser.red.service-persistence-service-shared-api-v1-2.x' into 'main'
Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.144.0 (main)

See merge request fforesight/layout-parser!59
2023-08-24 00:16:53 +02:00
Renovate Bot
88a20924b9 Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.144.0 2023-08-23 19:14:51 +00:00
Renovate Bot
f89243472c Merge branch 'renovate/main-com.iqser.red.commons-storage-commons-2.x' into 'main'
Update dependency com.iqser.red.commons:storage-commons to v2.39.0 (main)

See merge request fforesight/layout-parser!58
2023-08-23 09:17:15 +02:00
Renovate Bot
ad3612acd4 Update dependency com.iqser.red.commons:storage-commons to v2.39.0 2023-08-23 04:17:21 +00:00
Renovate Bot
630eee6bd7 Merge branch 'renovate/main-aws-java-sdk-monorepo' into 'main'
Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.535 (main)

See merge request fforesight/layout-parser!57
2023-08-23 03:13:39 +02:00
Renovate Bot
a951911ec8 Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.535 2023-08-22 22:16:17 +00:00
Renovate Bot
75e6b88705 Merge branch 'renovate/main-com.iqser.red.service-persistence-service-shared-api-v1-2.x' into 'main'
Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.140.0 (main)

See merge request fforesight/layout-parser!56
2023-08-22 21:18:37 +02:00
Renovate Bot
2e0adbdd9a Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.140.0 2023-08-22 16:17:49 +00:00
Renovate Bot
b747742558 Merge branch 'renovate/main-com.iqser.red.commons-storage-commons-2.x' into 'main'
Update dependency com.iqser.red.commons:storage-commons to v2.38.0 (main)

See merge request fforesight/layout-parser!55
2023-08-22 15:17:43 +02:00
Renovate Bot
192c9976c1 Update dependency com.iqser.red.commons:storage-commons to v2.38.0 2023-08-22 10:17:05 +00:00
Dominique Eifländer
b251697492 Merge branch 'PDFBox-update' into 'main'
upgrade PDFBox to 3.0.0

See merge request fforesight/layout-parser!52
2023-08-22 09:39:53 +02:00
Renovate Bot
22d6b25fe4 Merge branch 'renovate/main-aws-java-sdk-monorepo' into 'main'
Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.534 (main)

See merge request fforesight/layout-parser!53
2023-08-22 09:17:26 +02:00
Renovate Bot
e6bcd6fb2b Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.534 2023-08-22 04:17:20 +00:00
Renovate Bot
2847adde22 Merge branch 'renovate/main-com.iqser.red.commons-storage-commons-2.x' into 'main'
Update dependency com.iqser.red.commons:storage-commons to v2.37.0 (main)

See merge request fforesight/layout-parser!54
2023-08-22 06:16:54 +02:00
Renovate Bot
7cf67d7121 Update dependency com.iqser.red.commons:storage-commons to v2.37.0 2023-08-22 01:17:23 +00:00
Kilian Schuettler
3a18923ef5 upgrade PDFBox to 3.0.0
* disable experimental ruling header stuff
2023-08-21 17:54:20 +02:00
Kilian Schuettler
2b15fd1d3c RED-7461: improve header/footer recognition 2023-08-21 17:49:13 +02:00
Dominique Eifländer
3722fff476 Merge branch 'RED-7461' into 'main'
Red 7461

See merge request fforesight/layout-parser!51
2023-08-21 17:08:10 +02:00
deiflaender
0cb8029f0a RED-7461: Fixed pr findings 2023-08-21 16:57:37 +02:00
deiflaender
b270b9c942 RED-7461: Use marked content to classify headers and footers if available 2023-08-21 16:02:24 +02:00
deiflaender
60615ec5d8 RED-7461: First working iteration of header and footer improvement 2023-08-21 15:31:11 +02:00
Renovate Bot
880914a167 Merge branch 'renovate/main-com.iqser.red.commons-storage-commons-2.x' into 'main'
Update dependency com.iqser.red.commons:storage-commons to v2.36.0 (main)

See merge request fforesight/layout-parser!50
2023-08-19 09:17:27 +02:00
Renovate Bot
a80a93d2b0 Update dependency com.iqser.red.commons:storage-commons to v2.36.0 2023-08-19 04:15:48 +00:00
Renovate Bot
0afa7e5b12 Merge branch 'renovate/main-aws-java-sdk-monorepo' into 'main'
Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.533 (main)

See merge request fforesight/layout-parser!49
2023-08-19 03:14:12 +02:00
Renovate Bot
12516ebf22 Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.533 2023-08-18 22:13:37 +00:00
Renovate Bot
0dca90c3fe Merge branch 'renovate/main-com.iqser.red.service-persistence-service-shared-api-v1-2.x' into 'main'
Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.138.0 (main)

See merge request fforesight/layout-parser!48
2023-08-18 21:14:23 +02:00
Renovate Bot
2506c9e091 Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.138.0 2023-08-18 16:15:53 +00:00
Timo Bejan
83d39ba3a5 Fixed issue with weird colors 2023-08-18 16:21:45 +03:00
Renovate Bot
c09bb06da6 Merge branch 'renovate/main-com.iqser.red.commons-storage-commons-2.x' into 'main'
Update dependency com.iqser.red.commons:storage-commons to v2.35.0 (main)

See merge request fforesight/layout-parser!46
2023-08-18 09:17:56 +02:00
Renovate Bot
1793b1138e Update dependency com.iqser.red.commons:storage-commons to v2.35.0 2023-08-18 04:17:52 +00:00
Renovate Bot
d30735bc49 Merge branch 'renovate/main-aws-java-sdk-monorepo' into 'main'
Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.532 (main)

See merge request fforesight/layout-parser!45
2023-08-18 03:12:57 +02:00
Renovate Bot
9356db5373 Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.532 2023-08-17 22:15:27 +00:00
Renovate Bot
ee766e7150 Merge branch 'renovate/main-com.iqser.red.service-persistence-service-shared-api-v1-2.x' into 'main'
Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.135.0 (main)

See merge request fforesight/layout-parser!44
2023-08-17 21:15:12 +02:00
Renovate Bot
a33bbc9abc Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.135.0 2023-08-17 16:16:19 +00:00
Renovate Bot
5758295fac Merge branch 'renovate/main-com.iqser.red.service-persistence-service-shared-api-v1-2.x' into 'main'
Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.132.0 (main)

See merge request fforesight/layout-parser!43
2023-08-17 12:15:34 +02:00
Renovate Bot
8142d0aa09 Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.132.0 2023-08-17 07:17:11 +00:00
Renovate Bot
dc80353a5b Merge branch 'renovate/main-com.iqser.red.commons-storage-commons-2.x' into 'main'
Update dependency com.iqser.red.commons:storage-commons to v2.34.0 (main)

See merge request fforesight/layout-parser!42
2023-08-17 09:16:50 +02:00
Renovate Bot
4d856b04b3 Update dependency com.iqser.red.commons:storage-commons to v2.34.0 2023-08-17 04:16:30 +00:00
Renovate Bot
6ba25ecaa0 Merge branch 'renovate/main-aws-java-sdk-monorepo' into 'main'
Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.531 (main)

See merge request fforesight/layout-parser!41
2023-08-17 06:16:08 +02:00
Renovate Bot
fcdcaf16e9 Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.531 2023-08-16 22:15:33 +00:00
Renovate Bot
086e338f4a Merge branch 'renovate/main-com.iqser.red.service-persistence-service-shared-api-v1-2.x' into 'main'
Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.131.0 (main)

See merge request fforesight/layout-parser!40
2023-08-17 00:15:20 +02:00
Renovate Bot
9dbe73f376 Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.131.0 2023-08-16 19:16:16 +00:00
Renovate Bot
aaf4015c95 Merge branch 'renovate/main-com.iqser.red.service-persistence-service-shared-api-v1-2.x' into 'main'
Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.130.0 (main)

See merge request fforesight/layout-parser!39
2023-08-16 18:16:44 +02:00
Renovate Bot
2b65ad4b4b Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.130.0 2023-08-16 13:13:52 +00:00
Renovate Bot
c0c75f6a0e Merge branch 'renovate/main-com.iqser.red.commons-storage-commons-2.x' into 'main'
Update dependency com.iqser.red.commons:storage-commons to v2.33.0 (main)

See merge request fforesight/layout-parser!38
2023-08-16 09:12:41 +02:00
Renovate Bot
2f4af6e377 Update dependency com.iqser.red.commons:storage-commons to v2.33.0 2023-08-16 04:13:49 +00:00
Renovate Bot
b9a305bf2d Merge branch 'renovate/main-aws-java-sdk-monorepo' into 'main'
Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.530 (main)

See merge request fforesight/layout-parser!37
2023-08-16 03:13:39 +02:00
Renovate Bot
db6b6af4d7 Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.530 2023-08-15 22:12:09 +00:00
Renovate Bot
d73addf7ed Merge branch 'renovate/main-com.iqser.red.service-persistence-service-shared-api-v1-2.x' into 'main'
Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.127.0 (main)

See merge request fforesight/layout-parser!36
2023-08-15 18:14:12 +02:00
Renovate Bot
c7978c93c2 Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.127.0 2023-08-15 13:14:00 +00:00
Kilian Schüttler
457f7d9c66 Merge branch 'RED-7158' into 'main'
RED-7158: fix for all page rotations

See merge request fforesight/layout-parser!35
2023-08-15 15:07:04 +02:00
Kilian Schuettler
0387cdd143 RED-7158: fix for all page rotations
* also make lines thinner
2023-08-15 14:55:41 +02:00
Kilian Schüttler
5c6898b975 Merge branch 'RED-7158' into 'main'
RED-7158: add layoutgrid into new ViewerDocument as optional content

See merge request fforesight/layout-parser!34
2023-08-15 13:22:06 +02:00
Kilian Schuettler
b7b273b47d RED-7158: layout grid
* downgraded storage-commons to working version
2023-08-15 13:15:44 +02:00
Kilian Schuettler
9aa9cb2d54 RED-7158: add layoutgrid into new ViewerDocument as optional content
* set layer to invisible by default
2023-08-15 13:14:16 +02:00
Renovate Bot
ee6c21638f Merge branch 'renovate/main-com.iqser.red.commons-storage-commons-2.x' into 'main'
Update dependency com.iqser.red.commons:storage-commons to v2.32.0 (main)

See merge request fforesight/layout-parser!33
2023-08-15 09:12:24 +02:00
Renovate Bot
1e4475afdf Update dependency com.iqser.red.commons:storage-commons to v2.32.0 2023-08-15 04:13:57 +00:00
Renovate Bot
708d274ebc Merge branch 'renovate/main-aws-java-sdk-monorepo' into 'main'
Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.529 (main)

See merge request fforesight/layout-parser!32
2023-08-15 03:13:02 +02:00
Renovate Bot
a94faad870 Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.529 2023-08-14 22:12:08 +00:00
Kilian Schüttler
d854125867 Merge branch 'RED-7158' into 'main'
RED-7158: add layoutgrid into new ViewerDocument as optional content

See merge request fforesight/layout-parser!31
2023-08-14 16:14:23 +02:00
Kilian Schuettler
63de8ef82d RED-7158: add layoutgrid into new ViewerDocument as optional content
* downgraded storage-commons
2023-08-14 16:07:11 +02:00
Kilian Schuettler
ea0af08c31 RED-7851: add layoutgrid to new viewer document as optional content 2023-08-14 16:06:23 +02:00
Renovate Bot
810caa0624 Merge branch 'renovate/main-com.iqser.red.commons-storage-commons-2.x' into 'main'
Update dependency com.iqser.red.commons:storage-commons to v2.31.0 (main)

See merge request fforesight/layout-parser!30
2023-08-12 09:14:34 +02:00
Renovate Bot
c282372dc8 Update dependency com.iqser.red.commons:storage-commons to v2.31.0 2023-08-12 04:15:18 +00:00
Renovate Bot
055ccd3366 Merge branch 'renovate/main-plugins-(non-major)' into 'main'
Update plugin io.freefair.lombok to v8.2.2 (main)

See merge request fforesight/layout-parser!29
2023-08-12 06:14:53 +02:00
Renovate Bot
4b4c73fb7b Update plugin io.freefair.lombok to v8.2.2 2023-08-12 01:13:35 +00:00
Renovate Bot
35b9cfd1c2 Merge branch 'renovate/main-aws-java-sdk-monorepo' into 'main'
Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.528 (main)

See merge request fforesight/layout-parser!28
2023-08-12 03:13:18 +02:00
Renovate Bot
9a73b952cf Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.528 2023-08-11 22:13:57 +00:00
Renovate Bot
a1c73094f1 Merge branch 'renovate/main-com.iqser.red.service-persistence-service-shared-api-v1-2.x' into 'main'
Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.126.0 (main)

See merge request fforesight/layout-parser!27
2023-08-11 18:17:30 +02:00
Renovate Bot
b79d9946a9 Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.126.0 2023-08-11 13:13:58 +00:00
Renovate Bot
a9735daa04 Merge branch 'renovate/main-com.iqser.red.commons-storage-commons-2.x' into 'main'
Update dependency com.iqser.red.commons:storage-commons to v2.30.0 (main)

See merge request fforesight/layout-parser!26
2023-08-11 15:13:39 +02:00
Renovate Bot
d00491c15e Update dependency com.iqser.red.commons:storage-commons to v2.30.0 2023-08-11 10:15:55 +00:00
deiflaender
ed48b6a4bf RED-6725: Fixed wrong file encoding in container, that leads to not working rules on terms with special chars 2023-08-11 11:16:07 +02:00
Renovate Bot
62eade84b9 Merge branch 'renovate/main-plugins-(non-major)' into 'main'
Update plugin io.freefair.lombok to v8.2.1 (main)

See merge request fforesight/layout-parser!25
2023-08-11 09:13:51 +02:00
Renovate Bot
d6a217fe70 Update plugin io.freefair.lombok to v8.2.1 2023-08-11 04:15:01 +00:00
Renovate Bot
f1e4d0d52b Merge branch 'renovate/main-aws-java-sdk-monorepo' into 'main'
Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.527 (main)

See merge request fforesight/layout-parser!24
2023-08-11 06:14:44 +02:00
Renovate Bot
38a1e8b95f Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.527 2023-08-11 01:14:44 +00:00
Renovate Bot
a371558f8c Merge branch 'renovate/main-plugins-(non-major)' into 'main'
Update plugin io.spring.dependency-management to v1.1.3 (main)

See merge request fforesight/layout-parser!23
2023-08-10 21:14:45 +02:00
Renovate Bot
b716b187eb Update plugin io.spring.dependency-management to v1.1.3 2023-08-10 16:14:50 +00:00
Kilian Schuettler
0be6454a7e add script for pushing custom images 2023-08-10 15:47:52 +02:00
Renovate Bot
5fde631e04 Merge branch 'renovate/main-spring-boot' into 'main'
Update spring boot to v3.1.2 (main)

See merge request fforesight/layout-parser!20
2023-08-10 12:15:11 +02:00
Renovate Bot
c076c10840 Update spring boot to v3.1.2 2023-08-10 07:14:23 +00:00
Renovate Bot
24104f8cc1 Merge branch 'renovate/main-com.iqser.red.commons-storage-commons-2.x' into 'main'
Update dependency com.iqser.red.commons:storage-commons to v2.29.0 (main)

See merge request fforesight/layout-parser!17
2023-08-10 09:14:03 +02:00
Renovate Bot
3632dd4667 Update dependency com.iqser.red.commons:storage-commons to v2.29.0 2023-08-10 04:14:44 +00:00
Renovate Bot
063aa8bfe1 Merge branch 'renovate/main-com.iqser.red.commons-jackson-commons-1.x' into 'main'
Update dependency com.iqser.red.commons:jackson-commons to v1.3.0 (main)

See merge request fforesight/layout-parser!16
2023-08-10 06:14:25 +02:00
Renovate Bot
d2716a60e9 Update dependency com.iqser.red.commons:jackson-commons to v1.3.0 2023-08-10 01:13:56 +00:00
Renovate Bot
8f08a8c62b Merge branch 'renovate/main-aws-java-sdk-monorepo' into 'main'
Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.526 (main)

See merge request fforesight/layout-parser!22
2023-08-10 03:13:38 +02:00
Renovate Bot
091cb73622 Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.526 2023-08-09 22:14:12 +00:00
Renovate Bot
d3b0bc430f Merge branch 'renovate/main-plugins-(non-major)' into 'main'
Update Plugins (non-major) (main)

See merge request fforesight/layout-parser!15
2023-08-10 00:13:58 +02:00
Renovate Bot
e3c12bc1bb Update Plugins (non-major) 2023-08-09 19:14:15 +00:00
Renovate Bot
f6f7a0a952 Merge branch 'renovate/main-jacksonversion' into 'main'
Update jacksonVersion to v2.15.2 (main)

See merge request fforesight/layout-parser!13
2023-08-09 21:13:57 +02:00
Renovate Bot
96df6e3145 Update jacksonVersion to v2.15.2 2023-08-09 16:12:54 +00:00
Renovate Bot
574e5ad425 Merge branch 'renovate/main-aws-java-sdk-monorepo' into 'main'
Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.525 (main)

See merge request fforesight/layout-parser!12
2023-08-09 18:12:37 +02:00
Renovate Bot
0611e56baa Update dependency com.amazonaws:aws-java-sdk-s3 to v1.12.525 2023-08-09 13:13:33 +00:00
Kilian Schüttler
442c1dafea Merge branch 'update-pdfbox' into 'main'
update PDFBox Version

See merge request fforesight/layout-parser!19
2023-08-09 12:48:14 +02:00
Kilian Schüttler
33bc532eac Merge branch 'renovate/main-com.iqser.red.service-persistence-service-shared-api-v1-2.x' into 'main'
Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.124.0 (main)

See merge request fforesight/layout-parser!18
2023-08-09 12:42:38 +02:00
Kilian Schuettler
4bd6e7e343 update PDFBox Version 2023-08-09 12:41:28 +02:00
Renovate Bot
159ac6348c Update dependency com.iqser.red.service:persistence-service-shared-api-v1 to v2.124.0 2023-08-09 10:14:27 +00:00
Kilian Schuettler
17259ed805 add renovate, fix checkstyle 2023-08-09 10:11:02 +02:00
Dominique Eifländer
67bf5cbaa8 Merge branch 'RED-6725' into 'main'
RED-6725: Install fonts

See merge request fforesight/layout-parser!11
2023-08-09 10:07:05 +02:00
deiflaender
f8a3cbc147 RED-6725: Install fonts 2023-08-09 10:00:07 +02:00
Andrei Isvoran
a3d4fbe3a3 Merge branch 'RED-6864' into 'main'
RED-6864 - Switch to DELETE_ON_CLOSE

See merge request fforesight/layout-parser!10
2023-08-09 08:37:07 +02:00
Andrei Isvoran
5c1dca5933 RED-6864 - Switch to DELETE_ON_CLOSE 2023-08-09 09:30:37 +03:00
Timo Bejan
f56ab8fa49 Merge branch 'RED-6864' into 'main'
RED-6864 - Switch to new storage-commons download

See merge request fforesight/layout-parser!9
2023-08-08 17:16:40 +02:00
Andrei Isvoran
cfca5376a0 RED-6864 - Switch to new storage-commons download 2023-08-08 17:16:40 +02:00
Kevin Tumma
0633fa04fb Update file .gitlab-ci.yml 2023-08-08 13:00:40 +02:00
Dominique Eifländer
659a9abaa5 Merge branch 'DM-165' into 'main'
DM-165: Fixed numberFormatException on german local machines

See merge request fforesight/layout-parser!8
2023-08-07 12:29:37 +02:00
deiflaender
5877aea3f7 DM-165: Fixed numberFormatException on german local machines 2023-08-07 12:12:00 +02:00
deiflaender
f2b92de827 DM-165: Fixed indexOutOfBounds error in TableNodeFactory 2023-08-05 10:20:05 +02:00
Kilian Schuettler
4a5464d6aa Refactoring to make downstream refactoring easier 2023-08-04 15:16:36 +02:00
Dominique Eifländer
d9a3bbbd30 Merge branch 'RED-5253' into 'main'
RED-5253: Ported last documine changes

See merge request fforesight/layout-parser!7
2023-08-04 09:59:57 +02:00
deiflaender
150aea55c0 RED-5253: Ported last documine changes 2023-08-04 09:55:35 +02:00
Kilian Schuettler
676f0c9d09 cleanup dependency versions 2023-08-01 10:27:14 +02:00
Kilian Schuettler
ded00df11e fix build 2023-08-01 09:57:58 +02:00
Kilian Schuettler
286556cbb6 mark non nullable fields in request 2023-08-01 00:50:33 +02:00
Kilian Schuettler
d6a74dc9f9 add field id to image data 2023-07-31 16:32:11 +02:00
Kilian Schuettler
2a55654fcf add simplifiedText 2023-07-31 15:30:03 +02:00
Kilian Schüttler
7496914b37 Merge branch 'RED-6725' into 'main'
include openfeign for tenant-commons

See merge request fforesight/layout-parser!6
2023-07-31 15:02:33 +02:00
Kilian Schuettler
c8ace585e1 include openfeign for tenant-commons 2023-07-31 14:40:57 +02:00
Kilian Schüttler
69c5f80c8c Merge branch 'RED-6725' into 'main'
Red 6725

See merge request fforesight/layout-parser!5
2023-07-31 13:19:39 +02:00
Kilian Schuettler
79d27189fd move Properties names to DocumentStructure 2023-07-31 12:54:49 +02:00
Kilian Schuettler
75bac72c05 fix dsljson 2023-07-29 02:44:22 +02:00
Kilian Schuettler
c5e6271dc3 fix fileIds 2023-07-29 01:59:32 +02:00
Kilian Schüttler
5561dd5e95 Merge branch 'RED-6725' into 'main'
Red 6725

See merge request fforesight/layout-parser!4
2023-07-28 17:46:40 +02:00
Kilian Schuettler
041b633742 use correct repo 2023-07-28 17:42:18 +02:00
Kilian Schuettler
715426bd3b remove hardcoded version 2023-07-28 17:34:59 +02:00
Kilian Schuettler
464b8053fe configure maven-publish plugin 2023-07-28 17:26:05 +02:00
Kilian Schuettler
2fece83c7c remove root build.gradle 2023-07-28 16:15:53 +02:00
Kilian Schuettler
ad03ef1922 move queue to server package, so i can easily import processor as a library for redaction-service 2023-07-28 16:15:31 +02:00
Christoph Schabert
cc44100e4e add root build.gradle.kts 2023-07-28 13:46:06 +02:00
Christoph Schabert
5d1c1ae406 Update gradle.properties.kts 2023-07-27 18:00:31 +02:00
Kilian Schuettler
f72838b0be removed unnecessary dependency 2023-07-27 17:15:22 +02:00
Kilian Schuettler
6388898cc0 added comments for native build investigation 2023-07-27 17:12:16 +02:00
Kilian Schuettler
72d1e6271a more refactoring, added a comment 2023-07-27 14:35:40 +02:00
Kilian Schuettler
299b5be385 package refactoring in processor 2023-07-27 14:28:09 +02:00
Kilian Schüttler
2ea58f5e9f Merge branch 'fixGradleDeploy' into 'main'
Fix gradle deploy

See merge request fforesight/layout-parser!3
2023-07-27 13:42:10 +02:00
Kilian Schuettler
510ec7ce45 remove native build 2023-07-27 13:29:14 +02:00
Kilian Schuettler
c186927e3d add reflection config 2023-07-27 13:21:01 +02:00
Kilian Schuettler
1a494b0dea add buildArgs to include logback at runtime 2023-07-27 13:00:26 +02:00
Kilian Schuettler
19552ddf69 added gradle.properties.kts 2023-07-27 12:32:10 +02:00
Kilian Schuettler
41267a0f98 ported to gradle 2023-07-27 12:27:30 +02:00
Kilian Schuettler
270129cd73 outputs almost equal current redaction-service in regards to RedactManager
* 3/200 files have minimal whitespace/sorting errors, most likely rounding errors
2023-07-25 18:12:57 +02:00
Kilian Schuettler
a41c13fdd6 add LayoutParsingType to BodyTextFrameService 2023-07-25 17:20:52 +02:00
Kilian Schuettler
65ab5eca22 update to redaction-service state 2023-07-25 16:10:57 +02:00
Kilian Schuettler
143ebee25e move and fix layout tests from redaction-service 2023-07-24 19:43:25 +02:00
Kilian Schuettler
47fd8e05d1 rename Data classes 2023-07-24 18:36:27 +02:00
Kilian Schüttler
653f280fd1 Merge branch 'TAAS-41' into 'main'
TAAS-41: TAAS Document Structure

See merge request fforesight/layout-parser!2
2023-07-24 16:09:18 +02:00
Kilian Schuettler
daa68f3fa6 TAAS-41: disable experimental tests 2023-07-24 16:07:27 +02:00
Kilian Schuettler
ed66043856 TAAS-41: add test files 2023-07-24 16:04:51 +02:00
Kilian Schuettler
526b1c5ad3 TAAS-41: add (inactive) experimental services 2023-07-24 15:58:06 +02:00
Kilian Schuettler
241a32cb4f TAAS-41/ RED-6725: integrate layoutparser into redactmanager 2023-07-24 15:55:31 +02:00
Timo Bejan
9c8501e76a Changes 2023-07-13 18:55:13 +03:00
Timo Bejan
3bc88bc9b7 store new document type 2023-07-13 13:01:01 +03:00
Kilian Schuettler
15a6d46f5c RED-7081: getBBox() Performance Improvement 2023-07-13 13:01:01 +03:00
Kilian Schuettler
788613c92e TAAS-41: TAAS Document Structure
* added linebreaks to ParagraphData
* moved List<String> cellText to List<ParagraphData> cellTexts
2023-07-13 13:01:01 +03:00
Kilian Schuettler
7f0aa32d1b TAAS-41: TAAS Document Structure
* added more testFiles
* hacked a workaround for CMMException
2023-07-13 13:01:01 +03:00
Kilian Schuettler
f08c4ced43 TAAS-41: TAAS Document Structure
* changed TextPageBlock splitting
* changed Header and Footer Classification
* added TAAS Document Structure Prototype
2023-07-13 13:01:01 +03:00
Timo Bejan
dfdeef5812 updated version 2023-07-12 23:27:49 +03:00
Timo Bejan
69a62c4dbe updated version 2023-07-12 23:22:34 +03:00
Timo Bejan
e346c04d67 version fix for gitlab 2023-07-12 20:57:30 +03:00
Timo Bejan
cdff0b0ece version fix for gitlab 2023-07-12 20:56:29 +03:00
Timo Bejan
a9e6c1f0f8 Image moved to ff 2023-07-12 20:54:36 +03:00
Timo Bejan
4fc7bac818 Project structure cleanup / adapted to fforesight modules 2023-07-12 20:46:01 +03:00
Kilian Schuettler
df9cbdc036 RED-6725: Integrate new layout parser
* ported current state from RedactManager
2023-06-15 12:52:51 +02:00
Dominique Eifländer
cc1fedac41 Merge branch 'RED-5694' into 'main'
RED-5694: Upgraded to latest platform-dependency

See merge request fforesight/layout-parser!1
2023-06-13 12:33:40 +02:00
deiflaender
54555d4ce0 RED-5694: Upgraded to latest platform-dependency 2023-06-13 12:21:27 +02:00
Christoph Schabert
0ac3ee309a Update .gitlab-ci.yml file 2023-04-19 15:42:43 +02:00
426 changed files with 57074 additions and 9666 deletions

1
.gitattributes vendored Normal file
View File

@ -0,0 +1 @@
*.pdf filter=lfs diff=lfs merge=lfs -text

7
.gitignore vendored
View File

@ -18,6 +18,7 @@ target/
.settings .settings
.springBeans .springBeans
.sts4-cache .sts4-cache
.gradle
### IntelliJ IDEA ### ### IntelliJ IDEA ###
.idea .idea
@ -37,3 +38,9 @@ build/
### VS Code ### ### VS Code ###
.vscode/ .vscode/
gradlew.bat
gradlew
gradle.properties
gradle/
.DS_Store
.DS_Store/

View File

@ -1,6 +1,26 @@
variables: variables:
SONAR_PROJECT_KEY: 'fforesight_layout-parser_AYd5quv2mRkBOCG22hvF' # SONAR_PROJECT_KEY: 'fforesight_layout-parser_AYd5quv2mRkBOCG22hvF'
GIT_SUBMODULE_STRATEGY: recursive
GIT_SUBMODULE_FORCE_HTTPS: 'true'
include: include:
- project: 'gitlab/gitlab' - project: 'gitlab/gitlab'
ref: 'main' ref: 'main'
file: 'ci-templates/maven_java.yml' file: 'ci-templates/gradle_java.yml'
deploy:
stage: deploy
tags:
- dind
script:
- echo "Building with gradle version ${BUILDVERSION}"
- gradle -Pversion=${BUILDVERSION} publish
- gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${BUILDVERSION}
- echo "BUILDVERSION=$BUILDVERSION" >> version.env
artifacts:
reports:
dotenv: version.env
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
- if: $CI_COMMIT_BRANCH =~ /^feature/ && $CI_COMMIT_TAG == ""
- if: $CI_COMMIT_BRANCH =~ /^release/
- if: $CI_COMMIT_TAG

8
.gitmodules vendored Normal file
View File

@ -0,0 +1,8 @@
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/basf"]
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/basf
url = ssh://git@git.knecon.com:22222/fforesight/documents/basf.git
update = merge
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta"]
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta
url = ssh://git@git.knecon.com:22222/fforesight/documents/syngenta.git
update = merge

Binary file not shown.

View File

@ -1,18 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.7/apache-maven-3.8.7-bin.zip
wrapperUrl=https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.1/maven-wrapper-3.1.1.jar

View File

@ -1 +1,89 @@
# PDF Layout Parser Micro-Service: layout-parser
## Introduction
The layout-parser micro-service is a powerful tool designed to efficiently extract structured information from PDF documents. Written in Java and utilizing Spring Boot 3, Apache PDFBox, and RabbitMQ, this micro-service excels at parsing PDFs and organizing their content into a meaningful and coherent layout structure. Notably, the layout-parser micro-service distinguishes itself by relying solely on advanced algorithms, rather than machine learning techniques.
### Key Steps in the PDF Layout Parsing Process:
* **Text Position Extraction:**
The micro-service leverages Apache PDFBox to extract precise text positions for each individual character within the PDF document.
* **Word Segmentation and Text Block Formation:**
Employing an array of diverse algorithms, the micro-service initially identifies and segments words, creating distinct text blocks.
* **Text Block Classification:**
The segmented text blocks are then subjected to classification algorithms. These algorithms categorize the text blocks based on their content and visual properties, distinguishing between sections, subsections, headlines, paragraphs, images, tables, table cells, headers, and footers.
* **Layout Coherence Establishment:**
The classified text blocks are subsequently orchestrated into a cohesive layout structure. This process involves arranging sections, subsections, paragraphs, images, and other elements in a logical and structured manner.
* **Output Generation in Various Formats:**
Once the layout structure is established, the micro-service generates output in multiple formats. These formats are designed for seamless integration with downstream micro-services. The supported formats include JSON, XML, and others, ensuring flexibility in downstream data consumption.
### Optional Enhancements:
* **ML-Based Table Extraction:**
For enhanced results, users have the option to incorporate machine learning-based table extraction. This feature can be activated by providing ML-generated results as a JSON file, which are then integrated seamlessly into the layout structure.
* **Image Classification using ML:**
Additionally, for more accurate image classification, users can optionally feed ML-generated image classification results into the micro-service. Similar to the table extraction option, the micro-service processes the pre-parsed results in JSON format, thus optimizing the accuracy of image content identification.
In conclusion, the layout-parser micro-service is a versatile PDF layout parsing solution crafted entirely around advanced algorithms, without reliance on machine learning. It proficiently extracts text positions, segments content into meaningful blocks, classifies these blocks, arranges them coherently, and outputs structured data for downstream micro-services. Optional integration with ML-generated table extractions and image classifications further enhances its capabilities.
## Installation
### Prerequisites
Before building and using the layout-parser micro-service, please ensure you have the following software and tools installed:
Java Development Kit (JDK) 17 or later
Gradle build tool (preinstalled)
Build and Test
To build and test the micro-service, follow these steps:
### Clone the Repository:
bash
```
git clone ssh://git@git.knecon.com:22222/fforesight/layout-parser.git
cd layout-parser
```
### Build the Project:
Use the following command to build the project using Gradle:
```
gradle clean build
```
### Run Tests:
Run the test suite using the following command:
```
gradle test
```
## Building a Custom Docker Image
To create a custom Docker image for the layout-parser micro-service, execute the provided script:
### Ensure Docker is Installed:
Ensure that Docker is installed and running on your system.
### Run the Image Building Script:
Execute the publish-custom-image script in the project directory:
```
./publish-custom-image
```
## Publishing to Internal Maven Repository
To publish the layout-parser micro-service to your internal Maven repository, execute the following command:
```
gradle -Pversion=buildVersion publish
```
Replace buildVersion with the desired version number.
## Additional Notes
Make sure to configure any necessary application properties before deploying the micro-service.
For advanced usage and configurations, refer to Kilian or Dom or preferably the source code.

View File

@ -0,0 +1,7 @@
plugins {
`kotlin-dsl`
}
repositories {
gradlePluginPortal()
}

View File

@ -0,0 +1,93 @@
plugins {
`java-library`
`maven-publish`
pmd
checkstyle
jacoco
}
group = "com.knecon.fforesight"
val documentVersion by rootProject.extra { "4.433.0" }
java.sourceCompatibility = JavaVersion.VERSION_17
java.targetCompatibility = JavaVersion.VERSION_17
tasks.pmdMain {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
}
tasks.pmdTest {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/test_pmd.xml")
}
tasks.named<Test>("test") {
useJUnitPlatform()
reports {
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
}
minHeapSize = "512m"
maxHeapSize = "2048m"
}
tasks.test {
finalizedBy(tasks.jacocoTestReport) // report is always generated after tests run
}
tasks.jacocoTestReport {
dependsOn(tasks.test) // tests are required to run before generating the report
reports {
xml.required.set(true)
csv.required.set(false)
html.outputLocation.set(layout.buildDirectory.dir("jacocoHtml"))
}
}
allprojects {
tasks.withType<Javadoc> {
options {
this as StandardJavadocDocletOptions
addBooleanOption("Xdoclint:none", true)
addStringOption("Xmaxwarns", "1")
}
}
pmd {
setConsoleOutput(true)
}
publishing {
publications {
create<MavenPublication>(name) {
from(components["java"])
}
}
repositories {
maven {
url = uri("https://nexus.knecon.com/repository/red-platform-releases/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull();
password = providers.gradleProperty("mavenPassword").getOrNull();
}
}
}
}
}
java {
withJavadocJar()
}
repositories {
mavenLocal()
mavenCentral()
maven {
url = uri("https://nexus.knecon.com/repository/gindev/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull();
password = providers.gradleProperty("mavenPassword").getOrNull();
}
}
}

View File

@ -0,0 +1,39 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
"http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
<module name="Checker">
<property
name="severity"
value="error"/>
<module name="TreeWalker">
<module name="SuppressWarningsHolder"/>
<module name="MissingDeprecated"/>
<module name="MissingOverride"/>
<module name="AnnotationLocation"/>
<module name="JavadocStyle"/>
<module name="NonEmptyAtclauseDescription"/>
<module name="IllegalImport"/>
<module name="RedundantImport"/>
<module name="RedundantModifier"/>
<module name="EmptyBlock"/>
<module name="DefaultComesLast"/>
<module name="EmptyStatement"/>
<module name="EqualsHashCode"/>
<module name="ExplicitInitialization"/>
<module name="IllegalInstantiation"/>
<module name="ModifiedControlVariable"/>
<module name="MultipleVariableDeclarations"/>
<module name="PackageDeclaration"/>
<module name="ParameterAssignment"/>
<module name="SimplifyBooleanExpression"/>
<module name="SimplifyBooleanReturn"/>
<module name="StringLiteralEquality"/>
<module name="OneStatementPerLine"/>
<module name="FinalClass"/>
<module name="ArrayTypeStyle"/>
<module name="UpperEll"/>
<module name="OuterTypeFilename"/>
</module>
<module name="FileTabCharacter"/>
<module name="SuppressWarningsFilter"/>
</module>

21
config/pmd/pmd.xml Normal file
View File

@ -0,0 +1,21 @@
<?xml version="1.0"?>
<ruleset name="Custom ruleset"
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
<description>
Knecon ruleset checks the code for bad stuff
</description>
<rule ref="category/java/errorprone.xml">
<exclude name="MissingSerialVersionUID"/>
<exclude name="AvoidLiteralsInIfCondition"/>
<exclude name="AvoidDuplicateLiterals"/>
<exclude name="NullAssignment"/>
<exclude name="AssignmentInOperand"/>
<exclude name="BeanMembersShouldSerialize"/>
</rule>
</ruleset>

23
config/pmd/test_pmd.xml Normal file
View File

@ -0,0 +1,23 @@
<?xml version="1.0"?>
<ruleset name="Custom ruleset"
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
<description>
Knecon test ruleset checks the code for bad stuff
</description>
<rule ref="category/java/errorprone.xml">
<exclude name="MissingSerialVersionUID"/>
<exclude name="AvoidLiteralsInIfCondition"/>
<exclude name="AvoidDuplicateLiterals"/>
<exclude name="AvoidFieldNameMatchingMethodName"/>
<exclude name="NullAssignment"/>
<exclude name="AssignmentInOperand"/>
<exclude name="TestClassWithoutTestCases"/>
<exclude name="BeanMembersShouldSerialize"/>
</rule>
</ruleset>

1
gradle.properties.kts Normal file
View File

@ -0,0 +1 @@
version = 0.1-SNAPSHOT

View File

@ -1,99 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.knecon.fforesight</groupId>
<artifactId>platform-docker-dependency</artifactId>
<version>0.1.0</version>
<relativePath/>
</parent>
<groupId>com.knecon.fforesight</groupId>
<artifactId>layoutparser-service-image</artifactId>
<version>1.0.0</version>
<packaging>pom</packaging>
<properties>
<service.server>layoutparser-service-server</service.server>
<platform.jar>${service.server}.jar</platform.jar>
<docker.skip.push>false</docker.skip.push>
<docker.image.name>${docker.image.prefix}/${service.server}</docker.image.name>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>io.fabric8</groupId>
<artifactId>docker-maven-plugin</artifactId>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>download-platform-jar</id>
<phase>prepare-package</phase>
<goals>
<goal>copy</goal>
</goals>
<configuration>
<artifactItems>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>${service.server}</artifactId>
<version>${project.version}</version>
<type>jar</type>
<overWrite>true</overWrite>
<destFileName>${platform.jar}</destFileName>
</dependency>
</artifactItems>
<outputDirectory>${docker.build.directory}</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>io.fabric8</groupId>
<artifactId>docker-maven-plugin</artifactId>
<configuration>
<images>
<image>
<name>${docker.image.name}</name>
<build>
<dockerFileDir>${docker.build.directory}</dockerFileDir>
<args>
<PLATFORM_JAR>${platform.jar}</PLATFORM_JAR>
</args>
<tags>
<tag>${docker.image.version}</tag>
<tag>latest</tag>
</tags>
</build>
</image>
</images>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>

View File

@ -1,9 +0,0 @@
FROM red/base-image:2.0.2
ARG PLATFORM_JAR
ENV PLATFORM_JAR ${PLATFORM_JAR}
ENV USES_ELASTICSEARCH false
COPY ["${PLATFORM_JAR}", "/"]

View File

@ -0,0 +1,10 @@
plugins {
id("com.knecon.fforesight.java-conventions")
id("io.freefair.lombok") version "8.4"
}
description = "layoutparser-service-internal-api"
dependencies {
implementation("io.swagger.core.v3:swagger-annotations:2.2.15")
}

View File

@ -1,29 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns="http://maven.apache.org/POM/4.0.0"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.knecon.fforesight</groupId>
<artifactId>layoutparser-service</artifactId>
<version>1.0.0</version>
</parent>
<artifactId>layoutparser-service-internal-api</artifactId>
<dependencies>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.26</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>31.1-jre</version>
</dependency>
</dependencies>
</project>

View File

@ -1,19 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class AtomicPositionBlockData {
Long id;
int[] stringIdxToPositionIdx;
float[][] positions;
}

View File

@ -1,23 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class AtomicTextBlockData {
Long id;
Long page;
String searchText;
int numberOnPage;
int start;
int end;
int[] lineBreaks;
}

View File

@ -1,20 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class DocumentData {
PageData[] pages;
AtomicTextBlockData[] atomicTextBlocks;
AtomicPositionBlockData[] atomicPositionBlocks;
TableOfContentsData tableOfContents;
}

View File

@ -1,20 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class PageData {
int number;
int height;
int width;
int rotation;
}

View File

@ -1,93 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TableOfContentsData {
EntryData root;
public EntryData get(List<Integer> tocId) {
if (tocId.isEmpty()) {
return root;
}
EntryData entry = root.subEntries.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
entry = entry.subEntries.get(id);
}
return entry;
}
public Stream<EntryData> streamAllEntries() {
return Stream.concat(Stream.of(root), root.subEntries.stream()).flatMap(TableOfContentsData::flatten);
}
public String toString() {
return String.join("\n", streamAllEntries().map(EntryData::toString).toList());
}
private static Stream<EntryData> flatten(EntryData entry) {
return Stream.concat(Stream.of(entry), entry.subEntries.stream().flatMap(TableOfContentsData::flatten));
}
@Builder
@Getter
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public static class EntryData {
NodeType type;
int[] tocId;
Long[] atomicBlocks;
Long[] pages;
Map<String, String> properties;
List<EntryData> subEntries;
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("[");
for (int i : tocId) {
sb.append(i);
sb.append(",");
}
sb.delete(sb.length() - 1, sb.length());
sb.append("]: ");
sb.append(type);
sb.append(" atbs = ");
sb.append(atomicBlocks.length);
return sb.toString();
}
}
}

View File

@ -0,0 +1,21 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@Schema(description = "Object containing a simplified version, which contains almost exclusively text, of the document structure Section class.")
public class SimplifiedSectionText {
@Schema(description = "The number of this Section. This is used to map the simplified section text back to the original Section.")
private String sectionNumber;
@Schema(description = "The text in this Section.")
private String text;
}

View File

@ -0,0 +1,34 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
import java.util.ArrayList;
import java.util.List;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@Schema(description = "Object containing a simplified version, which contains almost exclusively text, of the document structure.")
public class SimplifiedText {
@Schema(description = "Number of pages in the entire document.")
private int numberOfPages;
@Schema(description = "A List of simplified Sections, which contains almost exclusively text.")
@Builder.Default
private List<SimplifiedSectionText> sectionTexts = new ArrayList<>();
@Schema(description = "A list of the main section numbers ")
@Builder.Default
private List<String> mainSectionNumbers = new ArrayList<>();
@Schema(description = "A list of the header section numbers ")
@Builder.Default
private List<String> headerSectionNumbers = new ArrayList<>();
@Schema(description = "A list of the footer section numbers ")
@Builder.Default
private List<String> footerSectionNumbers = new ArrayList<>();
}

View File

@ -0,0 +1,30 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
import java.util.List;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Builder;
import lombok.Data;
@Data
@Builder
@Schema(description = "Object containing information about a Paragraph/Headline/Header/Footer.")
public class ParagraphData {
@Schema(description = "The text of this Semantic Node, without any linebreaks.", example = "This is some text.")
private String text;
@Schema(description = "A list of text ranges in string offsets. Every character in any of the ranges is bold.", example = "[0, 15]")
List<Range> boldTextBoundaries;
@Schema(description = "A list of text ranges in string offsets. Every character in any of the ranges is italic.", example = "[0, 15]")
List<Range> italicTextBoundaries;
@Schema(description = "The line breaks in the text of this semantic node in string offsets. They are exclusive end. At the end of each semantic node there is an implicit linebreak.", example = "[5, 10]")
List<Integer> linebreaks;
@Schema(description = "The classification of this Paragraph.", allowableValues = "{paragraph, headline, header, footer}")
private String classification;
@Schema(description = "Describes the text orientation of this semantic node. Any semantic node only has a single text orientation.", allowableValues = "{ZERO, QUARTER_CIRCLE, HALF_CIRCLE, THREE_QUARTER_CIRCLE}")
private String orientation;
@Schema(description = "Describes the text direction in degrees of this semantic node. Any semantic node only has a single text direction.", minimum = "0", maximum = "359")
private int textDirection;
}

View File

@ -0,0 +1,8 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
import io.swagger.v3.oas.annotations.media.Schema;
@Schema(description = "Object specifying the start and end offsets of a text range in string offsets.")
public record Range(int start, int end) {
}

View File

@ -0,0 +1,21 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
import java.util.List;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
@Builder
@Data
@AllArgsConstructor
@Schema(description = "Object containing a simplified version of the document structure. This simplified form only knows Paragraphs and Tables. The Paragraph Objects might be a Paragraph, Headline, Header or Footer.")
public class ResearchDocumentData {
@Schema(description = "File name of the original uploaded file.")
String originalFile;
@Schema(description = "A List of all paragraphs/headline or table objects, that have been parsed in this document.")
List<StructureObject> structureObjects;
}

View File

@ -0,0 +1,20 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
import java.util.List;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
@Schema(description = "Object containing information about a Table Row.")
public class RowData {
@Schema(description = "Boolean indicating whether this table row is classified as a header row.")
boolean header;
@Schema(description = "A list of Objects containing information about the text in each cell of this row.")
List<ParagraphData> cellText;
@Schema(description = "The bounding box of this StructureObject. Is always exactly 4 values representing x, y, w, h, where x, y specify the lower left corner.")
float[] bBox;
}

View File

@ -0,0 +1,31 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
import java.util.List;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
@Data
@Builder
@AllArgsConstructor
@Schema(description = "Object containing information about either a Paragraph/Headline/Header/Footer or a Table.")
public class StructureObject {
@Schema(description = "The ID of this StructureObject.")
Integer structureObjectNumber;
@Schema(description = "The Tree ID of this StructureObject.")
List<Integer> treeId;
@Schema(description = "This value indicates the start of the string offsets in this Object, with respect to the reading order.")
int page;
@Schema(description = "This stringOffset indicates the start of the string offsets in this Object, with respect to the reading order of the entire document. It is equal to the previous' StructureObject stringOffset + its length.")
int stringOffset;
@Schema(description = "The bounding box of this StructureObject. Is always exactly 4 values representing x, y, w, h, where x, y specify the lower left corner.", example = "[100, 100, 50, 50]")
float[] boundingBox;
@Schema(description = "Object containing information about a Paragraph/Headline/Header/Footer. Either this or table is null.")
ParagraphData paragraph;
@Schema(description = "Object containing information about a Table. Either this or paragraph is null.")
TableData table;
}

View File

@ -0,0 +1,21 @@
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
import java.util.List;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
@Schema(description = "Object containing information about a Table.")
public class TableData {
@Schema(description = "A list of Objects containing information about all rows in this table.")
List<RowData> rowData;
@Schema(description = "Number of columns in this table.")
Integer numberOfCols;
@Schema(description = "Number of rows in this table.")
Integer numberOfRows;
}

View File

@ -1,148 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
import java.util.LinkedList;
import java.util.List;
import lombok.Setter;
@Setter
public class Boundary implements Comparable<Boundary> {
private int start;
private int end;
public Boundary(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end));
}
this.start = start;
this.end = end;
}
public int length() {
return end - start;
}
public int start() {
return start;
}
public int end() {
return end;
}
public boolean contains(Boundary boundary) {
return start <= boundary.start() && boundary.end() <= end;
}
public boolean containedBy(Boundary boundary) {
return boundary.contains(this);
}
public boolean contains(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end));
}
return this.start <= start && end <= this.end;
}
public boolean containedBy(int start, int end) {
if (start > end) {
throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end));
}
return start <= this.start && this.end <= end;
}
public boolean contains(int index) {
return start <= index && index < end;
}
public boolean intersects(Boundary boundary) {
return contains(boundary.start()) || contains(boundary.end() - 1);
}
public List<Boundary> split(List<Integer> splitIndices) {
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
throw new IndexOutOfBoundsException(String.format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
}
List<Boundary> splitBoundaries = new LinkedList<>();
int previousIndex = start;
for (int splitIndex : splitIndices) {
// skip split if it would produce a boundary of length 0
if (splitIndex == previousIndex) {
continue;
}
splitBoundaries.add(new Boundary(previousIndex, splitIndex));
previousIndex = splitIndex;
}
splitBoundaries.add(new Boundary(previousIndex, end));
return splitBoundaries;
}
public static Boundary merge(List<Boundary> boundaries) {
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new);
return new Boundary(minStart, maxEnd);
}
@Override
public String toString() {
return String.format("Boundary [%d|%d)", start, end);
}
@Override
public int compareTo(Boundary boundary) {
if (end < boundary.end() && start < boundary.start()) {
return -1;
}
if (start > boundary.start() && end > boundary.end()) {
return 1;
}
return 0;
}
@Override
public int hashCode() {
return toString().hashCode();
}
@Override
public boolean equals(Object object) {
return hashCode() == object.hashCode();
}
}

View File

@ -1,101 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class DocumentGraph implements SemanticNode {
Set<PageNode> pages;
TableOfContents tableOfContents;
Integer numberOfPages;
TextBlock textBlock;
public TextBlock buildTextBlock() {
return streamTerminalTextBlocksInOrder().collect(new TextBlockCollector());
}
public List<SectionNode> getMainSections() {
return streamChildren().filter(node -> node instanceof SectionNode).map(node -> (SectionNode) node).collect(Collectors.toList());
}
public Stream<TextBlock> streamTerminalTextBlocksInOrder() {
return streamAllNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock);
}
public Set<EntityNode> getEntities() {
return streamAllSubNodes().map(SemanticNode::getEntities).flatMap(Set::stream).collect(Collectors.toUnmodifiableSet());
}
@Override
public List<Integer> getTocId() {
return Collections.emptyList();
}
@Override
public void setTocId(List<Integer> tocId) {
throw new UnsupportedOperationException("DocumentGraph is always the root of the Table of Contents");
}
private Stream<SemanticNode> streamAllNodes() {
return tableOfContents.streamAllEntriesInOrder().map(TableOfContents.Entry::getNode);
}
@Override
public String toString() {
return NodeType.DOCUMENT + ": " + buildTextBlock().buildSummary();
}
@Override
public Map<PageNode, Rectangle2D> getBBox() {
Map<PageNode, Rectangle2D> bBox = new HashMap<>();
for (PageNode page : pages) {
bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()));
}
return bBox;
}
}

View File

@ -1,193 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph;
import static java.lang.String.format;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Stream;
import com.google.common.hash.Hashing;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Data
public class TableOfContents {
private final Entry root;
public TableOfContents(DocumentGraph documentGraph) {
root = Entry.builder().tocId(Collections.emptyList()).type(NodeType.DOCUMENT).children(new LinkedList<>()).node(documentGraph).build();
}
public TextBlock buildTextBlock() {
return streamAllEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
public List<Integer> createNewMainEntryAndReturnId(NodeType nodeType, SemanticNode node) {
return createNewChildEntryAndReturnId(Collections.emptyList(), nodeType, node);
}
public List<Integer> createNewChildEntryAndReturnId(List<Integer> parentId, NodeType nodeType, SemanticNode node) {
if (!entryExists(parentId)) {
throw new UnsupportedOperationException(format("parentId %s does not exist!", parentId));
}
Entry parent = getEntryById(parentId);
List<Integer> newId = new LinkedList<>(parentId);
newId.add(parent.children.size());
parent.children.add(Entry.builder().tocId(newId).node(node).type(nodeType).children(new LinkedList<>()).build());
return newId;
}
private boolean entryExists(List<Integer> tocId) {
if (tocId.isEmpty()) {
return root != null;
}
Entry entry = root.children.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
if (id >= entry.children.size() || 0 > id) {
return false;
}
entry = entry.children.get(id);
}
return true;
}
public Entry getParentEntryById(List<Integer> tocId) {
return getEntryById(getParentId(tocId));
}
public boolean hasParentById(List<Integer> tocId) {
return entryExists(getParentId(tocId));
}
public Stream<SemanticNode> streamChildrenNodes(List<Integer> tocId) {
return getEntryById(tocId).children.stream().map(Entry::getNode);
}
private static List<Integer> getParentId(List<Integer> tocId) {
if (tocId.isEmpty()) {
throw new UnsupportedOperationException("Root has no parent!");
}
if (tocId.size() < 2) {
return Collections.emptyList();
}
return tocId.subList(0, tocId.size() - 1);
}
public Entry getEntryById(List<Integer> tocId) {
if (tocId.isEmpty()) {
return root;
}
Entry entry = root.children.get(tocId.get(0));
for (int id : tocId.subList(1, tocId.size())) {
entry = entry.children.get(id);
}
return entry;
}
public Stream<Entry> streamMainEntries() {
return root.children.stream();
}
public Stream<Entry> streamAllEntriesInOrder() {
return Stream.of(root).flatMap(TableOfContents::flatten);
}
public Stream<Entry> streamAllSubEntriesInOrder(List<Integer> parentId) {
return getEntryById(parentId).getChildren().stream().flatMap(TableOfContents::flatten);
}
@Override
public String toString() {
return String.join("\n", streamAllEntriesInOrder().map(Entry::toString).toList());
}
public String toString(List<Integer> id) {
return String.join("\n", streamAllSubEntriesInOrder(id).map(Entry::toString).toList());
}
private static Stream<Entry> flatten(Entry entry) {
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(TableOfContents::flatten));
}
@Builder
@Getter
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
public static class Entry {
List<Integer> tocId;
NodeType type;
SemanticNode node;
List<Entry> children;
@Override
public String toString() {
return node.toString();
}
@Override
public int hashCode() {
return Hashing.murmur3_32_fixed().hashString(toString(), StandardCharsets.UTF_8).hashCode();
}
@Override
public boolean equals(Object o) {
return o instanceof Entry && o.hashCode() == this.hashCode();
}
}
}

View File

@ -1,76 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.entity;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
public interface EntityNode {
/**
* This represents the text, which is contained within the boundary of the Entity.
*
* @return String
*/
String getValue();
/**
* The Boundary primarily defines the Entity, all other values may be inferred from it.
*
* @return Boundary, uniquely identifying this Entity
*/
Boundary getBoundary();
/**
* The deepest fully containing node represents the node which is the deepest node in the document tree structure,
* whose boundary also fully contains the boundary of this entity.
*
* @return the deepest fully containing node
*/
SemanticNode getDeepestFullyContainingNode();
/**
* The intersecting nodes represent all nodes, whose boundary intersects the boundary of this entity.
*
* @return all intersecting Nodes
*/
List<SemanticNode> getIntersectingNodes();
void setDeepestFullyContainingNode(SemanticNode semanticNode);
void addIntersectingNode(SemanticNode semanticNode);
void setIntersectingNodes(List<SemanticNode> semanticNodes);
/**
* @return all pages this entity intersects.
*/
Set<PageNode> getPages();
void setPages(Set<PageNode> pages);
/**
* removes all occurrences of this node in the graph and resets all graph specific fields.
*/
default void removeFromGraph() {
getIntersectingNodes().forEach(node -> node.getEntities().remove(this));
getPages().forEach(page -> page.getEntities().remove(this));
setPages(Collections.emptySet());
setDeepestFullyContainingNode(null);
setIntersectingNodes(Collections.emptyList());
}
}

View File

@ -1,45 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.entity;
import java.awt.geom.Rectangle2D;
import java.nio.charset.StandardCharsets;
import java.util.List;
import com.google.common.hash.Hashing;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import lombok.AccessLevel;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@FieldDefaults(level = AccessLevel.PRIVATE)
public class EntityPosition {
PageNode pageNode;
List<Rectangle2D> rectanglePerLine;
public String getId() {
return String.valueOf(hashCode());
}
@Override
public int hashCode() {
StringBuilder sb = new StringBuilder();
sb.append(pageNode.getNumber());
rectanglePerLine.forEach(r -> sb.append(r.getX()).append(r.getY()).append(r.getWidth()).append(r.getHeight()));
return Hashing.murmur3_128().hashString(sb.toString(), StandardCharsets.UTF_8).hashCode();
}
@Override
public boolean equals(Object o) {
return o instanceof EntityPosition && o.hashCode() == this.hashCode();
}
}

View File

@ -1,53 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class FooterNode implements SemanticNode {
List<Integer> tocId;
TextBlock terminalTextBlock;
@Builder.Default
boolean terminal = true;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public TextBlock buildTextBlock() {
return terminalTextBlock;
}
@Override
public String toString() {
return tocId + ": " + NodeType.FOOTER + ": " + terminalTextBlock.buildSummary();
}
}

View File

@ -1,53 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class HeaderNode implements SemanticNode {
List<Integer> tocId;
TextBlock terminalTextBlock;
@Builder.Default
boolean terminal = true;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public TextBlock buildTextBlock() {
return terminalTextBlock;
}
@Override
public String toString() {
return tocId + ": " + NodeType.HEADER + ": " + terminalTextBlock.buildSummary();
}
}

View File

@ -1,60 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class HeadlineNode implements SemanticNode {
List<Integer> tocId;
TextBlock terminalTextBlock;
@Builder.Default
boolean terminal = true;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public TextBlock buildTextBlock() {
return terminalTextBlock;
}
@Override
public String toString() {
return tocId + ": " + NodeType.HEADLINE + ": " + terminalTextBlock.buildSummary();
}
@Override
public SemanticNode getHeadline() {
return this;
}
}

View File

@ -1,87 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ImageNode implements SemanticNode {
List<Integer> tocId;
ImageType imageType;
boolean transparency;
Rectangle2D position;
boolean redaction;
boolean ignored;
@Builder.Default
String redactionReason = "";
@Builder.Default
String legalBasis = "";
@Builder.Default
int matchedRule = -1;
@EqualsAndHashCode.Exclude
PageNode page;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public TextBlock buildTextBlock() {
return streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
@Override
public Set<PageNode> getPages() {
return Collections.singleton(page);
}
@Override
public String toString() {
return tocId + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position;
}
@Override
public Map<PageNode, Rectangle2D> getBBox() {
Map<PageNode, Rectangle2D> bBoxPerPage = new HashMap<>();
bBoxPerPage.put(page, position);
return bBoxPerPage;
}
}

View File

@ -1,9 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
public enum ImageType {
LOGO,
FORMULA,
SIGNATURE,
OTHER,
OCR
}

View File

@ -1,13 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
public enum NodeType {
DOCUMENT,
SECTION,
HEADLINE,
PARAGRAPH,
TABLE,
TABLE_CELL,
IMAGE,
HEADER,
FOOTER
}

View File

@ -1,71 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.Setter;
import lombok.experimental.FieldDefaults;
@Getter
@Setter
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class PageNode {
Integer number;
Integer height;
Integer width;
Integer rotation;
@EqualsAndHashCode.Exclude
List<SemanticNode> mainBody;
@EqualsAndHashCode.Exclude
HeaderNode header;
@EqualsAndHashCode.Exclude
FooterNode footer;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Builder.Default
@EqualsAndHashCode.Exclude
Set<ImageNode> images = new HashSet<>();
public TextBlock getMainBodyTextBlock() {
return mainBody.stream().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
@Override
public String toString() {
return String.valueOf(number);
}
@Override
public int hashCode() {
return number;
}
@Override
public boolean equals(Object o) {
return o instanceof PageNode && o.hashCode() == this.hashCode();
}
}

View File

@ -1,51 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ParagraphNode implements SemanticNode {
List<Integer> tocId;
TextBlock terminalTextBlock;
@Builder.Default
boolean terminal = true;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public TextBlock buildTextBlock() {
return terminalTextBlock;
}
@Override
public String toString() {
return tocId + ": " + NodeType.PARAGRAPH + ": " + terminalTextBlock.buildSummary();
}
}

View File

@ -1,63 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class SectionNode implements SemanticNode {
List<Integer> tocId;
TextBlock textBlock;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public TextBlock buildTextBlock() {
if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
return textBlock;
}
@Override
public String toString() {
return tocId.toString() + ": " + NodeType.SECTION + ": " + buildTextBlock().buildSummary();
}
public HeadlineNode getHeadline() {
return streamChildren().filter(node -> node instanceof HeadlineNode)
.map(node -> (HeadlineNode) node)
.findFirst()
.orElseThrow(() -> new NoSuchElementException("ClassificationSection has no Headline!"));
}
}

View File

@ -1,275 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations;
public interface SemanticNode {
/**
* Searches all Nodes located underneath this Node in the TableOfContents and concatenates their AtomicTextBlocks into a single TextBlockEntity.
* So, for a ClassificationSection all TextBlocks of Subsections, Paragraphs, and Tables are concatenated into a single TextBlockEntity
* If the Node is Terminal, the TerminalTextBlock will be returned instead.
*
* @return ClassificationTextBlock containing all AtomicTextBlocks that are located under this Node.
*/
TextBlock buildTextBlock();
/**
* Any Node maintains its own Set of Entities.
* This Set contains all Entities whose boundary intersects the boundary of this node.
*
* @return Set of all Entities associated with this Node
*/
Set<EntityNode> getEntities();
/**
* Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's ClassificationTextBlock.
*
* @return Set of PageNodes this node appears on.
*/
default Set<PageNode> getPages() {
return buildTextBlock().getPages();
}
/**
* @return the TableOfContents of the ClassificationDocument this node belongs to
*/
TableOfContents getTableOfContents();
/**
* The id is a List of Integers uniquely identifying this node in the TableOfContents.
*
* @return the TableOfContents ID
*/
List<Integer> getTocId();
/**
* This should only be used during graph construction.
*
* @param tocId List of Integers
*/
void setTocId(List<Integer> tocId);
/**
* Traverses the Tree up, until it hits a HeadlineNode or hits a SectionNode which will then return the first HeadlineNode from its children.
* Throws NotFoundException if no Headline is found this way
*
* @return First HeadlineNode found
*/
default SemanticNode getHeadline() {
return getParent().getHeadline();
}
/**
* @return boolean indicating wether this Node has a Parent in the TableOfContents
*/
default boolean hasParent() {
return getTableOfContents().hasParentById(getTocId());
}
/**
* @return The SemanticNode representing the Parent in the TableOfContents
* throws NotFoundException, when no parent is present
*/
default SemanticNode getParent() {
return getTableOfContents().getParentEntryById(getTocId()).getNode();
}
/**
* Terminal means a SemanticNode has direct access to a ClassificationTextBlock, by default this is false and must be overridden.
* Currently only Sections, Images, and Tables are not terminal.
* A TableCell might be Terminal depending on its area compared to the page.
*
* @return boolean, indicating if a Node has direct access to a ClassificationTextBlock
*/
default boolean isTerminal() {
return false;
}
/**
* Terminal means a SemanticNode has direct access to a ClassificationTextBlock, by default this is false and must be overridden.
* Currently only Sections and Tables are not terminal.
*
* @return AtomicTextBlock
*/
default TextBlock getTerminalTextBlock() {
throw new UnsupportedOperationException("Only terminal Nodes have access to TerminalTextBlocks!");
}
default void setTerminalTextBlock(TextBlock textBlock) {
throw new UnsupportedOperationException();
}
/**
* Each AtomicTextBlock has an index on its page, this returns the number of the first AtomicTextBlock underneath this node.
* If this node does not have any AtomicTexBlocks underneath it, e.g. an empty TableCell. It returns -1.
*
* @return Integer representing the number on the page
*/
default Integer getNumberOnPage() {
TextBlock textBlock = buildTextBlock();
if (textBlock.getAtomicTextBlocks().size() > 0) {
return buildTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
} else {
return -1;
}
}
/**
* @return true, if this node's ClassificationTextBlock is not empty
*/
default boolean hasText() {
return buildTextBlock().length() > 0;
}
/**
* @param string A String which the ClassificationTextBlock might contain
* @return true, if this node's ClassificationTextBlock contains the string
*/
default boolean containsString(String string) {
return buildTextBlock().getSearchText().contains(string);
}
/**
* @param strings A List of Strings which the ClassificationTextBlock might contain
* @return true, if this node's ClassificationTextBlock contains any of the strings
*/
default boolean containsAnyString(List<String> strings) {
return strings.stream().anyMatch(this::containsString);
}
/**
* This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the EntityNode intersects or even contains the EntityNode.
* It sets the fields accordingly and recursively calls this function on all its children.
*
* @param entityNode EntityNode, which is being inserted into the graph
*/
default void addThisToEntityIfIntersects(EntityNode entityNode) {
TextBlock textBlock = buildTextBlock();
if (textBlock.getBoundary().intersects(entityNode.getBoundary())) {
if (textBlock.containsBoundary(entityNode.getBoundary())) {
entityNode.setDeepestFullyContainingNode(this);
}
entityNode.addIntersectingNode(this);
streamChildren().forEach(node -> node.addThisToEntityIfIntersects(entityNode));
}
}
/**
* Streams all children located directly underneath this node in the TableOfContents.
*
* @return Stream of all children
*/
default Stream<SemanticNode> streamChildren() {
return getTableOfContents().streamChildrenNodes(getTocId());
}
/**
* recursively streams all SemanticNodes located underneath this node in the TableOfContents in order.
*
* @return Stream of all SubNodes
*/
default Stream<SemanticNode> streamAllSubNodes() {
return getTableOfContents().streamAllSubEntriesInOrder(getTocId()).map(TableOfContents.Entry::getNode);
}
/**
* @return Boundary of this Node's ClassificationTextBlock
*/
default Boundary getBoundary() {
return buildTextBlock().getBoundary();
}
/**
* If this Node is Terminal it will calculate the boundingBox of its TerminalTextBlock, otherwise it will calculate the Union of the BoundingBoxes of all its Children.
* If called on the ClassificationDocument, it will return the cropbox of each page
*
* @return Rectangle2D fully encapsulating this Node for each page.
*/
default Map<PageNode, Rectangle2D> getBBox() {
Map<PageNode, Rectangle2D> bBoxPerPage = new HashMap<>();
if (isTerminal()) {
return getBBoxFromTerminalTextBlock(bBoxPerPage);
}
return getBBoxFromChildren(bBoxPerPage);
}
/**
* TODO this does not yet work for sections spanning multiple columns.
*
* @param bBoxPerPage initial empty BoundingBox
* @return The union of the BoundingBoxes of all children
*/
private Map<PageNode, Rectangle2D> getBBoxFromChildren(Map<PageNode, Rectangle2D> bBoxPerPage) {
return streamChildren().map(SemanticNode::getBBox).reduce((map1, map2) -> {
map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D()));
return map2;
}).orElse(bBoxPerPage);
}
/**
* @param bBoxPerPage initial empty BoundingBox
* @return The union of all BoundingBoxes of the ClassificationTextBlock of this node
*/
private Map<PageNode, Rectangle2D> getBBoxFromTerminalTextBlock(Map<PageNode, Rectangle2D> bBoxPerPage) {
Map<PageNode, List<AtomicTextBlock>> atomicTextBlockPerPage = buildTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage));
atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
return bBoxPerPage;
}
}

View File

@ -1,92 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TableCellNode implements SemanticNode {
List<Integer> tocId;
int row;
int col;
boolean header;
Rectangle2D bBox;
@Builder.Default
boolean terminal = true;
TextBlock terminalTextBlock;
TextBlock textBlock;
@EqualsAndHashCode.Exclude
TableOfContents tableOfContents;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
@Override
public Map<PageNode, Rectangle2D> getBBox() {
Map<PageNode, Rectangle2D> bBoxPerPage = new HashMap<>();
getPages().forEach(page -> bBoxPerPage.put(page, bBox));
return bBoxPerPage;
}
@Override
public TextBlock buildTextBlock() {
if (terminal) {
return terminalTextBlock;
}
if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
return textBlock;
}
@Override
public String toString() {
return tocId + ": " + NodeType.TABLE_CELL + ": " + buildTextBlock().buildSummary();
}
public boolean hasHeader(String headerString) {
return getHeaders().anyMatch(header -> header.buildTextBlock().getSearchText().strip().equals(headerString));
}
private Stream<TableCellNode> getHeaders() {
TableNode tableNode = (TableNode) getParent();
return tableNode.streamHeadersForCell(row, col);
}
}

View File

@ -1,73 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TableNode implements SemanticNode {
List<Integer> tocId;
TableOfContents tableOfContents;
Integer numberOfRows;
Integer numberOfCols;
TextBlock textBlock;
@Builder.Default
@EqualsAndHashCode.Exclude
Set<EntityNode> entities = new HashSet<>();
public Stream<TableCellNode> streamTableCells() {
return streamChildren().map(node -> (TableCellNode) node);
}
public Stream<TableCellNode> streamHeaders() {
return streamTableCells().filter(TableCellNode::isHeader);
}
public Stream<TableCellNode> streamHeadersForCell(int row, int col) {
return streamHeaders().filter(cell -> cell.getRow() == row || cell.getCol() == col);
}
@Override
public TextBlock buildTextBlock() {
if (textBlock == null) {
textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
}
return textBlock;
}
@Override
public String toString() {
return tocId.toString() + ": " + NodeType.TABLE + ": " + buildTextBlock().buildSummary();
}
}

View File

@ -1,131 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock;
import java.awt.geom.Rectangle2D;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@AllArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class AtomicTextBlock implements TextBlock {
Long id;
Integer numberOnPage;
PageNode page;
//string coordinates
Boundary boundary;
String searchText;
List<Integer> lineBreaks;
//position coordinates
List<Integer> stringIdxToPositionIdx;
List<Rectangle2D> positions;
@EqualsAndHashCode.Exclude
SemanticNode parent;
@Override
public int numberOfLines() {
return lineBreaks.size() + 1;
}
public CharSequence getLine(int lineNumber) {
if (lineNumber >= numberOfLines() || lineNumber < 0) {
throw new IndexOutOfBoundsException(String.format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
}
if (lineNumber == 0) {
return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start());
} else if (lineNumber == numberOfLines() - 1) {
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end());
}
return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start());
}
@Override
public List<AtomicTextBlock> getAtomicTextBlocks() {
return List.of(this);
}
@Override
public int getNextLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak > fromIndex - boundary.start()) //
.findFirst() //
.orElse(searchText.length()) + boundary.start();
}
@Override
public int getPreviousLinebreak(int fromIndex) {
return lineBreaks.stream()//
.filter(linebreak -> linebreak <= fromIndex - boundary.start())//
.reduce((a, b) -> b)//
.orElse(0) + boundary.start();
}
@Override
public Rectangle2D getPosition(int stringIdx) {
return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start()));
}
@Override
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
if (!containsBoundary(stringBoundary)) {
throw new IndexOutOfBoundsException(String.format("%s is out of bounds for %s", stringBoundary, this.boundary));
}
if (stringBoundary.end() == this.boundary.end()) {
return positions.subList(stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()), positions.size());
}
return positions.subList(stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()),
stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start()));
}
public List<EntityPosition> getEntityPositionsPerPage(Boundary stringBoundary) {
List<Rectangle2D> positionsPerLine = stringBoundary.split(getLineBreaks().stream().map(lb -> lb + boundary.start()).filter(stringBoundary::contains).toList())
.stream()
.map(this::getPositions)
.map(RectangleTransformations::rectangleUnion)
.toList();
return List.of(EntityPosition.builder().rectanglePerLine(positionsPerLine).pageNode(page).build());
}
@Override
public String toString() {
return searchText;
}
}

View File

@ -1,179 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock;
import java.awt.geom.Rectangle2D;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import lombok.AccessLevel;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ConcatenatedTextBlock implements TextBlock {
List<AtomicTextBlock> atomicTextBlocks;
String searchText;
Boundary boundary;
public ConcatenatedTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
this.atomicTextBlocks = new LinkedList<>();
if (atomicTextBlocks.isEmpty()) {
boundary = new Boundary(-1, -1);
return;
}
var firstTextBlock = atomicTextBlocks.get(0);
this.atomicTextBlocks.add(firstTextBlock);
boundary = new Boundary(firstTextBlock.getBoundary().start(), firstTextBlock.getBoundary().end());
atomicTextBlocks.subList(1, atomicTextBlocks.size()).forEach(this::concat);
}
public ConcatenatedTextBlock concat(TextBlock textBlock) {
if (this.atomicTextBlocks.isEmpty()) {
boundary.setStart(textBlock.getBoundary().start());
boundary.setEnd(textBlock.getBoundary().end());
} else if (boundary.end() != textBlock.getBoundary().start()) {
throw new UnsupportedOperationException(String.format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
}
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
boundary.setEnd(textBlock.getBoundary().end());
return this;
}
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
return atomicTextBlocks.stream().filter(textBlock -> textBlock.getBoundary().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new);
}
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(Boundary boundary) {
return atomicTextBlocks.stream().filter(tb -> tb.getBoundary().intersects(boundary)).toList();
}
@Override
public String getSearchText() {
if (searchText == null) {
StringBuilder sb = new StringBuilder();
getAtomicTextBlocks().forEach(atb -> sb.append(atb.getSearchText()));
searchText = sb.toString();
}
return searchText;
}
@Override
public int numberOfLines() {
return atomicTextBlocks.stream().map(AtomicTextBlock::getLineBreaks).mapToInt(List::size).sum();
}
@Override
public int getNextLinebreak(int fromIndex) {
return getAtomicTextBlockByStringIndex(fromIndex).getNextLinebreak(fromIndex);
}
@Override
public int getPreviousLinebreak(int fromIndex) {
return getAtomicTextBlockByStringIndex(fromIndex).getPreviousLinebreak(fromIndex);
}
@Override
public List<Integer> getLineBreaks() {
return getAtomicTextBlocks().stream().flatMap(atomicTextBlock -> atomicTextBlock.getLineBreaks().stream()).toList();
}
@Override
public Rectangle2D getPosition(int stringIdx) {
return getAtomicTextBlockByStringIndex(stringIdx).getPosition(stringIdx);
}
@Override
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
if (textBlocks.size() == 1) {
return textBlocks.get(0).getPositions(stringBoundary);
}
AtomicTextBlock firstTextBlock = textBlocks.get(0);
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
positions.addAll(textBlock.getPositions());
}
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
positions.addAll(lastTextBlock.getPositions(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
return positions;
}
@Override
public List<EntityPosition> getEntityPositionsPerPage(Boundary stringBoundary) {
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
if (textBlocks.size() == 1) {
return textBlocks.get(0).getEntityPositionsPerPage(stringBoundary);
}
AtomicTextBlock firstTextBlock = textBlocks.get(0);
List<EntityPosition> positions = new LinkedList<>(firstTextBlock.getEntityPositionsPerPage(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())));
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
positions.addAll(textBlock.getEntityPositionsPerPage(textBlock.getBoundary()));
}
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
positions.addAll(lastTextBlock.getEntityPositionsPerPage(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
return mergeEntityPositionsWithSamePageNode(positions);
}
private List<EntityPosition> mergeEntityPositionsWithSamePageNode(List<EntityPosition> positions) {
Map<PageNode, List<Rectangle2D>> entityPositionsPerPage = positions.stream().collect(//
Collectors.groupingBy(EntityPosition::getPageNode, //
Collectors.flatMapping(entityPosition -> entityPosition.getRectanglePerLine().stream(), Collectors.toList())));
return entityPositionsPerPage.entrySet().stream()//
.map(entry -> EntityPosition.builder().pageNode(entry.getKey()).rectanglePerLine(entry.getValue()).build())//
.toList();
}
@Override
public String toString() {
return getSearchText();
}
}

View File

@ -1,125 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
public interface TextBlock extends CharSequence {
String getSearchText();
List<AtomicTextBlock> getAtomicTextBlocks();
Boundary getBoundary();
int getNextLinebreak(int fromIndex);
int getPreviousLinebreak(int fromIndex);
List<Integer> getLineBreaks();
Rectangle2D getPosition(int stringIdx);
List<Rectangle2D> getPositions(Boundary stringBoundary);
List<EntityPosition> getEntityPositionsPerPage(Boundary stringBoundary);
int numberOfLines();
default int indexOf(String searchTerm) {
return indexOf(searchTerm, getBoundary().start());
}
default Set<PageNode> getPages() {
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getPage).collect(Collectors.toUnmodifiableSet());
}
default int indexOf(String searchTerm, int startOffset) {
int start = getSearchText().indexOf(searchTerm, startOffset - getBoundary().start());
if (start == -1) {
return -1;
}
return start + getBoundary().start();
}
default CharSequence getFirstLine() {
return subSequence(getBoundary().start(), getNextLinebreak(getBoundary().start()));
}
default boolean containsBoundary(Boundary boundary) {
if (boundary.end() < boundary.start()) {
throw new IllegalArgumentException(String.format("Invalid %s, StartIndex must be smaller than EndIndex", boundary));
}
return getBoundary().contains(boundary);
}
default boolean containsIndex(int stringIndex) {
return getBoundary().contains(stringIndex);
}
default CharSequence subSequence(Boundary boundary) {
return subSequence(boundary.start(), boundary.end());
}
default String buildSummary() {
String[] words = getSearchText().split(" ");
int bound = Math.min(words.length, 4);
List<String> list = new ArrayList<>(Arrays.asList(words).subList(0, bound));
return String.join(" ", list);
}
@Override
default CharSequence subSequence(int start, int end) {
return getSearchText().substring(start - getBoundary().start(), end - getBoundary().start());
}
@Override
default int length() {
return getBoundary().length();
}
@Override
default char charAt(int index) {
return getSearchText().charAt(index - getBoundary().start());
}
}

View File

@ -1,50 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock;
import java.util.Collections;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import lombok.NoArgsConstructor;
@NoArgsConstructor
public class TextBlockCollector implements Collector<TextBlock, ConcatenatedTextBlock, TextBlock> {
@Override
public Supplier<ConcatenatedTextBlock> supplier() {
return () -> new ConcatenatedTextBlock(Collections.emptyList());
}
@Override
public BiConsumer<ConcatenatedTextBlock, TextBlock> accumulator() {
return ConcatenatedTextBlock::concat;
}
@Override
public BinaryOperator<ConcatenatedTextBlock> combiner() {
return ConcatenatedTextBlock::concat;
}
@Override
public Function<ConcatenatedTextBlock, TextBlock> finisher() {
return a -> a;
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.IDENTITY_FINISH, Characteristics.CONCURRENT);
}
}

View File

@ -1,146 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.mapper;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import lombok.experimental.UtilityClass;
@UtilityClass
public class DocumentDataMapper {
public DocumentData toDocumentData(DocumentGraph documentGraph) {
List<AtomicTextBlockData> atomicTextBlockData = documentGraph.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
.distinct()
.map(DocumentDataMapper::toAtomicTextBlockData)
.toList();
List<AtomicPositionBlockData> atomicPositionBlockData = documentGraph.streamTerminalTextBlocksInOrder()
.flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream())
.distinct()
.map(DocumentDataMapper::toAtomicPositionBlockData)
.toList();
List<PageData> pageData = documentGraph.getPages().stream().map(DocumentDataMapper::toPageData).toList();
TableOfContentsData tableOfContentsData = toTableOfContentsData(documentGraph.getTableOfContents());
return DocumentData.builder()
.atomicTextBlocks(atomicTextBlockData.toArray(new AtomicTextBlockData[0]))
.atomicPositionBlocks(atomicPositionBlockData.toArray(new AtomicPositionBlockData[0]))
.pages(pageData.toArray(new PageData[0]))
.tableOfContents(tableOfContentsData)
.build();
}
private TableOfContentsData toTableOfContentsData(TableOfContents tableOfContents) {
return new TableOfContentsData(toEntryData(tableOfContents.getRoot()));
}
private TableOfContentsData.EntryData toEntryData(TableOfContents.Entry entry) {
Long[] atomicTextBlocks;
if (entry.getNode().isTerminal()) {
atomicTextBlocks = toAtomicTextBlockIds(entry.getNode().getTerminalTextBlock());
} else {
atomicTextBlocks = new Long[]{};
}
Map<String, String> properties = switch (entry.getType()) {
case TABLE -> PropertiesMapper.buildTableProperties((TableNode) entry.getNode());
case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCellNode) entry.getNode());
case IMAGE -> PropertiesMapper.buildImageProperties((ImageNode) entry.getNode());
default -> new HashMap<>();
};
return TableOfContentsData.EntryData.builder()
.tocId(toPrimitiveIntArray(entry.getTocId()))
.subEntries(entry.getChildren().stream().map(DocumentDataMapper::toEntryData).toList())
.type(entry.getType())
.atomicBlocks(atomicTextBlocks)
.pages(entry.getNode().getPages().stream().map(PageNode::getNumber).map(Integer::longValue).toArray(Long[]::new))
.properties(properties)
.build();
}
private Long[] toAtomicTextBlockIds(TextBlock textBlock) {
return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new);
}
private PageData toPageData(PageNode p) {
return PageData.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).build();
}
private AtomicTextBlockData toAtomicTextBlockData(AtomicTextBlock atomicTextBlock) {
return AtomicTextBlockData.builder()
.id(atomicTextBlock.getId())
.page(atomicTextBlock.getPage().getNumber().longValue())
.searchText(atomicTextBlock.getSearchText())
.numberOnPage(atomicTextBlock.getNumberOnPage())
.start(atomicTextBlock.getBoundary().start())
.end(atomicTextBlock.getBoundary().end())
.lineBreaks(toPrimitiveIntArray(atomicTextBlock.getLineBreaks()))
.build();
}
private AtomicPositionBlockData toAtomicPositionBlockData(AtomicTextBlock atomicTextBlock) {
return AtomicPositionBlockData.builder()
.id(atomicTextBlock.getId())
.positions(toPrimitiveFloatMatrix(atomicTextBlock.getPositions()))
.stringIdxToPositionIdx(toPrimitiveIntArray(atomicTextBlock.getStringIdxToPositionIdx()))
.build();
}
private float[][] toPrimitiveFloatMatrix(List<Rectangle2D> positions) {
float[][] positionMatrix = new float[positions.size()][];
for (int i = 0; i < positions.size(); i++) {
float[] singlePositions = new float[4];
singlePositions[0] = (float) positions.get(i).getMinX();
singlePositions[1] = (float) positions.get(i).getMinY();
singlePositions[2] = (float) positions.get(i).getWidth();
singlePositions[3] = (float) positions.get(i).getHeight();
positionMatrix[i] = singlePositions;
}
return positionMatrix;
}
private int[] toPrimitiveIntArray(List<Integer> list) {
int[] array = new int[list.size()];
for (int i = 0; i < list.size(); i++) {
array[i] = list.get(i);
}
return array;
}
}

View File

@ -1,229 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.mapper;
import static com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType.FOOTER;
import static com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType.HEADER;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import com.google.common.primitives.Ints;
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.FooterNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeaderNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeadlineNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ParagraphNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
import lombok.experimental.UtilityClass;
@UtilityClass
public class DocumentGraphMapper {
public DocumentGraph toDocumentGraph(DocumentData documentData) {
DocumentGraph documentGraph = new DocumentGraph();
Context context = new Context(documentData,
new TableOfContents(documentGraph),
new LinkedList<>(),
new LinkedList<>(),
Arrays.stream(documentData.getAtomicTextBlocks()).toList(),
Arrays.stream(documentData.getAtomicPositionBlocks()).toList());
context.pages.addAll(Arrays.stream(documentData.getPages()).map(DocumentGraphMapper::buildPage).toList());
context.tableOfContents.getRoot().getChildren().addAll(buildEntries(documentData.getTableOfContents().getRoot().getSubEntries(), context));
documentGraph.setTableOfContents(context.tableOfContents);
documentGraph.setPages(new HashSet<>(context.pages));
documentGraph.setNumberOfPages(documentData.getPages().length);
documentGraph.setTextBlock(documentGraph.buildTextBlock());
return documentGraph;
}
private List<TableOfContents.Entry> buildEntries(List<TableOfContentsData.EntryData> entries,
Context context) {
List<TableOfContents.Entry> newEntries = new LinkedList<>();
for (TableOfContentsData.EntryData entryData : entries) {
boolean terminal = isTerminal(entryData);
List<PageNode> pages = Arrays.stream(entryData.getPages()).map(pageNumber -> getPage(pageNumber, context)).toList();
SemanticNode node = switch (entryData.getType()) {
case SECTION -> buildSection(context);
case PARAGRAPH -> buildParagraph(context, terminal);
case HEADLINE -> buildHeadline(context, terminal);
case HEADER -> buildHeader(context, terminal);
case FOOTER -> buildFooter(context, terminal);
case TABLE -> buildTable(context, entryData.getProperties());
case TABLE_CELL -> buildTableCell(context, entryData.getProperties(), terminal);
case IMAGE -> buildImage(context, entryData.getProperties());
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
};
if (node.isTerminal()) {
TextBlock textBlock = toTextBlock(entryData.getAtomicBlocks(), context, node);
node.setTerminalTextBlock(textBlock);
}
List<Integer> tocId = Arrays.stream(entryData.getTocId()).boxed().toList();
node.setTocId(tocId);
if (entryData.getType() == HEADER) {
pages.forEach(page -> page.setHeader((HeaderNode) node));
} else if (entryData.getType() == FOOTER) {
pages.forEach(page -> page.setFooter((FooterNode) node));
} else {
pages.forEach(page -> page.getMainBody().add(node));
}
newEntries.add(TableOfContents.Entry.builder().tocId(tocId).type(entryData.getType()).children(buildEntries(entryData.getSubEntries(), context)).node(node).build());
}
return newEntries;
}
private HeadlineNode buildHeadline(Context context, boolean terminal) {
return HeadlineNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
}
private static boolean isTerminal(TableOfContentsData.EntryData entryData) {
return entryData.getAtomicBlocks().length > 0;
}
private ImageNode buildImage(Context context, Map<String, String> properties) {
var builder = ImageNode.builder();
PropertiesMapper.parseImageProperties(properties, builder);
return builder.tableOfContents(context.tableOfContents()).build();
}
private TableCellNode buildTableCell(Context context, Map<String, String> properties, boolean terminal) {
TableCellNode.TableCellNodeBuilder builder = TableCellNode.builder();
PropertiesMapper.parseTableCellProperties(properties, builder);
return builder.terminal(terminal).tableOfContents(context.tableOfContents()).build();
}
private TableNode buildTable(Context context, Map<String, String> properties) {
TableNode.TableNodeBuilder builder = TableNode.builder();
PropertiesMapper.parseTableProperties(properties, builder);
return TableNode.builder().tableOfContents(context.tableOfContents()).build();
}
private FooterNode buildFooter(Context context, boolean terminal) {
return FooterNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
}
private HeaderNode buildHeader(Context context, boolean terminal) {
return HeaderNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
}
private SectionNode buildSection(Context context) {
return SectionNode.builder().tableOfContents(context.tableOfContents()).build();
}
private ParagraphNode buildParagraph(Context context, boolean terminal) {
return ParagraphNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build();
}
private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) {
return Arrays.stream(atomicTextBlockIds)
.map(atomicTextBlockId -> toAtomicTextBlock(context.atomicTextBlockData.get(Math.toIntExact(atomicTextBlockId)),
context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)),
parent,
context))
.collect(new TextBlockCollector());
}
private PageNode buildPage(PageData p) {
return PageNode.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build();
}
private AtomicTextBlock toAtomicTextBlock(AtomicTextBlockData atomicTextBlockData,
AtomicPositionBlockData atomicPositionBlockData,
SemanticNode parent,
Context context) {
return AtomicTextBlock.builder()
.id(atomicTextBlockData.getId())
.numberOnPage(atomicTextBlockData.getNumberOnPage())
.page(getPage(atomicTextBlockData.getPage(), context))
.boundary(new Boundary(atomicTextBlockData.getStart(), atomicTextBlockData.getEnd()))
.searchText(atomicTextBlockData.getSearchText())
.lineBreaks(Ints.asList(atomicTextBlockData.getLineBreaks()))
.stringIdxToPositionIdx(Ints.asList(atomicPositionBlockData.getStringIdxToPositionIdx()))
.positions(toRectangle2DList(atomicPositionBlockData.getPositions()))
.parent(parent)
.build();
}
private static List<Rectangle2D> toRectangle2DList(float[][] positions) {
return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList();
}
private PageNode getPage(Long pageIndex, Context context) {
return context.pages.stream()
.filter(page -> page.getNumber() == Math.toIntExact(pageIndex))
.findFirst()
.orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex)));
}
record Context(
DocumentData layoutParsingModel,
TableOfContents tableOfContents,
List<PageNode> pages,
List<SectionNode> sections,
List<AtomicTextBlockData> atomicTextBlockData,
List<AtomicPositionBlockData> atomicPositionBlockData) {
}
}

View File

@ -1,101 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.mapper;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode;
import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations;
public class PropertiesMapper {
public static Map<String, String> buildImageProperties(ImageNode image) {
Map<String, String> properties = new HashMap<>();
properties.put("imageType", image.getImageType().toString());
properties.put("transparency", String.valueOf(image.isTransparency()));
properties.put("position", RectangleTransformations.toString(image.getPosition()));
return properties;
}
public static Map<String, String> buildTableCellProperties(TableCellNode tableCell) {
Map<String, String> properties = new HashMap<>();
properties.put("row", String.valueOf(tableCell.getRow()));
properties.put("col", String.valueOf(tableCell.getCol()));
properties.put("header", String.valueOf(tableCell.isHeader()));
if (tableCell.getPages().size() > 1 || tableCell.getBBox().keySet().size() > 1) {
throw new IllegalArgumentException("TableCell can only occur on a single page!");
}
String bBoxString = RectangleTransformations.toString(tableCell.getBBox().get(tableCell.getPages().stream().findFirst().get()));
properties.put("bBox", bBoxString);
return properties;
}
public static Map<String, String> buildTableProperties(TableNode table) {
Map<String, String> properties = new HashMap<>();
properties.put("numberOfRows", String.valueOf(table.getNumberOfRows()));
properties.put("numberOfCols", String.valueOf(table.getNumberOfCols()));
return properties;
}
public static void parseImageProperties(Map<String, String> properties, ImageNode.ImageNodeBuilder builder) {
builder.imageType(parseImageType(properties.get("imageType")));
builder.transparency(Boolean.parseBoolean(properties.get("transparency")));
builder.position(parseRectangle2D(properties.get("position")));
}
public static void parseTableCellProperties(Map<String, String> properties, TableCellNode.TableCellNodeBuilder builder) {
builder.row(Integer.parseInt(properties.get("row")));
builder.col(Integer.parseInt(properties.get("col")));
builder.header(Boolean.parseBoolean(properties.get("header")));
builder.bBox(parseRectangle2D(properties.get("bBox")));
}
public static void parseTableProperties(Map<String, String> properties, TableNode.TableNodeBuilder builder) {
builder.numberOfRows(Integer.parseInt(properties.get("numberOfRows")));
builder.numberOfCols(Integer.parseInt(properties.get("numberOfCols")));
}
private static ImageType parseImageType(String imageType) {
return switch (imageType) {
case "LOGO" -> ImageType.LOGO;
case "FORMULA" -> ImageType.FORMULA;
case "SIGNATURE" -> ImageType.SIGNATURE;
case "OCR" -> ImageType.OCR;
default -> ImageType.OTHER;
};
}
public static String toString(Rectangle2D rectangle2D) {
return String.format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
}
public static Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
}

View File

@ -2,9 +2,26 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
import java.util.Map; import java.util.Map;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Builder; import lombok.Builder;
@Builder @Builder
public record LayoutParsingFinishedEvent(Map<String, String> identifier, long duration, int numberOfPages, String message) { @Schema(description = "Object containing information about the layout parsing.")
public record LayoutParsingFinishedEvent(
@Schema(description = "General purpose identifier. It is returned exactly the same way it is inserted with the LayoutParsingRequest.") //
Map<String, String> identifier,
@Schema(description = "The duration of a single layout parsing in ms.") //
long duration,
@Schema(description = "The number of pages of the parsed document.") //
int numberOfPages,
@Schema(description = "A general message. It contains some information useful for a developer, like the paths where the files are stored. Not meant to be machine readable.") //
String message,
@Schema(description = "The app version of the layout parser.") //
String layoutParserVersion
) {
} }

View File

@ -2,7 +2,9 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
public class LayoutParsingQueueNames { public class LayoutParsingQueueNames {
public static final String LAYOUT_PARSING_REQUEST_QUEUE = "LAYOUTPARSING_REQUEST_QUEUE"; public static final String LAYOUT_PARSING_REQUEST_QUEUE_PREFIX = "layout_parsing_request";
public static final String LAYOUT_PARSING_DLQ = "LAYOUTPARSING_DLQ"; public static final String LAYOUT_PARSING_REQUEST_EXCHANGE = "layout_parsing_request_exchange";
public static final String LAYOUT_PARSING_FINISHED_EVENT_QUEUE = "LAYOUTPARSING_FINISHED_EVENT_QUEUE"; public static final String LAYOUT_PARSING_RESPONSE_QUEUE_PREFIX = "layout_parsing_response";
public static final String LAYOUT_PARSING_RESPONSE_EXCHANGE = "layout_parsing_response_exchange";
public static final String LAYOUT_PARSING_DLQ = "layout_parsing_error";
} }

View File

@ -3,18 +3,45 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
import java.util.Map; import java.util.Map;
import java.util.Optional; import java.util.Optional;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Builder; import lombok.Builder;
import lombok.NonNull;
@Builder @Builder
@Schema(description = "Object containing all storage paths the service needs to know.")
public record LayoutParsingRequest( public record LayoutParsingRequest(
@Schema(description = "Enum specifying the type of layout parsing to be performed.", allowableValues = "{RedactManager, DocuMine, TAAS}")//
@NonNull LayoutParsingType layoutParsingType,
@Schema(description = "General purpose identifiers. They are not changed by the service at all and are returned as is in the response queue.")//
Map<String, String> identifier, Map<String, String> identifier,
String originFileStorageId,
Optional<String> tablesFileStorageId, @Schema(description = "Path to the original PDF file.")//
Optional<String> imagesFileStorageId, @NonNull String originFileStorageId,//
String structureFileStorageId,
String textBlockFileStorageId, @Schema(description = "Optional Path to the table extraction file.")//
String positionBlockFileStorageId, Optional<String> tablesFileStorageId,//
String pageFileStorageId) { @Schema(description = "Optional Path to the image classification file.")//
Optional<String> imagesFileStorageId,//
@Schema(description = "Optional Path to the the visual layout parsing service file") Optional<String> visualLayoutParsingFileId,//
@Schema(description = "Path where the Document Structure File will be stored.")//
@NonNull String structureFileStorageId,//
@Schema(description = "Path where the Research Data File will be stored.")//
String researchDocumentStorageId,//
@Schema(description = "Path where the Document Text File will be stored.")//
@NonNull String textBlockFileStorageId,//
@Schema(description = "Path where the Document Positions File will be stored.")//
@NonNull String positionBlockFileStorageId,//
@Schema(description = "Path where the Document Pages File will be stored.")//
@NonNull String pageFileStorageId,//
@Schema(description = "Path where the Document Markdown File will be stored.")//
Optional<String> documentMarkdownFileStorageId,//
@Schema(description = "Path where the Simplified Text File will be stored.")//
@NonNull String simplifiedTextStorageId,//
@Schema(description = "Path where the Viewer Document PDF will be stored.")//
@NonNull String viewerDocumentStorageId
) {
} }

View File

@ -0,0 +1,12 @@
package com.knecon.fforesight.service.layoutparser.internal.api.queue;
public enum LayoutParsingType {
REDACT_MANAGER,
REDACT_MANAGER_OLD,
REDACT_MANAGER_PARAGRAPH_DEBUG,
REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
DOCUMINE,
DOCUMINE_OLD,
CLARIFYND,
CLARIFYND_PARAGRAPH_DEBUG
}

View File

@ -1,10 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.services;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
public interface EntityEnrichmentService {
void enrichEntity(EntityNode entity, TextBlock textBlock);
}

View File

@ -1,56 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.services;
import java.util.Collections;
import java.util.NoSuchElementException;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor
public class EntityInsertionService {
private final EntityEnrichmentService entityEnrichmentService;
public void addEntityToGraph(EntityNode entity, TableOfContents tableOfContents) {
try {
SemanticNode containingNode = tableOfContents.streamChildrenNodes(Collections.emptyList())
.filter(node -> node.buildTextBlock().containsBoundary(entity.getBoundary()))
.findFirst()
.orElseThrow(() -> new NoSuchElementException("No containing Node found!"));
containingNode.addThisToEntityIfIntersects(entity);
TextBlock textBlock = entity.getDeepestFullyContainingNode().buildTextBlock();
entityEnrichmentService.enrichEntity(entity, textBlock);
addToPages(entity);
addToNodeEntitySets(entity);
} catch (NoSuchElementException e) {
entity.removeFromGraph();
}
}
private void addToPages(EntityNode entity) {
Set<PageNode> pages = entity.getDeepestFullyContainingNode().getPages();
entity.getPages().addAll(pages);
pages.forEach(page -> page.getEntities().add(entity));
}
private void addToNodeEntitySets(EntityNode entity) {
entity.getIntersectingNodes().forEach(node -> node.getEntities().add(entity));
}
}

View File

@ -1,95 +0,0 @@
package com.knecon.fforesight.service.layoutparser.internal.api.services;
import static java.lang.String.format;
import java.awt.geom.Area;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock;
import lombok.experimental.UtilityClass;
@UtilityClass
public class RectangleTransformations {
public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) {
return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY);
}
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DUnion());
}
public static Rectangle2D rectangleUnion(List<Rectangle2D> rectangle2DList) {
return rectangle2DList.stream().collect(new Rectangle2DUnion());
}
public static String toString(Rectangle2D rectangle2D) {
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
}
public static Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
private static class Rectangle2DUnion implements Collector<Rectangle2D, Area, Rectangle2D> {
@Override
public Supplier<Area> supplier() {
return Area::new;
}
@Override
public BiConsumer<Area, Rectangle2D> accumulator() {
return (area, rectangle2D) -> area.add(new Area(rectangle2D));
}
@Override
public BinaryOperator<Area> combiner() {
return (area1, area2) -> {
area1.add(area2);
return area1;
};
}
@Override
public Function<Area, Rectangle2D> finisher() {
return Area::getBounds2D;
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED);
}
}
}

View File

@ -0,0 +1,16 @@
<Configuration>
<Appenders>
<Console name="CONSOLE" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
</Console>
</Appenders>
<Loggers>
<Root level="warn">
<AppenderRef ref="CONSOLE"/>
</Root>
<Logger name="com.iqser" level="info"/>
</Loggers>
</Configuration>

View File

@ -0,0 +1,40 @@
plugins {
id("com.knecon.fforesight.java-conventions")
id("io.freefair.lombok") version "8.4"
}
description = "layoutparser-service-processor"
val jacksonVersion = "2.15.2"
val pdfBoxVersion = "3.0.0"
dependencies {
implementation(project(":layoutparser-service-internal-api"))
implementation(project(":viewer-doc-processor"))
implementation("com.knecon.fforesight:document:${rootProject.extra.get("documentVersion")}")
implementation("com.iqser.red.service:persistence-service-shared-api-v1:2.564.0-RED9010.0") {
exclude("org.springframework.boot", "spring-boot-starter-security")
exclude("org.springframework.boot", "spring-boot-starter-validation")
}
implementation("com.knecon.fforesight:tenant-commons:0.30.0") {
exclude("com.iqser.red.commons", "storage-commons")
}
implementation("com.iqser.red.commons:storage-commons:2.50.0")
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
implementation("org.jgrapht:jgrapht-core:1.5.2")
implementation("org.apache.pdfbox:jbig2-imageio:3.0.4")
implementation("com.github.jai-imageio:jai-imageio-core:1.4.0")
implementation("com.github.jai-imageio:jai-imageio-jpeg2000:1.4.0")
implementation("org.tinspin:tinspin-indexes:2.1.3")
implementation("org.commonmark:commonmark:0.22.0")
implementation("org.commonmark:commonmark-ext-gfm-tables:0.22.0")
implementation("com.pdftron:PDFNet:10.11.0")
implementation("org.apache.commons:commons-text:1.12.0")
}

View File

@ -1,129 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.knecon.fforesight</groupId>
<artifactId>layoutparser-service</artifactId>
<version>1.0.0</version>
</parent>
<artifactId>layoutparser-service-processor</artifactId>
<dependencies>
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>persistence-service-internal-api-v1</artifactId>
<version>2.36.0</version>
</dependency>
<dependency>
<groupId>com.knecon.fforesight</groupId>
<artifactId>layoutparser-service-internal-api</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>storage-commons</artifactId>
<version>1.13.0</version>
</dependency>
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>spring-commons</artifactId>
<version>6.2.0</version>
</dependency>
<dependency>
<groupId>com.dslplatform</groupId>
<artifactId>dsl-json-java8</artifactId>
<version>1.10.0</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>${pdfbox.version}</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-tools</artifactId>
<version>${pdfbox.version}</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>31.1-jre</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-afterburner</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.datatype</groupId>
<artifactId>jackson-datatype-jsr310</artifactId>
<version>${jackson.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-security</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
<version>${spring.version}</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-openfeign</artifactId>
<version>4.0.2</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-amqp</artifactId>
<version>${spring.version}</version>
</dependency>
</dependencies>
<repositories>
<repository>
<id>spring-milestones</id>
<name>Spring Milestones</name>
<url>https://repo.spring.io/milestone</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
<repository>
<id>spring-snapshots</id>
<name>Spring Snapshots</name>
<url>https://repo.spring.io/snapshot</url>
<releases>
<enabled>false</enabled>
</releases>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>spring-milestones</id>
<name>Spring Milestones</name>
<url>https://repo.spring.io/milestone</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
</pluginRepository>
<pluginRepository>
<id>spring-snapshots</id>
<name>Spring Snapshots</name>
<url>https://repo.spring.io/snapshot</url>
<releases>
<enabled>false</enabled>
</releases>
</pluginRepository>
</pluginRepositories>
</project>

View File

@ -0,0 +1,20 @@
package com.knecon.fforesight.service.layoutparser.processor;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import lombok.AccessLevel;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Configuration
@ConfigurationProperties("layoutparser")
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutParserSettings {
boolean debug;
LayoutParsingType layoutParsingTypeOverride;
}

View File

@ -0,0 +1,474 @@
package com.knecon.fforesight.service.layoutparser.processor;
import static java.lang.String.format;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.mapper.DocumentDataMapper;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEnhancementService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.VisualLayoutParsingAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
import io.micrometer.observation.Observation;
import io.micrometer.observation.ObservationRegistry;
import io.micrometer.observation.annotation.Observed;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@SuppressWarnings("PMD.CloseResource")
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutParsingPipeline {
final ImageServiceResponseAdapter imageServiceResponseAdapter;
final CvTableParsingAdapter cvTableParsingAdapter;
final LayoutParsingStorageService layoutParsingStorageService;
final SectionsBuilderService sectionsBuilderService;
final SimplifiedSectionTextService simplifiedSectionTextService;
final RulingCleaningService rulingCleaningService;
final TableExtractionService tableExtractionService;
final DocuMineBlockificationService docuMineBlockificationService;
final RedactManagerBlockificationService redactManagerBlockificationService;
final BlockificationPostprocessingService blockificationPostprocessingService;
final DocstrumBlockificationService docstrumBlockificationService;
final LayoutGridService layoutGridService;
final ObservationRegistry observationRegistry;
final VisualLayoutParsingAdapter visualLayoutParsingAdapter;
final GraphicExtractorService graphicExtractorService;
final OutlineExtractorService outlineExtractorService;
final SectionTreeBuilderService sectionTreeBuilderService;
final SectionTreeEnhancementService sectionTreeEnhancementService;
final LayoutParserSettings settings;
final ClassificationService classificationService;
@Value("${LAYOUT_PARSER_VERSION:}")
private String layoutParserVersion;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
long start = System.currentTimeMillis();
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
.map(layoutParsingStorageService::getVisualLayoutParsingFile)
.orElse(new VisualLayoutParsingResponse());
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
.map(layoutParsingStorageService::getImagesFile)
.orElse(new ImageServiceResponse());
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
.map(layoutParsingStorageService::getTablesFile)
.orElse(new TableServiceResponse());
LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride();
ClassificationDocument classificationDocument = parseLayout(layoutParsingType,
originFile,
imageServiceResponse,
tableServiceResponse,
visualLayoutParsingResponse,
layoutParsingRequest.identifier());
log.info("Building document graph for {}", layoutParsingRequest.identifier());
DocumentWithVisualization documentWithVisualization = observeBuildDocumentGraph(layoutParsingType, classificationDocument);
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
layoutGridService.addLayoutGrid(viewerDocumentFile, documentWithVisualization, viewerDocumentFile, layoutParsingType, layoutParserVersion, false);
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentWithVisualization.document()));
if (layoutParsingRequest.documentMarkdownFileStorageId()
.isPresent()) {
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
.get(), new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
}
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentWithVisualization.document()));
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
if (layoutParsingRequest.researchDocumentStorageId() != null) {
log.info("Building research document data for {}", layoutParsingRequest.identifier());
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentWithVisualization.document());
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
}
if (!viewerDocumentFile.equals(originFile)) {
assert !viewerDocumentFile.exists() || viewerDocumentFile.delete();
}
assert !originFile.exists() || originFile.delete();
return LayoutParsingFinishedEvent.builder()
.identifier(layoutParsingRequest.identifier())
.numberOfPages(documentWithVisualization.document().getNumberOfPages())
.duration(System.currentTimeMillis() - start)
.message(format("""
Layout parsing has finished in %.02f s.
identifiers: %s
%s
Files have been saved with Ids:
Structure: %s
Text: %s
Positions: %s
PageData: %s
Simplified Text: %s
Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(documentWithVisualization.document().getNumberOfPages(), documentWithVisualization.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId()))
.layoutParserVersion(layoutParserVersion)
.build();
}
private DocumentWithVisualization observeBuildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument) {
AtomicReference<DocumentWithVisualization> documentReference = new AtomicReference<>();
Observation.createNotStarted("LayoutParsingPipeline", observationRegistry)
.contextualName("build-document-graph")
.observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(layoutParsingType, classificationDocument)));
return documentReference.get();
}
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
return format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
}
@SneakyThrows
@Observed(name = "LayoutParsingPipeline", contextualName = "parse-layout")
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
File originFile,
ImageServiceResponse imageServiceResponse,
TableServiceResponse tableServiceResponse,
VisualLayoutParsingResponse visualLayoutParsingResponse,
Map<String, String> identifier) {
PDDocument originDocument = openDocument(originFile);
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
ClassificationDocument classificationDocument = new ClassificationDocument();
if (settings.isDebug() || identifier.containsKey("debug")) {
classificationDocument.getLayoutDebugLayer().setActive(true);
}
List<ClassificationPage> classificationPages = new ArrayList<>();
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
long pageCount = originDocument.getNumberOfPages();
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
if (pageNumber % 100 == 0) {
// re-open document every once in a while to save on RAM. This has no significant performance impact.
// This is due to PDFBox caching all images and some other stuff with Soft References. This dereferences them and forces the freeing of memory.
originDocument.close();
originDocument = openDocument(originFile);
}
if (pageNumber % 100 == 0 || pageNumber == pageCount || pageNumber == 1) {
log.info("Extracting text on Page {} for {}", pageNumber, identifier);
}
classificationDocument.setPages(classificationPages);
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
PDPage pdPage = originDocument.getPage(pageNumber - 1);
stripper.setPageNumber(pageNumber);
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
stripper.getText(originDocument);
List<Word> words = stripper.getWords();
// rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
var lines = TextPositionOperations.groupByLine(new HashSet<>(words));
classificationDocument.getLayoutDebugLayer().addLineVisualizationsFromNestedTextPosition(lines, pageNumber);
words = TextPositionOperations.sortWords(lines);
}
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
PDRectangle pdr = pdPage.getMediaBox();
List<Ruling> rulings = stripper.getRulings();
classificationDocument.getLayoutDebugLayer().addRulingVisualization(rulings, pageNumber);
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), rulings);
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber);
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getWords(), false);
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
.addAll(graphics.stream()
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()),
ImageType.GRAPHIC,
false,
stripper.getPageNumber(),
""))
.toList());
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer());
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
};
updateClassificationPage(pdPage, pdr, classificationPage, cleanRulings, pageNumber, pageInformation);
blockificationPostprocessingService.findHeadlinesFromOutline(classificationDocument, pageNumber, classificationPage, pageInformation);
classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
if (pdfImages.containsKey(pageNumber)) {
classificationPage.setImages(pdfImages.get(pageNumber));
imageServiceResponseAdapter.findOcr(classificationPage);
}
if (signatures.containsKey(pageNumber)) {
if (classificationPage.getImages() == null || classificationPage.getImages().isEmpty()) {
classificationPage.setImages(signatures.get(pageNumber));
} else {
classificationPage.getImages().addAll(signatures.get(pageNumber));
}
}
tableExtractionService.extractTables(emptyTableCells, classificationPage);
buildPageStatistics(classificationPage);
increaseDocumentStatistics(classificationPage, classificationDocument);
classificationPages.add(classificationPage);
}
originDocument.close();
classificationService.classify(classificationDocument, layoutParsingType, identifier);
SectionTree sectionTree = sectionTreeBuilderService.createSectionTree(classificationDocument);
classificationDocument.setSectionTree(sectionTree);
log.info("Building Sections for {}", identifier);
switch (layoutParsingType) {
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
default -> sectionTreeEnhancementService.assignSectionBlocksAndImages(classificationDocument);
}
return classificationDocument;
}
private static void updateClassificationPage(PDPage pdPage,
PDRectangle pdr,
ClassificationPage classificationPage,
CleanRulings cleanRulings,
int pageNumber,
PageInformation pageInformation) {
int rotation = pdPage.getRotation();
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape);
classificationPage.setPageNumber(pageNumber);
classificationPage.setPageWidth((float) pageInformation.width());
classificationPage.setPageHeight((float) pageInformation.height());
}
private static void rotateDirAdjExactly(List<Word> words, PDPage pdPage) {
for (TextDirection dir : TextDirection.values()) {
double averageRotation = words.stream()
.map(Word::getCharacters)
.flatMap(Collection::stream)
.map(Character::getTextPosition)
.filter(pos -> pos.getDir().equals(dir))
.mapToDouble(RedTextPosition::getExactDir).average()
.orElse(0);
if (averageRotation == 0) {
continue;
}
AffineTransform rotateInstance = AffineTransform.getRotateInstance(averageRotation, pdPage.getMediaBox().getWidth() / 2, pdPage.getMediaBox().getHeight() / 2);
for (Word word : words) {
if (!dir.equals(word.getDir())) {
continue;
}
word.transform(rotateInstance);
}
}
}
private void addNumberOfPagesToTrace(int numberOfPages, long size) {
if (observationRegistry.getCurrentObservation() != null) {
observationRegistry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages));
observationRegistry.getCurrentObservation().highCardinalityKeyValue("fileSize", String.valueOf(size));
}
}
@SneakyThrows
private PDDocument openDocument(File originFile) {
PDDocument document = Loader.loadPDF(originFile);
document.setAllSecurityToBeRemoved(true);
return document;
}
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
Map<String, List<Rectangle2D>> markedContentBboxes = new HashMap<>();
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER));
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER));
return markedContentBboxes;
}
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
}
private void buildPageStatistics(ClassificationPage classificationPage) {
// Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame.
for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
if (((TextPageBlock) textBlock).getWords() == null) {
continue;
}
for (Word word : ((TextPageBlock) textBlock).getWords()) {
classificationPage.getTextHeightCounter().add(word.getTextHeight());
classificationPage.getFontCounter().add(word.getFont());
classificationPage.getFontSizeCounter().add(word.getFontSize());
classificationPage.getFontStyleCounter().add(word.getFontStyle());
}
}
}
}
}

View File

@ -1,88 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor;
import static java.lang.String.format;
import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph;
import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.adapter.CvTableParsingAdapter;
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.service.ClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.classification.service.PdfParsingService;
import com.knecon.fforesight.service.layoutparser.processor.classification.service.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class LayoutParsingService {
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
private final CvTableParsingAdapter cvTableParsingAdapter;
private final LayoutParsingStorageService layoutParsingStorageService;
private final PdfParsingService pdfParsingService;
private final ClassificationService classificationService;
private final SectionsBuilderService sectionsBuilderService;
private final DocumentGraphFactory documentGraphFactory;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
long start = System.currentTimeMillis();
PDDocument originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.pageFileStorageId());
}
TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.pageFileStorageId());
}
DocumentGraph documentGraph = parseLayout(originDocument, imageServiceResponse, tableServiceResponse);
int numberOfPages = originDocument.getNumberOfPages();
originDocument.close();
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
return LayoutParsingFinishedEvent.builder()
.identifier(layoutParsingRequest.identifier())
.numberOfPages(numberOfPages)
.duration(System.currentTimeMillis() - start)
.message(format("Layout parsing is finished and files have been saved with Ids:\n Structure: %s\nText: %s\nPositions: %s\nPageData: %s",
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId()))
.build();
}
public DocumentGraph parseLayout(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) {
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument,
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
classificationService.classifyDocument(classificationDocument);
sectionsBuilderService.buildSections(classificationDocument);
return documentGraphFactory.buildDocumentGraph(classificationDocument);
}
}

View File

@ -1,10 +1,23 @@
package com.knecon.fforesight.service.layoutparser.processor; package com.knecon.fforesight.service.layoutparser.processor;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.ComponentScan; import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Configuration;
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
import io.micrometer.observation.ObservationRegistry;
@Configuration @Configuration
@ComponentScan @ComponentScan
public class LayoutParsingServiceProcessorConfiguration { public class LayoutParsingServiceProcessorConfiguration {
@Bean
@Autowired
public PDFTronViewerDocumentService viewerDocumentService(ObservationRegistry registry) {
return new PDFTronViewerDocumentService(registry);
}
} }

View File

@ -1,30 +1,36 @@
package com.knecon.fforesight.service.layoutparser.processor; package com.knecon.fforesight.service.layoutparser.processor;
import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.FileOutputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import org.apache.commons.io.IOUtils; import org.springframework.core.task.TaskExecutor;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.redaction.v1.server.data.DocumentData;
import com.iqser.red.storage.commons.service.StorageService; import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData; import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.multitenancy.TenantContext;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocVersioningUtility;
import com.knecon.fforesight.tenantcommons.TenantContext;
import io.micrometer.observation.annotation.Observed;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
@ -35,67 +41,104 @@ public class LayoutParsingStorageService {
private final StorageService storageService; private final StorageService storageService;
private final ObjectMapper objectMapper; private final ObjectMapper objectMapper;
private final TaskExecutor taskExecutor;
public PDDocument getOriginFile(String storageId) throws IOException {
try (var originDocumentInputStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) { @Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file")
File tempFile = createTempFile("document", ".pdf"); public File getOriginFile(String storageId) throws IOException {
try (var tempFileOutputStream = new FileOutputStream(tempFile)) {
IOUtils.copy(originDocumentInputStream, tempFileOutputStream); File tempFile = createTempFile("document", ".pdf");
} storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
return Loader.loadPDF(tempFile, MemoryUsageSetting.setupMixed(67108864L)); return tempFile;
}
@Observed(name = "LayoutParsingStorageService", contextualName = "get-viewer-doc-file")
public Optional<File> getViewerDocFile(String storageId) throws IOException {
if (!storageService.objectExists(TenantContext.getTenantId(), storageId)) {
return Optional.empty();
}
File tempFile = createTempFile("viewerDocument", ".pdf");
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
if (!ViewerDocVersioningUtility.isCurrentVersion(tempFile)) {
assert tempFile.delete();
return Optional.empty();
}
return Optional.of(tempFile);
}
@SneakyThrows
public ImageServiceResponse getImagesFile(String storageId) {
try (InputStream inputStream = getObject(storageId)) {
ImageServiceResponse imageServiceResponse = objectMapper.readValue(inputStream, ImageServiceResponse.class);
inputStream.close();
return imageServiceResponse;
} }
} }
public ImageServiceResponse getImagesFile(String storageId) throws IOException { @SneakyThrows
public TableServiceResponse getTablesFile(String storageId) {
try (InputStream inputStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) { try (var tableClassificationStream = getObject(storageId)) {
return objectMapper.readValue(inputStream, ImageServiceResponse.class); TableServiceResponse tableServiceResponse = objectMapper.readValue(tableClassificationStream, TableServiceResponse.class);
tableClassificationStream.close();
return tableServiceResponse;
} }
} }
public TableServiceResponse getTablesFile(String storageId) throws IOException { @SneakyThrows
public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) {
try (var tableClassificationStream = storageService.getObject(TenantContext.getTenantId(), storageId).getInputStream()) {
return objectMapper.readValue(tableClassificationStream, TableServiceResponse.class);
try (InputStream inputStream = getObject(storageId)) {
return objectMapper.readValue(inputStream, VisualLayoutParsingResponse.class);
} }
} }
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) throws IOException { @SneakyThrows
@Observed(name = "LayoutParsingStorageService", contextualName = "store-document-data")
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getTableOfContents()); Runnable storeDocumentStructureRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getAtomicTextBlocks()); layoutParsingRequest.structureFileStorageId(),
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getAtomicPositionBlocks()); documentData.getDocumentStructure());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getPages());
CompletableFuture<Void> storeDocumentStructureFuture = CompletableFuture.runAsync(storeDocumentStructureRunnable, taskExecutor);
Runnable storeDocumentTextDataRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.textBlockFileStorageId(),
documentData.getDocumentTextData());
CompletableFuture<Void> storeDocumentTextDataFuture = CompletableFuture.runAsync(storeDocumentTextDataRunnable, taskExecutor);
Runnable storeDocumentPositionsRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.positionBlockFileStorageId(),
documentData.getDocumentPositionData());
CompletableFuture<Void> storeDocumentPositionsFuture = CompletableFuture.runAsync(storeDocumentPositionsRunnable, taskExecutor);
Runnable storeDocumentPagesRunnable = () -> storageService.storeProtoObject(TenantContext.getTenantId(),
layoutParsingRequest.pageFileStorageId(),
documentData.getDocumentPages());
CompletableFuture<Void> storeDocumentPagesFuture = CompletableFuture.runAsync(storeDocumentPagesRunnable, taskExecutor);
CompletableFuture.allOf(storeDocumentStructureFuture, storeDocumentTextDataFuture, storeDocumentPositionsFuture, storeDocumentPagesFuture).join();
} }
public DocumentData readDocumentData(LayoutParsingRequest layoutParsingRequest) throws IOException { public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) {
PageData[] pageData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), PageData[].class); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
AtomicTextBlockData[] atomicTextBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
layoutParsingRequest.textBlockFileStorageId(),
AtomicTextBlockData[].class);
AtomicPositionBlockData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
layoutParsingRequest.positionBlockFileStorageId(),
AtomicPositionBlockData[].class);
TableOfContentsData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(),
layoutParsingRequest.structureFileStorageId(),
TableOfContentsData.class);
return DocumentData.builder()
.tableOfContents(tableOfContentsData)
.atomicPositionBlocks(atomicPositionBlockData)
.atomicTextBlocks(atomicTextBlockData)
.pages(pageData)
.build();
} }
@ -123,4 +166,43 @@ public class LayoutParsingStorageService {
} }
} }
@Observed(name = "LayoutParsingStorageService", contextualName = "store-simplified-text")
public void storeSimplifiedText(LayoutParsingRequest layoutParsingRequest, SimplifiedText simplifiedText) {
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.simplifiedTextStorageId(), simplifiedText);
}
@SneakyThrows
private InputStream getObject(String storageId) {
File tempFile = File.createTempFile("temp", ".data");
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
Path path = Paths.get(tempFile.getPath());
return Files.newInputStream(path, StandardOpenOption.DELETE_ON_CLOSE);
}
@SneakyThrows
@Observed(name = "LayoutParsingStorageService", contextualName = "store-viewer-document")
public void storeViewerDocument(LayoutParsingRequest layoutParsingRequest, File out) {
try (var in = new FileInputStream(out)) {
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.viewerDocumentStorageId(), in);
}
}
@SneakyThrows
@Observed(name = "LayoutParsingStorageService", contextualName = "store-markdown-file")
public void storeMarkdownFile(String markdownFileStorageId, String markdownContent) {
try (InputStream inputStream = new ByteArrayInputStream(markdownContent.getBytes(StandardCharsets.UTF_8))) {
storageService.storeObject(TenantContext.getTenantId(), markdownFileStorageId, inputStream);
}
}
} }

View File

@ -1,49 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.CvParsedTableCell;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class CvTableParsingAdapter {
public Map<Integer, List<com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell>> buildCvParsedTablesPerPage(TableServiceResponse tableServiceResponse) {
Map<Integer, List<com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell>> tableCells = new HashMap<>();
tableServiceResponse.getData()
.forEach(tableData -> tableCells.computeIfAbsent(tableData.getPageInfo().getNumber(), tableCell -> new ArrayList<>())
.addAll(convertTableCells(tableData.getTableCells())));
return tableCells;
}
private Collection<? extends com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell> convertTableCells(List<CvParsedTableCell> tableCells) {
List<com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell> cvParsedTableCells = new ArrayList<>();
tableCells.forEach(t -> cvParsedTableCells.add(com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell.builder()
.y0(t.getY0())
.x1(t.getX1())
.y1(t.getY1())
.x0(t.getX0())
.width(t.getWidth())
.height(t.getHeight())
.build()));
return cvParsedTableCells;
}
}

View File

@ -1,17 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import java.util.HashMap;
import java.util.Map;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Classification {
private Map<String, Float> probabilities = new HashMap<>();
private String label;
}

View File

@ -1,14 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class FilterGeometry {
private ImageSize imageSize;
private Format imageFormat;
}

View File

@ -1,15 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Filters {
private FilterGeometry geometry;
private Probability probability;
private boolean allPassed;
}

View File

@ -1,15 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Format {
private float quotient;
private boolean tooTall;
private boolean tooWide;
}

View File

@ -1,14 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Geometry {
private float width;
private float height;
}

View File

@ -1,33 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import java.util.ArrayList;
import java.util.List;
import com.dslplatform.json.CompiledJson;
import com.dslplatform.json.JsonAttribute;
import com.fasterxml.jackson.annotation.JsonAlias;
import com.fasterxml.jackson.annotation.JsonProperty;
import lombok.Data;
@Data
@CompiledJson
public class ImageServiceResponse {
private String dossierId;
private String fileId;
@JsonProperty(value = "imageMetadata")
@JsonAlias("data")
@JsonAttribute(alternativeNames = {"imageMetadata"})
private List<Metadata> data = new ArrayList<>();
private List<Metadata> dataCV = new ArrayList<>();
@JsonProperty(value = "imageMetadata")
@JsonAlias("data")
@JsonAttribute(alternativeNames = {"imageMetadata"})
public void setData(List<Metadata> data) {this.data = data;}
}

View File

@ -1,15 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class ImageSize {
private float quotient;
private boolean tooLarge;
private boolean tooSmall;
}

View File

@ -1,17 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Metadata {
private Classification classification;
private Position position;
private Geometry geometry;
private Filters filters;
private boolean alpha;
}

View File

@ -1,17 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Position {
private float x1;
private float x2;
private float y1;
private float y2;
private int pageNumber;
}

View File

@ -1,13 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class Probability {
private boolean unconfident;
}

View File

@ -1,16 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class CvParsedPageInfo {
private int number;
private int rotation;
private float width;
private float height;
}

View File

@ -1,18 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class CvParsedTableCell {
private float x0;
private float y0;
private float x1;
private float y1;
private float width;
private float height;
}

View File

@ -1,17 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
import java.util.ArrayList;
import java.util.List;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class CvParsedTableModel {
private CvParsedPageInfo pageInfo;
private List<CvParsedTableCell> tableCells = new ArrayList<>();
}

View File

@ -1,22 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table;
import java.util.ArrayList;
import java.util.List;
import com.dslplatform.json.CompiledJson;
import lombok.Data;
@Data
@CompiledJson
public class TableServiceResponse {
private String dossierId;
private String fileId;
private String operation;
private String targetFileExtension;
private String responseFileExtension;
private List<CvParsedTableModel> data = new ArrayList<>();
}

View File

@ -1,71 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
import java.awt.geom.Rectangle2D;
import com.dslplatform.json.JsonAttribute;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextBlockOrientation;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@AllArgsConstructor
@NoArgsConstructor
public abstract class AbstractTextContainer {
protected float minX;
protected float maxX;
protected float minY;
protected float maxY;
protected String classification;
protected int page;
private TextBlockOrientation orientation = TextBlockOrientation.NONE;
public abstract String getText();
public boolean containsBlock(ClassificationTextBlock other) {
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
}
public boolean contains(AbstractTextContainer other) {
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
}
public boolean contains(Rectangle2D other) {
return other.contains(minX, minY, getWidth(), getHeight());
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getHeight() {
return maxY - minY;
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getWidth() {
return maxX - minX;
}
public boolean intersectsY(AbstractTextContainer atc) {
return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY();
}
}

View File

@ -1,16 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class ClassificationFooter {
private List<ClassificationTextBlock> textBlocks;
}

View File

@ -1,16 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class ClassificationHeader {
private List<ClassificationTextBlock> textBlocks;
}

View File

@ -1,38 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
public class ClassificationSection implements Comparable {
private List<AbstractTextContainer> pageBlocks = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>();
private String headline;
public List<Table> getTables() {
List<Table> tables = new ArrayList<>();
pageBlocks.forEach(block -> {
if (block instanceof Table) {
tables.add((Table) block);
}
});
return tables;
}
@Override
public int compareTo(Object o) {
return 0;
}
}

View File

@ -1,77 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import lombok.Getter;
public class FloatFrequencyCounter {
@Getter
Map<Float, Integer> countPerValue = new HashMap<>();
public void add(float value) {
if (!countPerValue.containsKey(value)) {
countPerValue.put(value, 1);
} else {
countPerValue.put(value, countPerValue.get(value) + 1);
}
}
public void addAll(Map<Float, Integer> otherCounter) {
for (Map.Entry<Float, Integer> entry : otherCounter.entrySet()) {
if (countPerValue.containsKey(entry.getKey())) {
countPerValue.put(entry.getKey(), countPerValue.get(entry.getKey()) + entry.getValue());
} else {
countPerValue.put(entry.getKey(), entry.getValue());
}
}
}
public Float getMostPopular() {
Map.Entry<Float, Integer> mostPopular = null;
for (Map.Entry<Float, Integer> entry : countPerValue.entrySet()) {
if (mostPopular == null || entry.getValue() >= mostPopular.getValue()) {
mostPopular = entry;
}
}
return mostPopular != null ? mostPopular.getKey() : null;
}
public List<Float> getHighterThanMostPopular() {
Float mostPopular = getMostPopular();
List<Float> higher = new ArrayList<>();
for (Float value : countPerValue.keySet()) {
if (value > mostPopular) {
higher.add(value);
}
}
return higher.stream().sorted(Collections.reverseOrder()).collect(Collectors.toList());
}
public Float getHighest() {
Float highest = null;
for (Float value : countPerValue.keySet()) {
if (highest == null || value > highest) {
highest = value;
}
}
return highest;
}
}

View File

@ -1,218 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.List;
@SuppressWarnings("all")
public class Rectangle extends Rectangle2D.Float {
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
/**
* Ill-defined comparator, from when Rectangle was Comparable.
* <p>
* see https://github.com/tabulapdf/tabula-java/issues/116
*
* @deprecated with no replacement
*/
@Deprecated
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
@Override
public int compare(Rectangle o1, Rectangle o2) {
if (o1.equals(o2)) {
return 0;
}
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1 ? -java.lang.Double.compare(o1.getX(), o2.getX()) : java.lang.Double.compare(o1.getX(), o2.getX());
} else {
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
}
}
};
public Rectangle() {
super();
}
public Rectangle(float top, float left, float width, float height) {
super();
this.setRect(left, top, width, height);
}
/**
* @param rectangles
* @return minimum bounding box that contains all the rectangles
*/
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
float minx = java.lang.Float.MAX_VALUE;
float miny = java.lang.Float.MAX_VALUE;
float maxx = java.lang.Float.MIN_VALUE;
float maxy = java.lang.Float.MIN_VALUE;
for (Rectangle r : rectangles) {
minx = (float) Math.min(r.getMinX(), minx);
miny = (float) Math.min(r.getMinY(), miny);
maxx = (float) Math.max(r.getMaxX(), maxx);
maxy = (float) Math.max(r.getMaxY(), maxy);
}
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
}
public int compareTo(Rectangle other) {
return ILL_DEFINED_ORDER.compare(this, other);
}
// I'm bad at Java and need this for fancy sorting in
// technology.tabula.TextChunk.
public int isLtrDominant() {
return 0;
}
public float getArea() {
return this.width * this.height;
}
public float verticalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
}
public boolean verticallyOverlaps(Rectangle other) {
return verticalOverlap(other) > 0;
}
public float horizontalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
}
public boolean horizontallyOverlaps(Rectangle other) {
return horizontalOverlap(other) > 0;
}
public float verticalOverlapRatio(Rectangle other) {
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - this.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - other.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - other.getTop()) / delta;
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - this.getTop()) / delta;
}
return rv;
}
public float overlapRatio(Rectangle other) {
double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
double unionArea = this.getArea() + other.getArea() - intersectionArea;
return (float) (intersectionArea / unionArea);
}
public Rectangle merge(Rectangle other) {
this.setRect(this.createUnion(other));
return this;
}
public float getTop() {
return (float) this.getMinY();
}
public void setTop(float top) {
float deltaHeight = top - this.y;
this.setRect(this.x, top, this.width, this.height - deltaHeight);
}
public float getRight() {
return (float) this.getMaxX();
}
public void setRight(float right) {
this.setRect(this.x, this.y, right - this.x, this.height);
}
public float getLeft() {
return (float) this.getMinX();
}
public void setLeft(float left) {
float deltaWidth = left - this.x;
this.setRect(left, this.y, this.width - deltaWidth, this.height);
}
public float getBottom() {
return (float) this.getMaxY();
}
public void setBottom(float bottom) {
this.setRect(this.x, this.y, this.width, bottom - this.y);
}
public Point2D[] getPoints() {
return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()), new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(),
this.getBottom()), new Point2D.Float(this.getLeft(), this.getBottom())};
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
String s = super.toString();
sb.append(s.substring(0, s.length() - 1));
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
return sb.toString();
}
}

View File

@ -1,25 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.image;
import java.awt.geom.Rectangle2D;
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType;
import lombok.Data;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class ClassifiedImage {
@NonNull
private Rectangle2D position;
@NonNull
private ImageType imageType;
private boolean isAppendedToSection;
@NonNull
private boolean hasTransparency;
@NonNull
private int page;
}

View File

@ -1,15 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
import java.util.List;
import lombok.Builder;
import lombok.Data;
@Data
@Builder
public class CleanRulings {
List<Ruling> horizontal;
List<Ruling> vertical;
}

View File

@ -1,437 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Formatter;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.CohenSutherlandClipping;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@SuppressWarnings("all")
public class Ruling extends Line2D.Float {
private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2;
public Ruling(Point2D p1, Point2D p2) {
super(p1, p2);
}
public static List<Ruling> cropRulingsToArea(List<Ruling> rulings, Rectangle2D area) {
ArrayList<Ruling> rv = new ArrayList<>();
for (Ruling r : rulings) {
if (r.intersects(area)) {
rv.add(r.intersect(area));
}
}
return rv;
}
// log(n) implementation of find_intersections
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
class SortObject {
protected SOType type;
protected float position;
protected Ruling ruling;
public SortObject(SOType type, float position, Ruling ruling) {
this.type = type;
this.position = position;
this.ruling = ruling;
}
}
List<SortObject> sos = new ArrayList<>();
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
@Override
public int compare(Ruling o1, Ruling o2) {
return java.lang.Double.compare(o1.getTop(), o2.getTop());
}
});
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
@Override
public int compare(Point2D o1, Point2D o2) {
if (o1.getY() > o2.getY()) {
return 1;
}
if (o1.getY() < o2.getY()) {
return -1;
}
if (o1.getX() > o2.getX()) {
return 1;
}
if (o1.getX() < o2.getX()) {
return -1;
}
return 0;
}
});
for (Ruling h : horizontals) {
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_PIXEL_EXPAND_AMOUNT, h));
}
for (Ruling v : verticals) {
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
}
Collections.sort(sos, new Comparator<SortObject>() {
@Override
public int compare(SortObject a, SortObject b) {
int rv;
if (DoubleComparisons.feq(a.position, b.position)) {
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
rv = 1;
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
rv = -1;
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
rv = -1;
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
rv = 1;
} else {
rv = java.lang.Double.compare(a.position, b.position);
}
} else {
return java.lang.Double.compare(a.position, b.position);
}
return rv;
}
});
for (SortObject so : sos) {
switch (so.type) {
case VERTICAL:
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
try {
Point2D i = h.getKey().intersectionPoint(so.ruling);
if (i == null) {
continue;
}
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT)});
} catch (UnsupportedOperationException e) {
log.info("Some line are oblique, ignoring...");
continue;
}
}
break;
case HRIGHT:
tree.remove(so.ruling);
break;
case HLEFT:
tree.put(so.ruling, true);
break;
}
}
return rv;
}
public boolean vertical() {
return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
}
public boolean horizontal() {
return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
}
// attributes that make sense only for non-oblique lines
// these are used to have a single collapse method (in page, currently)
public boolean oblique() {
return !(this.vertical() || this.horizontal());
}
public float getPosition() {
if (this.oblique()) {
throw new UnsupportedOperationException();
}
return this.vertical() ? this.getLeft() : this.getTop();
}
public float getStart() {
if (this.oblique()) {
throw new UnsupportedOperationException();
}
return this.vertical() ? this.getTop() : this.getLeft();
}
public void setStart(float v) {
if (this.oblique()) {
throw new UnsupportedOperationException();
}
if (this.vertical()) {
this.setTop(v);
} else {
this.setLeft(v);
}
}
public float getEnd() {
if (this.oblique()) {
throw new UnsupportedOperationException();
}
return this.vertical() ? this.getBottom() : this.getRight();
}
public void setEnd(float v) {
if (this.oblique()) {
throw new UnsupportedOperationException();
}
if (this.vertical()) {
this.setBottom(v);
} else {
this.setRight(v);
}
}
public void setStartEnd(float start, float end) {
if (this.oblique()) {
throw new UnsupportedOperationException();
}
if (this.vertical()) {
this.setTop(start);
this.setBottom(end);
} else {
this.setLeft(start);
this.setRight(end);
}
}
public boolean perpendicularTo(Ruling other) {
return this.vertical() == other.horizontal();
}
public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) {
if (this.intersectsLine(another)) {
return true;
}
boolean rv = false;
if (this.perpendicularTo(another)) {
rv = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).intersectsLine(another);
} else {
rv = this.expand(colinearOrParallelExpandAmount).intersectsLine(another.expand(colinearOrParallelExpandAmount));
}
return rv;
}
public double length() {
return Math.sqrt(Math.pow(this.x1 - this.x2, 2) + Math.pow(this.y1 - this.y2, 2));
}
public Ruling intersect(Rectangle2D clip) {
Float clipee = (Float) this.clone();
boolean clipped = new CohenSutherlandClipping(clip).clip(clipee);
if (clipped) {
return new Ruling(clipee.getP1(), clipee.getP2());
} else {
return this;
}
}
public Ruling expand(float amount) {
Ruling r = (Ruling) this.clone();
try {
r.setStart(this.getStart() - amount);
r.setEnd(this.getEnd() + amount);
} catch (UnsupportedOperationException e) {
log.warn("Could not expand ruling!");
}
return r;
}
public Point2D intersectionPoint(Ruling other) {
Ruling this_l = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
Ruling other_l = other.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT);
Ruling horizontal, vertical;
if (!this_l.intersectsLine(other_l)) {
return null;
}
if (this_l.horizontal() && other_l.vertical()) {
horizontal = this_l;
vertical = other_l;
} else if (this_l.vertical() && other_l.horizontal()) {
vertical = this_l;
horizontal = other_l;
} else {
log.warn("lines must be orthogonal, vertical and horizontal");
return null;
}
return new Point2D.Float(vertical.getLeft(), horizontal.getTop());
}
@Override
public boolean equals(Object other) {
if (this == other) {
return true;
}
if (!(other instanceof Ruling)) {
return false;
}
Ruling o = (Ruling) other;
return this.getP1().equals(o.getP1()) && this.getP2().equals(o.getP2());
}
@Override
public int hashCode() {
return super.hashCode();
}
public float getTop() {
return this.y1;
}
public void setTop(float v) {
setLine(this.getLeft(), v, this.getRight(), this.getBottom());
}
public float getLeft() {
return this.x1;
}
public void setLeft(float v) {
setLine(v, this.getTop(), this.getRight(), this.getBottom());
}
public float getBottom() {
return this.y2;
}
public void setBottom(float v) {
setLine(this.getLeft(), this.getTop(), this.getRight(), v);
}
public float getRight() {
return this.x2;
}
public void setRight(float v) {
setLine(this.getLeft(), this.getTop(), v, this.getBottom());
}
public float getWidth() {
return this.getRight() - this.getLeft();
}
public float getHeight() {
return this.getBottom() - this.getTop();
}
public double getAngle() {
double angle = Math.toDegrees(Math.atan2(this.getP2().getY() - this.getP1().getY(), this.getP2().getX() - this.getP1().getX()));
if (angle < 0) {
angle += 360;
}
return angle;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
Formatter formatter = new Formatter(sb);
String rv = formatter.format("%s[minX=%f minY=%f maxX=%f maxY=%f]", this.getClass().toString(), this.x1, this.y1, this.x2, this.y2).toString();
formatter.close();
return rv;
}
private enum SOType {
VERTICAL,
HRIGHT,
HLEFT
}
}

View File

@ -1,350 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class Table extends AbstractTextContainer {
private final TreeMap<TableCellPosition, TableCell> cells = new TreeMap<>();
private final int rotation;
@Getter
@Setter
private String headline;
private int unrotatedRowCount;
private int unrotatedColCount;
private int rowCount = -1;
private int colCount = -1;
private List<List<TableCell>> rows;
public Table(List<TableCell> cells, Rectangle area, int rotation) {
addCells(cells);
minX = area.getLeft();
minY = area.getBottom();
maxX = area.getRight();
maxY = area.getTop();
classification = "Table";
this.rotation = rotation;
}
public List<List<TableCell>> getRows() {
if (rows == null) {
rows = computeRows();
// Ignore rows that does not contain any cells and values.
List<List<TableCell>> rowsToRemove = new ArrayList<>();
for (List<TableCell> row : rows) {
if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()) {
rowsToRemove.add(row);
}
}
rows.removeAll(rowsToRemove);
computeHeaders();
}
return rows;
}
public int getRowCount() {
if (rowCount == -1) {
rowCount = getRows().size();
}
return rowCount;
}
public int getColCount() {
if (colCount == -1) {
colCount = getRows().stream().mapToInt(List::size).max().orElse(0);
}
return colCount;
}
/**
* Detect header cells (either first row or first column):
* Column is marked as header if cell text is bold and row cell text is not bold.
* Defaults to row.
*/
private void computeHeaders() {
if (rows == null) {
rows = computeRows();
}
// A bold cell is a header cell as long as every cell to the left/top is bold, too
// we move from left to right and top to bottom
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
List<TableCell> rowCells = rows.get(rowIndex);
if (rowCells.size() == 1) {
continue;
}
for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
TableCell cell = rowCells.get(colIndex);
List<TableCell> cellsToTheLeft = rowCells.subList(0, colIndex);
TableCell lastHeaderCell = null;
for (TableCell leftCell : cellsToTheLeft) {
if (leftCell.isHeaderCell()) {
lastHeaderCell = leftCell;
} else {
break;
}
}
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
List<TableCell> cellsToTheTop = new ArrayList<>();
for (int i = 0; i < rowIndex; i++) {
try {
cellsToTheTop.add(rows.get(i).get(colIndex));
} catch (IndexOutOfBoundsException e) {
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
}
}
for (TableCell topCell : cellsToTheTop) {
if (topCell.isHeaderCell()) {
lastHeaderCell = topCell;
} else {
break;
}
}
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
cell.setHeaderCell(true);
}
}
}
}
private List<List<TableCell>> computeRows() {
List<List<TableCell>> rows = new ArrayList<>();
if (rotation == 90) {
for (int i = 0; i < unrotatedColCount; i++) { // rows
List<TableCell> lastRow = new ArrayList<>();
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
TableCell cell = cells.get(new TableCellPosition(j, i));
if (cell != null) {
lastRow.add(cell);
}
}
rows.add(lastRow);
}
} else if (rotation == 270) {
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
List<TableCell> lastRow = new ArrayList<>();
for (int j = 0; j < unrotatedRowCount; j++) { // cols
TableCell cell = cells.get(new TableCellPosition(j, i));
if (cell != null) {
lastRow.add(cell);
}
}
rows.add(lastRow);
}
} else {
for (int i = 0; i < unrotatedRowCount; i++) {
List<TableCell> lastRow = new ArrayList<>();
for (int j = 0; j < unrotatedColCount; j++) {
TableCell cell = cells.get(new TableCellPosition(i, j)); // JAVA_8 use getOrDefault()
if (cell != null) {
lastRow.add(cell);
}
}
rows.add(lastRow);
}
}
return rows;
}
private void add(TableCell chunk, int row, int col) {
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
TableCellPosition cp = new TableCellPosition(row, col);
cells.put(cp, chunk);
}
private void addCells(List<TableCell> cells) {
if (cells.isEmpty()) {
return;
}
cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1);
List<List<TableCell>> rowsOfCells = calculateStructure(cells);
for (int i = 0; i < rowsOfCells.size(); i++) {
for (int j = 0; j < rowsOfCells.get(i).size(); j++) {
add(rowsOfCells.get(i).get(j), i, j);
}
}
}
/**
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
*
* @param cells The found cells
* @return Table Structure
*/
private List<List<TableCell>> calculateStructure(List<TableCell> cells) {
List<List<TableCell>> matrix = new ArrayList<>();
if (cells.isEmpty()) {
return matrix;
}
Set<Float> uniqueX = new HashSet<>();
Set<Float> uniqueY = new HashSet<>();
cells.stream().filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3).forEach(c -> {
uniqueX.add(c.getLeft());
uniqueX.add(c.getRight());
uniqueY.add(c.getBottom());
uniqueY.add(c.getTop());
});
var sortedUniqueX = uniqueX.stream().sorted().collect(Collectors.toList());
var sortedUniqueY = uniqueY.stream().sorted().collect(Collectors.toList());
Float prevY = null;
for (Float y : sortedUniqueY) {
List<TableCell> row = new ArrayList<>();
Float prevX = null;
for (Float x : sortedUniqueX) {
if (prevY != null && prevX != null) {
var cell = new TableCell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
if (intersectionCell.isPresent()) {
cell.getTextBlocks().addAll(intersectionCell.get().getTextBlocks());
}
row.add(cell);
}
prevX = x;
}
if (prevY != null && prevX != null) {
matrix.add(row);
}
prevY = y;
}
Collections.reverse(matrix);
return matrix;
}
@Override
public String getText() {
StringBuilder sb = new StringBuilder();
List<List<TableCell>> rows = getRows();
int i = 0;
for (List<TableCell> row : rows) {
if (i != 0) {
sb.append("\n");
}
if (!row.isEmpty()) {
boolean firstColumn = true;
for (TableCell column : row) {
if (!firstColumn) {
sb.append(",");
}
if (column != null && column.getTextBlocks() != null) {
boolean first = true;
for (ClassificationTextBlock textBlock : column.getTextBlocks()) {
if (!first) {
sb.append("\n");
}
sb.append('\"').append(textBlock.getText().replaceAll("\"", "\\\"")).append('\"');
first = false;
}
}
firstColumn = false;
}
}
i++;
}
return sb.toString();
}
public String getTextAsHtml() {
StringBuilder sb = new StringBuilder();
List<List<TableCell>> rows = getRows();
sb.append("<table border=\"1\">");
int i = 0;
for (List<TableCell> row : rows) {
sb.append("\n<tr>");
if (!row.isEmpty()) {
for (TableCell column : row) {
sb.append(i == 0 ? "\n<th>" : "\n<td>");
if (column != null && column.getTextBlocks() != null) {
boolean first = true;
for (ClassificationTextBlock textBlock : column.getTextBlocks()) {
if (!first) {
sb.append("<br />");
}
sb.append(textBlock.getText().replaceAll("\\n", "<br />"));
first = false;
}
}
sb.append(i == 0 ? "</th>" : "</td>");
}
}
sb.append("</tr>");
i++;
}
sb.append("</table>");
return sb.toString();
}
}

View File

@ -1,38 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
@SuppressWarnings("serial")
@Data
@EqualsAndHashCode(callSuper = true)
@NoArgsConstructor
public class TableCell extends Rectangle {
private List<ClassificationTextBlock> textBlocks = new ArrayList<>();
private List<TableCell> headerCells = new ArrayList<>();
private boolean isHeaderCell;
public TableCell(Point2D topLeft, Point2D bottomRight) {
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
}
public void addTextBlock(ClassificationTextBlock textBlock) {
textBlocks.add(textBlock);
}
}

View File

@ -1,286 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
@EqualsAndHashCode(callSuper = true)
@AllArgsConstructor
@Builder
@Data
@NoArgsConstructor
public class ClassificationTextBlock extends AbstractTextContainer {
@Builder.Default
private List<TextPositionSequence> sequences = new ArrayList<>();
private int rotation;
private int indexOnPage;
private String mostPopularWordFont;
private String mostPopularWordStyle;
private float mostPopularWordFontSize;
private float mostPopularWordHeight;
private float mostPopularWordSpaceWidth;
private float highestFontSize;
private String classification;
public TextDirection getDir() {
return sequences.get(0).getDir();
}
private float getPageHeight() {
return sequences.get(0).getPageHeight();
}
private float getPageWidth() {
return sequences.get(0).getPageWidth();
}
/**
* Returns the minX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the minX value in pdf coordinate system
*/
public float getPdfMinX() {
if (getDir().getDegrees() == 90) {
return minY;
} else if (getDir().getDegrees() == 180) {
return getPageWidth() - maxX;
} else if (getDir().getDegrees() == 270) {
return getPageWidth() - maxY;
} else {
return minX;
}
}
/**
* Returns the maxX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the maxX value in pdf coordinate system
*/
public float getPdfMaxX() {
if (getDir().getDegrees() == 90) {
return maxY;
} else if (getDir().getDegrees() == 180) {
return getPageWidth() - minX;
} else if (getDir().getDegrees() == 270) {
return getPageWidth() - minY;
} else {
return maxX;
}
}
/**
* Returns the minY value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the minY value in pdf coordinate system
*/
public float getPdfMinY() {
if (getDir().getDegrees() == 90) {
return minX;
} else if (getDir().getDegrees() == 180) {
return maxY;
} else if (getDir().getDegrees() == 270) {
return getPageHeight() - maxX;
} else {
return getPageHeight() - maxY;
}
}
/**
* Returns the maxY value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the maxY value in pdf coordinate system
*/
public float getPdfMaxY() {
if (getDir().getDegrees() == 90) {
return maxX;
} else if (getDir().getDegrees() == 180) {
return minY;
} else if (getDir().getDegrees() == 270) {
return getPageHeight() - minX;
} else {
return getPageHeight() - minY;
}
}
public ClassificationTextBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation, int indexOnPage) {
super();
this.indexOnPage = indexOnPage;
super.minX = minX;
super.maxX = maxX;
super.minY = minY;
super.maxY = maxY;
this.sequences = sequences;
this.rotation = rotation;
}
public ClassificationTextBlock union(TextPositionSequence r) {
ClassificationTextBlock union = this.copy();
union.add(r);
return union;
}
public ClassificationTextBlock union(ClassificationTextBlock r) {
ClassificationTextBlock union = this.copy();
union.add(r);
return union;
}
public void add(ClassificationTextBlock r) {
if (r.getMinX() < minX) {
minX = r.getMinX();
}
if (r.getMaxX() > maxX) {
maxX = r.getMaxX();
}
if (r.getMinY() < minY) {
minY = r.getMinY();
}
if (r.getMaxY() > maxY) {
maxY = r.getMaxY();
}
sequences.addAll(r.getSequences());
}
public void add(TextPositionSequence r) {
if (r.getMinXDirAdj() < minX) {
minX = r.getMinXDirAdj();
}
if (r.getMaxXDirAdj() > maxX) {
maxX = r.getMaxXDirAdj();
}
if (r.getMinYDirAdj() < minY) {
minY = r.getMinYDirAdj();
}
if (r.getMaxYDirAdj() > maxY) {
maxY = r.getMaxYDirAdj();
}
}
public ClassificationTextBlock copy() {
return new ClassificationTextBlock(minX, maxX, minY, maxY, sequences, rotation, indexOnPage);
}
public void resize(float x1, float y1, float width, float height) {
set(x1, y1, x1 + width, y1 + height);
}
public void set(float x1, float y1, float x2, float y2) {
this.minX = Math.min(x1, x2);
this.maxX = Math.max(x1, x2);
this.minY = Math.min(y1, y2);
this.maxY = Math.max(y1, y2);
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < sequences.size(); i++) {
String sequenceAsString = sequences.get(i).toString();
// Fix for missing Whitespace. This is recognized in getSequences method. See PDFTextStripper Line 1730.
if (i != 0 && sequences.get(i - 1).charAt(sequences.get(i - 1).length() - 1) != ' ' && sequenceAsString.charAt(0) != ' ') {
builder.append(' ');
}
builder.append(sequenceAsString);
}
return builder.toString();
}
@Override
public String getText() {
StringBuilder sb = new StringBuilder();
TextPositionSequence previous = null;
for (TextPositionSequence word : sequences) {
if (previous != null) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');
}
}
sb.append(word.toString());
previous = word;
}
return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString());
}
}

View File

@ -1,106 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.beans.BeanUtils;
import com.dslplatform.json.CompiledJson;
import com.dslplatform.json.JsonAttribute;
import com.fasterxml.jackson.annotation.JsonIgnore;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.SneakyThrows;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@CompiledJson
public class RedTextPosition {
private String textMatrix;
private float[] position;
@JsonIgnore
private int rotation;
@JsonIgnore
private float pageHeight;
@JsonIgnore
private float pageWidth;
private String unicode;
@JsonIgnore
private float dir;
// not used in reanalysis
@JsonIgnore
@JsonAttribute(ignore = true)
private float widthOfSpace;
// not used in reanalysis
@JsonIgnore
@JsonAttribute(ignore = true)
private float fontSizeInPt;
// not used in reanalysis
@JsonIgnore
@JsonAttribute(ignore = true)
private String fontName;
@SneakyThrows
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
var pos = new RedTextPosition();
BeanUtils.copyProperties(textPosition, pos);
pos.setFontName(textPosition.getFont().getName());
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
pos.setTextMatrix(textPosition.getTextMatrix().toString());
var position = new float[4];
position[0] = textPosition.getXDirAdj();
position[1] = textPosition.getYDirAdj();
position[2] = textPosition.getWidthDirAdj();
position[3] = textPosition.getHeightDir();
pos.setPosition(position);
return pos;
}
@JsonIgnore
public float getXDirAdj() {
return position[0];
}
@JsonIgnore
public float getYDirAdj() {
return position[1];
}
@JsonIgnore
public float getWidthDirAdj() {
return position[2];
}
@JsonIgnore
public float getHeightDir() {
return position[3];
}
}

View File

@ -1,8 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
public enum TextBlockOrientation {
NONE,
LEFT,
RIGHT
}

View File

@ -1,298 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.pdfbox.text.TextPosition;
import com.dslplatform.json.JsonAttribute;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class TextPositionSequence implements CharSequence {
public static final int HEIGHT_PADDING = 2;
private int page;
private List<RedTextPosition> textPositions = new ArrayList<>();
private TextDirection dir;
private int rotation;
private float pageHeight;
private float pageWidth;
public TextPositionSequence(List<TextPosition> textPositions, int page) {
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
this.page = page;
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
}
@Override
public int length() {
return textPositions.size();
}
@Override
public char charAt(int index) {
RedTextPosition textPosition = textPositionAt(index);
String text = textPosition.getUnicode();
return text.charAt(0);
}
@Override
public TextPositionSequence subSequence(int start, int end) {
var textPositionSequence = new TextPositionSequence();
textPositionSequence.textPositions = textPositions.subList(start, end);
textPositionSequence.page = page;
textPositionSequence.dir = dir;
textPositionSequence.rotation = rotation;
textPositionSequence.pageHeight = pageHeight;
textPositionSequence.pageWidth = pageWidth;
return textPositionSequence;
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder(length());
for (int i = 0; i < length(); i++) {
builder.append(charAt(i));
}
return builder.toString();
}
public RedTextPosition textPositionAt(int index) {
return textPositions.get(index);
}
public void add(TextPositionSequence textPositionSequence, RedTextPosition textPosition) {
this.textPositions.add(textPosition);
this.page = textPositionSequence.getPage();
this.dir = textPositionSequence.getDir();
this.rotation = textPositionSequence.getRotation();
this.pageHeight = textPositionSequence.getPageHeight();
this.pageWidth = textPositionSequence.getPageWidth();
}
public void add(TextPosition textPosition) {
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted minX value
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getMinXDirAdj() {
return textPositions.get(0).getXDirAdj();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted maxX value
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getMaxXDirAdj() {
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getMinYDirAdj() {
return textPositions.get(0).getYDirAdj() - getTextHeight();
}
/**
* This value is adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This method ignores the page rotation but takes the text rotation and adjusts the coordinates to awt.
*
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
*/
@JsonIgnore
@JsonAttribute(ignore = true)
public float getMaxYDirAdj() {
return textPositions.get(0).getYDirAdj();
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getTextHeight() {
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getHeight() {
return getMaxYDirAdj() - getMinYDirAdj();
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getWidth() {
return getMaxXDirAdj() - getMinXDirAdj();
}
@JsonIgnore
@JsonAttribute(ignore = true)
public String getFont() {
return textPositions.get(0).getFontName().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", "");
}
@JsonIgnore
@JsonAttribute(ignore = true)
public String getFontStyle() {
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
if (lowercaseFontName.contains("bold") && lowercaseFontName.contains("italic")) {
return "bold, italic";
} else if (lowercaseFontName.contains("bold")) {
return "bold";
} else if (lowercaseFontName.contains("italic")) {
return "italic";
} else {
return "standard";
}
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getFontSize() {
return textPositions.get(0).getFontSizeInPt();
}
@JsonIgnore
@JsonAttribute(ignore = true)
public float getSpaceWidth() {
return textPositions.get(0).getWidthOfSpace();
}
/**
* This returns the bounding box of the word in Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return bounding box of the word in Pdf Coordinate System
*/
@JsonIgnore
@JsonAttribute(ignore = true)
@SneakyThrows
public Rectangle getRectangle() {
log.debug("ClassificationPage: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir);
float textHeight = getTextHeight();
RedTextPosition firstTextPos = textPositions.get(0);
RedTextPosition lastTextPos = textPositions.get(textPositions.size() - 1);
Point2D bottomLeft = new Point2D.Double(firstTextPos.getXDirAdj(), firstTextPos.getYDirAdj() - HEIGHT_PADDING);
Point2D topRight = new Point2D.Double(lastTextPos.getXDirAdj() + lastTextPos.getWidthDirAdj(), lastTextPos.getYDirAdj() + textHeight + HEIGHT_PADDING);
AffineTransform transform = new AffineTransform();
if (dir == TextDirection.ZERO || dir == TextDirection.HALF_CIRCLE) {
transform.rotate(dir.getRadians(), pageWidth / 2f, pageHeight / 2f);
transform.translate(0f, pageHeight + textHeight);
transform.scale(1., -1.);
} else if (dir == TextDirection.QUARTER_CIRCLE) {
transform.rotate(dir.getRadians(), pageWidth / 2f, pageWidth / 2f);
transform.translate(0f, pageWidth + textHeight);
transform.scale(1., -1.);
} else {
transform.rotate(dir.getRadians(), pageHeight / 2f, pageHeight / 2f);
transform.translate(0f, pageWidth + textHeight);
transform.scale(1., -1.);
}
bottomLeft = transform.transform(bottomLeft, null);
topRight = transform.transform(topRight, null);
return new Rectangle( //
new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()),
(float) (topRight.getX() - bottomLeft.getX()),
(float) (topRight.getY() - bottomLeft.getY()),
page);
}
}

View File

@ -1,82 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence;
import lombok.Getter;
import lombok.Setter;
public class PDFAreaTextStripper extends PDFTextStripperByArea {
@Getter
private List<TextPositionSequence> textPositionSequences = new ArrayList<>();
@Setter
private int pageNumber;
public PDFAreaTextStripper() throws IOException {
}
@Override
public void writeString(String text, List<TextPosition> textPositions) throws IOException {
int startIndex = 0;
for (int i = 0; i <= textPositions.size() - 1; i++) {
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0"))) {
startIndex++;
continue;
}
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
if (i > 0 && textPositions.get(i).getX() < textPositions.get(i - 1).getX()) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i;
}
if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i;
}
if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0")) && i <= textPositions.size() - 2) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
startIndex = i + 1;
}
}
List<TextPosition> sublist = textPositions.subList(startIndex, textPositions.size());
if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) {
sublist = sublist.subList(0, sublist.size() - 1);
}
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) {
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
}
super.writeString(text);
}
public void clearPositions() {
textPositionSequences = new ArrayList<>();
}
}

Some files were not shown because too many files have changed in this diff Show More