Compare commits

...

214 Commits

Author SHA1 Message Date
Kilian Schüttler
5fca39728b Merge branch 'RED-10365' into 'master'
RED-10365: update pdftron logic commons to avoid crash for specific file

See merge request fforesight/ocr-service!59
2024-11-06 09:25:26 +01:00
Kilian Schuettler
cd6390fde1 RED-10365: update pdftron logic commons to avoid crash for specific file 2024-11-06 09:09:50 +01:00
Kilian Schüttler
bc459ee966 Merge branch 'RED-9864' into 'master'
RED-9864: sped up invisible element removal, fixed crash

See merge request fforesight/ocr-service!58
2024-08-26 15:27:07 +02:00
Kilian Schuettler
47e7f8b297 RED-9864: sped up invisible element removal, fixed crash 2024-08-26 15:23:11 +02:00
Kilian Schüttler
22392e083d Merge branch 'RED-9746' into 'master'
RED-9746: update pdftron-ologic-commons version

See merge request fforesight/ocr-service!57
2024-08-20 09:43:58 +02:00
Kilian Schuettler
52a1fb4a05 RED-9746: update pdftron-ologic-commons version
* fox build
2024-08-19 13:41:46 +02:00
Kilian Schüttler
378436cb2f Merge branch 'RED-8800' into 'master'
RRED-8800: adjust coords to cropbox

See merge request fforesight/ocr-service!55
2024-07-15 17:51:06 +02:00
Kilian Schuettler
f1204acc60 RRED-8800: adjust coords to cropbox 2024-07-15 17:46:50 +02:00
Andrei Isvoran
998755c3e3 Merge branch 'RED-9496' into 'master'
RED-9496 - Implement graceful shutdown

See merge request fforesight/ocr-service!54
2024-07-04 12:35:01 +02:00
Andrei Isvoran
c598f62633 RED-9496 - Implement graceful shutdown 2024-07-04 12:17:12 +03:00
Corina Olariu
2e25ee2155 Merge branch 'RED-8701-deletefile' into 'master'
RED-8701 - Move files to customer data repositories

See merge request fforesight/ocr-service!53
2024-05-17 09:56:29 +02:00
Corina Olariu
7f04fb3c6f RED-8701 - Move files to customer data repositories
- remove one customer file (single page)
2024-05-17 10:48:10 +03:00
Andrei Isvoran
ff32f016eb Merge branch 'RED-9157-tracing' into 'master'
RED-9157 - Update tracing

See merge request fforesight/ocr-service!52
2024-05-15 09:59:00 +02:00
Andrei Isvoran
821ef265fe RED-9157 - Update tracing 2024-05-15 10:40:31 +03:00
Kilian Schüttler
7fcb6652ef Merge branch 'RED-7669' into 'master'
RED-7669: improve ocr

See merge request fforesight/ocr-service!51
2024-05-13 15:03:06 +02:00
Kilian Schuettler
61b1010e24 RED-7669: improve ocr
* fix pmd
2024-05-13 12:59:40 +02:00
Kilian Schuettler
7b5a175440 RED-7669: improve ocr
* fix pmd
2024-05-13 11:35:57 +02:00
Kilian Schuettler
18ba1daaef RED-7669: improve ocr
* decrease otsu-scorefract slightly for thin lines
* don't write text that is overlapped with existing text
2024-05-08 10:55:38 +02:00
Kilian Schuettler
c61f71871e RED-7669: improve ocr
* decrease otsu-scorefract slightly for thin lines
* don't write text that is overlapped with existing text
2024-05-08 10:54:25 +02:00
Timo Bejan
cc2937d0d2 Merge branch 'RED-8701-upgrade2' into 'master'
RED-8701 - Move files to customer data repositories

See merge request fforesight/ocr-service!50
2024-04-29 08:56:26 +02:00
Corina Olariu
71255d9fc9 RED-8701 - Move files to customer data repositories
- update springBootStarterVersion and org.springframework.cloud:spring-cloud-starter-openfeign
2024-04-26 15:01:48 +03:00
Kilian Schüttler
1f9dac17e3 Merge branch 'RED-8701-upgrade' into 'master'
RED-8701 - Move files to customer data repositories

See merge request fforesight/ocr-service!49
2024-04-25 12:56:28 +02:00
Corina Olariu
5712292698 RED-8701 - Move files to customer data repositories
- update fagiani_apt builpack
2024-04-25 13:37:59 +03:00
Kevin Tumma
1395318e18 Merge branch 'RED-8701-spring-version' into 'master'
RED-8701 - Move files to customer data repositories

See merge request fforesight/ocr-service!48
2024-04-25 11:11:53 +02:00
Corina Olariu
842b794153 RED-8701 - Move files to customer data repositories
- update "org.springframework.boot" version to 3.2.3
2024-04-25 12:00:00 +03:00
Yannik Hampe
4b3ccc28e2 Merge branch 'RED-8701' into 'master'
RED-8701 - Move files to customer data repositories

See merge request fforesight/ocr-service!47
2024-04-25 09:06:53 +02:00
Corina Olariu
b469ea4174 RED-8701 - Move files to customer data repositories
- update syngenta submodule
2024-04-23 14:56:57 +03:00
Corina Olariu
253bb70519 RED-8701 - Move files to customer data repositories
- update syngenta submodule
2024-04-23 10:44:48 +03:00
Corina Olariu
d55f245c5e RED-8701 - Move files to customer data repositories
- update unit tests with the new path to submodules for customer files
- remove customer files from project
2024-04-22 14:06:56 +03:00
Corina Olariu
7ed1632c6f RED-8701 - Move files to customer data repositories
- use git lfs to store customer files
2024-04-18 20:58:35 +03:00
Kilian Schüttler
6be5dcf305 Merge branch 'RED-8800' into 'master'
RED-8800: fix text location for weird mediaboxes

See merge request fforesight/ocr-service!46
2024-04-04 18:10:02 +02:00
Kilian Schuettler
7f0fb149a9 RED-8800: fix text location for weird mediaboxes 2024-04-04 17:03:37 +02:00
Ali Oezyetimoglu
6cab77c5c1 Merge branch 'RED-5966-memory-optimization' into 'master'
RED-5966 - Excessive RAM usage at report download due to watermarks

See merge request fforesight/ocr-service!45
2024-03-18 16:15:40 +01:00
Corina Olariu
1e3dc3df24 RED-5966 - Excessive RAM usage at report download due to watermarks
- update pdftron-logic-commons version for memory optimization for removal of invisible elements
2024-03-15 10:33:54 +02:00
Timo Bejan
f54f526f44 Merge branch 'clari-30' into 'master'
Clari-30 ocr service compatibility

See merge request fforesight/ocr-service!44
2024-03-08 13:48:38 +01:00
Timo Bejan
d8011bdba5 Clari-30 ocr service compatibility 2024-03-08 14:44:48 +02:00
Timo Bejan
ea11013132 Merge branch 'clari-30' into 'master'
Clari 30

See merge request fforesight/ocr-service!43
2024-03-06 17:11:39 +01:00
Timo Bejan
6d69b783f1 wrong conditional 2024-03-06 18:09:16 +02:00
Timo Bejan
5b3261d229 wrong conditional 2024-03-06 18:02:04 +02:00
Timo Bejan
b5a78a4396 Merge branch 'clari-30' into 'master'
orchestrator/persistence service should control queues

See merge request fforesight/ocr-service!42
2024-03-06 15:53:55 +01:00
Timo Bejan
3b320bfb00 orchestrator/persistence service should control queues 2024-03-06 16:50:41 +02:00
Timo Bejan
021b18ada3 Merge branch 'clari-30' into 'master'
CLARI-30 - reworked ocr service to use queues for request/response, moved DLQ...

See merge request fforesight/ocr-service!41
2024-03-05 14:11:15 +01:00
Timo Bejan
23bc84bd98 redelivery protection 2024-03-05 10:37:25 +02:00
Timo Bejan
2e37b8eec9 CLARI-30 - reworked ocr service to use queues for request/response, moved DLQ listener to consumer of this service. Removed rest API calls 2024-03-04 11:42:30 +02:00
Kilian Schüttler
c4c20d15ae Merge branch 'RED-8156' into 'master'
RED-8156: add debug layers to viewer document

See merge request fforesight/ocr-service!40
2024-02-08 11:43:15 +01:00
Kilian Schuettler
d2f2def1c2 RED-8156: add ocr debug layers to viewer document
* fix pmd
* disable tests again
2024-02-07 11:36:42 +01:00
Kilian Schuettler
2bbc3775c5 RED-8156: add ocr debug layers to viewer document 2024-02-07 11:31:40 +01:00
Kilian Schuettler
2aaa53f441 RED-8156: add debug layers to viewer document
* wip, fonts need to be created in the original document
2024-02-05 18:28:19 +01:00
Dominique Eifländer
724bb58969 Merge branch 'RED-8171' into 'master'
RED-8171: Traces do not stop at @Async

See merge request fforesight/ocr-service!39
2024-02-02 13:57:50 +01:00
Dominique Eifländer
1e08405082 RED-8171: Traces do not stop at @Async 2024-02-02 13:53:31 +01:00
Timo Bejan
150f2153c0 Merge branch 'RED-8085' into 'master'
Red 8085

See merge request fforesight/ocr-service!38
2024-01-30 07:34:09 +01:00
Timo Bejan
ca9e22b190 Red 8085 2024-01-30 07:34:09 +01:00
Timo Bejan
0c6ce2d77b Merge branch 'RED-8085' into 'master'
PMD fix for ocr service RED-8085

See merge request fforesight/ocr-service!37
2024-01-30 07:17:37 +01:00
Timo Bejan
b48db538fd PMD fix for ocr service RED-8085 2024-01-30 07:17:37 +01:00
Timo Bejan
14cc9941ff Merge branch 'RED-8085' into 'master'
RED-8085  pmd checkup/cleanup

See merge request fforesight/ocr-service!36
2024-01-29 10:31:26 +01:00
Timo Bejan
fdb3f3476b RED-8085 pmd checkup/cleanup 2024-01-29 00:19:42 +08:00
Kilian Schüttler
75bd2142ec Merge branch 'RED-8212' into 'master'
RED-8212: Pageborders from scanned documents are used for tables

See merge request fforesight/ocr-service!35
2024-01-24 13:40:17 +01:00
Kilian Schüttler
9010ee8691 RED-8212: Pageborders from scanned documents are used for tables 2024-01-24 13:40:17 +01:00
Dominique Eifländer
eaa6973a1f Merge branch 'RED-8155' into 'master'
RED-8155: bold-detection in ocr-service

See merge request fforesight/ocr-service!34
2024-01-17 13:54:00 +01:00
Kilian Schüttler
74d5f8d8e0 RED-8155: bold-detection in ocr-service 2024-01-17 13:54:00 +01:00
Kilian Schüttler
fb1fe35bc1 Merge branch 'RED-8155' into 'master'
RED-8155: bold-detection in ocr-service

Closes RED-8155

See merge request redactmanager/ocr-service!33
2024-01-08 13:53:30 +01:00
Kilian Schuettler
912f00aa84 RED-8155: bold-detection in ocr-service
* fix application.yml
2024-01-08 13:49:58 +01:00
Kilian Schüttler
bab16ad9b2 Merge branch 'RED-7669-fontstyle' into 'master'
RED-8155: integrate bold-detection into ocr-service

Closes RED-7669

See merge request redactmanager/ocr-service!31
2024-01-05 16:05:53 +01:00
Kilian Schüttler
be4656189b RED-8155: integrate bold-detection into ocr-service 2024-01-05 16:05:53 +01:00
Dominique Eifländer
8944b57344 Merge branch 'RED-7669' into 'master'
RED-7669: optimize OCR-module performance

Closes RED-7669

See merge request redactmanager/ocr-service!30
2023-12-22 15:14:42 +01:00
Kilian Schuettler
67540950b8 RED-7669: optimize OCR-module performance
* fix thread handling for PDFs without any images
2023-12-22 15:11:29 +01:00
Kilian Schuettler
6f29270e66 RED-7669: optimize OCR-module performance
* fix thread handling for PDFs without any images
2023-12-22 15:04:52 +01:00
Dominique Eifländer
b961c9e324 Merge branch 'RED-1137' into 'master'
RED-1137: Do not observe actuator endpoints

Closes RED-1137

See merge request redactmanager/ocr-service!29
2023-12-20 14:24:49 +01:00
Dominique Eifländer
4b6411161e RED-1137: Do not observe actuator endpoints 2023-12-20 14:17:09 +01:00
Dominique Eifländer
14982eae7c Merge branch 'RED-5223' into 'master'
RED-5223: Use tracing-commons from fforesight

Closes RED-5223

See merge request redactmanager/ocr-service!28
2023-12-13 16:16:29 +01:00
Dominique Eifländer
99fc16130b RED-5223: Use tracing-commons from fforesight 2023-12-13 16:10:10 +01:00
Dominique Eifländer
80d38fb785 Merge branch 'RED-7669' into 'master'
RED-7669: optimize OCR-module performance

Closes RED-7669

See merge request redactmanager/ocr-service!27
2023-12-12 15:27:00 +01:00
Kilian Schüttler
c06974ce69 RED-7669: optimize OCR-module performance 2023-12-12 15:27:00 +01:00
Dominique Eifländer
591c7d7fab Merge branch 'RED-5223' into 'master'
RED-5223: Enabled tracing, upgrade spring, use logstash-logback-encoder for json logs

Closes RED-5223

See merge request redactmanager/ocr-service!26
2023-12-12 12:36:01 +01:00
Dominique Eifländer
0300a087d4 RED-5223: Enabled tracing, upgrade spring, use logstash-logback-encoder for json logs 2023-12-12 11:55:01 +01:00
Andrei Isvoran
98752ff1d1 Merge branch 'RED-7714' into 'master'
RED-7715 - Add log4j config to enable switching between json/line logs

Closes RED-7714

See merge request redactmanager/ocr-service!25
2023-12-06 12:42:33 +01:00
Andrei Isvoran
ae09a59a7c RED-7715 - Add log4j config to enable switching between json/line logs 2023-12-06 11:52:01 +02:00
Kilian Schüttler
65d818200f Merge branch 'RED-7669' into 'master'
RED-7669: optimize OCR-module performance

Closes RED-7669

See merge request redactmanager/ocr-service!24
2023-11-28 12:35:22 +01:00
Kilian Schuettler
6fe95c6940 RED-7669: optimize OCR-module performance
* dont interrupt threads, use boolean flag instead
2023-11-28 10:04:56 +01:00
Kilian Schüttler
202132e14c Merge branch 'RED-7668' into 'master'
RED-7669: optimize OCR-module performance

Closes RED-7668

See merge request redactmanager/ocr-service!23
2023-11-24 10:57:45 +01:00
Kilian Schuettler
0264e28cc2 RED-7669: optimize OCR-module performance
* enable caches
2023-11-24 10:21:55 +01:00
Dominique Eifländer
a50f54676e Merge branch 'RED-7668' into 'master'
RED-7669: optimize OCR-module performance

Closes RED-7668

See merge request redactmanager/ocr-service!22
2023-11-23 16:04:43 +01:00
Kilian Schuettler
1926707ae1 RED-7669: optimize OCR-module performance
* move all critical stuff to its own singleton thread
* make gs process queue any image once the file has been written
2023-11-23 16:00:53 +01:00
Kilian Schuettler
d3190844a3 RED-7669: optimize OCR-module performance
* move all critical stuff to its own singleton thread
* make gs process queue any image once the file has been written
2023-11-23 16:00:31 +01:00
Kilian Schuettler
c7ccbae6ff RED-7669: optimize OCR-module performance
* move all critical stuff to its own singleton thread
* make gs process queue any image once the file has been written
2023-11-23 16:00:31 +01:00
Kilian Schuettler
880bebcafc RED-7669: optimize OCR-module performance
* move all critical stuff to its own singleton thread
* make gs process queue any image once the file has been written
2023-11-23 16:00:31 +01:00
Kilian Schuettler
955ff6281d RED-7669: optimize OCR-module performance
* move all critical stuff to its own singleton thread
* make gs process queue any image once the file has been written
2023-11-23 16:00:31 +01:00
Kilian Schuettler
efd3a1d952 RED-7669: optimize OCR-module performance
* move all non thread safe stuff to separate thread in the middle
2023-11-23 16:00:29 +01:00
Kilian Schuettler
bb5b4a2fd8 RED-7669: optimize OCR-module performance
* binarize images after reading
2023-11-23 16:00:22 +01:00
Kilian Schuettler
6f99664906 RED-7669: optimize OCR-module performance
* try and synchronize all malloc calls
2023-11-23 16:00:19 +01:00
Kilian Schuettler
574f7ac25e RED-7669: optimize OCR-module performance
* moar sigsegv
2023-11-23 16:00:01 +01:00
Kilian Schuettler
12217f2459 RED-7669: optimize OCR-module performance
* moar sigsegv
2023-11-23 16:00:01 +01:00
Kilian Schuettler
19747cbca5 RED-7669: optimize OCR-module performance
* moar sigsegv
2023-11-23 15:59:59 +01:00
Kilian Schuettler
2632d2023d RED-7669: optimize OCR-module performance
* reset test and settings
2023-11-23 15:59:16 +01:00
Kilian Schuettler
4c225c2219 RED-7669: optimize OCR-module performance
* cleanup Code
2023-11-23 15:59:16 +01:00
Kilian Schuettler
3d09f46844 RED-7669: optimize OCR-module performance
* don't despeckle small images
2023-11-23 15:59:16 +01:00
Kilian Schuettler
77355b5367 RED-7669: optimize OCR-module performance
* second attempt at thread safety
2023-11-23 15:59:16 +01:00
Kilian Schuettler
57e194fcd0 RED-7669: optimize OCR-module performance
* attempt at thread safety
2023-11-23 15:59:14 +01:00
Kilian Schüttler
c556687499 Merge branch 'RED-7669' into 'master'
RED-7669: optimize OCR-module performance

Closes RED-7669

See merge request redactmanager/ocr-service!21
2023-11-20 09:55:48 +01:00
Kilian Schüttler
759bae6499 RED-7669: optimize OCR-module performance 2023-11-20 09:55:48 +01:00
Kilian Schüttler
aa45fa84bb Merge branch 'CYB-001' into 'master'
CYB-001: Improve OCR-Module performance

Closes CYB-001

See merge request redactmanager/ocr-service!20
2023-11-14 09:17:46 +01:00
Kilian Schüttler
a82676c36b CYB-001: Improve OCR-Module performance 2023-11-14 09:17:46 +01:00
Kilian Schüttler
948c4bed79 Merge branch 'RED-7686' into 'master'
RED-7686 - Specific hidden text in specific file is not removed

Closes RED-7686

See merge request redactmanager/ocr-service!19
2023-10-16 09:27:34 +02:00
Corina Olariu
6d3ec8a9db RED-7686 - Specific hidden text in specific file is not removed
- upgraded storage-commons, tenant-commons to the newest windows compatible versions
2023-10-13 14:50:31 +03:00
Corina Olariu
6533501ffc RED-7686 - Specific hidden text in specific file is not removed
- update pdftron-logic-commons dependency
2023-10-13 13:45:44 +03:00
Raphael Arnold
607b9be6f5 Merge branch 'RED-7075' into 'master'
RED-7075: Watermark recognition improval

Closes RED-7075

See merge request redactmanager/ocr-service!18
2023-09-01 12:22:46 +02:00
RaphaelArnold
acba4cb103 RED-7075: Watermark recognition improval 2023-09-01 12:13:22 +02:00
Andrei Isvoran
bf3fe7f13d Merge branch 'RED-6864' into 'master'
RED-6864 - Fix storage update

Closes RED-6864

See merge request redactmanager/ocr-service!17
2023-08-18 09:22:27 +02:00
Andrei Isvoran
ede443a47a RED-6864 - Fix storage update 2023-08-18 10:19:33 +03:00
Andrei Isvoran
f5f1f70ffd Merge branch 'RED-6864' into 'master'
RED-6864 - Update ocr-service to new storage

Closes RED-6864

See merge request redactmanager/ocr-service!16
2023-08-16 11:43:53 +02:00
Andrei Isvoran
1c62c5ddf4 RED-6864 - Update ocr-service to new storage 2023-08-16 11:32:49 +03:00
deiflaender
506b888424 RED-7080: Fixed NPE in OCGWatermark removal 2023-08-14 16:35:31 +02:00
deiflaender
37ff2b982a RED-7080: Remove all watermarks that are named as watermarks in OCG 2023-08-14 16:11:48 +02:00
deiflaender
06c49cc412 RED-7080: Remove watermarks that are named as watermarks in OCG 2023-08-14 13:26:03 +02:00
deiflaender
262204bcca hotfix: Fixed npe for inline image where getXObject() returns null 2023-08-11 15:16:28 +02:00
Raphael Arnold
04a0925a6c Merge branch 'RED-7075' into 'master'
RED-7075: WIP

Closes RED-7075

See merge request redactmanager/ocr-service!15
2023-08-07 13:33:10 +02:00
RaphaelArnold
57ef7da5b3 RED-7075: WIP 2023-08-07 13:29:17 +02:00
Kilian Schuettler
7e20541d73 hotfix: fix OCRServiceIntegrationTest 2023-08-07 12:39:49 +02:00
Andrei Isvoran
33412589c0 RED-7290 - Update platform-common-dependency version 2023-08-03 18:14:46 +03:00
Andrei Isvoran
0ff07979ee Merge branch 'RED-7080' into 'master'
RED-7080 - Add removeWatermark flag for dossier template

Closes RED-7080

See merge request redactmanager/ocr-service!14
2023-08-01 08:27:00 +02:00
Andrei Isvoran
0ad4682571 RED-7080 - Add removeWatermark flag for dossier template 2023-07-31 15:51:22 +03:00
Timo Bejan
74f9f123f4 Update pom.xml 2023-07-27 08:49:44 +02:00
Ali Oezyetimoglu
7525a54341 Merge branch 'RED-7012' into 'master'
RED-7012: upgraded pdftron-logic-commons version

Closes RED-7012

See merge request redactmanager/ocr-service!13
2023-07-19 15:11:34 +02:00
Ali Oezyetimoglu
7209d47862 RED-7012: upgraded pdftron-logic-commons version 2023-07-19 15:06:22 +02:00
Kilian Schüttler
82db83936d Merge branch 'DM-326' into 'master'
DM-326: extend removeInvisibleElements

Closes DM-326

See merge request redactmanager/ocr-service!10
2023-07-14 12:48:49 +02:00
Kilian Schuettler
856d52951c DM-326: extend removeInvisibleElements 2023-07-14 12:46:24 +02:00
Kilian Schuettler
95af0faecd added watermark removal test 2023-07-13 11:25:16 +02:00
Kilian Schüttler
e1f9d178b0 Merge branch 'DM-305' into 'master'
DM-305: port rules to new schema

Closes DM-305

See merge request redactmanager/ocr-service!8
2023-07-11 17:25:51 +02:00
Kilian Schuettler
098bfcfce3 DM-305: port rules to new schema
* implement remove invisible text, when color equal to background
2023-07-11 17:23:03 +02:00
Dominique Eifländer
a4f6b2c0d2 Merge branch 'DM-307' into 'master'
DM-307: Added none production ready code remove watermarks from SCM Flora prototype files

Closes DM-307

See merge request redactmanager/ocr-service!7
2023-07-03 12:44:20 +02:00
deiflaender
4df80612ab DM-307: Added none production ready code remove watermarks from SCM Flora prototype files 2023-07-03 12:37:42 +02:00
Timo Bejan
e36a3363ef Merge branch 'RED-6686-2' into 'master'
RED-6686 tenant commons update

Closes RED-6686

See merge request redactmanager/ocr-service!6
2023-06-27 22:46:32 +02:00
Timo Bejan
11dc276dfd RED-6686 tenant commons update 2023-06-27 23:44:14 +03:00
Timo Bejan
4fe2b2ae6b Merge branch 'RED-6686' into 'master'
Resolve RED-6686

Closes RED-6686

See merge request redactmanager/ocr-service!4
2023-06-26 23:40:58 +02:00
Timo Bejan
61f6e2ebf6 Resolve RED-6686 2023-06-26 23:40:58 +02:00
Timo Bejan
038cc22f28 Merge branch 'RED-6686' into 'master'
RED-6686 Extract Tenant and user-management code into a separate service.

See merge request redactmanager/ocr-service!1
2023-06-26 21:58:50 +02:00
Timo Bejan
06a504cf2f RED-6686 Extract Tenant and user-management code into a separate service. 2023-06-26 21:58:49 +02:00
Raphael Arnold
eb91574b0a Merge branch 'RED-6965' into 'master'
RED-6965: Null pointer removal

Closes RED-6965

See merge request redactmanager/ocr-service!3
2023-06-21 16:15:17 +02:00
RaphaelArnold
b28cc721c2 RED-6965: Null pointer removal 2023-06-21 16:11:07 +02:00
Dominique Eifländer
7dcc59c750 Merge branch 'RED-6072-timestamp' into 'master'
RED-6072 - As Operation I want to see why files are in an ERROR state

Closes RED-6072

See merge request redactmanager/ocr-service!2
2023-06-15 14:08:39 +02:00
Corina Olariu
b15bcfa372 RED-6072 - As Operation I want to see why files are in an ERROR state
- add timestamp to error info
2023-06-15 14:37:32 +03:00
Christoph Schabert
d2f38d7cf0 Update 2 files
- /.gitlab-ci.yml
- /pom.xml
2023-05-31 15:44:02 +02:00
Christoph Schabert
df59fd83ed Update 6 files
- /bamboo-specs/src/main/java/buildjob/PlanSpec.java
- /bamboo-specs/src/main/resources/scripts/build-java.sh
- /bamboo-specs/src/main/resources/scripts/sonar-java.sh
- /bamboo-specs/src/test/java/buildjob/PlanSpecTest.java
- /bamboo-specs/pom.xml
- /.gitlab-ci.yml
2023-05-31 15:10:37 +02:00
Corina Olariu
e400a04644 Pull request #22: RED-6072
Merge in RED/ocr-service from RED-6072 to master

* commit '961129b917e92b4dda2085215d3f15e205ec35a3':
  RED-6072 - As Operation I want to see why files are in an ERROR state - update the ocr failed with error information - in case of exception the reason is caught and added tot the message received from the queue
  RED-6072 - As Operation I want to see why files are in an ERROR state update from iqser to knecon
2023-05-25 14:59:04 +02:00
Dominique Eiflaender
933b4bd3de Pull request #23: RED-5567: Upgraded to lastest pdftron version
Merge in RED/ocr-service from RED-5567-exptest to master

* commit 'ee94f1c711c923e865e559499674c1658c87fbb4':
  RED-5567: Upgraded to lastest pdftron version
2023-05-25 14:19:18 +02:00
deiflaender
ee94f1c711 RED-5567: Upgraded to lastest pdftron version 2023-05-25 14:14:59 +02:00
devplant
961129b917 RED-6072 - As Operation I want to see why files are in an ERROR state
- update the ocr failed with error information
- in case of exception the reason is caught and added tot the message received from the queue
2023-05-25 14:29:40 +03:00
devplant
0307da9093 RED-6072 - As Operation I want to see why files are in an ERROR state
update from iqser to knecon
2023-05-25 14:20:14 +03:00
Corina Olariu
aac56cc2ed Pull request #21: RED-5694 - Upgrade spring-boot to 3.0
Merge in RED/ocr-service from RED-5694-storage to master

* commit '828b4f53c837a77b860dc3df20a8e65b1330eaa8':
  RED-5694 - Upgrade spring-boot to 3.0 - add -DknownExploitedEnabled=false - import StorageAutoConfiguration to Application
2023-04-21 14:08:04 +02:00
devplant
828b4f53c8 RED-5694 - Upgrade spring-boot to 3.0
- add -DknownExploitedEnabled=false
- import StorageAutoConfiguration to Application
2023-04-20 16:14:28 +03:00
Corina Olariu
062cf46e30 Pull request #20: RED-5694
Merge in RED/ocr-service from RED-5694 to master

* commit 'f72001c4ac0431c482cc467e948bdc972643d785':
  RED-5694 - Upgrade spring-boot to 3.0 - remove commented code
  RED-5694 - Upgrade spring-boot to 3.0 - remove versions already defined in spring boot
  RED-5694 - Upgrade spring-boot to 3.0 - remove unused import
  RED-5694 - Upgrade spring-boot to 3.0 - update platform-dependency - update other dependencies versions to the latest
2023-04-06 12:42:23 +02:00
devplant
f72001c4ac RED-5694 - Upgrade spring-boot to 3.0
- remove commented code
2023-04-06 13:36:43 +03:00
devplant
81fd35f9c0 RED-5694 - Upgrade spring-boot to 3.0
- remove versions already defined in spring boot
2023-04-06 13:35:42 +03:00
devplant
17b1932926 RED-5694 - Upgrade spring-boot to 3.0
- remove unused import
2023-04-06 11:57:36 +03:00
devplant
c00e9290bc RED-5694 - Upgrade spring-boot to 3.0
- update platform-dependency
- update other dependencies versions to the latest
2023-04-06 11:52:44 +03:00
Thomas Beyer
d9e4f79099 Pull request #19: RED-4875 1
Merge in RED/ocr-service from RED-4875_1 to master

* commit '036203c24a1f5eb1588e945eb020a666cac5dba2':
  RED-4875 - delete commented out classes
  RED-4875 - set version of pdftron-common-logics to newest (release)
  RED-4875 - removed duration log
  RED-4875 - update version of pdftron-commons to newest
  RED-4875 - update version of pdftron logic commons to newest
  RED-4875 - set version of common pdftron logics to newest and move PdfTextExtraction to this new repo
  RED-4875 - update version of pdftron-logic-commons to newest
  RED-4875 - call logic of new repo pdftron-logic-commons instead of local one
2023-03-22 12:07:24 +01:00
Thomas Beyer
036203c24a RED-4875 - delete commented out classes 2023-03-21 18:20:26 +01:00
Thomas Beyer
6ac71f6d94 RED-4875 - set version of pdftron-common-logics to newest (release) 2023-03-21 17:00:41 +01:00
Thomas Beyer
dd12611fdc RED-4875 - removed duration log 2023-03-21 13:18:41 +01:00
Thomas Beyer
b0c4c25bec RED-4875 - update version of pdftron-commons to newest 2023-03-21 12:28:57 +01:00
Thomas Beyer
5efa0e96a8 RED-4875 - update version of pdftron logic commons to newest 2023-03-20 11:25:38 +01:00
Thomas Beyer
fd92419895 RED-4875 - set version of common pdftron logics to newest and move PdfTextExtraction to this new repo 2023-03-20 10:01:33 +01:00
Thomas Beyer
142e8cf957 RED-4875 - update version of pdftron-logic-commons to newest 2023-03-17 17:25:52 +01:00
Thomas Beyer
143538fa40 RED-4875 - call logic of new repo pdftron-logic-commons instead of local one 2023-03-17 10:33:48 +01:00
Christoph Schabert
74a094b42d Change Tagging 2023-03-13 09:41:59 +01:00
Timo Bejan
8ab4092046 Pull request #18: RED-6162 - bumped version
Merge in RED/ocr-service from RED-6162 to master

* commit 'c86cabf70a9ec90687c3f123768f2293da49c4ff':
  RED-6162 - bumped version
  RED-6162 - bumped version
2023-03-10 21:45:38 +01:00
Timo Bejan
c86cabf70a RED-6162 - bumped version 2023-03-10 22:43:35 +02:00
Dominique Eiflaender
936a795797 Pull request #17: RED-4645: Multitenancy for storage
Merge in RED/ocr-service from RED-4645 to master

* commit 'e5df0ec65864667134affad2af2a5118126355ce':
  RED-4645: Multitenancy for storage
2023-03-10 16:42:43 +01:00
deiflaender
e5df0ec658 RED-4645: Multitenancy for storage 2023-03-10 16:31:34 +01:00
Timo Bejan
0d9365d020 RED-6162 - bumped version 2023-03-10 15:59:25 +02:00
Timo Bejan
5003970fe1 Pull request #16: RED-6162 - bumped version
Merge in RED/ocr-service from RED-6162 to master

* commit '9c6fa4384a7ea43dfd36c6fcb6639e5127ab92b9':
  RED-6162 - bumped version
2023-03-10 14:56:05 +01:00
Timo Bejan
9c6fa4384a RED-6162 - bumped version 2023-03-10 15:52:37 +02:00
Timo Bejan
444b823cce Pull request #15: RED-6162 Redaction Gateway - Persistence Service Merge Updates
Merge in RED/ocr-service from RED-6162 to master

* commit '6208ff029252884ced189806acd4118cd21530e1':
  RED-6182 - version bump
  RED-6162 - test issue on bamboo
  RED-6162 - test issue on bamboo
  RED-6162 - persistence update - identity test
  RED-6162 - persistence update - reverse dependency cleanup
2023-03-10 11:27:13 +01:00
Timo Bejan
6208ff0292 RED-6182 - version bump 2023-03-10 08:45:48 +02:00
Timo Bejan
5156c560f2 RED-6162 - test issue on bamboo 2023-03-10 08:45:48 +02:00
Timo Bejan
c2a7e32789 RED-6162 - test issue on bamboo 2023-03-10 08:45:48 +02:00
Timo Bejan
c1cd4d1cab RED-6162 - persistence update - identity test 2023-03-10 08:45:48 +02:00
Timo Bejan
da5a1cc042 RED-6162 - persistence update - reverse dependency cleanup 2023-03-10 08:45:48 +02:00
Kilian Schuettler
d91fb737cb Pull request #14: RED-6321: OCR not working correctly with 3.6.0
Merge in RED/ocr-service from RED-6321 to master

* commit '2d503c74a6fba0b02ac4c92af82a493165e45761':
  RED-6321: OCR not working correctly with 3.6.0 *added end() statements to formWriters to write their changes to the PDF contentStream *moved replaceOriginalPageWithOcrPage outside the main OCR Loop since it caused exponential RAM Usage in some cases
2023-03-06 10:16:02 +01:00
Kilian Schuettler
2d503c74a6 RED-6321: OCR not working correctly with 3.6.0
*added end() statements to formWriters to write their changes to the PDF contentStream
*moved replaceOriginalPageWithOcrPage outside the main OCR Loop since it caused exponential RAM Usage in some cases
2023-03-06 10:08:32 +01:00
Corina Olariu
bdcba7cb6d Pull request #11: RED-4988 Check jacoco version in poms and update to a current compatible version
Merge in RED/ocr-service from RED-4988 to master

* commit '334409e4db326945a1eeb86004d79e0b0faeb99c':
  RED-4988 Check jacoco version in poms and update to a current compatible version - update platform-dependency, platform-commons-dependency
2023-03-02 09:28:11 +01:00
Shamel Hussain
23622d4a85 Pull request #13: RED-5718: Update base image version to allow using random user ids
Merge in RED/ocr-service from shussain/Dockerfile-1677589647499 to master

* commit '4266184456ea4d97fa19b52ff37d5d9fc09dd170':
  RED-5718: Update base image version to allow using random user ids
2023-02-28 14:56:37 +01:00
Shamel Hussain
4266184456 RED-5718: Update base image version to allow using random user ids 2023-02-28 14:07:44 +01:00
Kilian Schuettler
a4ca2db37d Pull request #12: RED-6280: Performance Test Issue with OCR-Service
Merge in RED/ocr-service from RED-6280 to master

* commit '742725834933ad74ad582366b2b62015524bedb3':
  RED-6280:  Performance Test Issue with OCR-Service *removed init/terminate calls again *manual memory cleanup at every opportunity
2023-02-28 10:30:58 +01:00
Kilian Schuettler
7427258349 RED-6280: Performance Test Issue with OCR-Service
*removed init/terminate calls again
*manual memory cleanup at every opportunity
2023-02-28 10:21:21 +01:00
devplant
334409e4db RED-4988 Check jacoco version in poms and update to a current compatible version
- update platform-dependency, platform-commons-dependency
2023-02-27 17:41:15 +02:00
Kilian Schuettler
bc661b7ea4 Pull request #9: RED-6126: In the OCRService, OCR Text is not applied to Document
Merge in RED/ocr-service from RED-6126 to master

* commit 'caff5580dda644451433921a6a66dafe1cfa5dca':
  RED-6126:  In the OCRService, OCR Text is not applied to Document *refactored Tests with inheritance *called PDFNet init/terminate in tests *don't call init on startup
  RED-6126:  In the OCRService, OCR Text is not applied to Document *called PDFNet.initialize and terminate before and after message receive *updated comments *renamed some variables
2023-02-22 13:33:49 +01:00
Kilian Schuettler
caff5580dd RED-6126: In the OCRService, OCR Text is not applied to Document
*refactored Tests with inheritance
*called PDFNet init/terminate in tests
*don't call init on startup
2023-02-22 11:58:07 +01:00
Kilian Schuettler
430ad45a67 RED-6126: In the OCRService, OCR Text is not applied to Document
*called PDFNet.initialize and terminate before and after message receive
*updated comments
*renamed some variables
2023-02-22 10:32:55 +01:00
Kilian Schuettler
a6d99f5916 Pull request #8: RED-6126: In the OCRService, OCR Text is not applied to Document
Merge in RED/ocr-service from RED-6126 to master

* commit '0bc4fea2a52c92efaaaf8cf93c2ae02766168a80':
  RED-6126: In the OCRService, OCR Text is not applied to Document *removed unnecessary getXObject() call, since it fails for inline_images
2023-02-14 09:57:28 +01:00
Kilian Schuettler
0bc4fea2a5 RED-6126: In the OCRService, OCR Text is not applied to Document
*removed unnecessary getXObject() call, since it fails for inline_images
2023-02-13 17:55:02 +01:00
Kilian Schuettler
001719a34c Pull request #7: RED-6126 performance test
Merge in RED/ocr-service from RED-6126-performance-test to master

* commit '37f1e03ebcd5356e0f0b403a5c0cdd20fc133997':
  RED-6126: performance-test *refactor to improve cleanness *closed inputStream
  RED-6126: performance-test *fixed NullPointerException *fixed StackOverFlowError by ignoring very small images and moving to while loop instead of recursion
  RED-6126: performance-test *fixed time calculation
  RED-6126: performance-test *improved error logging
  RED-6126: performance-test *re-enabled overlap detection *re-creating helper document for every page instead of reusing and adding/removing pages
  RED-6126: Performance Tests *moved to streams for pdf file transfer *disabled overlap detection
2023-02-10 15:00:55 +01:00
Kilian Schuettler
37f1e03ebc RED-6126: performance-test
*refactor to improve cleanness
*closed inputStream
2023-02-10 14:49:10 +01:00
Kilian Schuettler
b3fa14b342 RED-6126: performance-test
*fixed NullPointerException
*fixed StackOverFlowError by ignoring very small images and moving to while loop instead of recursion
2023-02-10 12:27:16 +01:00
Kilian Schuettler
7065d098f3 RED-6126: performance-test
*fixed time calculation
2023-02-09 16:31:42 +01:00
Kilian Schuettler
8db0b712f7 RED-6126: performance-test
*improved error logging
2023-02-09 13:57:21 +01:00
Kilian Schuettler
6ccf3f80fc RED-6126: performance-test
*re-enabled overlap detection
*re-creating helper document for every page instead of reusing and adding/removing pages
2023-02-09 11:22:39 +01:00
Kilian Schuettler
e705f869fd RED-6126: Performance Tests
*moved to streams for pdf file transfer
*disabled overlap detection
2023-02-09 11:09:52 +01:00
Timo Bejan
efaa291e43 Pull request #6: RED-4609 - added ocr metric, enabled prometheus, added test for metric
Merge in RED/ocr-service from RED-4609 to master

* commit '7c71d8ad041f839c21ec26023ee8eaef670a4924':
  RED-4609 - added ocr metric, enabled prometheus, added test for metric
2023-02-09 10:57:37 +01:00
Timo Bejan
7c71d8ad04 RED-4609 - added ocr metric, enabled prometheus, added test for metric 2023-02-08 16:46:51 +02:00
Kilian Schuettler
b0a658213d Pull request #5: RED-6126
Merge in RED/ocr-service from RED-6126 to master

* commit '00cfe9e44948c153857ad59442dbc9349e1d4555':
  RED-6126: In the OCRService, OCR Text is not applied to Document *reformatted InvisibleElementRemovalService with new Code Style
  RED-6126: In the OCRService, OCR Text is not applied to Document *updated some comments *very slight refactor
  RED-6126: In the OCRService, OCR Text is not applied to Document *complete refactor of the OCRService *moved image position retrieval to new class instead of image service *added new tests for image rotation
  RED-6126: In the OCRService, OCR Text is not applied to Document *removed private configuration
  RED-6126: In the OCRService, OCR Text is not applied to Document *formatted one line
  RED-6126: In the OCRService, OCR Text is not applied to Document *reverted application of OCR Text to Document to old state *refactored OCR Service slightly *added meaningful test cases
2023-02-07 13:35:32 +01:00
Kilian Schuettler
00cfe9e449 RED-6126: In the OCRService, OCR Text is not applied to Document
*reformatted InvisibleElementRemovalService with new Code Style
2023-02-07 12:52:09 +01:00
Kilian Schuettler
d0d6bf70a4 RED-6126: In the OCRService, OCR Text is not applied to Document
*updated some comments
*very slight refactor
2023-02-07 12:09:04 +01:00
Kilian Schuettler
a415224db5 RED-6126: In the OCRService, OCR Text is not applied to Document
*complete refactor of the OCRService
*moved image position retrieval to new class instead of image service
*added new tests for image rotation
2023-02-07 12:05:24 +01:00
Kilian Schuettler
355887c865 RED-6126: In the OCRService, OCR Text is not applied to Document
*removed private configuration
2023-02-03 13:16:56 +01:00
Kilian Schuettler
ab566a11a9 RED-6126: In the OCRService, OCR Text is not applied to Document
*formatted one line
2023-02-03 13:03:47 +01:00
Kilian Schuettler
edd044395e RED-6126: In the OCRService, OCR Text is not applied to Document
*reverted application of OCR Text to Document to old state
*refactored OCR Service slightly
*added meaningful test cases
2023-02-03 13:01:01 +01:00
Kilian Schuettler
b37ec5afc9 Pull request #4: RED-6019: Remove hidden text when processing OCR
Merge in RED/ocr-service from RED-6019 to master

* commit 'a96260f77fd5b546a5d27d84f34861742f13ddff':
  RED-6019: Remove hidden text when processing OCR *moved InvisibleElementRemovalDto to private inner record of InvisibleElementRemovalService *added comments for color choices
  RED-6019: Remove hidden text when processing OCR *moved to release version of platform-dependencies *restored annotationProcessors
  RED-6019: Remove hidden text when processing OCR *code refactor *upgrade to java 17
  RED-6019: Remove hidden text when processing OCR handled cases:      Text which is transparent or is set to not render      Elements outside of clipping path      Elements that have been painted over by visible and filled Paths unhandled cases:      Elements covered by widely stroked path      Elements same color as background      Any Text set to clipping with its many interactions with other elements
2023-02-02 13:05:03 +01:00
Kilian Schuettler
a96260f77f RED-6019: Remove hidden text when processing OCR
*moved InvisibleElementRemovalDto to private inner record of InvisibleElementRemovalService
*added comments for color choices
2023-02-02 13:01:58 +01:00
Kilian Schuettler
12fbdbee50 RED-6019: Remove hidden text when processing OCR
*moved to release version of platform-dependencies
*restored annotationProcessors
2023-02-02 10:53:19 +01:00
Kilian Schuettler
99a0cb51d0 RED-6019: Remove hidden text when processing OCR
*code refactor
*upgrade to java 17
2023-02-02 10:27:01 +01:00
Kilian Schuettler
fd7ec6e7aa RED-6019: Remove hidden text when processing OCR
handled cases:
     Text which is transparent or is set to not render
     Elements outside of clipping path
     Elements that have been painted over by visible and filled Paths
unhandled cases:
     Elements covered by widely stroked path
     Elements same color as background
     Any Text set to clipping with its many interactions with other elements
2023-01-30 16:13:51 +01:00
Dominique Eiflaender
265fac8099 Pull request #3: RED-5911: Reverted to old ocr logic that uses ContentReplacer/TextExtractor to remove text behind images
Merge in RED/ocr-service from RED-5911 to master

* commit '7a4c5c2f898e83623a66ef29ab9ed696e2057e24':
  RED-5911: Reverted to old ocr logic that uses ContentReplacer/TextExtractor to remove text behind images
2023-01-17 12:42:10 +01:00
deiflaender
7a4c5c2f89 RED-5911: Reverted to old ocr logic that uses ContentReplacer/TextExtractor to remove text behind images 2023-01-17 12:15:34 +01:00
146 changed files with 5141 additions and 1857 deletions

47
.gitignore vendored
View File

@ -9,6 +9,49 @@
**/tmp/
**/.apt_generated/
HELP.md
target/
!.mvn/wrapper/maven-wrapper.jar
!**/src/main/**/target/
!**/src/test/**/target/
### maven build ###
*.class
/out/
/build/
/target/
**/out/
**/build/
**/target/
### STS ###
.apt_generated
.classpath
.factorypath
.project
.settings
.springBeans
.sts4-cache
.gradle
### IntelliJ IDEA ###
.idea
*.iws
*.iml
*.ipr
### NetBeans ###
/nbproject/private/
/nbbuild/
/dist/
/nbdist/
/.nb-gradle/
build/
!**/src/main/**/build/
!**/src/test/**/build/
### VS Code ###
.vscode/
.factorypath
.springBeans
@ -26,3 +69,7 @@
**/.DS_Store
**/classpath-data.json
**/dependencies-and-licenses-overview.txt
gradle.properties
gradlew
gradlew.bat
gradle/

25
.gitlab-ci.yml Normal file
View File

@ -0,0 +1,25 @@
variables:
# SONAR_PROJECT_KEY: 'ocr-service:ocr-service-server'
GIT_SUBMODULE_STRATEGY: recursive
GIT_SUBMODULE_FORCE_HTTPS: 'true'
include:
- project: 'gitlab/gitlab'
ref: 'main'
file: 'ci-templates/gradle_java.yml'
deploy:
stage: deploy
tags:
- dind
script:
- echo "Building with gradle version ${BUILDVERSION}"
- gradle -Pversion=${BUILDVERSION} publish
- gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${BUILDVERSION}
- echo "BUILDVERSION=$BUILDVERSION" >> version.env
artifacts:
reports:
dotenv: version.env
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
- if: $CI_COMMIT_BRANCH =~ /^release/
- if: $CI_COMMIT_TAG

8
.gitmodules vendored Normal file
View File

@ -0,0 +1,8 @@
[submodule "ocr-service-v1/ocr-service-server/src/test/resources/files/syngenta"]
path = ocr-service-v1/ocr-service-server/src/test/resources/files/syngenta
url = https://gitlab.knecon.com/fforesight/documents/syngenta.git
update = merge
[submodule "ocr-service-v1/ocr-service-server/src/test/resources/files/basf"]
path = ocr-service-v1/ocr-service-server/src/test/resources/files/basf
url = https://gitlab.knecon.com/fforesight/documents/basf.git
update = merge

87
README.md Normal file
View File

@ -0,0 +1,87 @@
# OCR Service
## Overview
The OCR service is a tool designed for extracting text content from PDF files. It utilizes Tesseract, Leptonica, PDFTron, PDFBox, and Ghostscript to perform various tasks, including removing invisible elements and watermarks, extracting images, stitching striped images, binarizing images, running OCR on the processed images, and writing the recognized text back to the original PDF. This service is particularly useful for obtaining machine-readable text from PDF documents.
## Dependencies
[Tesseract](https://github.com/tesseract-ocr/tesseract)
[Leptonica](http://leptonica.org/)
[PDFTron](https://apryse.com/)
[PDFBox](https://pdfbox.apache.org/)
[Ghostscript](https://www.ghostscript.com/)
## Functionality
1. Invisible Element and Watermark Removal
The service uses PDFTron to attempt the removal of invisible elements and watermarks from the PDF.
2. Image Extraction
Extracts all images from the PDF using PDFBox
3. Striped Image Detection and Stitching
Detects if images are striped and stitches them together using Ghostscript.
4. Image Processing
- Convert to grayscale
- Upscale to target DPI
- Filter using Gauss kernel
- Binarizes the resulting images using Leptonica and the Otsu thresholding algorithm.
- Despeckle using various morphological operations
5. OCR Processing
Runs Tesseract on the images to extract text.
6. Font style detection
Detection of bold text using stroke width estimation
7. Text Integration
Draws the resulting text onto the original PDF using PDFBox.
Steps 2.-5. happen in parallel and communicate via a blocking queue to limit RAM usage.
Therefore, choosing your thread counts carefully leads to most optimal performance.
For example with 18 available cores, I achieved the highest performance with 2 Image extraction threads, 2 ghostscript processes and 16 OCR threads.
Setting all threads to basically unlimited (1000+) leads to comparable performance without laborious thread tuning, but at the cost of (potentially a lot) more RAM.
## Installation
To run the OCR service, ensure that the following dependencies are installed:
1. Ghostscript: Install using apt.
```bash
sudo apt install ghostscript
```
2. Tesseract and Leptonica: Install using [vcpkg](https://github.com/microsoft/vcpkg) with the command and set the environment variable `VCPKG_DYNAMIC_LIB` to your vcpkg lib folder (e.g. ~/vcpkg/installed/x64-linux-dynamic/lib).
```bash
vcpkg install tesseract --triplet x64-linux-dynamic
```
```bash
vcpkg install leptonica --triplet x64-linux-dynamic
```
3. Other dependencies are handled by Gradle build
```bash
gradle build
```
## Configuration
Configuration settings are available in the OcrServiceSettings class.
These settings can be overridden using environment variables. e.g.
`OCR_SERVICE_OCR_THREAD_COUNT=16`
Possible configurations and their defaults include:
```java
int ocrThreadCount = 4; // Number of OCR threads
int imageExtractThreadCount = 4; // Number of image extraction threads
int gsProcessCount = 4; // Number of Ghostscript processes
int dpi = 300; // Target DPI for binarized images
int psmOverride = -1; // Overrides the page segmentation mode if > 0
int minImageHeight = 20; // Minimum height for images to be processed
int minImageWidth = 20; // Minimum width for images to be processed
boolean debug = false; // If true, overlays OCR images with a grid and draws word bounding boxes
boolean removeWatermark; // If false, watermarks will not be removed
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
```
## Integration
The OCR-service communicates via RabbitMQ and uses the queues `ocr_request_queue`, `ocr_response_queue`,
`ocr_dead_letter_queue`, and `ocr_status_update_response_queue`.
### ocr_request_queue
This queue is used to start the OCR process, a DocumentRequest must be passed as a message. The service will then download the PDF from the provided cloud storage.
### ocr_response_queue
This queue is also used to signal the end of processing.
### ocr_dead_letter_queue
This queue is used to signal an error has occurred during processing.
### ocr_status_update_response_queue
This queue is used by the OCR service to give updates about the progress of the ongoing OCR on a image per image basis. The total amount may change, when less images are found than initially assumed.

View File

@ -1,37 +0,0 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs-parent</artifactId>
<version>8.1.3</version>
<relativePath/>
</parent>
<artifactId>bamboo-specs</artifactId>
<version>1.0.0-SNAPSHOT</version>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs-api</artifactId>
</dependency>
<dependency>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs</artifactId>
</dependency>
<!-- Test dependencies -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<!-- run 'mvn test' to perform offline validation of the plan -->
<!-- run 'mvn -Ppublish-specs' to upload the plan to your Bamboo server -->
</project>

View File

@ -1,125 +0,0 @@
package buildjob;
import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
import java.time.LocalTime;
import com.atlassian.bamboo.specs.api.BambooSpec;
import com.atlassian.bamboo.specs.api.builders.BambooKey;
import com.atlassian.bamboo.specs.api.builders.Variable;
import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
import com.atlassian.bamboo.specs.api.builders.permission.PermissionType;
import com.atlassian.bamboo.specs.api.builders.permission.Permissions;
import com.atlassian.bamboo.specs.api.builders.permission.PlanPermissions;
import com.atlassian.bamboo.specs.api.builders.plan.Job;
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
import com.atlassian.bamboo.specs.api.builders.plan.PlanIdentifier;
import com.atlassian.bamboo.specs.api.builders.plan.Stage;
import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup;
import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement;
import com.atlassian.bamboo.specs.api.builders.project.Project;
import com.atlassian.bamboo.specs.builders.task.CheckoutItem;
import com.atlassian.bamboo.specs.builders.task.InjectVariablesTask;
import com.atlassian.bamboo.specs.builders.task.ScriptTask;
import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask;
import com.atlassian.bamboo.specs.builders.task.VcsTagTask;
import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
import com.atlassian.bamboo.specs.builders.trigger.ScheduledTrigger;
import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
import com.atlassian.bamboo.specs.util.BambooServer;
/**
* Plan configuration for Bamboo.
* Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
*/
@BambooSpec
public class PlanSpec {
private static final String SERVICE_NAME = "ocr-service";
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-", "");
/**
* Run main to publish plan on Bamboo
*/
public static void main(final String[] args) throws Exception {
//By default credentials are read from the '.credentials' file.
BambooServer bambooServer = new BambooServer("http://localhost:8085");
Plan plan = new PlanSpec().createPlan();
bambooServer.publish(plan);
PlanPermissions planPermission = new PlanSpec().createPlanPermission(plan.getIdentifier());
bambooServer.publish(planPermission);
Plan secPlan = new PlanSpec().createSecBuild();
bambooServer.publish(secPlan);
PlanPermissions secPlanPermission = new PlanSpec().createPlanPermission(secPlan.getIdentifier());
bambooServer.publish(secPlanPermission);
}
private PlanPermissions createPlanPermission(PlanIdentifier planIdentifier) {
Permissions permission = new Permissions().userPermissions("atlbamboo",
PermissionType.EDIT,
PermissionType.VIEW,
PermissionType.ADMIN,
PermissionType.CLONE,
PermissionType.BUILD)
.groupPermissions("Development", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
.groupPermissions("devplant", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
.loggedInUserPermissions(PermissionType.VIEW)
.anonymousUserPermissionView();
return new PlanPermissions(planIdentifier.getProjectKey(), planIdentifier.getPlanKey()).permissions(permission);
}
private Project project() {
return new Project().name("RED").key(new BambooKey("RED"));
}
public Plan createPlan() {
return new Plan(project(), SERVICE_NAME, new BambooKey(SERVICE_KEY)).description("Plan created from (enter repository url of your plan)")
.variables(new Variable("maven_add_param", ""))
.stages(new Stage("Default Stage").jobs(new Job("Default Job", new BambooKey("JOB1")).tasks(new ScriptTask().description("Clean")
.inlineBody("#!/bin/bash\n" + "set -e\n" + "rm -rf ./*"),
new VcsCheckoutTask().description("Checkout Default Repository").cleanCheckout(true).checkoutItems(new CheckoutItem().defaultRepository()),
new ScriptTask().description("Build").location(Location.FILE).fileFromPath("bamboo-specs/src/main/resources/scripts/build-java.sh").argument(SERVICE_NAME),
createJUnitParserTask().description("Resultparser")
.resultDirectories("**/test-reports/*.xml, **/target/surefire-reports/*.xml, **/target/failsafe-reports/*.xml")
.enabled(true),
new InjectVariablesTask().description("Inject git Tag").path("git.tag").namespace("g").scope(InjectVariablesScope.LOCAL),
new VcsTagTask().description("${bamboo.g.gitTag}").tagName("${bamboo.g.gitTag}").defaultRepository())
.dockerConfiguration(new DockerConfiguration().image("nexus.iqser.com:5001/infra/maven:3.8.4-openjdk-17-slim")
.volume("/etc/maven/settings.xml", "/usr/share/maven/conf/settings.xml")
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
.linkedRepositories("RED / " + SERVICE_NAME)
.triggers(new BitbucketServerTrigger())
.planBranchManagement(new PlanBranchManagement().createForVcsBranch()
.delete(new BranchCleanup().whenInactiveInRepositoryAfterDays(14))
.notificationForCommitters());
}
public Plan createSecBuild() {
return new Plan(project(), SERVICE_NAME + "-Sec", new BambooKey(SERVICE_KEY + "SEC")).description("Security Analysis Plan")
.stages(new Stage("Default Stage").jobs(new Job("Default Job", new BambooKey("JOB1")).tasks(new ScriptTask().description("Clean")
.inlineBody("#!/bin/bash\n" + "set -e\n" + "rm -rf ./*"),
new VcsCheckoutTask().description("Checkout Default Repository").checkoutItems(new CheckoutItem().defaultRepository()),
new ScriptTask().description("Sonar").location(Location.FILE).fileFromPath("bamboo-specs/src/main/resources/scripts/sonar-java.sh").argument(SERVICE_NAME))
.dockerConfiguration(new DockerConfiguration().image("nexus.iqser.com:5001/infra/maven:3.6.2-jdk-13-3.0.0")
.dockerRunArguments("--net=host")
.volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
.linkedRepositories("RED / " + SERVICE_NAME)
.triggers(new ScheduledTrigger().scheduleOnceDaily(LocalTime.of(23, 00)))
.planBranchManagement(new PlanBranchManagement().createForVcsBranchMatching("release.*").notificationForCommitters());
}
}

View File

@ -1,62 +0,0 @@
#!/bin/bash
set -e
SERVICE_NAME=$1
if [[ "$bamboo_planRepository_branchName" == "master" ]]
then
echo "building on master branch"
branchVersion=$(cat pom.xml | grep -Eo "<version>.*" | sed -s 's|<version>\(.*\)\..*\(-*.*\)</version>|\1|')
echo "branch version is : $branchVersion"
latestVersion=$(semver $( git tag -l $branchVersion.* ) | tail -n1)
echo "latestVersion is : $latestVersion"
newVersion="$(semver $latestVersion -p -i minor)"
echo "newVersion is : $newVersion"
elif [[ "$bamboo_planRepository_branchName" == release* ]]
then
branchVersion=$(echo $bamboo_planRepository_branchName | sed -s 's|release\/\([0-9]\+\.[0-9]\+\)\.x|\1|')
latestVersion=$(semver $( git tag -l $branchVersion.* ) | tail -n1)
newVersion="$(semver $latestVersion -p -i patch)"
elif [[ "${bamboo_version_tag}" != "dev" ]]
then
newVersion="${bamboo_version_tag}"
else
mvn -f ${bamboo_build_working_directory}/$SERVICE_NAME-v1/pom.xml \
--no-transfer-progress \
${bamboo_maven_add_param} \
clean install \
-Djava.security.egd=file:/dev/./urandomelse
echo "gitTag=${bamboo_planRepository_1_branch}_${bamboo_buildNumber}" > git.tag
exit 0
fi
echo "gitTag=${newVersion}" > git.tag
mvn --no-transfer-progress \
-f ${bamboo_build_working_directory}/$SERVICE_NAME-v1/pom.xml \
versions:set \
-DnewVersion=${newVersion}
mvn --no-transfer-progress \
-f ${bamboo_build_working_directory}/$SERVICE_NAME-image-v1/pom.xml \
versions:set \
-DnewVersion=${newVersion}
mvn -f ${bamboo_build_working_directory}/$SERVICE_NAME-v1/pom.xml \
--no-transfer-progress \
clean deploy \
${bamboo_maven_add_param} \
-e \
-DdeployAtEnd=true \
-Dmaven.wagon.http.ssl.insecure=true \
-Dmaven.wagon.http.ssl.allowall=true \
-Dmaven.wagon.http.ssl.ignore.validity.dates=true \
-DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/red-platform-releases
mvn --no-transfer-progress \
-f ${bamboo_build_working_directory}/$SERVICE_NAME-image-v1/pom.xml \
package
mvn --no-transfer-progress \
-f ${bamboo_build_working_directory}/$SERVICE_NAME-image-v1/pom.xml \
docker:push

View File

@ -1,44 +0,0 @@
#!/bin/bash
set -e
SERVICE_NAME=$1
echo "build jar binaries"
mvn -f ${bamboo_build_working_directory}/$SERVICE_NAME-v1/pom.xml \
--no-transfer-progress \
clean install \
-Djava.security.egd=file:/dev/./urandomelse
echo "dependency-check:aggregate"
mvn --no-transfer-progress \
-f ${bamboo_build_working_directory}/$SERVICE_NAME-v1/pom.xml \
org.owasp:dependency-check-maven:aggregate
if [[ -z "${bamboo_repository_pr_key}" ]]
then
echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
mvn --no-transfer-progress \
-f ${bamboo_build_working_directory}/$SERVICE_NAME-v1/pom.xml \
sonar:sonar \
-Dsonar.projectKey=RED_$SERVICE_NAME \
-Dsonar.host.url=https://sonarqube.iqser.com \
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-Dsonar.branch.name=${bamboo_planRepository_1_branch} \
-Dsonar.dependencyCheck.jsonReportPath=target/dependency-check-report.json \
-Dsonar.dependencyCheck.xmlReportPath=target/dependency-check-report.xml \
-Dsonar.dependencyCheck.htmlReportPath=target/dependency-check-report.html
else
echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
mvn --no-transfer-progress \
-f ${bamboo_build_working_directory}/$SERVICE_NAME-v1/pom.xml \
sonar:sonar \
-Dsonar.projectKey=RED_$SERVICE_NAME \
-Dsonar.host.url=https://sonarqube.iqser.com \
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
-Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \
-Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \
-Dsonar.dependencyCheck.jsonReportPath=target/dependency-check-report.json \
-Dsonar.dependencyCheck.xmlReportPath=target/dependency-check-report.xml \
-Dsonar.dependencyCheck.htmlReportPath=target/dependency-check-report.html
fi

View File

@ -1,22 +0,0 @@
package buildjob;
import org.junit.Test;
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
import com.atlassian.bamboo.specs.api.exceptions.PropertiesValidationException;
import com.atlassian.bamboo.specs.api.util.EntityPropertiesBuilders;
public class PlanSpecTest {
@Test
public void checkYourPlanOffline() throws PropertiesValidationException {
Plan plan = new PlanSpec().createPlan();
EntityPropertiesBuilders.build(plan);
Plan secPlan = new PlanSpec().createSecBuild();
EntityPropertiesBuilders.build(secPlan);
}
}

15
buildSrc/build.gradle.kts Normal file
View File

@ -0,0 +1,15 @@
/*
* This file was generated by the Gradle 'init' task.
*
* This project uses @Incubating APIs which are subject to change.
*/
plugins {
// Support convention plugins written in Kotlin. Convention plugins are build scripts in 'src/main' that automatically become available as plugins in the main build.
`kotlin-dsl`
}
repositories {
// Use the plugin portal to apply community plugins in convention plugins.
gradlePluginPortal()
}

View File

@ -0,0 +1,70 @@
plugins {
`java-library`
pmd
checkstyle
jacoco
}
group = "com.knecon.fforesight.service"
java.sourceCompatibility = JavaVersion.VERSION_17
java.targetCompatibility = JavaVersion.VERSION_17
pmd {
isConsoleOutput = true
}
tasks.pmdMain {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
}
tasks.pmdTest {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/test_pmd.xml")
}
tasks.named<Test>("test") {
useJUnitPlatform()
reports {
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
}
minHeapSize = "512m"
maxHeapSize = "8192m"
}
tasks.test {
finalizedBy(tasks.jacocoTestReport) // report is always generated after tests run
}
tasks.jacocoTestReport {
dependsOn(tasks.test) // tests are required to run before generating the report
reports {
xml.required.set(true)
csv.required.set(false)
html.outputLocation.set(layout.buildDirectory.dir("jacocoHtml"))
}
}
java {
withJavadocJar()
}
repositories {
mavenLocal()
maven {
url = uri("https://nexus.knecon.com/repository/gindev/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull()
password = providers.gradleProperty("mavenPassword").getOrNull()
}
}
maven {
url = uri("https://nexus.knecon.com/repository/PDFTron/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull()
password = providers.gradleProperty("mavenPassword").getOrNull()
}
}
mavenCentral()
}

View File

@ -0,0 +1,39 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
"http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
<module name="Checker">
<property
name="severity"
value="error"/>
<module name="TreeWalker">
<module name="SuppressWarningsHolder"/>
<module name="MissingDeprecated"/>
<module name="MissingOverride"/>
<module name="AnnotationLocation"/>
<module name="JavadocStyle"/>
<module name="NonEmptyAtclauseDescription"/>
<module name="IllegalImport"/>
<module name="RedundantImport"/>
<module name="RedundantModifier"/>
<module name="EmptyBlock"/>
<module name="DefaultComesLast"/>
<module name="EmptyStatement"/>
<module name="EqualsHashCode"/>
<module name="ExplicitInitialization"/>
<module name="IllegalInstantiation"/>
<module name="ModifiedControlVariable"/>
<module name="MultipleVariableDeclarations"/>
<module name="PackageDeclaration"/>
<module name="ParameterAssignment"/>
<module name="SimplifyBooleanExpression"/>
<module name="SimplifyBooleanReturn"/>
<module name="StringLiteralEquality"/>
<module name="OneStatementPerLine"/>
<module name="FinalClass"/>
<module name="ArrayTypeStyle"/>
<module name="UpperEll"/>
<module name="OuterTypeFilename"/>
</module>
<module name="FileTabCharacter"/>
<module name="SuppressWarningsFilter"/>
</module>

20
config/pmd/pmd.xml Normal file
View File

@ -0,0 +1,20 @@
<?xml version="1.0"?>
<ruleset name="Custom ruleset"
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
<description>
Knecon ruleset checks the code for bad stuff
</description>
<rule ref="category/java/errorprone.xml">
<exclude name="MissingSerialVersionUID"/>
<exclude name="AvoidLiteralsInIfCondition"/>
<exclude name="AvoidDuplicateLiterals"/>
<exclude name="NullAssignment"/>
<exclude name="AssignmentInOperand"/>
<exclude name="BeanMembersShouldSerialize"/>
</rule>
</ruleset>

22
config/pmd/test_pmd.xml Normal file
View File

@ -0,0 +1,22 @@
<?xml version="1.0"?>
<ruleset name="Custom ruleset"
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
<description>
Knecon test ruleset checks the code for bad stuff
</description>
<rule ref="category/java/errorprone.xml">
<exclude name="MissingSerialVersionUID"/>
<exclude name="AvoidLiteralsInIfCondition"/>
<exclude name="AvoidDuplicateLiterals"/>
<exclude name="NullAssignment"/>
<exclude name="AssignmentInOperand"/>
<exclude name="TestClassWithoutTestCases"/>
<exclude name="BeanMembersShouldSerialize"/>
</rule>
</ruleset>

1
gradle.properties.kts Normal file
View File

@ -0,0 +1 @@
version = 4.0-SNAPSHOT

View File

@ -1,118 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>com.iqser.red</groupId>
<artifactId>platform-docker-dependency</artifactId>
<version>1.2.0</version>
<relativePath/>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>ocr-service-image-v1</artifactId>
<groupId>com.iqser.red.service</groupId>
<version>1.0-SNAPSHOT</version>
<packaging>pom</packaging>
<properties>
<service.server>ocr-service-server-v1</service.server>
<platform.jar>${service.server}.jar</platform.jar>
<docker.skip.push>false</docker.skip.push>
<docker.image.name>${docker.image.prefix}/${service.server}</docker.image.name>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>io.fabric8</groupId>
<artifactId>docker-maven-plugin</artifactId>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>download-platform-jar</id>
<phase>prepare-package</phase>
<goals>
<goal>copy</goal>
</goals>
<configuration>
<artifactItems>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>${service.server}</artifactId>
<version>${version}</version>
<type>jar</type>
<overWrite>true</overWrite>
<destFileName>${platform.jar}</destFileName>
</dependency>
</artifactItems>
<outputDirectory>${docker.build.directory}</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>io.fabric8</groupId>
<artifactId>docker-maven-plugin</artifactId>
<configuration>
<images>
<image>
<name>${docker.image.name}</name>
<build>
<dockerFileDir>${docker.build.directory}</dockerFileDir>
<args>
<PLATFORM_JAR>${platform.jar}</PLATFORM_JAR>
</args>
<tags>
<tag>${docker.image.version}</tag>
<tag>latest</tag>
</tags>
</build>
</image>
</images>
</configuration>
</plugin>
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<executions>
<execution>
<id>copy-resources</id>
<phase>prepare-package</phase>
<goals>
<goal>copy-resources</goal>
</goals>
<configuration>
<outputDirectory>${basedir}/target/build/libs/</outputDirectory>
<resources>
<resource>
<directory>libs</directory>
<filtering>false</filtering>
</resource>
</resources>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>

View File

@ -1,18 +0,0 @@
FROM red/base-image:2.0.0
COPY "libs/pdftron/OCRModuleLinux.tar.gz" .
RUN tar xvzf OCRModuleLinux.tar.gz
RUN mkdir /OCRModule
RUN mv Lib/* /OCRModule/
RUN apt-get -y update
# Ghostscript somehow improves ocr quality using pdfton, do not remove!
RUN apt-get -y install ghostscript
ARG PLATFORM_JAR
ENV PLATFORM_JAR ${PLATFORM_JAR}
ENV USES_ELASTICSEARCH false
COPY ["${PLATFORM_JAR}", "/"]

View File

@ -1,68 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.iqser.red.service</groupId>
<artifactId>ocr-service-v1</artifactId>
<version>1.0-SNAPSHOT</version>
</parent>
<artifactId>ocr-service-api-v1</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<persistence-service.version>1.269.0</persistence-service.version>
<redaction-service.version>3.155.0</redaction-service.version>
<dsljson.version>1.9.9</dsljson.version>
</properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/com.dslplatform/dsl-json-java8 -->
<dependency>
<groupId>com.dslplatform</groupId>
<artifactId>dsl-json-java8</artifactId>
<version>${dsljson.version}</version>
</dependency>
<dependency>
<!-- This dependency contains annotations that are used in specifying REST endpoints. -->
<!-- It is optional since not all users of this API might use Feign. -->
<groupId>io.github.openfeign</groupId>
<artifactId>feign-core</artifactId>
<optional>true</optional>
</dependency>
<!-- spring -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-web</artifactId>
</dependency>
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>persistence-service-api-v1</artifactId>
<exclusions>
<exclusion>
<groupId>com.iqser.red.service</groupId>
<artifactId>redaction-service-api-v1</artifactId>
</exclusion>
</exclusions>
<version>${persistence-service.version}</version>
</dependency>
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>redaction-service-api-v1</artifactId>
<exclusions>
<exclusion>
<groupId>com.iqser.red.service</groupId>
<artifactId>persistence-service-api-v1</artifactId>
</exclusion>
</exclusions>
<version>${redaction-service.version}</version>
</dependency>
</dependencies>
</project>

View File

@ -1,16 +0,0 @@
package com.iqser.red.service.ocr.v1.api.model;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.SuperBuilder;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class DocumentRequest {
protected String dossierId;
protected String fileId;
}

View File

@ -0,0 +1,22 @@
plugins {
`maven-publish`
id("com.iqser.red.service.java-conventions")
id("io.freefair.lombok") version "8.4"
}
publishing {
publications {
create<MavenPublication>(name) {
from(components["java"])
}
}
repositories {
maven {
url = uri("https://nexus.knecon.com/repository/red-platform-releases/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull();
password = providers.gradleProperty("mavenPassword").getOrNull();
}
}
}
}

View File

@ -0,0 +1,24 @@
package com.knecon.fforesight.service.ocr.v1.api.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class DocumentRequest {
protected String dossierId;
protected String fileId;
protected boolean removeWatermark;
public DocumentRequest(String dossierId, String fileId) {
this.dossierId = dossierId;
this.fileId = fileId;
}
}

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.ocr.v1.api.model;
package com.knecon.fforesight.service.ocr.v1.api.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
@ -6,14 +6,15 @@ import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class OCRStatusUpdateResponse {
private String fileId;
private int numberOfPagesToOCR;
private int numberOfOCRedPages;
private boolean ocrFinished;
private boolean ocrStarted;
}

View File

@ -0,0 +1,31 @@
plugins {
id("com.iqser.red.service.java-conventions")
id("io.freefair.lombok") version "8.4"
}
configurations {
all {
exclude(group = "org.springframework.boot", module = "spring-boot-starter-logging")
}
}
dependencies {
api("com.iqser.red.service:persistence-service-internal-api-v1:2.224.0")
api("net.sourceforge.tess4j:tess4j:5.8.0")
api("com.iqser.red.commons:metric-commons:2.1.0")
api("com.iqser.red.commons:storage-commons:2.45.0")
api("com.knecon.fforesight:tenant-commons:0.21.0")
api("com.knecon.fforesight:lifecycle-commons:0.6.0")
api("com.pdftron:PDFNet:10.5.0")
api("org.apache.pdfbox:pdfbox:3.0.0")
api("org.apache.pdfbox:jbig2-imageio:3.0.4")
api("com.github.jai-imageio:jai-imageio-core:1.4.0")
api("com.github.jai-imageio:jai-imageio-jpeg2000:1.4.0")
api("org.apache.commons:commons-math3:3.6.1")
api("io.github.karols:hocr4j:0.2.0")
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
api("com.google.guava:guava:31.1-jre")
api("com.iqser.red.commons:pdftron-logic-commons:2.32.0")
api("com.knecon.fforesight:viewer-doc-processor:0.125.0")
testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
}

View File

@ -0,0 +1,26 @@
package com.knecon.fforesight.service.ocr.processor;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import io.micrometer.observation.ObservationRegistry;
@Configuration
@ComponentScan
@EnableConfigurationProperties(OcrServiceSettings.class)
public class OcrServiceProcessorConfiguration {
@Bean
@Autowired
public ViewerDocumentService viewerDocumentService(ObservationRegistry registry) {
return new ViewerDocumentService(registry);
}
}

View File

@ -0,0 +1,49 @@
package com.knecon.fforesight.service.ocr.processor.initializer;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import com.pdftron.pdf.PDFNet;
import com.sun.jna.NativeLibrary;
import jakarta.annotation.PostConstruct;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Component
@RequiredArgsConstructor
public class NativeLibrariesInitializer {
@Value("${pdftron.license:}")
private String pdftronLicense;
@SneakyThrows
@PostConstruct
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
public void init() {
log.info("Initializing Native Libraries");
log.info("Setting pdftron license: {}", pdftronLicense);
PDFNet.setTempPath("/tmp/pdftron");
PDFNet.initialize(pdftronLicense);
log.info("Setting jna.library.path: {}", System.getenv("VCPKG_DYNAMIC_LIB"));
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
log.info("Asserting Native Libraries loaded");
try (NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica")) {
assert leptonicaLib != null;
log.info("Leptonica library loaded from {}", leptonicaLib.getFile().getAbsolutePath());
}
try (NativeLibrary tesseractLib = NativeLibrary.getInstance("tesseract")) {
assert tesseractLib != null;
log.info("Tesseract library loaded from {}", tesseractLib.getFile().getAbsolutePath());
}
}
}

View File

@ -0,0 +1,36 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import org.apache.pdfbox.pdmodel.graphics.color.PDColorSpace;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
public record ExtractedImage(
int pageNumber, QuadPoint position, int height, int width, BufferedImage image, Matrix ctm, int numberOnPage, PDColorSpace colorSpace) implements UnprocessedImage {
@SneakyThrows
public Pix asPix() {
BufferedImage image = ImageProcessingUtils.convertToDeviceColorSpace(this);
ImageProcessingUtils.setAlphaChannelToWhite(image);
return LeptUtils.convertImageToPix(image);
}
public QuadPoint getImageCoordinatesInInitialUserSpace() {
return QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, 1, 1)).getTransformed(ctm.createAffineTransform());
}
}

View File

@ -0,0 +1,61 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.Graphics;
import java.awt.geom.AffineTransform;
import java.awt.image.BufferedImage;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.Setter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.tess4j.ITessAPI;
@Slf4j
@Getter
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ExtractedOcrImage implements OcrImage {
int pageNumber;
int numberOnPage;
int originalHeight;
int originalWidth;
Matrix ctm;
Pix pix;
int height;
int width;
int rotationDegrees;
@Override
public AffineTransform getImageCTM() {
AffineTransform affineTransform = ctm.createAffineTransform();
affineTransform.scale((double) 1 / getWidth(), (double) 1 / getHeight());
AffineTransform deRotationMatrix = switch (360 - rotationDegrees) {
case 90 -> new AffineTransform(0, 1, -1, 0, getHeight(), 0);
case 180 -> new AffineTransform(-1, 0, 0, -1, getWidth(), getHeight());
case 270 -> new AffineTransform(0, -1, 1, 0, getWidth() - getHeight(), getHeight()); // results from 90 + 180 rotations
default -> new AffineTransform();
};
affineTransform.concatenate(deRotationMatrix);
AffineTransform mirrorTransform = new AffineTransform(1, 0, 0, -1, 0, getHeight());
affineTransform.concatenate(mirrorTransform);
return affineTransform;
}
}

View File

@ -0,0 +1,17 @@
package com.knecon.fforesight.service.ocr.processor.model;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Getter
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public final class FontMetrics {
float descent; // descent is the part of the text which is below the baseline, e.g. the lower curve of a 'g'. https://en.wikipedia.org/wiki/Body_height_(typography)
float fontSize;
float heightScaling;
}

View File

@ -0,0 +1,5 @@
package com.knecon.fforesight.service.ocr.processor.model;
public record HeightAndDescent(float height, float descent) {
}

View File

@ -0,0 +1,127 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.image.BufferedImage;
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
import com.knecon.fforesight.service.ocr.processor.utils.PdfDpiCalculator;
import lombok.SneakyThrows;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
import net.sourceforge.tess4j.ITessAPI;
public interface OcrImage {
/**
* Retrieves the page number where the OCR image is located. It uses 1-based-index.
*
* @return The page number where the OCR image is located.
*/
int getPageNumber();
/**
* Retrieves the number of this image on the page. For full page images this always returns 0.
*
* @return The number of this image on the page.
*/
int getNumberOnPage();
/**
* Retrieves the height of the original image (not necessarily in pdf coordinates).
*
* @return the height of the image
*/
int getHeight();
/**
* Retrieves the width of the original image (not necessarily in pdf coordinates).
*
* @return the width of the image
*/
int getWidth();
/**
* Gets the outer boundary of the image in image coordinates. (0,0) is upper left corner. And height and width is the image size
*
* @return the QuadPoint representing the size of the image
*/
default QuadPoint getImageBounds() {
// cannot be solved with a nice rotation matrix. After rotating the text coordinates in the image will always start at (0,0) and will therefore always start at (0,0) in the PDF.
// So in order to mimic this behavior we need to start with (0,0) coordinates always.
if (getRotationDegrees() == 90 || getRotationDegrees() == 270) {
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, getWidth()), new Point2D.Double(getHeight(), getWidth()), new Point2D.Double(getHeight(), 0));
} else {
return new QuadPoint(new Point2D.Double(0, 0), new Point2D.Double(0, getHeight()), new Point2D.Double(getWidth(), getHeight()), new Point2D.Double(getWidth(), 0));
}
}
/**
* Retrieves the image coordinates in the PDF by transforming the image bounds using the current transformation matrix (CTM).
*
* @return The image coordinates as a QuadPoint object.
*/
default QuadPoint getImageCoordinatesInInitialUserSpace() {
return getImageBounds().getTransformed(getImageCTM());
}
/**
* Retrieves the rotation degree of the OCR image.
*
* @return The rotation degree of the OCR image.
*/
int getRotationDegrees();
/**
* Retrieves the optimal page segmentation mode for the OCR image.
*
* @return The optimal page segmentation mode.
*/
default int getOptimalPageSegmentationMode() {
if (getWidth() < 200 || getHeight() < 200) {
return ITessAPI.TessPageSegMode.PSM_SINGLE_BLOCK;
}
return ITessAPI.TessPageSegMode.PSM_AUTO;
} // TODO: evaluate if PSM can be dynamically chosen to increase performance
/**
* Retrieves the buffered image associated with the OCR image.
*
* @return The BufferedImage object representing the image.
*/
Pix getPix();
default int getDpi() {
return PdfDpiCalculator.calculateDpi(getImageBounds(), getImageCTM(), getWidth());
}
/**
* Retrieves the current transformation matrix (CTM). The CTM may be used to transform the image coordinates to Initial User Space coordinates.
*
* @return The AffineTransform representing the current transformation matrix.
*/
AffineTransform getImageCTM();
default void destroyPix() {
LeptUtils.disposePix(getPix());
}
}

View File

@ -0,0 +1,23 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.AffineTransform;
import java.util.List;
import com.knecon.fforesight.service.ocr.processor.service.HOcrPageParser;
import io.github.karols.hocr4j.Word;
public record OcrResult(OcrImage image, String tesseractOutputFilePath) {
public static OcrResult create(OcrImage image, String tesseractResult) {
return new OcrResult(image, tesseractResult);
}
public List<Word> getAllWords() {
return HOcrPageParser.extractHocrPage(tesseractOutputFilePath).getAllWords();
}
}

View File

@ -0,0 +1,42 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.FontStyleDetectionModel;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
public record OcrResultToWrite(List<TextPositionInImage> textPositionInImage, QuadPoint imageBoundingBox) {
public static OcrResultToWrite fromFontStyleDetectionModel(FontStyleDetectionModel fontStyleDetectionModel) {
return new OcrResultToWrite(fontStyleDetectionModel.getTextPositionInImages(), fontStyleDetectionModel.getImageBounds());
}
public static Map<Integer, List<OcrResultToWrite>> buildOcrResultsToWrite(List<OcrResult> ocrResults, FontMetricsFactory fontMetricsFactory) {
return ocrResults.stream()
.collect(Collectors.groupingBy(ocrResult -> ocrResult.image().getPageNumber()))
.entrySet()
.stream()
.collect(Collectors.toMap(Map.Entry::getKey,
entry -> entry.getValue()
.stream()
.map(ocrResult -> new OcrResultToWrite(toTextPositionInImage(ocrResult, fontMetricsFactory), ocrResult.image().getImageCoordinatesInInitialUserSpace()))
.toList()));
}
private static List<TextPositionInImage> toTextPositionInImage(OcrResult ocrResult, FontMetricsFactory fontMetricsFactory) {
return ocrResult.getAllWords()
.stream()
.filter(word -> !word.isBlank())
.map(word -> new TextPositionInImage(word, ocrResult.image().getImageCTM(), fontMetricsFactory, FontStyle.REGULAR))
.toList();
}
}

View File

@ -0,0 +1,42 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.Rectangle2D;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
public record PageInformation(Rectangle2D mediabox, int number, int rotationDegrees) {
public static PageInformation fromPDPage(int pageNum, PDPage page) {
PDRectangle mediaBox = page.getMediaBox();
return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()),
pageNum,
page.getRotation());
}
public double height() {
return mediabox.getHeight();
}
public double width() {
return mediabox.getWidth();
}
public double minX() {
return mediabox.getX();
}
public double minY() {
return mediabox.getY();
}
}

View File

@ -0,0 +1,117 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.Rectangle;
import java.awt.geom.AffineTransform;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.List;
import io.github.karols.hocr4j.Bounds;
public record QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) {
/*
B _____ C
| |
A|_____|D
*/
public static QuadPoint fromRectangle2D(Rectangle2D rectangle2D) {
return new QuadPoint(new Point2D.Double(rectangle2D.getX(), rectangle2D.getY()),
new Point2D.Double(rectangle2D.getX(), rectangle2D.getMaxY()),
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getMaxY()),
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getY()));
}
public static QuadPoint fromBounds(Bounds bounds) {
return new QuadPoint(new Point2D.Double(bounds.getLeft(), bounds.getBottom()),
new Point2D.Double(bounds.getLeft(), bounds.getTop()),
new Point2D.Double(bounds.getRight(), bounds.getTop()),
new Point2D.Double(bounds.getRight(), bounds.getBottom()));
}
public Rectangle2D getBounds2D() {
double minX = Math.min(Math.min(Math.min(a.getX(), b.getX()), c.getX()), d.getX());
double minY = Math.min(Math.min(Math.min(a.getY(), b.getY()), c.getY()), d.getY());
double maxX = Math.max(Math.max(Math.max(a.getX(), b.getX()), c.getX()), d.getX());
double maxY = Math.max(Math.max(Math.max(a.getY(), b.getY()), c.getY()), d.getY());
return new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY);
}
public QuadPoint getTransformed(AffineTransform at) {
return new QuadPoint(at.transform(a, null), at.transform(b, null), at.transform(c, null), at.transform(d, null));
}
/**
* Determines if the given QuadPoint aligns with this QuadPoint within a given threshold.
* It does os by trying every possible combination of aligning sides. It starts with the most likely combination of ab and cd.
*
* @param other The QuadPoint to compare with.
* @param threshold The maximum distance allowed for alignment.
* @return True if the QuadPoints align within the threshold, false otherwise.
*/
public boolean aligns(QuadPoint other, double threshold) {
Line2D ab = new Line2D.Double(a, b);
Line2D bc = new Line2D.Double(b, c);
Line2D cd = new Line2D.Double(c, d);
Line2D da = new Line2D.Double(d, a);
Line2D ab2 = new Line2D.Double(other.a, other.b);
Line2D bc2 = new Line2D.Double(other.b, other.c);
Line2D cd2 = new Line2D.Double(other.c, other.d);
Line2D da2 = new Line2D.Double(other.d, other.a);
List<Line2D> lines = List.of(ab, cd, bc, da);
List<Line2D> lines2 = List.of(cd2, ab2, bc2, da2);
return lines.stream().anyMatch(line -> lines2.stream().anyMatch(line2 -> aligns(line, line2, threshold)));
}
private static boolean aligns(Line2D a, Line2D b, double threshold) {
return aligns(a.getP1(), a.getP2(), b.getP1(), b.getP2(), threshold);
}
private static boolean aligns(Point2D a, Point2D b, Point2D a2, Point2D b2, double threshold) {
if (a.distance(a2) < threshold && b.distance(b2) < threshold) {
return true;
}
return a.distance(b2) < threshold && b.distance(a2) < threshold;
}
@Override
public String toString() {
return String.format("A:(%.2f, %.2f) | B:(%.2f, %.2f) | C:(%.2f, %.2f) | D:(%.2f, %.2f)",
a().getX(),
a().getY(),
b().getX(),
b().getY(),
c().getX(),
c().getY(),
d().getX(),
d().getY());
}
public double size() {
return a().distance(b()) * a().distance(d());
}
}

View File

@ -0,0 +1,14 @@
package com.knecon.fforesight.service.ocr.processor.model;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
public record RenderedPageImageFile(int pageNumber, String absoluteFilePath) implements UnprocessedImage {
@Override
public Pix asPix() {
return Leptonica1.pixRead(absoluteFilePath);
}
}

View File

@ -0,0 +1,82 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.AffineTransform;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Pix;
@Getter
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class RenderedPageOcrImage implements OcrImage {
int height;
int width;
PageInformation pageInformation;
Pix pix;
int rotationDegrees;
@Override
public AffineTransform getImageCTM() {
double scalingFactor = calculateScalingFactor();
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, 0, 0);
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
AffineTransform rotationMatrix = switch (calculateTotalRotation()) {
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height()); // results from 90 + 180 rotations
default -> new AffineTransform();
};
// matrix multiplication is performed from right to left, so the order is reversed.
// scaling -> mirror -> rotation
AffineTransform resultMatrix = new AffineTransform();
resultMatrix.concatenate(rotationMatrix);
resultMatrix.concatenate(mirrorMatrix);
resultMatrix.concatenate(imageToCropBoxScaling);
return resultMatrix;
}
private int calculateTotalRotation() {
return (pageInformation.rotationDegrees() + (360 - rotationDegrees)) % 360;
}
@Override
public int getPageNumber() {
return pageInformation.number();
}
@Override
public int getNumberOnPage() {
return 0;
}
private double calculateScalingFactor() {
// PDFBox always returns page height and width based on rotation
double pageWidth;
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
pageWidth = pageInformation.height();
} else {
pageWidth = pageInformation.width();
}
return pageWidth / width;
}
}

View File

@ -0,0 +1,135 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
import io.github.karols.hocr4j.Word;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.Setter;
import lombok.experimental.FieldDefaults;
@Getter
@FieldDefaults(level = AccessLevel.PRIVATE)
public class TextPositionInImage {
final QuadPoint position;
final String text;
final AffineTransform imageCTM;
@Setter
FontMetricsFactory fontMetricsFactory;
@Setter
FontStyle fontStyle;
public TextPositionInImage(Word word, AffineTransform imageCTM, FontMetricsFactory fontMetricsFactory, FontStyle fontStyle) {
this.position = QuadPoint.fromBounds(word.getBounds());
this.text = word.getText();
this.imageCTM = imageCTM;
this.fontMetricsFactory = fontMetricsFactory;
this.fontStyle = fontStyle;
}
public QuadPoint getTransformedTextBBox() {
return position.getTransformed(imageCTM);
}
public PDFont getFont() {
return fontMetricsFactory.getFont();
}
public Matrix getTextMatrix() {
FontMetrics metrics = fontMetricsFactory.calculateMetrics(text, getTransformedWidth(), getTransformedHeight());
// Matrix multiplication is from right to left:
// convert to image coords -> subtract descent -> scale height -> reverse imageCTM scaling -> translate to coordinates in image -> convert to pdf coords
// width must not be set, since it is scaled with the fontsize attribute
AffineTransform ctm = new AffineTransform();
ctm.concatenate(imageCTM);
ctm.translate(position.a().getX(), position.a().getY());
ctm.scale(getWidth() / getTransformedWidth(),
getHeight() / getTransformedHeight()); // scale with transformation coefficient, such that fontsize may be set with transformed width.
ctm.scale(1, metrics.getHeightScaling());
ctm.translate(0, metrics.getDescent());
ctm.concatenate(new AffineTransform(1, 0, 0, -1, 0, 0)); // start in image coordinates, with (0,0) being top left and negative height.
return new Matrix(ctm);
}
public double getFontSize() {
return fontMetricsFactory.calculateFontSize(text, getTransformedWidth());
}
public double getTransformedWidth() {
return transformedA().distance(transformedD());
}
public double getTransformedHeight() {
return transformedA().distance(transformedB());
}
public double getWidth() {
return position.a().distance(position.d());
}
public double getTextHeight() {
var metrics = fontMetricsFactory.calculateMetrics(text, getTransformedWidth(), getTransformedHeight());
return fontMetricsFactory.calculateFontSize(text, getTransformedWidth()) * metrics.getHeightScaling();
}
public double getHeight() {
return position.a().distance(position.b());
}
public Point2D transformedA() {
return imageCTM.transform(position.a(), null);
}
public Point2D transformedB() {
return imageCTM.transform(position.b(), null);
}
public Point2D transformedC() {
return imageCTM.transform(position.c(), null);
}
public Point2D transformedD() {
return imageCTM.transform(position.d(), null);
}
}

View File

@ -0,0 +1,9 @@
package com.knecon.fforesight.service.ocr.processor.model;
import net.sourceforge.lept4j.Pix;
public interface UnprocessedImage {
Pix asPix();
}

View File

@ -0,0 +1,58 @@
package com.knecon.fforesight.service.ocr.processor.model.scriptdetection;
import java.util.List;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
@Getter
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public final class FontStyleDetectionModel {
QuadPoint imageBounds;
Pix image;
List<TextPositionAndWordImage> textPositionsAndWordImages;
public static FontStyleDetectionModel fromOcrResult(OcrResult ocrResult, FontMetricsFactory fontMetricsFactory, OcrServiceSettings settings) {
var image = Leptonica1.pixRead(ocrResult.tesseractOutputFilePath() + ".tiff");
var wordPixes = ocrResult.getAllWords().stream().filter(word -> !word.isBlank()).map(word -> TextPositionAndWordImage.create(ocrResult.image().getImageCTM(), word, image, settings, fontMetricsFactory)).toList();
return new FontStyleDetectionModel(ocrResult.image().getImageCoordinatesInInitialUserSpace(), image, wordPixes);
}
public List<TextPositionInImage> getTextPositionInImages() {
return textPositionsAndWordImages.stream().map(TextPositionAndWordImage::getTextPositionInImage).toList();
}
public List<WordImage> getWordImages() {
return textPositionsAndWordImages.stream().map(TextPositionAndWordImage::getWordImage).toList();
}
public void dispose() {
LeptUtils.disposePix(image);
getWordImages().forEach(WordImage::dispose);
}
}

View File

@ -0,0 +1,52 @@
package com.knecon.fforesight.service.ocr.processor.model.scriptdetection;
import java.awt.geom.AffineTransform;
import java.util.Objects;
import org.apache.commons.math3.ml.clustering.Clusterable;
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import io.github.karols.hocr4j.Word;
import lombok.Getter;
import net.sourceforge.lept4j.Pix;
@Getter
public final class TextPositionAndWordImage implements Clusterable {
private final TextPositionInImage textPositionInImage;
private final WordImage wordImage;
public TextPositionAndWordImage(TextPositionInImage textPositionInImage, WordImage wordImage) {
this.textPositionInImage = textPositionInImage;
this.wordImage = wordImage;
}
public static TextPositionAndWordImage create(AffineTransform imageCTM, Word word, Pix image, OcrServiceSettings settings, FontMetricsFactory fontMetricsFactory) {
TextPositionInImage textPositionInImage = new TextPositionInImage(word, imageCTM, fontMetricsFactory, FontStyle.REGULAR);
WordImage wordImage = new WordImage(textPositionInImage.getTextHeight(), word, image, settings);
return new TextPositionAndWordImage(textPositionInImage, wordImage);
}
@Override
public double[] getPoint() {
return wordImage.getPoint();
}
public double getTextHeight() {
return wordImage.getTextHeight();
}
}

View File

@ -0,0 +1,71 @@
package com.knecon.fforesight.service.ocr.processor.model.scriptdetection;
import org.apache.commons.math3.ml.clustering.Clusterable;
import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.StrokeWidthCalculator;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import io.github.karols.hocr4j.Word;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Box;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
@Getter
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class WordImage implements Clusterable {
Pix image;
String text;
double textHeight;
OcrServiceSettings settings;
public WordImage(double textHeight, Word word, Pix originalImage, OcrServiceSettings settings) {
Box box = new Box(word.getBounds().getLeft(), word.getBounds().getTop(), word.getBounds().getWidth(), word.getBounds().getHeight(), 1);
this.image = Leptonica1.pixClipRectangle(originalImage, box, null);
box.clear();
this.text = word.getText();
this.textHeight = textHeight;
this.settings = settings;
}
public boolean hasLargerStrokeWidth(double strokeWidth) {
int roundedStrokeWidth = (int) Math.round(strokeWidth);
double roundingError = (roundedStrokeWidth - strokeWidth) / strokeWidth;
// add 1 to open a bit bigger than the estimated regular stroke width
Pix openedPix = Leptonica1.pixOpenBrick(null, image, roundedStrokeWidth + 1, roundedStrokeWidth + 1);
double openedPixelDensity = ImageProcessingUtils.calculatePixelDensity(openedPix);
double pixelDensity = ImageProcessingUtils.calculatePixelDensity(image);
LeptUtils.disposePix(openedPix);
return (openedPixelDensity * (1 + roundingError)) / pixelDensity > (settings.getBoldThreshold());
}
@Override
public double[] getPoint() {
return new double[]{textHeight};
}
public void dispose() {
LeptUtils.disposePix(image);
}
}

View File

@ -0,0 +1,66 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.tenantcommons.TenantContext;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class FileStorageService {
private final StorageService storageService;
public static String getStorageId(String dossierId, String fileId, FileType fileType) {
return dossierId + "/" + fileId + "." + fileType.name() + fileType.getExtension();
}
public boolean untouchedFileExists(String dossierId, String fileId) {
return storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED));
}
@SneakyThrows
public void storeFiles(String dossierId, String fileId, File documentFile, File viewerDocumentFile) {
try (var in = new FileInputStream(documentFile)) {
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), in);
}
try (var in = new FileInputStream(viewerDocumentFile)) {
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT), in);
}
}
@SneakyThrows
public void downloadFiles(String dossierId, String fileId, File documentFile, File viewerDocumentFile) {
storageService.downloadTo(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), documentFile);
if (storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT))) {
storageService.downloadTo(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT), viewerDocumentFile);
} else {
Files.copy(documentFile.toPath(), viewerDocumentFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
}
if (!untouchedFileExists(dossierId, fileId)) {
try (var in = new FileInputStream(documentFile)) {
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED), in);
}
}
}
}

View File

@ -0,0 +1,169 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.stream.Collectors;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import com.knecon.fforesight.service.ocr.processor.service.threads.BlockingQueueFiller;
import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@SuppressWarnings("PMD") // can't figure out how to safely close the stdOut and stdError streams in line 142/144
public class GhostScriptService {
static String FORMAT = ".tiff";
static String DEVICE = "tiffgray";
OcrServiceSettings settings;
@SneakyThrows
public void renderPagesAsImagesBatchedAndAddToQueue(List<Integer> stitchedPageNumbers,
String documentAbsolutePath,
Path tmpImageDir,
PDDocument document,
BlockingQueue<UnprocessedImage> imageProcessingQueue,
Statistics stats) {
BlockingQueue<RenderedPageImageFile> imageFileCollectorQueue = new LinkedBlockingDeque<>();
BlockingQueueFiller asyncTransferThread = new BlockingQueueFiller(imageFileCollectorQueue, imageProcessingQueue);
asyncTransferThread.start();
int numOfProcesses = Math.min(settings.getGsProcessCount(), stitchedPageNumbers.size());
List<List<ProcessInfo>> processInfoBatches = buildSubListForEachProcess(stitchedPageNumbers,
numOfProcesses,
256 * numOfProcesses); // GS has a limit on how many pageIndices per call are possible, so we limit it to 256 pages per process
for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) {
long timestamp = System.currentTimeMillis();
List<ProcessInfo> processInfos = processInfoBatches.get(batchIdx);
log.info("Batch {}: Running {} gs processes with ({}) pages each",
batchIdx,
processInfos.size(),
processInfos.stream().map(info -> info.stitchedPageNumbers().size()).map(String::valueOf).collect(Collectors.joining(", ")));
int finalBatchIdx = batchIdx;
List<Process> processes = processInfos.stream()
.parallel()
.map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.stitchedPageNumbers(), tmpImageDir, documentAbsolutePath))
.peek(s -> log.debug(String.join(" ", s.cmdArgs())))
.map(processInfo -> executeProcess(processInfo, imageFileCollectorQueue))
.toList();
List<Integer> processExitCodes = new LinkedList<>();
for (Process process : processes) {
processExitCodes.add(process.waitFor());
}
stats.increasePDF2ImgDuration(System.currentTimeMillis() - timestamp);
log.info("Batch {}: Ghostscript processes finished with exit codes " + processExitCodes, batchIdx);
}
asyncTransferThread.setAllImagesQueued(true);
}
private List<List<ProcessInfo>> buildSubListForEachProcess(List<Integer> stitchedPageNumbers, int processCount, int batchSize) {
// GhostScript command line can only handle so many page numbers at once, so we split it into batches
int batchCount = (int) Math.ceil((double) stitchedPageNumbers.size() / batchSize);
log.info("Splitting {} page renderings across {} process(es) in {} batch(es) with size {}", stitchedPageNumbers.size(), processCount, batchCount, batchSize);
List<List<ProcessInfo>> processInfoBatches = new ArrayList<>(batchCount);
List<List<List<Integer>>> batchedBalancedSublist = ListSplittingUtils.buildBatchedBalancedSublist(stitchedPageNumbers.stream().sorted().toList(), processCount, batchCount);
for (var batch : batchedBalancedSublist) {
List<ProcessInfo> processInfos = new ArrayList<>(processCount);
for (int threadIdx = 0; threadIdx < batch.size(); threadIdx++) {
List<Integer> balancedPageNumbersSubList = batch.get(threadIdx);
processInfos.add(new ProcessInfo(threadIdx, balancedPageNumbersSubList));
}
processInfoBatches.add(processInfos);
}
return processInfoBatches;
}
@SneakyThrows
private ProcessCmdsAndRenderedImageFiles buildCmdArgs(Integer processIdx,
Integer batchIdx,
List<Integer> stitchedImagePageIndices,
Path outputDir,
String documentAbsolutePath) {
String imagePathFormat = outputDir.resolve("output_" + processIdx + "_" + batchIdx + ".%04d" + FORMAT).toFile().toString();
Map<Integer, RenderedPageImageFile> fullPageImages = new HashMap<>();
for (int i = 0; i < stitchedImagePageIndices.size(); i++) {
Integer pageNumber = stitchedImagePageIndices.get(i);
fullPageImages.put(pageNumber, new RenderedPageImageFile(pageNumber, String.format(imagePathFormat, i + 1)));
}
String[] cmdArgs = buildCmdArgs(stitchedImagePageIndices, documentAbsolutePath, imagePathFormat);
return new ProcessCmdsAndRenderedImageFiles(cmdArgs, fullPageImages);
}
private String[] buildCmdArgs(List<Integer> stitchedImagePageIndices, String documentAbsolutePath, String imagePathFormat) {
StringBuilder sPageList = new StringBuilder();
int i = 1;
for (Integer integer : stitchedImagePageIndices) {
sPageList.append(integer);
if (i < stitchedImagePageIndices.size()) {
sPageList.append(",");
}
i++;
}
String[] cmdArgs = new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + settings.getDpi(), "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"};
return cmdArgs;
}
@SneakyThrows
private Process executeProcess(ProcessCmdsAndRenderedImageFiles processInfo, BlockingQueue<RenderedPageImageFile> imageFileCollectorQueue) {
Process p = Runtime.getRuntime().exec(processInfo.cmdArgs());
InputStream stdOut = p.getInputStream();
GhostScriptOutputHandler stdOutLogger = GhostScriptOutputHandler.stdOut(stdOut, processInfo.renderedPageImageFiles(), imageFileCollectorQueue);
InputStream stdError = p.getErrorStream();
GhostScriptOutputHandler stdErrorLogger = GhostScriptOutputHandler.errorHandler(stdError);
stdOutLogger.start();
stdErrorLogger.start();
return p;
}
private record ProcessCmdsAndRenderedImageFiles(String[] cmdArgs, Map<Integer, RenderedPageImageFile> renderedPageImageFiles) {
}
private record ProcessInfo(Integer processIdx, List<Integer> stitchedPageNumbers) {
}
}

View File

@ -0,0 +1,24 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.io.FileInputStream;
import java.util.List;
import io.github.karols.hocr4j.Page;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class HOcrPageParser {
@SneakyThrows
public Page extractHocrPage(String tesseractOutputFileName) {
String hOcrString;
try (var hocrIn = new FileInputStream(tesseractOutputFileName + ".hocr")) {
hOcrString = new String(hocrIn.readAllBytes());
}
return Page.fromHocr(List.of(hOcrString)).get(0);
}
}

View File

@ -0,0 +1,16 @@
package com.knecon.fforesight.service.ocr.processor.service;
import org.springframework.stereotype.Service;
@Service
public interface IOcrMessageSender {
void sendUpdate(String fileId, int finishedImages, int totalImages);
void sendOCRStarted(String fileId);
void sendOcrFinished(String fileId, int totalImages);
void sendOcrResponse(String dossierId, String fileId);
}

View File

@ -0,0 +1,96 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.DrawObject;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
import org.apache.pdfbox.contentstream.operator.state.Restore;
import org.apache.pdfbox.contentstream.operator.state.Save;
import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters;
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.util.Matrix;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import lombok.Getter;
import lombok.SneakyThrows;
@Getter
public class ImageStreamEngine extends PDFStreamEngine {
private List<ExtractedImage> imagesOnCurrentPage;
private OcrServiceSettings settings;
private int pageNum;
public ImageStreamEngine(OcrServiceSettings settings) {
this.settings = settings;
// preparing PDFStreamEngine
addOperator(new Concatenate(this));
addOperator(new DrawObject(this));
addOperator(new SetGraphicsStateParameters(this));
addOperator(new Save(this));
addOperator(new Restore(this));
addOperator(new SetMatrix(this));
}
@Override
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException {
String operation = operator.getName();
if ("Do".equals(operation)) {
COSName objectName = (COSName) operands.get(0);
// get the PDF object
PDXObject xobject = getResources().getXObject(objectName);
// check if the object is an image object
if (xobject instanceof PDImageXObject imageXObject) {
if (imageXObject.getWidth() < settings.getMinImageWidth() || imageXObject.getHeight() < settings.getMinImageHeight()) {
return;
}
Matrix imageCTM = getGraphicsState().getCurrentTransformationMatrix();
this.imagesOnCurrentPage.add(new ExtractedImage(pageNum,
QuadPoint.fromRectangle2D(new Rectangle2D.Double(0, 0, imageXObject.getWidth(), imageXObject.getHeight())),
imageXObject.getHeight(),
imageXObject.getWidth(),
imageXObject.getImage(),
imageCTM,
imagesOnCurrentPage.size(),
imageXObject.getColorSpace()));
} else if (xobject instanceof PDFormXObject) {
PDFormXObject form = (PDFormXObject) xobject;
showForm(form);
}
} else {
super.processOperator(operator, operands);
}
}
@SneakyThrows
public void processPage(int pageNum, PDPage page) {
this.pageNum = pageNum;
this.imagesOnCurrentPage = new LinkedList<>();
super.processPage(page);
}
}

View File

@ -0,0 +1,176 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.stream.IntStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service;
import org.springframework.util.FileSystemUtils;
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.FontStyleDetector;
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.pdftron.pdf.PDFDoc;
import io.micrometer.observation.ObservationRegistry;
import io.micrometer.observation.annotation.Observed;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OCRService {
FileStorageService fileStorageService;
OcrServiceSettings settings;
IOcrMessageSender ocrMessageSender;
WatermarkRemovalService watermarkRemovalService;
InvisibleElementRemovalService invisibleElementRemovalService;
OcrResultWriter ocrResultWriter;
GhostScriptService ghostScriptService;
FontStyleDetector boldDetector;
ObservationRegistry registry;
/**
* Starts the OCR-Process: Collecting images (via threads),
* looking for stitchedImages (if so converting the current page to an image with ghostscript and work on this instead),
* perform tesseract-ocr on these images (via threads) and write the generated ocr-text as invisible elements.
*
* @param dossierId Id of dossier
* @param fileId Id of file
* @param tmpDir working directory for all files
* @param documentFile the file to perform ocr on, results are written invisibly
* @param viewerDocumentFile debugging file, results are written visibly in an optional content group
*/
@Observed(name = "OCRService", contextualName = "run-ocr-on-document")
@SneakyThrows
public void runOcrOnDocument(String dossierId, String fileId, boolean removeWatermark, Path tmpDir, File documentFile, File viewerDocumentFile) {
if (removeWatermark) {
removeWatermarkIfEnabled(documentFile);
}
removeInvisibleElements(documentFile);
log.info("Starting OCR for file {}", fileId);
long ocrStart = System.currentTimeMillis();
Statistics stats = runOcr(tmpDir, documentFile, viewerDocumentFile, fileId, dossierId);
long ocrEnd = System.currentTimeMillis();
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, String.format("%.1f", (ocrEnd - ocrStart) / 1000.0));
log.info("Runtime breakdown: {}", stats);
}
@SneakyThrows
private void removeInvisibleElements(File originFile) {
Path tmpFile = Files.createTempFile("invisibleElements", ".pdf");
try (var in = new FileInputStream(originFile); var out = new FileOutputStream(tmpFile.toFile())) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false, false);
}
Files.copy(tmpFile, originFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
assert tmpFile.toFile().delete();
}
@SneakyThrows
private void removeWatermarkIfEnabled(File originFile) {
Path tmpFile = Files.createTempFile("removeWatermarks", ".pdf");
try (var in = new FileInputStream(originFile); var out = new FileOutputStream(tmpFile.toFile())) {
watermarkRemovalService.removeWatermarks(in, out);
}
Files.copy(tmpFile, originFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
assert tmpFile.toFile().delete();
}
@SneakyThrows
public Statistics runOcr(Path tmpDir, File documentFile, File viewerDocumentFile, String fileId, String dossierId) {
long timestamp;
Path tmpImageDir = tmpDir.resolve("images");
Path tesseractOutputDir = tmpDir.resolve("tesseract_output");
tesseractOutputDir.toFile().mkdirs();
tmpImageDir.toFile().mkdirs();
Statistics stats;
try (PDDocument document = Loader.loadPDF(documentFile)) {
OcrProgressLogger logger = new OcrProgressLogger(document.getNumberOfPages(), ocrMessageSender, fileId);
int numberOfExtractThreads = Math.min(settings.getImageExtractThreadCount(), document.getNumberOfPages());
int numberOfOcrThreads = Math.min(settings.getOcrThreadCount(), document.getNumberOfPages());
stats = new Statistics(numberOfExtractThreads, numberOfOcrThreads);
BlockingQueue<OcrImage> ocrImageQueue = new ArrayBlockingQueue<>((int) (1.5 * numberOfOcrThreads));
OcrImageFactory ocrImageFactory = new OcrImageFactory(document,
documentFile,
tmpImageDir,
numberOfExtractThreads,
ghostScriptService,
ocrImageQueue,
logger,
settings,
stats);
ocrImageFactory.start();
List<OcrResult> ocrResults = new LinkedList<>();
List<OCRThread> ocrThreads = IntStream.range(0, numberOfOcrThreads)
.boxed()
.map(id -> new OCRThread(id, ocrImageQueue, tesseractOutputDir, ocrResults, logger, stats, settings))
.peek(Thread::start)
.toList();
log.info("Started {} OCR consumer threads, listening for images on the queue", ocrThreads.size());
ocrImageFactory.join();
log.info("Processed all images, interrupting ocr threads");
ocrThreads.forEach(Thread::interrupt);
for (OCRThread ocrThread : ocrThreads) {
ocrThread.join();
}
log.info("Tesseract OCR has finished for file {} and dossier {}", fileId, dossierId);
timestamp = System.currentTimeMillis();
Map<Integer, List<OcrResultToWrite>> imageWithTextPositionsPerPage = boldDetector.detectBold(ocrResults, document);
stats.increaseFontStyleDetectionDuration(System.currentTimeMillis() - timestamp);
timestamp = System.currentTimeMillis();
ocrResultWriter.drawOcrResultsToPdf(documentFile, viewerDocumentFile, imageWithTextPositionsPerPage);
log.info("Saving document");
stats.increaseWritingTextDuration(System.currentTimeMillis() - timestamp);
logger.sendFinished();
return stats;
}
}
}

View File

@ -0,0 +1,105 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.io.File;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.stream.Collectors;
import org.apache.pdfbox.pdmodel.PDDocument;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageExtractionThread;
import com.knecon.fforesight.service.ocr.processor.service.threads.ImageProcessingThread;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
import lombok.AccessLevel;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OcrImageFactory {
PDDocument document;
File documentFile;
Path tmpImageDir;
GhostScriptService ghostScriptService;
BlockingQueue<UnprocessedImage> imageProcessingQueue;
ImageProcessingThread imageProcessingThread;
BlockingQueue<OcrImage> imageOutputQueue;
List<ImageExtractionThread> imageExtractionThreads;
List<Integer> stitchedPageNumbers;
Statistics stats;
public OcrImageFactory(PDDocument document,
File documentFile,
Path tmpImageDir,
int numberOfThreads,
GhostScriptService ghostScriptService,
BlockingQueue<OcrImage> imageOcrQueue,
OcrProgressLogger logger,
OcrServiceSettings settings,
Statistics stats) {
this.document = document;
this.documentFile = documentFile;
this.tmpImageDir = tmpImageDir;
this.ghostScriptService = ghostScriptService;
this.imageOutputQueue = imageOcrQueue;
this.imageProcessingQueue = new ArrayBlockingQueue<>(imageOcrQueue.remainingCapacity());
this.stitchedPageNumbers = Collections.synchronizedList(new LinkedList<>());
this.stats = stats;
this.imageExtractionThreads = new ArrayList<>(numberOfThreads);
List<List<Integer>> balancedPageNumbers = ListSplittingUtils.buildBalancedContinuousSublist(document.getNumberOfPages(), numberOfThreads);
for (int i = 0; i < balancedPageNumbers.size(); i++) {
imageExtractionThreads.add(new ImageExtractionThread(i, balancedPageNumbers.get(i), documentFile, logger, stats, settings, imageProcessingQueue, stitchedPageNumbers));
}
this.imageProcessingThread = new ImageProcessingThread(imageProcessingQueue, imageOcrQueue, stats, settings, document);
log.info("Started {} image extraction threads, with ({}) pages each",
imageExtractionThreads.size(),
imageExtractionThreads.stream().map(ImageExtractionThread::getPageIndices).map(List::size).map(String::valueOf).collect(Collectors.joining(", ")));
}
public void start() {
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
imageExtractionThread.start();
}
imageProcessingThread.start();
}
@SneakyThrows
public void join() {
for (ImageExtractionThread imageExtractionThread : imageExtractionThreads) {
imageExtractionThread.join();
}
if (!stitchedPageNumbers.isEmpty()) {
ghostScriptService.renderPagesAsImagesBatchedAndAddToQueue(stitchedPageNumbers, documentFile.toString(), tmpImageDir, document, imageProcessingQueue, stats);
}
imageProcessingThread.setAllImagesExtracted(true);
imageProcessingThread.interrupt();
imageProcessingThread.join();
}
}

View File

@ -0,0 +1,91 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OcrProgressLogger {
Set<ImageNumberWithPageNumber> imagesToProcess;
Set<ImageNumberWithPageNumber> processedImages;
IOcrMessageSender ocrMessageSender;
String fileId;
public OcrProgressLogger(int totalPageCount, IOcrMessageSender ocrMessageSender, String fileId) {
this.ocrMessageSender = ocrMessageSender;
this.fileId = fileId;
this.imagesToProcess = Collections.synchronizedSet(new HashSet<>(totalPageCount));
for (int i = 0; i < totalPageCount; i++) {
imagesToProcess.add(new ImageNumberWithPageNumber(0, i + 1));
}
this.processedImages = Collections.synchronizedSet(new HashSet<>(totalPageCount));
}
public void logImageFinished(OcrImage image, int psm) {
this.processedImages.add(new ImageNumberWithPageNumber(image.getNumberOnPage(), image.getPageNumber()));
if (image instanceof ExtractedOcrImage) {
log.info("{}/{}: Finished image {} on page {} with rotation {}, used PSM {}, quad-point: {}",
processedImages.size(),
imagesToProcess.size(),
image.getNumberOnPage(),
image.getPageNumber(),
image.getRotationDegrees(),
psm,
image.getImageCoordinatesInInitialUserSpace());
} else {
log.info("{}/{}: Finished page {} as fully rendered page with rotation {}, used PSM {}",
processedImages.size(),
imagesToProcess.size(),
image.getPageNumber(),
image.getRotationDegrees(),
psm);
}
ocrMessageSender.sendUpdate(fileId, this.processedImages.size(), this.imagesToProcess.size());
}
public void logPageSkipped(Integer pageIndex) {
var pageDummy = new ImageNumberWithPageNumber(0, pageIndex);
this.imagesToProcess.remove(pageDummy);
log.debug("{}/{}: No images to ocr on page {}", processedImages.size(), imagesToProcess.size(), pageIndex);
ocrMessageSender.sendUpdate(fileId, this.processedImages.size(), imagesToProcess.size());
}
public void addImagesToProcess(int pageNumber, int imageNumber) {
this.imagesToProcess.add(new ImageNumberWithPageNumber(imageNumber, pageNumber));
}
public void sendFinished() {
log.info("{}/{}: Finished OCR on all images", processedImages.size(), imagesToProcess.size());
ocrMessageSender.sendOcrFinished(fileId, imagesToProcess.size());
}
private record ImageNumberWithPageNumber(int imageNumber, int pageNumber) {
}
}

View File

@ -0,0 +1,251 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.awt.Color;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Stream;
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
import org.springframework.stereotype.Service;
import com.iqser.red.pdftronlogic.commons.Converter;
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.TextExtractor;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OcrResultWriter {
public static final Color REGULAR_TEXT_COLOR = Color.BLUE;
public static final Color BOLD_TEXT_COLOR = Color.CYAN;
public static final Color REGULAR_TEXT_IN_IGNORE_ZONE = Color.RED;
public static final Color BOLD_TEXT_IN_IGNORE_ZONE = Color.RED;
ViewerDocumentService viewerDocumentService;
@SneakyThrows
public void drawOcrResultsToPdf(File document, File viewerDocument, Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
Map<Integer, VisualizationsOnPage> ocrVisualizationsOnPages = new HashMap<>();
Map<Integer, VisualizationsOnPage> ocrTextDebugVisualizationsOnPages = new HashMap<>();
Map<Integer, VisualizationsOnPage> ocrBBoxDebugVisualizationsOnPages = new HashMap<>();
try (var in = new FileInputStream(document); PDFDoc doc = new PDFDoc(in)) {
for (Integer pageNumber : imagesWithResultsPerPage.keySet()) {
List<Rectangle2D> textBBoxes = getTextBBoxes(doc.getPage(pageNumber));
ocrVisualizationsOnPages.put(pageNumber - 1, createVisualizations(imagesWithResultsPerPage.get(pageNumber), textBBoxes));
ocrTextDebugVisualizationsOnPages.put(pageNumber - 1, createDebugTextVisualizations(imagesWithResultsPerPage.get(pageNumber), textBBoxes));
ocrBBoxDebugVisualizationsOnPages.put(pageNumber - 1, createDebugBBoxVisualizations(imagesWithResultsPerPage.get(pageNumber)));
}
}
Visualizations visualizations = new Visualizations(ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false);
List<Visualizations> debugVisualizations = List.of(visualizations,
new Visualizations(ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false),
new Visualizations(ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false));
viewerDocumentService.addVisualizationsOnPage(document, document, List.of(visualizations));
viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, debugVisualizations);
}
@SuppressWarnings("PMD")
private List<Rectangle2D> getTextBBoxes(Page page) {
List<Rectangle2D> textBBoxes = new ArrayList<>();
try (var textExtractor = new TextExtractor()) {
textExtractor.begin(page);
try {
for (TextExtractor.Line line = textExtractor.getFirstLine(); line.isValid(); line = getNextLine(line)) {
for (TextExtractor.Word word = line.getFirstWord(); word.isValid(); word = getNextWord(word)) {
textBBoxes.add(Converter.toRectangle2D(word.getBBox()));
}
}
} catch (Exception e) {
log.warn("Could not get word dimension, {}", e.getMessage());
}
return textBBoxes;
}
}
private static TextExtractor.Word getNextWord(TextExtractor.Word word) {
TextExtractor.Word nextWord = word.getNextWord();
word.close();
return nextWord;
}
private static TextExtractor.Line getNextLine(TextExtractor.Line line) {
TextExtractor.Line newLine = line.getNextLine();
line.close();
return newLine;
}
private VisualizationsOnPage createVisualizations(List<OcrResultToWrite> ocrResultsToWrite, List<Rectangle2D> ignoreZones) {
List<TextPositionInImage> words = ocrResultsToWrite.stream()
.map(OcrResultToWrite::textPositionInImage)
.flatMap(Collection::stream)
.filter(word -> ignoreZones.stream()
.noneMatch(ignoreZone -> word.getTransformedTextBBox().getBounds2D().intersects(ignoreZone)))
.toList();
List<PlacedText> placedTexts = words.stream()
.map(word -> new PlacedText(word.getText(),
null,
Color.BLACK,
(float) word.getFontSize(),
word.getFontMetricsFactory(),
Optional.of(word.getTextMatrix()),
Optional.of(RenderingMode.NEITHER)))
.toList();
return VisualizationsOnPage.builder().placedTexts(placedTexts).build();
}
private VisualizationsOnPage createDebugTextVisualizations(List<OcrResultToWrite> ocrResultsToWrite, List<Rectangle2D> textBBoxes) {
List<TextPositionInImage> wordsToDraw = new ArrayList<>();
List<TextPositionInImage> ignoredWords = new ArrayList<>();
for (OcrResultToWrite ocrResultToWrite : ocrResultsToWrite) {
for (TextPositionInImage textPositionInImage : ocrResultToWrite.textPositionInImage()) {
if (textBBoxes.stream()
.anyMatch(ignoreZone -> textPositionInImage.getTransformedTextBBox().getBounds2D().intersects(ignoreZone))) {
ignoredWords.add(textPositionInImage);
} else {
wordsToDraw.add(textPositionInImage);
}
}
}
Stream<PlacedText> placedTexts = wordsToDraw.stream()
.map(word -> new PlacedText(word.getText(),
null,
word.getFontStyle().equals(FontStyle.REGULAR) ? REGULAR_TEXT_COLOR : BOLD_TEXT_COLOR,
(float) word.getFontSize(),
word.getFontMetricsFactory(),
Optional.of(word.getTextMatrix()),
Optional.of(RenderingMode.FILL)));
Stream<PlacedText> placedTexts2 = ignoredWords.stream()
.map(word -> new PlacedText(word.getText(),
null,
word.getFontStyle().equals(FontStyle.REGULAR) ? REGULAR_TEXT_IN_IGNORE_ZONE : BOLD_TEXT_IN_IGNORE_ZONE,
(float) word.getFontSize(),
word.getFontMetricsFactory(),
Optional.of(word.getTextMatrix()),
Optional.of(RenderingMode.FILL)));
return VisualizationsOnPage.builder()
.placedTexts(Stream.of(placedTexts, placedTexts2)
.flatMap(Function.identity())
.toList())
.build();
}
private VisualizationsOnPage createDebugBBoxVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {
List<TextPositionInImage> words = ocrResultsToWrite.stream()
.map(OcrResultToWrite::textPositionInImage)
.flatMap(Collection::stream)
.toList();
List<ColoredLine> coloredLines = Stream.concat(//
words.stream()
.map(TextPositionInImage::getTransformedTextBBox)
.map(this::quadPointAsLines),//
ocrResultsToWrite.stream()
.map(OcrResultToWrite::imageBoundingBox)
.map(this::createGrid)//
)
.flatMap(Collection::stream)
.toList();
return VisualizationsOnPage.builder().coloredLines(coloredLines).build();
}
private List<ColoredLine> quadPointAsLines(QuadPoint rect) {
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.ORANGE, 1),
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1),
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1),
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.MAGENTA, 1));
}
@SneakyThrows
private List<ColoredLine> createGrid(QuadPoint rect) {
List<ColoredLine> lines = new LinkedList<>(quadPointAsLines(rect));
int nRows = 8;
int nCols = 8;
Point2D abStep = new Point2D.Double((rect.b().getX() - rect.a().getX()) / (nRows + 1), (rect.b().getY() - rect.a().getY()) / (nRows + 1));
Point2D start = add(rect.a(), abStep);
Point2D end = add(rect.d(), abStep);
for (int row = 0; row < nRows; ++row) {
lines.add(new ColoredLine(new Line2D.Double(start, end), Color.BLACK, 0.2f));
start = add(start, abStep);
end = add(end, abStep);
}
Point2D adStep = new Point2D.Double((rect.d().getX() - rect.a().getX()) / (nCols + 1), (rect.d().getY() - rect.a().getY()) / (nCols + 1));
start = add(rect.a(), adStep);
end = add(rect.b(), adStep);
for (int col = 0; col < nCols; ++col) {
lines.add(new ColoredLine(new Line2D.Double(start, end), Color.BLACK, 0.2f));
start = add(start, adStep);
end = add(end, adStep);
}
return lines;
}
private Point2D add(Point2D a, Point2D b) {
return new Point2D.Double(a.getX() + b.getX(), a.getY() + b.getY());
}
}

View File

@ -1,4 +1,10 @@
package com.iqser.red.service.ocr.v1.server.utils;
package com.knecon.fforesight.service.ocr.processor.service;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import org.apache.commons.lang3.StringUtils;
@ -21,6 +27,14 @@ public final class OsUtils {
return addBackSlashAtEnd(getTemporaryDirectory()) + addBackSlashAtEnd(SERVICE_NAME) + addBackSlashAtEnd(suffix) + addBackSlashAtEnd(fileId);
}
public static File writeFileToTmpFolder(InputStream in, Path tmpDir) throws IOException {
File pdfFile = tmpDir.resolve("document.pdf").toFile();
try (var fileOut = new FileOutputStream(pdfFile)) {
fileOut.write(in.readAllBytes());
}
return pdfFile;
}
private static boolean isWindows() {
@ -55,4 +69,10 @@ public final class OsUtils {
return "/tmp";
}
public static String createTmpFileName(String filename, String suffix) {
return Path.of(OsUtils.getTemporaryDirectory()).resolve(Path.of(filename).getFileName()).toString().replace(".pdf", "_" + suffix + ".pdf");
}
}

View File

@ -0,0 +1,85 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;
import lombok.AccessLevel;
import lombok.experimental.FieldDefaults;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class Statistics {
List<Long> imageExtraction;
List<Long> tesseractDuration;
AtomicLong pdf2ImgDuration;
AtomicLong writingTextDuration;
AtomicLong imageProcessingDuration;
AtomicLong fontStyleDetectionDuration;
public Statistics(int numberOfExtractThreads, int numberOfOcrThreads) {
this.imageExtraction = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfExtractThreads, 0L)));
this.tesseractDuration = Collections.synchronizedList(new ArrayList<>(Collections.nCopies(numberOfOcrThreads, 0L)));
this.fontStyleDetectionDuration = new AtomicLong(0);
this.pdf2ImgDuration = new AtomicLong(0);
this.writingTextDuration = new AtomicLong(0);
this.imageProcessingDuration = new AtomicLong(0);
}
public void increaseImageExtraction(int threadId, long duration) {
imageExtraction.set(threadId, imageExtraction.get(threadId) + duration);
}
public void increaseImageProcessing(long duration) {
imageProcessingDuration.addAndGet(duration);
}
public void increaseTesseractDuration(int threadId, long duration) {
tesseractDuration.set(threadId, tesseractDuration.get(threadId) + duration);
}
public void increasePDF2ImgDuration(long duration) {
pdf2ImgDuration.addAndGet(duration);
}
public void increaseWritingTextDuration(long duration) {
writingTextDuration.addAndGet(duration);
}
public void increaseFontStyleDetectionDuration(long duration) {
fontStyleDetectionDuration.addAndGet(duration);
}
@Override
public String toString() {
return String.format(
"imageExtraction: mean %.2f s, max %.2f s, min %.2f, tesseract: mean %.2f s, max %.2f s, min %.2f, ImageProcessing=%.2f s, PDF2Img=%.2f s, writingText=%.2f s, FontstyleDetection=%.2f s",
((float) imageExtraction.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
((float) imageExtraction.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
((float) imageExtraction.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
((float) tesseractDuration.stream().mapToLong(Long::longValue).average().orElse(0) / 1000),
((float) tesseractDuration.stream().mapToLong(Long::longValue).max().orElse(0) / 1000),
((float) tesseractDuration.stream().mapToLong(Long::longValue).min().orElse(0) / 1000),
(float) imageProcessingDuration.get() / 1000,
(float) pdf2ImgDuration.get() / 1000,
(float) writingTextDuration.get() / 1000,
(float) fontStyleDetectionDuration.get() / 1000);
}
}

View File

@ -0,0 +1,43 @@
package com.knecon.fforesight.service.ocr.processor.service.fonts;
import org.apache.pdfbox.pdmodel.font.PDFont;
import com.knecon.fforesight.service.ocr.processor.model.FontMetrics;
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
import com.knecon.fforesight.service.viewerdoc.model.EmbeddableFont;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
public interface FontMetricsFactory extends EmbeddableFont {
default FontMetrics calculateMetrics(String text, double textWidth, double textHeight) {
HeightAndDescent heightAndDescent = calculateHeightAndDescent(text);
float fontSize = calculateFontSize(text, textWidth);
float heightScaling = (float) ((textHeight / (heightAndDescent.height() - heightAndDescent.descent())) * 1000) / fontSize;
return new FontMetrics((heightAndDescent.descent() / 1000) * fontSize, fontSize, heightScaling);
}
@SneakyThrows
default float calculateFontSize(String text, double textWidth) {
float width;
try {
width = getFont().getStringWidth(text);
} catch (IllegalArgumentException e) {
// this means, the font has no glyph for this character
width = getFont().getAverageFontWidth() * text.length();
}
return (float) (textWidth / width) * 1000;
}
PDFont getFont();
HeightAndDescent calculateHeightAndDescent(String text);
}

View File

@ -0,0 +1,5 @@
package com.knecon.fforesight.service.ocr.processor.service.fonts;
public enum FontStyle {
REGULAR, BOLD, ITALIC
}

View File

@ -0,0 +1,140 @@
package com.knecon.fforesight.service.ocr.processor.service.fonts;
import java.io.ByteArrayInputStream;
import java.util.Set;
import org.apache.fontbox.ttf.GlyphData;
import org.apache.fontbox.ttf.TTFParser;
import org.apache.fontbox.ttf.TrueTypeFont;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import com.knecon.fforesight.service.ocr.processor.model.HeightAndDescent;
import lombok.AllArgsConstructor;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor
@AllArgsConstructor
public class Type0FontMetricsFactory implements FontMetricsFactory {
private final String resourcePath;
private PDType0Font type0Font;
private TrueTypeFont trueTypeFont;
private PDDocument documentThisIsEmbeddedIn;
// for this specific font back-/forward-slashes have a lot of descent screwing up the font size and therefore bold detection. So if we find such a character we ignore its descent.
private static final Set<Integer> slashGlyphIds = Set.of(18, 63);
@SneakyThrows
public static Type0FontMetricsFactory regular(PDDocument document) {
String resourcePath = "fonts/cmu-regular.ttf";
return createFromResourcePath(resourcePath, document);
}
@SneakyThrows
public static Type0FontMetricsFactory bold(PDDocument document) {
String resourcePath = "fonts/cmu-bold.ttf";
return createFromResourcePath(resourcePath, document);
}
@SneakyThrows
@SuppressWarnings("PMD.CloseResource")
private static TrueTypeFont readFromResourcePath(String resourcePath) {
// The ttf is closed with the document, seePDType0Font line 134
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(resourcePath); var buffer = new RandomAccessReadBuffer(in)) {
return new TTFParser().parse(buffer);
}
}
@SneakyThrows
@SuppressWarnings("PMD.CloseResource")
private static Type0FontMetricsFactory createFromResourcePath(String resourcePath, PDDocument document) {
TrueTypeFont trueTypeFont = readFromResourcePath(resourcePath);
// since Type0Font can be descendant from any font, we need to remember the original TrueTypeFont for the glyph information
return new Type0FontMetricsFactory(resourcePath, PDType0Font.load(document, trueTypeFont, true), trueTypeFont, document); // use Type0Font for unicode support)
}
@SneakyThrows
public HeightAndDescent calculateHeightAndDescent(String text) {
byte[] bytes;
try {
bytes = type0Font.encode(text);
} catch (IllegalArgumentException e) {
log.warn("The string {} could not be parsed, using average height and descent", text);
return new HeightAndDescent(800, -50);
}
ByteArrayInputStream in = new ByteArrayInputStream(bytes);
float descent = 0;
float height = 0;
while (in.available() > 0) {
try {
int code = type0Font.readCode(in);
int glyphId = type0Font.codeToGID(code);
GlyphData glyph = trueTypeFont.getGlyph().getGlyph(glyphId);
if (glyph == null || glyph.getBoundingBox() == null) {
continue;
}
if (!slashGlyphIds.contains(glyphId)) {
descent = Math.min(descent, glyph.getYMinimum());
}
height = Math.max(height, glyph.getYMaximum());
} catch (Exception e) {
log.warn("descent and height of string {} could not be parsed, using average fallback value!", text);
}
}
// some characters like comma or minus return very small height values, while tesseract still returns a normal-sized bounding box and therefore exploding the height scaling factors,
// so we need a minimum value. Here, 500 seems optimal for the characters "-", ",", "_"
return new HeightAndDescent(Math.max(height, 500), descent);
}
@Override
public PDFont getFont() {
return type0Font;
}
@Override
@SneakyThrows
public PDFont embed(PDDocument document) {
if (documentThisIsEmbeddedIn.equals(document)) {
return getFont();
}
// no need to close, the font will be closed with the document it is embedded in
this.trueTypeFont = readFromResourcePath(resourcePath);
this.type0Font = PDType0Font.load(document, trueTypeFont, true);
this.documentThisIsEmbeddedIn = document;
return getFont();
}
@SneakyThrows
public void close() {
trueTypeFont.close();
}
}

View File

@ -0,0 +1,158 @@
package com.knecon.fforesight.service.ocr.processor.service.scriptdetection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Stream;
import org.apache.commons.math3.ml.clustering.Cluster;
import org.apache.commons.math3.ml.clustering.DBSCANClusterer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.FontStyleDetectionModel;
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.TextPositionAndWordImage;
import com.knecon.fforesight.service.ocr.processor.model.scriptdetection.WordImage;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
import com.knecon.fforesight.service.ocr.processor.service.fonts.Type0FontMetricsFactory;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class FontStyleDetector {
OcrServiceSettings settings;
StrokeWidthCalculator strokeWidthCalculator;
/**
* Implementation of the MOBDoB algorithm, refer to the paper here:
* <a href="http://mile.ee.iisc.ac.in/publications/softCopy/DocumentAnalysis/Sai_NCVPRIPG2013.pdf">Script Independent Detection of Bold Words in Multi Font-size Documents</a>
* <p>
* As a high level overview: We cluster all text based on its font size. We determine the cluster with the most words. This is assumed to be regular text.
* We then estimate the average stroke width of that cluster by thinning all text to a single pixel and calculating the ratio of remaining pixels.
* (<a href="http://www.leptonica.org/papers/conn.pdf">Leptonica Documentation on thinning</a>)
* For each word we scale this average strokewidth based on its fontsize compared to the most common fontsize.
* Using the scaled strokewidth we do an opening operation.
* (<a href="https://en.wikipedia.org/wiki/Opening_(morphology)">Opening (Morphology)</a>).
* We then threshold the ratio of remaining pixels to determine whether a word is bold or not.
* <p>
* I did take some liberties though. Firstly, the paper uses text height without ascender/descender height for the clustering. I'm using the previously implemented font size estimation.
* But that is calculated based on text width. Thus, I'm also using the height scaling factor to scale the font size by the text height.
* The paper does not describe its clustering algorithm, so I've decided on DBSCAN due to its good runtime and readily available implementation by apache commons math.
* Moreover, the paper states that stroke width scales linearly with text height. I've come to the conclusion this is not the case.
* It seems it scales with the square root of the text height. Or at least this seemed to give the best results for me.
*/
public Map<Integer, List<OcrResultToWrite>> detectBold(List<OcrResult> ocrResults, PDDocument document) {
FontMetricsFactory fontMetricsFactory = Type0FontMetricsFactory.regular(document);
if (!settings.isBoldDetection()) {
return OcrResultToWrite.buildOcrResultsToWrite(ocrResults, fontMetricsFactory);
}
Map<Integer, List<OcrResultToWrite>> ocrResultToWritePerPage = new HashMap<>();
DBSCANClusterer<TextPositionAndWordImage> clusterer = new DBSCANClusterer<>(0.5, 1);
FontMetricsFactory boldFontMetricsFactory = Type0FontMetricsFactory.bold(document);
for (OcrResult result : ocrResults) {
FontStyleDetectionModel fontStyleDetectionModel = FontStyleDetectionModel.fromOcrResult(result, fontMetricsFactory, settings);
List<Cluster<TextPositionAndWordImage>> clusters = clusterer.cluster(fontStyleDetectionModel.getTextPositionsAndWordImages());
Optional<Cluster<TextPositionAndWordImage>> largestCluster = clusters.stream().max(Comparator.comparingInt(cluster -> cluster.getPoints().size()));
if (largestCluster.isEmpty()) {
insertResultIntoMap(result.image().getPageNumber(), ocrResultToWritePerPage, fontStyleDetectionModel);
continue;
}
List<TextPositionAndWordImage> wordsWithMostCommonTextHeight = largestCluster.get().getPoints();
double standardTextHeight = calculateStandardTextheight(wordsWithMostCommonTextHeight);
double regularStrokeWidth = calculateRegularStrokeWidth(wordsWithMostCommonTextHeight);
for (TextPositionAndWordImage textPositionsAndWordImage : fontStyleDetectionModel.getTextPositionsAndWordImages()) {
decideOnFontStyle(textPositionsAndWordImage, regularStrokeWidth, standardTextHeight, boldFontMetricsFactory);
}
insertResultIntoMap(result.image().getPageNumber(), ocrResultToWritePerPage, fontStyleDetectionModel);
fontStyleDetectionModel.dispose();
}
log.info("Finished bold detection");
return ocrResultToWritePerPage;
}
private static double calculateStandardTextheight(List<TextPositionAndWordImage> wordsWithMostCommonTextHeight) {
return wordsWithMostCommonTextHeight.stream()
.map(TextPositionAndWordImage::getWordImage)
.mapToDouble(WordImage::getTextHeight)
.filter(Double::isFinite)
.average()
.orElseThrow();
}
private double calculateRegularStrokeWidth(List<TextPositionAndWordImage> wordsWithMostCommonTextHeight) {
return wordsWithMostCommonTextHeight.stream()
.mapToDouble(textPositionAndWordImage -> strokeWidthCalculator.calculate(textPositionAndWordImage.getWordImage().getImage()))
.filter(Double::isFinite)
.average()
.orElseThrow();
}
private static void insertResultIntoMap(int pageNumber, Map<Integer, List<OcrResultToWrite>> ocrResultToWritePerPage, FontStyleDetectionModel fontStyleDetectionModel) {
OcrResultToWrite ocrResult = OcrResultToWrite.fromFontStyleDetectionModel(fontStyleDetectionModel);
ocrResultToWritePerPage.compute(pageNumber, (key, existingList) -> {
if (existingList == null) {
return List.of(ocrResult);
} else {
return Stream.concat(existingList.stream(), Stream.of(ocrResult)).toList();
}
});
}
private void decideOnFontStyle(TextPositionAndWordImage textPositionsAndWordImage,
double standardStrokeWidth,
double standardTextHeight,
FontMetricsFactory boldFontMetricsFactory) {
double scaledStrokeWidth = scaleStrokeWidthByFontSize(textPositionsAndWordImage, standardStrokeWidth, standardTextHeight);
if (textPositionsAndWordImage.getWordImage().hasLargerStrokeWidth(scaledStrokeWidth)) {
textPositionsAndWordImage.getTextPositionInImage().setFontMetricsFactory(boldFontMetricsFactory);
textPositionsAndWordImage.getTextPositionInImage().setFontStyle(FontStyle.BOLD);
} else {
textPositionsAndWordImage.getTextPositionInImage().setFontStyle(FontStyle.REGULAR);
}
}
private static double scaleStrokeWidthByFontSize(TextPositionAndWordImage textPositionsAndWordImage, double standardStrokeWidth, double standardFontSize) {
double influenceOfFontSize = 1.0; // the paper states that stroke width scales exactly linearly with font size. This did not seem to be true for me. Maybe some of the preprocessing steps are affecting this.
double fontsizeScalingFactor = Math.sqrt(textPositionsAndWordImage.getWordImage().getTextHeight() / standardFontSize);
return standardStrokeWidth + (influenceOfFontSize * (fontsizeScalingFactor - 1) * standardStrokeWidth);
}
}

View File

@ -0,0 +1,57 @@
package com.knecon.fforesight.service.ocr.processor.service.scriptdetection;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import lombok.AccessLevel;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.Sel;
import net.sourceforge.lept4j.util.LeptUtils;
/**
* This code is a good start for detecting italic text, although it has a few issues especially with glyphs which are naturally slanted. E.g. z, 2, 7, /
* If we want this maybe we should exclude these glyphs and then it might have less false positives. But in its current state i don't recommend using it.
*/
@NoArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ItalicDetector {
static String italicKernel = "ooxxooxxooxxoxxooXxooxxoxxooxxooxxoo";
Sel italicSel = Leptonica1.selCreateFromString(italicKernel, 9, 4, "italicKernel");
Sel brickSel = Leptonica1.selCreateBrick(3, 4, 1, 2, 1);
public boolean isItalic(Pix pix) {
Pix preprocessed = preprocess(pix);
Pix flipped = Leptonica1.pixFlipLR(null, pix);
Pix flippedPreprocessed = preprocess(flipped);
Leptonica1.pixFlipLR(flippedPreprocessed, flippedPreprocessed);
double pixelDensity = ImageProcessingUtils.calculatePixelDensity(preprocessed);
double flippedPixelDensity = ImageProcessingUtils.calculatePixelDensity(flippedPreprocessed);
LeptUtils.disposePix(preprocessed);
LeptUtils.disposePix(flipped);
LeptUtils.disposePix(flippedPreprocessed);
return flippedPixelDensity / pixelDensity < 0.85;
}
private Pix preprocess(Pix pix) {
Pix eroded = Leptonica1.pixErode(null, pix, italicSel.getPointer());
Pix dilated = Leptonica1.pixDilate(null, eroded, brickSel.getPointer());
LeptUtils.disposePix(eroded);
return dilated;
}
public void dispose() {
LeptUtils.dispose(italicSel);
LeptUtils.dispose(brickSel);
}
}

View File

@ -0,0 +1,58 @@
package com.knecon.fforesight.service.ocr.processor.service.scriptdetection;
import static net.sourceforge.lept4j.ILeptonica.L_THIN_FG;
import java.nio.IntBuffer;
import org.springframework.stereotype.Service;
import lombok.AccessLevel;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.Sela;
import net.sourceforge.lept4j.util.LeptUtils;
@Service
@NoArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class StrokeWidthCalculator {
Sela thinningSel;
/**
* Uses a series of sels to thin all connected lines to a single pixel. Then the pixel ratio is a good estimation of the stroke width in pixels.
* <a href="http://www.leptonica.org/papers/conn.pdf">Leptonica Documentation on thinning</a>
* Since the baseline is a strokewidth of exactly one, we need to add 1 to the result.
*
* @param input binarized pix with text on it
* @return estimated stroke width in pixels
*/
public double calculate(Pix input) {
init();
Pix thinned = Leptonica1.pixThinConnectedBySet(input, L_THIN_FG, thinningSel, 0);
IntBuffer thinnedPixelCount = IntBuffer.allocate(1);
Leptonica1.pixCountPixels(thinned, thinnedPixelCount, null);
IntBuffer pixelCount = IntBuffer.allocate(1);
Leptonica1.pixCountPixels(input, pixelCount, null);
LeptUtils.disposePix(thinned);
return (double) pixelCount.get() / thinnedPixelCount.get() + 1;
}
private void init() {
if (thinningSel == null) {
thinningSel = Leptonica1.selaMakeThinSets(1, 0);
}
}
}

View File

@ -0,0 +1,65 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.concurrent.BlockingQueue;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.Setter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.tess4j.TessAPI1;
/*
This just moves the Elements from the GhostScriptOutputListener into the ImageProcessing queue asynchronously
*/
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class BlockingQueueFiller extends Thread {
final BlockingQueue<RenderedPageImageFile> imageInputQueue;
final BlockingQueue<UnprocessedImage> imageOutputQueue;
@Setter
boolean allImagesQueued;
@SneakyThrows
@Override
public void run() {
// Interrupting signals that the image extraction has finished
try {
while (!allImagesQueued) {
final UnprocessedImage image = imageInputQueue.take();
try {
imageOutputQueue.put(image);
} catch (InterruptedException e) {
imageOutputQueue.put(image);
}
}
} catch (InterruptedException e) {
log.info("All images extracted, emptying processing queue and stopping");
}
// empty the queue
try {
while (true) {
final UnprocessedImage image = imageInputQueue.remove();
imageOutputQueue.put(image);
}
} catch (NoSuchElementException e) {
log.debug("No images left in queue, stopping.");
}
}
}

View File

@ -0,0 +1,121 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.BlockingQueue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class GhostScriptOutputHandler extends Thread {
static Pattern pageFinishedPattern = Pattern.compile("Page (\\d+)");
// If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock.
// Since both need to read simultaneously we need to implement the readers as separate threads.
final InputStream is;
final String processName;
final Type type;
final Map<Integer, RenderedPageImageFile> pagesToProcess;
final BlockingQueue<RenderedPageImageFile> renderedPageImageFileOutput;
int currentPageNumber;
public static GhostScriptOutputHandler errorHandler(InputStream is) {
return new GhostScriptOutputHandler(is, "GS", Type.ERROR, null, null);
}
public static GhostScriptOutputHandler stdOut(InputStream is,
Map<Integer, RenderedPageImageFile> pagesToProcess,
BlockingQueue<RenderedPageImageFile> renderedPageImageFileOutput) {
return new GhostScriptOutputHandler(is, "GS", Type.STD_OUT, pagesToProcess, renderedPageImageFileOutput);
}
@SneakyThrows
public void run() {
try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) {
String line;
while (true) {
line = br.readLine();
if (line == null) {
break;
}
if (type.equals(Type.ERROR)) {
log.error(processName + "_" + type.name() + ">" + line);
} else {
log.debug(processName + "_" + type.name() + ">" + line);
addProcessedImageToQueue(line);
}
}
}
is.close();
if (type.equals(Type.STD_OUT)) {
queueFinishedPage(currentPageNumber);
}
}
private void addProcessedImageToQueue(String line) {
/*
Ghostscript prints the pageNumber it is currently working on, so we remember the current page and queue it as soon as the next comes in.
*/
Matcher pageNumberMatcher = pageFinishedPattern.matcher(line);
if (pageNumberMatcher.find()) {
int pageNumber = Integer.parseInt(pageNumberMatcher.group(1));
if (currentPageNumber == 0) {
currentPageNumber = pageNumber;
return;
}
queueFinishedPage(currentPageNumber);
currentPageNumber = pageNumber;
}
}
private void queueFinishedPage(int pageNumber) {
var imageFile = this.pagesToProcess.get(pageNumber);
if (imageFile == null) {
throw new IllegalArgumentException(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet()));
}
renderedPageImageFileOutput.add(imageFile);
}
public enum Type {
ERROR,
STD_OUT
}
}

View File

@ -0,0 +1,110 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import java.io.File;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import com.knecon.fforesight.service.ocr.processor.service.ImageStreamEngine;
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ImageExtractionThread extends Thread {
static double FULL_PAGE_IMAGE_THRESHOLD = 0.99;
static double IMAGE_ALIGNMENT_THRESHOLD = 1;
int id;
@Getter
List<Integer> pageIndices;
File documentFile;
OcrProgressLogger logger;
Statistics stats;
OcrServiceSettings settings;
// output is written to these lists
BlockingQueue<UnprocessedImage> imageProcessingQueue;
List<Integer> stitchedPageNumbers;
@SneakyThrows
@Override
public void run() {
long timestamp;
for (Integer pageIndex : pageIndices) {
try (PDDocument document = Loader.loadPDF(documentFile)) { // load new PDDocument for thread safety, also keeps RAM usage low.
timestamp = System.currentTimeMillis();
List<ExtractedImage> extractedImages = getExtractedImages(pageIndex, document);
stats.increaseImageExtraction(id, System.currentTimeMillis() - timestamp);
if (extractedImages.isEmpty()) {
logger.logPageSkipped(pageIndex);
}
if (checkForFullPageOrStitchedImages(extractedImages, document.getPage(pageIndex - 1))) {
stitchedPageNumbers.add(pageIndex);
logger.addImagesToProcess(pageIndex, 0);
continue;
}
for (ExtractedImage image : extractedImages) {
imageProcessingQueue.put(image);
logger.addImagesToProcess(image.pageNumber(), image.numberOnPage());
}
}
}
}
private List<ExtractedImage> getExtractedImages(Integer pageIndex, PDDocument document) {
PDPage page = document.getPage(pageIndex - 1);
ImageStreamEngine imageStreamEngine = new ImageStreamEngine(settings);
imageStreamEngine.processPage(pageIndex, page);
return imageStreamEngine.getImagesOnCurrentPage();
}
@SneakyThrows
private boolean checkForFullPageOrStitchedImages(List<ExtractedImage> imagesOnCurrentPage, PDPage page) {
if (imagesOnCurrentPage.isEmpty()) {
return false;
}
for (ExtractedImage imageOnPage : imagesOnCurrentPage) {
if (imageOnPage.width() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox().getWidth() && imageOnPage.height() > FULL_PAGE_IMAGE_THRESHOLD * page.getCropBox()
.getHeight()) {
return true;
}
}
//checking for intersections or direct alignment of images
for (int j = 0; j < imagesOnCurrentPage.size(); j++) {
for (int i = j + 1; i < imagesOnCurrentPage.size(); i++) {
if (imagesOnCurrentPage.get(j)
.getImageCoordinatesInInitialUserSpace()
.aligns(imagesOnCurrentPage.get(i).getImageCoordinatesInInitialUserSpace(), IMAGE_ALIGNMENT_THRESHOLD)) {
// TODO: see if we can stitch aligning images using BufferedImage and skip the gs conversion entirely
return true;
}
}
}
return false;
}
}

View File

@ -0,0 +1,251 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import static net.sourceforge.tess4j.ITessAPI.TRUE;
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.util.NoSuchElementException;
import java.util.concurrent.BlockingQueue;
import org.apache.pdfbox.pdmodel.PDDocument;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedOcrImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageImageFile;
import com.knecon.fforesight.service.ocr.processor.model.RenderedPageOcrImage;
import com.knecon.fforesight.service.ocr.processor.model.UnprocessedImage;
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.utils.ImageProcessingUtils;
import com.sun.jna.ptr.PointerByReference;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.Setter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.L_Kernel;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
import net.sourceforge.tess4j.ITessAPI;
import net.sourceforge.tess4j.TessAPI1;
/*
* This thread does all the image processing. There should only be one, since Leptonica is not thread safe.
*/
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ImageProcessingThread extends Thread {
final BlockingQueue<UnprocessedImage> imageInputQueue;
final BlockingQueue<OcrImage> imageOutputQueue;
final ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
final L_Kernel gaussianKernel = Leptonica1.makeGaussianKernel(2, 2, 1.0f, 1);
final Statistics stats;
final OcrServiceSettings settings;
final PDDocument document;
@Setter
boolean allImagesExtracted;
@SneakyThrows
@Override
public void run() {
try {
while (!allImagesExtracted) {
final UnprocessedImage image = imageInputQueue.take();
var ocrImage = this.process(image);
try {
imageOutputQueue.put(ocrImage);
} catch (InterruptedException e) {
imageOutputQueue.put(ocrImage);
}
}
} catch (InterruptedException e) {
log.info("All images extracted, emptying processing queue and stopping");
}
try {
while (true) {
final UnprocessedImage image = imageInputQueue.remove();
OcrImage ocrImage = this.process(image);
imageOutputQueue.put(ocrImage);
}
} catch (NoSuchElementException e) {
log.debug("No images left in processing queue, stopping.");
}
TessAPI1.TessBaseAPIEnd(this.detectionScriptHandle);
TessAPI1.TessBaseAPIDelete(this.detectionScriptHandle);
LeptUtils.dispose(gaussianKernel);
}
private OcrImage process(UnprocessedImage unprocessedImage) {
long timestamp = System.currentTimeMillis();
OcrImage ocrImage;
if (unprocessedImage instanceof ExtractedImage extractedImage) {
ocrImage = processExtractedImage(extractedImage);
} else if (unprocessedImage instanceof RenderedPageImageFile renderedPageImageFile) {
ocrImage = processRenderedPageImageFile(renderedPageImageFile);
} else {
throw new UnsupportedOperationException(String.format("Class %s is not supported!", unprocessedImage.getClass()));
}
stats.increaseImageProcessing(System.currentTimeMillis() - timestamp);
return ocrImage;
}
@SuppressWarnings("PMD.CompareObjectsWithEquals")
private OcrImage processRenderedPageImageFile(RenderedPageImageFile renderedPageImageFile) {
Pix pix = processPix(renderedPageImageFile.asPix(), settings.getDpi(), settings.getDpi());
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix);
OcrImage ocrImage = new RenderedPageOcrImage(pix.h,
pix.w,
PageInformation.fromPDPage(renderedPageImageFile.pageNumber(), document.getPage(renderedPageImageFile.pageNumber() - 1)),
rotatedPix,
orientDegree);
if (pix != rotatedPix) {
LeptUtils.disposePix(pix);
}
return ocrImage;
}
@SuppressWarnings("PMD.CompareObjectsWithEquals")
private OcrImage processExtractedImage(ExtractedImage extractedImage) {
float imageDPI = Math.abs(extractedImage.image().getWidth() / (extractedImage.ctm().getScalingFactorX() / 72));
Pix pix = processPix(extractedImage.asPix(), imageDPI, settings.getDpi());
int orientDegree = detectOrientation(pix, settings.getDpi(), detectionScriptHandle);
Pix rotatedPix = ImageProcessingUtils.deRotatePix(orientDegree, pix);
OcrImage ocrImage = new ExtractedOcrImage(extractedImage.pageNumber(),
extractedImage.numberOnPage(),
extractedImage.height(),
extractedImage.width(),
extractedImage.ctm(),
rotatedPix,
pix.h,
pix.w,
orientDegree);
if (pix != rotatedPix) {
LeptUtils.disposePix(pix);
}
return ocrImage;
}
public int detectOrientation(Pix pix, int dpi, ITessAPI.TessBaseAPI detectionScriptHandle) {
TessAPI1.TessBaseAPISetImage2(detectionScriptHandle, pix);
TessAPI1.TessBaseAPISetSourceResolution(detectionScriptHandle, dpi);
IntBuffer orientationDegreeResultBuffer;
FloatBuffer orientationDegreeConfidenceBuffer;
PointerByReference scriptureNameBuffer;
FloatBuffer scriptureConfidenceBuffer;
orientationDegreeResultBuffer = IntBuffer.allocate(1);
orientationDegreeConfidenceBuffer = FloatBuffer.allocate(1);
scriptureNameBuffer = new PointerByReference(); // Is this memory being freed?
scriptureConfidenceBuffer = FloatBuffer.allocate(1);
int orientationDegree = 0;
int result = TessAPI1.TessBaseAPIDetectOrientationScript(detectionScriptHandle,
orientationDegreeResultBuffer,
orientationDegreeConfidenceBuffer,
scriptureNameBuffer,
scriptureConfidenceBuffer);
if (result == TRUE && orientationDegreeConfidenceBuffer.get() > settings.getMinRotationConfidence()) {
orientationDegree = orientationDegreeResultBuffer.get();
}
TessAPI1.TessBaseAPIClear(detectionScriptHandle);
return orientationDegree;
}
@SneakyThrows
private Pix processPix(Pix pix, float imageDpi, int targetDpi) {
Pix grayScale;
Pix scaledUp;
Pix gaussian;
Pix binarized;
//convert to grayscale
if (pix.d == 8) {
grayScale = pix;
} else if (pix.d == 32) {
grayScale = Leptonica1.pixConvertRGBToGrayFast(pix);
LeptUtils.disposePix(pix);
} else if (pix.d == 1) {
grayScale = Leptonica1.pixConvert1To8(null, pix, (byte) 0, (byte) 255);
LeptUtils.disposePix(pix);
} else {
throw new UnsupportedOperationException(String.format("Unknown pix format with bpp of %d", pix.d));
}
// scale up
float targetFactor = targetDpi / imageDpi;
if (targetFactor > 2.1) {
scaledUp = Leptonica1.pixScaleGray4xLI(grayScale);
LeptUtils.disposePix(grayScale);
} else if (targetFactor > 1.1) {
scaledUp = Leptonica1.pixScaleGray2xLI(grayScale);
LeptUtils.disposePix(grayScale);
} else {
scaledUp = grayScale;
}
// remove noise and prep for Otsu
gaussian = Leptonica1.pixConvolve(scaledUp, gaussianKernel, 8, 1);
LeptUtils.disposePix(scaledUp);
// Threshold to binary
if (pix.w < 100 || pix.h < 100) {
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
} else {
binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.1f, null);
if (binarized == null) { // Sometimes Otsu just fails, then we binarize directly
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
}
}
LeptUtils.disposePix(gaussian);
return binarized;
}
private static ITessAPI.TessBaseAPI initDetectionScriptHandle() {
ITessAPI.TessBaseAPI handle = TessAPI1.TessBaseAPICreate();
String datapath = System.getenv("TESSDATA_PREFIX");
TessAPI1.TessBaseAPIInit3(handle, datapath, "osd");
TessAPI1.TessBaseAPISetVariable(handle, "debug_file", "/dev/null");
return handle;
}
}

View File

@ -0,0 +1,136 @@
package com.knecon.fforesight.service.ocr.processor.service.threads;
import static net.sourceforge.tess4j.ITessAPI.TRUE;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPICreate;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPIInit1;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetPageSegMode;
import static net.sourceforge.tess4j.TessAPI1.TessBaseAPISetVariable;
import java.io.File;
import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.nio.file.Path;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.concurrent.BlockingQueue;
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.model.OcrImage;
import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
import com.knecon.fforesight.service.ocr.processor.service.OcrProgressLogger;
import com.knecon.fforesight.service.ocr.processor.service.Statistics;
import com.knecon.fforesight.service.ocr.processor.utils.Tesseract2;
import com.sun.jna.StringArray;
import com.sun.jna.ptr.PointerByReference;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
import net.sourceforge.tess4j.ITessAPI;
import net.sourceforge.tess4j.ITesseract;
import net.sourceforge.tess4j.TessAPI1;
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OCRThread extends Thread {
int id;
BlockingQueue<OcrImage> imageInputQueue;
Path tesseractOutputDir;
List<OcrResult> results;
OcrProgressLogger logger;
Statistics stats;
OcrServiceSettings settings;
Tesseract2 instance;
public OCRThread(int id,
BlockingQueue<OcrImage> imageInputQueue,
Path tesseractOutputDir,
List<OcrResult> results,
OcrProgressLogger logger,
Statistics stats,
OcrServiceSettings settings) {
this.id = id;
this.imageInputQueue = imageInputQueue;
this.tesseractOutputDir = tesseractOutputDir;
this.results = results;
this.logger = logger;
this.stats = stats;
this.settings = settings;
this.instance = createInstance(settings);
}
@SneakyThrows
@Override
public void run() {
// Interrupting signals that the image extraction has finished
while (!isInterrupted()) {
try {
final OcrImage image = imageInputQueue.take();
this.process(image);
} catch (InterruptedException e) {
// set isInterrupted to true (This exception may only happen during active waiting for queue, and then isInterrupted will not be set!)
interrupt();
}
}
// empty the queue
try {
while (true) {
final OcrImage image = imageInputQueue.remove();
this.process(image);
}
} catch (NoSuchElementException e) {
log.debug("Executed tesseract on all Images, finishing.");
}
}
private void process(OcrImage image) {
long timestamp = System.currentTimeMillis();
String tmpOutputFileName = String.format("output_%04d_%04d", image.getPageNumber(), image.getNumberOnPage());
String tesseractOutputFileName = tesseractOutputDir.resolve(tmpOutputFileName).toFile().toString();
int psm = settings.getPsmOverride() < 0 ? image.getOptimalPageSegmentationMode() : settings.getPsmOverride();
executeTesseract(psm, image.getDpi(), image.getPix(), tesseractOutputFileName);
image.destroyPix();
results.add(OcrResult.create(image, tesseractOutputFileName));
logger.logImageFinished(image, psm);
stats.increaseTesseractDuration(id, System.currentTimeMillis() - timestamp);
}
@SneakyThrows
public void executeTesseract(int psm, int dpi, Pix pix, String tesseractOutputFileName) {
Leptonica1.pixWrite(tesseractOutputFileName + ".tiff", pix, 5); // write the used image for later bold detection
instance.setVariable("user_defined_dpi", String.valueOf(dpi));
instance.setPageSegMode(psm);
instance.createDocumentsWithResults(pix, null, tesseractOutputFileName, List.of(ITesseract.RenderedFormat.HOCR), ITessAPI.TessPageIteratorLevel.RIL_BLOCK);
}
private static Tesseract2 createInstance(OcrServiceSettings settings) {
Tesseract2 instance = new Tesseract2();
instance.setVariable("debug_file", "/dev/null"); // remove warnings from std out
instance.setOcrEngineMode(1); // set to LSTM based Engine
instance.setLanguage(settings.getLanguages());
return instance;
}
}

View File

@ -0,0 +1,28 @@
package com.knecon.fforesight.service.ocr.processor.settings;
import org.apache.pdfbox.cos.COSName;
import org.springframework.boot.context.properties.ConfigurationProperties;
import lombok.AccessLevel;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@ConfigurationProperties("ocr-service")
@FieldDefaults(level = AccessLevel.PRIVATE)
public class OcrServiceSettings {
int ocrThreadCount = 4; // Number of OCR threads
int imageExtractThreadCount = 2; // Number of image extraction threads
int gsProcessCount = 1; // Number of Ghostscript processes
int dpi = 300; // Target DPI for binarized images
int psmOverride = -1; // Overrides the page segmentation mode if > 0
int minImageHeight = 20; // Minimum height for images to be processed
int minImageWidth = 20; // Minimum width for images to be processed
float minRotationConfidence = 2; // Sets a lower bound for the confidence rating for rotated pages.
boolean debug; // If true, overlays OCR images with a grid and draws word bounding boxes
String languages = "deu+eng"; // Defines languages loaded into Tesseract as 3-char codes, additional languages must also be installed in the docker environment
COSName ocrMarkedContentTag = COSName.getPDFName("KNECON_OCR");
boolean boldDetection = true; // if true, bold detection will be attempted
double boldThreshold = 0.5; // Words are opened with a brick of average stroke width, if the ratio of remaining pixels is higher the word is determined bold.
}

View File

@ -0,0 +1,85 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import java.awt.AlphaComposite;
import java.awt.Color;
import java.awt.Graphics;
import java.awt.Graphics2D;
import java.awt.Transparency;
import java.awt.image.BufferedImage;
import java.nio.IntBuffer;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import com.knecon.fforesight.service.ocr.processor.model.ExtractedImage;
import com.sun.jna.ptr.PointerByReference;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
import net.sourceforge.lept4j.L_Kernel;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
@UtilityClass
public class ImageProcessingUtils {
public BufferedImage convertToDeviceColorSpace(ExtractedImage extractedImage) {
BufferedImage image;
if (extractedImage.colorSpace() instanceof PDDeviceRGB || extractedImage.colorSpace() instanceof PDDeviceGray) {
image = extractedImage.image();
} else {
BufferedImage pdfImage = extractedImage.image();
image = new BufferedImage(pdfImage.getWidth(), pdfImage.getHeight(), BufferedImage.TYPE_BYTE_GRAY);
Graphics g = image.getGraphics();
g.drawImage(pdfImage, 0, 0, null);
g.dispose();
}
return image;
}
public Pix deRotatePix(int orientDegree, Pix pix) {
return switch (360 - orientDegree) {
case 90 -> Leptonica1.pixRotateOrth(pix, 1);
case 180 -> Leptonica1.pixRotateOrth(pix, 2);
case 270 -> Leptonica1.pixRotateOrth(pix, 3);
default -> pix;
};
}
public static void setAlphaChannelToWhite(BufferedImage image) {
if (image.getTransparency() == Transparency.TRANSLUCENT) {
// NOTE: For BITMASK images, the color model is likely IndexColorModel,
// and this model will contain the "real" color of the transparent parts
// which is likely a better fit than unconditionally setting it to white.
// Fill background with white
Graphics2D graphics = image.createGraphics();
try {
graphics.setComposite(AlphaComposite.DstOver); // Set composite rules to paint "behind"
graphics.setPaint(Color.WHITE);
graphics.fillRect(0, 0, image.getWidth(), image.getHeight());
} finally {
graphics.dispose();
}
}
}
public static double calculatePixelDensity(Pix pix) {
IntBuffer pixelCount = IntBuffer.allocate(1);
int result = Leptonica1.pixCountPixels(pix, pixelCount, null);
if (result == 0) {
return (double) pixelCount.get() / (pix.h * pix.w);
} else {
return -1;
}
}
}

View File

@ -0,0 +1,73 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import lombok.experimental.UtilityClass;
import net.sourceforge.lept4j.L_Kernel;
import net.sourceforge.lept4j.Leptonica1;
@UtilityClass
public class KernelUtils {
/*
-1, -1, -1
-1, 8, -1
-1, -1, -1
*/
public L_Kernel createFullLaplacianKernel() {
L_Kernel laplacianKernel = Leptonica1.kernelCreate(3, 3);
Leptonica1.kernelSetElement(laplacianKernel, 0, 0, -1);
Leptonica1.kernelSetElement(laplacianKernel, 0, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 0, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 0, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 0, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 1, 8);
return laplacianKernel;
}
/*
0, 0, -1, 0, 0
0, -1, -1, -1, 0
-1, -1, 12, -1, -1
0, -1, -1, -1, 0
0, 0, -1, 0, 0
*/
public L_Kernel createLaplacianKernel5x5() {
L_Kernel laplacianKernel = Leptonica1.kernelCreate(5, 5);
Leptonica1.kernelSetElement(laplacianKernel, 0, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 3, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 0, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 3, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 4, -1);
Leptonica1.kernelSetElement(laplacianKernel, 3, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 3, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 3, 3, -1);
Leptonica1.kernelSetElement(laplacianKernel, 4, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 2, 12);
return laplacianKernel;
}
/*
0, -1, 0
-1, 4, -1
0, -1, 0
*/
public L_Kernel createLaplacianKernel() {
L_Kernel laplacianKernel = Leptonica1.kernelCreate(3, 3);
Leptonica1.kernelSetElement(laplacianKernel, 0, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 0, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 2, -1);
Leptonica1.kernelSetElement(laplacianKernel, 2, 1, -1);
Leptonica1.kernelSetElement(laplacianKernel, 1, 1, 4);
return laplacianKernel;
}
}

View File

@ -0,0 +1,64 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.IntStream;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ListSplittingUtils {
public List<List<Integer>> buildBalancedContinuousSublist(Integer totalNumberOfEntries, int threadCount) {
return buildBalancedSublist(IntStream.range(0, totalNumberOfEntries).map(i -> i + 1).boxed().toList(), threadCount);
}
public <T> List<List<T>> buildBalancedSublist(List<T> entries, int threadCount) {
List<Integer> balancedEntryCounts = buildBalancedEntryCounts(entries.size(), threadCount);
List<List<T>> balancedSublist = new ArrayList<>(threadCount);
int startIdx = 0;
for (Integer numberOfEntriesPerThread : balancedEntryCounts) {
balancedSublist.add(entries.subList(startIdx, startIdx + numberOfEntriesPerThread));
startIdx += numberOfEntriesPerThread;
}
return balancedSublist;
}
public <T> List<List<List<T>>> buildBatchedBalancedSublist(List<T> entries, int threadCount, int batchSize) {
// batches -> threads -> entries
List<List<List<T>>> batchedBalancedSubList = new LinkedList<>();
List<List<List<T>>> threadsWithBatches = buildBalancedSublist(entries, threadCount).stream().map(list -> buildBalancedSublist(list, batchSize)).toList();
// swap first two dimensions
for (int batchIdx = 0; batchIdx < batchSize; batchIdx++) {
List<List<T>> threadEntriesPerBatch = new ArrayList<>(threadCount);
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
threadEntriesPerBatch.add(threadsWithBatches.get(threadIdx).get(batchIdx));
}
batchedBalancedSubList.add(threadEntriesPerBatch);
}
return batchedBalancedSubList;
}
public List<Integer> buildBalancedEntryCounts(int totalNumberOfEntries, int threadCount) {
List<Integer> numberOfPagesPerThread = new ArrayList<>(threadCount);
for (int i = 0; i < threadCount; i++) {
numberOfPagesPerThread.add(0);
}
int threadIdx;
for (int i = 0; i < totalNumberOfEntries; i++) {
threadIdx = i % threadCount;
numberOfPagesPerThread.set(threadIdx, numberOfPagesPerThread.get(threadIdx) + 1);
}
return numberOfPagesPerThread;
}
}

View File

@ -0,0 +1,21 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import java.awt.geom.AffineTransform;
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
import lombok.AccessLevel;
import lombok.experimental.FieldDefaults;
import lombok.experimental.UtilityClass;
@UtilityClass
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class PdfDpiCalculator {
public int calculateDpi(QuadPoint imageBounds, AffineTransform imageCTM, double width) {
QuadPoint transformedImageBounds = imageBounds.getTransformed(imageCTM);
double transformedWidth = transformedImageBounds.a().distance(transformedImageBounds.d());
double widthInInches = transformedWidth * 1 / 72;
return (int) Math.ceil(width / widthInInches);
}
}

View File

@ -0,0 +1,73 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.Rect;
import com.pdftron.pdf.RectCollection;
import lombok.SneakyThrows;
public class PdfDraw {
@SneakyThrows
public static void drawGrid(ElementWriter writer, Page page) {
try (var eb = new ElementBuilder()) {
double dX = 15;
double dY = 15;
int nRows = (int) (page.getPageHeight() / dY) + 1;
int nCols = (int) (page.getPageWidth() / dX) + 1;
for (int row = 0; row < nRows; ++row) {
for (int col = 0; col < nCols; ++col) {
Element cell = eb.createRect(col * dX, row * dY, dX, dY);
cell.setPathStroke(true);
cell.getGState().setLineWidth(1);
cell.getGState().setStrokeOpacity(0.1);
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
if (row == 0 && col == 0) {
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
cell.setPathFill(true);
cell.getGState().setFillOpacity(0.8);
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
cell.getGState().setFillColor(new ColorPt(0, 0, 1));
} else {
cell.setPathFill(false);
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
}
writer.writePlacedElement(cell);
}
}
}
}
@SneakyThrows
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection) {
try (var colorPt = new ColorPt(1, 0, 0); var eb = new ElementBuilder()) {
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
try(var r = rectCollection.getRectAt(i)) {
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setLineWidth(5);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
rect.setPathFill(true);
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setFillColor(colorPt);
rect.getGState().setFillOpacity(0.5);
writer.writePlacedElement(rect);
}
}
}
}
}

View File

@ -0,0 +1,141 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import java.awt.Rectangle;
import java.nio.IntBuffer;
import java.util.ArrayList;
import java.util.List;
import com.sun.jna.Pointer;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.tess4j.OCRResult;
import net.sourceforge.tess4j.TessAPI1;
import net.sourceforge.tess4j.Tesseract1;
import net.sourceforge.tess4j.TesseractException;
import net.sourceforge.tess4j.Word;
@Slf4j
/**
* Overriden version only so I can use Tesseract1 with Pixs instead of BufferedImages. All Functions are copied and then the BufferedImage -> Pix conversion deleted.
*/
public class Tesseract2 extends Tesseract1 {
private int createDocuments(Pix pix, String filename, TessResultRenderer renderer) {
String title = TessBaseAPIGetStringVariable(getHandle(), DOCUMENT_TITLE);
TessResultRendererBeginDocument(renderer, title);
int result = TessBaseAPIProcessPage(getHandle(), pix, 0, filename, null, 0, renderer);
TessResultRendererEndDocument(renderer);
// if (result == ITessAPI.FALSE) {
// throw new TesseractException("Error during processing page.");
// }
return TessBaseAPIMeanTextConf(getHandle());
}
public OCRResult createDocumentsWithResults(Pix bi, String filename, String outputbase, List<RenderedFormat> formats, int pageIteratorLevel) throws TesseractException {
List<OCRResult> results = createDocumentsWithResults(new Pix[]{bi}, new String[]{filename}, new String[]{outputbase}, formats, pageIteratorLevel);
if (!results.isEmpty()) {
return results.get(0);
} else {
return null;
}
}
public List<OCRResult> createDocumentsWithResults(Pix[] pixs, String[] filenames, String[] outputbases, List<RenderedFormat> formats, int pageIteratorLevel) {
if (pixs.length != filenames.length || pixs.length != outputbases.length) {
throw new RuntimeException("The three arrays must match in length.");
}
init();
setVariables();
List<OCRResult> results = new ArrayList<OCRResult>();
try {
for (int i = 0; i < pixs.length; i++) {
try {
TessResultRenderer renderer = createRenderers(outputbases[i], formats);
int meanTextConfidence = createDocuments(pixs[i], filenames[i], renderer);
TessDeleteResultRenderer(renderer);
List<Word> words = meanTextConfidence > 0 ? getRecognizedWords(pageIteratorLevel) : new ArrayList<Word>();
results.add(new OCRResult(meanTextConfidence, words));
} catch (Exception e) {
// skip the problematic image file
log.warn(e.getMessage(), e);
}
}
} finally {
dispose();
}
return results;
}
private List<Word> getRecognizedWords(int pageIteratorLevel) {
List<Word> words = new ArrayList<>();
try {
TessResultIterator ri = TessBaseAPIGetIterator(getHandle());
TessPageIterator pi = TessResultIteratorGetPageIterator(ri);
TessPageIteratorBegin(pi);
do {
Pointer ptr = TessResultIteratorGetUTF8Text(ri, pageIteratorLevel);
if (ptr == null) {
continue;
}
String text = ptr.getString(0);
TessAPI1.TessDeleteText(ptr);
float confidence = TessResultIteratorConfidence(ri, pageIteratorLevel);
IntBuffer leftB = IntBuffer.allocate(1);
IntBuffer topB = IntBuffer.allocate(1);
IntBuffer rightB = IntBuffer.allocate(1);
IntBuffer bottomB = IntBuffer.allocate(1);
TessPageIteratorBoundingBox(pi, pageIteratorLevel, leftB, topB, rightB, bottomB);
int left = leftB.get();
int top = topB.get();
int right = rightB.get();
int bottom = bottomB.get();
Word word = new Word(text, confidence, new Rectangle(left, top, right - left, bottom - top));
words.add(word);
} while (TessPageIteratorNext(pi, pageIteratorLevel) == TRUE);
// TessPageIteratorDelete(pi);
TessResultIteratorDelete(ri);
} catch (Exception e) {
log.warn(e.getMessage(), e);
}
return words;
}
private TessResultRenderer createRenderers(String outputbase, List<RenderedFormat> formats) {
TessResultRenderer renderer = null;
for (RenderedFormat format : formats) {
switch (format) {
case HOCR:
if (renderer == null) {
renderer = TessHOcrRendererCreate(outputbase);
} else {
TessResultRendererInsert(renderer, TessHOcrRendererCreate(outputbase));
}
break;
}
}
return renderer;
}
}

View File

@ -0,0 +1,29 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.io.File;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.ocr.processor.model.FontMetrics;
import com.knecon.fforesight.service.ocr.processor.service.fonts.Type0FontMetricsFactory;
import lombok.SneakyThrows;
import lombok.experimental.SuperBuilder;
@SuppressWarnings("PMD")
class Type0FontMetricsFactoryTest {
@Test
@SneakyThrows
public void testStringWidth() {
try (PDDocument document = Loader.loadPDF(new File(Type0FontMetricsFactoryTest.class.getClassLoader().getResource("InvisibleText.pdf").getPath()))) {
Type0FontMetricsFactory metricsFactory = Type0FontMetricsFactory.regular(document);
FontMetrics fontMetrics = metricsFactory.calculateMetrics("deine mutter", 100, 50);
}
}
}

View File

@ -0,0 +1,36 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import static net.sourceforge.lept4j.ILeptonica.IFF_PNG;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
@Disabled
class ImageProcessingUtilsTest {
@BeforeEach
public void loadLeptonica() {
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
}
@Test
public void testRotation() {
Pix pix = Leptonica1.pixRead("/home/kschuettler/Downloads/painHarold.webp");
Pix pix2 = ImageProcessingUtils.deRotatePix(0, pix);
Leptonica1.pixWrite("/tmp/0.png", pix2, IFF_PNG);
Pix pix3 = ImageProcessingUtils.deRotatePix(90, pix);
Leptonica1.pixWrite("/tmp/90.png", pix3, IFF_PNG);
Pix pix4 = ImageProcessingUtils.deRotatePix(180, pix);
Leptonica1.pixWrite("/tmp/180.png", pix4, IFF_PNG);
Pix pix5 = ImageProcessingUtils.deRotatePix(270, pix);
Leptonica1.pixWrite("/tmp/270.png", pix5, IFF_PNG);
}
}

View File

@ -0,0 +1,21 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.util.Collection;
import org.junit.jupiter.api.Test;
class ListSplittingUtilsTest {
@Test
public void testBalancedListSplitting() {
int threadCount = 18;
int numberOfPages = 48;
var balancedList = ListSplittingUtils.buildBalancedContinuousSublist(numberOfPages, threadCount);
assertEquals(threadCount, balancedList.size());
assertEquals(numberOfPages, balancedList.stream().mapToLong(Collection::size).sum());
}
}

View File

@ -0,0 +1,83 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import java.awt.image.BufferedImage;
import java.io.File;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.IntStream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
import org.springframework.util.FileSystemUtils;
import com.knecon.fforesight.service.ocr.processor.service.OsUtils;
import com.knecon.fforesight.service.ocr.processor.service.threads.GhostScriptOutputHandler;
import lombok.SneakyThrows;
// YOU NEED GHOSTSCRIPT INSTALLED TO RUN THIS TEST!!!!
@Disabled
public class Pdf2ImgTest {
private static final int DPI = 150;
@Test
@SneakyThrows
@Disabled
public void testPDFBox() {
String outputDir = OsUtils.getTemporaryDirectory("imageOutput", "");
new File(outputDir).mkdirs();
ClassPathResource resource = new ClassPathResource("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
try (PDDocument document = Loader.loadPDF(resource.getFile())) {
PDFRenderer renderer = new PDFRenderer(document);
for (int pageNumber = 0; pageNumber < document.getNumberOfPages(); pageNumber++) {
BufferedImage image = renderer.renderImageWithDPI(pageNumber, DPI);
boolean written = ImageIOUtil.writeImage(image, "tif", new File(outputDir + String.format("page%04d", pageNumber)).getAbsolutePath(), DPI);
System.out.printf("%d: %s%n", pageNumber, written);
}
}
FileSystemUtils.deleteRecursively(new File(outputDir));
}
@Test
@SneakyThrows
public void testGhostScriptParallel() {
int numOfProcesses = 5;
ClassPathResource resource = new ClassPathResource("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
String outputDir = "/tmp/ghostscript_out/";
List<Process> processes = IntStream.range(0, numOfProcesses).boxed().parallel().map(i -> buildCmdArgs(i, outputDir, resource)).map(Pdf2ImgTest::executeProcess).toList();
List<Integer> processExitCodes = new LinkedList<>();
for (Process process : processes) {
processExitCodes.add(process.waitFor());
}
System.out.println("Ghostscripts finished with exit codes " + processExitCodes);
FileSystemUtils.deleteRecursively(new File(outputDir));
}
@SneakyThrows
private static Process executeProcess(String[] cmdArgs) {
return Runtime.getRuntime().exec(cmdArgs);
}
@SneakyThrows
private static String[] buildCmdArgs(Integer i, String outputDir, ClassPathResource resource) {
String outDir = outputDir + "/" + i + "/";
new File(outDir).mkdirs();
return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=tiffgray", "-r" + DPI, "-sOutputFile=" + outDir + "page%04d", resource.getFile().toString(), "-c", "quit"};
}
}

View File

@ -1,145 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.iqser.red.service</groupId>
<artifactId>ocr-service-v1</artifactId>
<version>1.0-SNAPSHOT</version>
</parent>
<artifactId>ocr-service-server-v1</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>com.iqser.red.service</groupId>
<artifactId>ocr-service-api-v1</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>storage-commons</artifactId>
</dependency>
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>spring-commons</artifactId>
</dependency>
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>metric-commons</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-openfeign</artifactId>
</dependency>
<dependency>
<groupId>com.pdftron</groupId>
<artifactId>PDFNet</artifactId>
<version>9.4.0</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-amqp</artifactId>
<version>2.3.1.RELEASE</version>
</dependency>
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-kms</artifactId>
<version>1.12.158</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</dependency>
<!-- Test -->
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>test-commons</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.amqp</groupId>
<artifactId>spring-rabbit-test</artifactId>
<version>2.3.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<exclusions>
<exclusion>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-tomcat</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.12.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<annotationProcessors>
<annotationProcessor>lombok.launch.AnnotationProcessorHider$AnnotationProcessor</annotationProcessor>
<annotationProcessor>com.dslplatform.json.processor.CompiledJsonAnnotationProcessor</annotationProcessor>
</annotationProcessors>
</configuration>
</plugin>
<plugin>
<!-- generate git.properties for exposure in /info -->
<groupId>pl.project13.maven</groupId>
<artifactId>git-commit-id-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>revision</goal>
</goals>
<configuration>
<generateGitPropertiesFile>true</generateGitPropertiesFile>
<gitDescribe>
<tags>true</tags>
</gitDescribe>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<!-- repackages the generated jar into a runnable fat-jar and makes it
executable -->
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>repackage</goal>
</goals>
<configuration>
<executable>true</executable>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>pdftron</id>
<name>PDFNet Maven</name>
<url>https://pdftron.com/maven/release</url>
</repository>
</repositories>
</project>

View File

@ -1,45 +0,0 @@
package com.iqser.red.service.ocr.v1.server;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.actuate.autoconfigure.security.servlet.ManagementWebSecurityAutoConfiguration;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.cloud.openfeign.EnableFeignClients;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Import;
import org.springframework.scheduling.annotation.EnableAsync;
import com.iqser.red.commons.spring.DefaultWebMvcConfiguration;
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
import io.micrometer.core.aop.TimedAspect;
import io.micrometer.core.instrument.MeterRegistry;
@EnableAsync
@EnableConfigurationProperties(OcrServiceSettings.class)
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
@Import({DefaultWebMvcConfiguration.class, MessagingConfiguration.class})
@EnableFeignClients(basePackageClasses = FileStatusProcessingUpdateClient.class)
public class Application {
/**
* Entry point to the service application.
*
* @param args Any command line parameter given upon startup.
*/
public static void main(String[] args) {
SpringApplication.run(Application.class, args);
}
@Bean
public TimedAspect timedAspect(MeterRegistry registry) {
return new TimedAspect(registry);
}
}

View File

@ -1,10 +0,0 @@
package com.iqser.red.service.ocr.v1.server.client;
import org.springframework.cloud.openfeign.FeignClient;
import com.iqser.red.service.persistence.service.v1.api.resources.FileStatusProcessingUpdateResource;
@FeignClient(name = "FileStatusProcessingUpdateResource", url = "${persistence-service.url}")
public interface FileStatusProcessingUpdateClient extends FileStatusProcessingUpdateResource {
}

View File

@ -1,42 +0,0 @@
package com.iqser.red.service.ocr.v1.server.configuration;
import org.springframework.amqp.core.Queue;
import org.springframework.amqp.core.QueueBuilder;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import lombok.RequiredArgsConstructor;
@Configuration
@RequiredArgsConstructor
public class MessagingConfiguration {
public static final String OCR_QUEUE = "ocrQueue";
public static final String OCR_DLQ = "ocrDLQ";
public static final String X_DEAD_LETTER_EXCHANGE = "x-dead-letter-exchange";
public static final String X_DEAD_LETTER_ROUTING_KEY = "x-dead-letter-routing-key";
public static final String X_MAX_PRIORITY = "x-max-priority";
public static final String OCR_STATUS_UPDATE_RESPONSE_QUEUE = "ocr_status_update_response_queue";
@Bean
public Queue ocrQueue() {
return QueueBuilder.durable(OCR_QUEUE)
.withArgument(X_DEAD_LETTER_EXCHANGE, "")
.withArgument(X_DEAD_LETTER_ROUTING_KEY, OCR_DLQ)
.withArgument(X_MAX_PRIORITY, 2)
.maxPriority(2)
.build();
}
@Bean
public Queue ocrDeadLetterQueue() {
return QueueBuilder.durable(OCR_DLQ).build();
}
}

View File

@ -1,35 +0,0 @@
package com.iqser.red.service.ocr.v1.server.initializer;
import javax.annotation.PostConstruct;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import com.pdftron.pdf.PDFNet;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@Component
@RequiredArgsConstructor
public class PDFNetInitializer {
@Value("${pdftron.license:}")
private String pdftronLicense;
@Value("${pdftron.ocrmodule.path:/tmp}")
private String ocrModulePath;
@SneakyThrows
@PostConstruct
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
public void init() {
PDFNet.setTempPath("/tmp/pdftron");
PDFNet.addResourceSearchPath(ocrModulePath);
PDFNet.initialize(pdftronLicense);
}
}

View File

@ -1,15 +0,0 @@
package com.iqser.red.service.ocr.v1.server.model;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class ImagePosition {
private Rectangle rectangle;
private boolean hasTransparency;
}

View File

@ -1,14 +0,0 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import java.util.HashMap;
import java.util.Map;
import lombok.Data;
@Data
public class Classification {
private Map<String, Float> probabilities = new HashMap<>();
private String label;
}

View File

@ -1,11 +0,0 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import lombok.Data;
@Data
public class FilterGeometry {
private ImageSize imageSize;
private ImageFormat imageFormat;
}

View File

@ -1,12 +0,0 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import lombok.Data;
@Data
public class Filters {
private FilterGeometry geometry;
private Probability probability;
private boolean allPassed;
}

View File

@ -1,11 +0,0 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import lombok.Data;
@Data
public class Geometry {
private float width;
private float height;
}

View File

@ -1,12 +0,0 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import lombok.Data;
@Data
public class ImageFormat {
private float quotient;
private boolean tooTall;
private boolean tooWide;
}

View File

@ -1,14 +0,0 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import lombok.Data;
@Data
public class ImageMetadata {
private Classification classification;
private Position position;
private Geometry geometry;
private Filters filters;
private boolean alpha;
}

View File

@ -1,31 +0,0 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import java.util.ArrayList;
import java.util.List;
import com.dslplatform.json.CompiledJson;
import com.dslplatform.json.JsonAttribute;
import com.fasterxml.jackson.annotation.JsonAlias;
import com.fasterxml.jackson.annotation.JsonProperty;
import lombok.Data;
@Data
@CompiledJson
public class ImageServiceResponse {
private String dossierId;
private String fileId;
@JsonProperty(value = "imageMetadata")
@JsonAlias("data")
@JsonAttribute(alternativeNames = {"imageMetadata"})
private List<ImageMetadata> data = new ArrayList<>();
@JsonProperty(value = "imageMetadata")
@JsonAlias("data")
@JsonAttribute(alternativeNames = {"imageMetadata"})
public void setData(List<ImageMetadata> data) {this.data = data;}
}

View File

@ -1,12 +0,0 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import lombok.Data;
@Data
public class ImageSize {
private float quotient;
private boolean tooLarge;
private boolean tooSmall;
}

View File

@ -1,14 +0,0 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import lombok.Data;
@Data
public class Position {
private float x1;
private float x2;
private float y1;
private float y2;
private int pageNumber;
}

View File

@ -1,10 +0,0 @@
package com.iqser.red.service.ocr.v1.server.model.image;
import lombok.Data;
@Data
public class Probability {
private boolean unconfident;
}

View File

@ -1,69 +0,0 @@
package com.iqser.red.service.ocr.v1.server.service;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import org.apache.commons.io.IOUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.storage.commons.service.StorageService;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class FileStorageService {
private final StorageService storageService;
public static String getStorageId(String dossierId, String fileId, FileType fileType) {
return dossierId + "/" + fileId + "." + fileType.name() + fileType.getExtension();
}
@SneakyThrows
public byte[] getOriginalFile(String dossierId, String fileId) {
return IOUtils.toByteArray(storageService.getObject(getStorageId(dossierId, fileId, FileType.ORIGIN)).getInputStream());
}
@SneakyThrows
public InputStream getOriginalFileAsStream(String dossierId, String fileId) {
return storageService.getObject(getStorageId(dossierId, fileId, FileType.ORIGIN)).getInputStream();
}
public void storeOriginalFile(String dossierId, String fileId, InputStream stream) {
storageService.storeObject(getStorageId(dossierId, fileId, FileType.ORIGIN), stream);
}
public boolean untouchedFileExists(String dossierId, String fileId) {
return storageService.objectExists(getStorageId(dossierId, fileId, FileType.UNTOUCHED));
}
public void storeUntouchedFile(String dossierId, String fileId, byte[] data) {
storageService.storeObject(getStorageId(dossierId, fileId, FileType.UNTOUCHED), new ByteArrayInputStream(data));
}
@SneakyThrows
public ImageServiceResponse getImageServiceResponse(String dossierId, String fileId) {
return storageService.readJSONObject(getStorageId(dossierId, fileId, FileType.IMAGE_INFO), ImageServiceResponse.class);
}
}

View File

@ -1,325 +0,0 @@
package com.iqser.red.service.ocr.v1.server.service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.io.IOUtils;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.model.ImagePosition;
import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
import com.iqser.red.service.persistence.service.v1.api.utils.SuppressFBWarnings;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.OCROptions;
import com.pdftron.pdf.Optimizer;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.Rect;
import com.pdftron.pdf.RectCollection;
import com.pdftron.sdf.Obj;
import com.pdftron.sdf.SDFDoc;
import io.micrometer.core.annotation.Timed;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class OCRService {
public static final String ENGLISH = "eng";
private final FileStorageService fileStorageService;
private final OcrServiceSettings settings;
private final RabbitTemplate rabbitTemplate;
private final ObjectMapper objectMapper;
@Timed("redactmanager_PDFTron-ocrDocument")
@SneakyThrows
public InputStream ocrDocument(String dossierId, String fileId) {
var fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
var imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId);
var fileBytes = IOUtils.toByteArray(fileStream);
var ocrBytes = ocr(fileBytes, fileId, imageServiceResponse);
return new ByteArrayInputStream(ocrBytes);
}
@SuppressFBWarnings("REC_CATCH_EXCEPTION")
private byte[] ocr(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) {
PDFDoc pdfDoc = null;
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
pdfDoc = new PDFDoc(file);
removeInvisibleText(pdfDoc);
Map<Integer, List<ImagePosition>> pages = new HashMap<>();
// TODO take logic to ignore small and combine images from image-service.
// TODO Then replace logic so ocr-service is independent from image-service.
imageServiceResponse.getData()
.forEach(imageMetadata -> pages.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
.add(new ImagePosition(new Rectangle(new Point(imageMetadata.getPosition().getX1(), imageMetadata.getPosition().getY1()),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight(),
imageMetadata.getPosition().getPageNumber()), imageMetadata.isAlpha())));
Map<Integer, PDFDoc> pdfDocMap = Collections.synchronizedMap(new HashMap<>());
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pages.keySet().size()).build()));
ocrPages(pdfDoc, fileId, pages, pdfDocMap);
for (var entry : pdfDocMap.entrySet()) {
var ocrDoc = entry.getValue();
var page = entry.getKey();
Page ocrPage = ocrDoc.getPageIterator(1).next();
pdfDoc.pageInsert(pdfDoc.getPageIterator(page), ocrPage);
pdfDoc.pageRemove(pdfDoc.getPageIterator(page + 1));
ocrDoc.close();
}
Optimizer.optimize(pdfDoc);
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
pdfDoc.close();
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
.fileId(fileId)
.numberOfPagesToOCR(pages.keySet().size())
.numberOfOCRedPages(pages.keySet().size())
.ocrFinished(true)
.build()));
return out.toByteArray();
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
if (pdfDoc != null) {
try {
pdfDoc.close();
} catch (Exception e) {
log.debug("Failed to close document", e);
}
}
}
}
@SneakyThrows
private void ocrPages(PDFDoc pdfDoc, String fileId, Map<Integer, List<ImagePosition>> pages, Map<Integer, PDFDoc> pdfDocMap) {
int numberOfOCRedPages = 0;
for (var pageEntry : pages.entrySet()) {
try {
RectCollection rectCollection = new RectCollection();
var page = pageEntry.getKey();
Page pdfPage = pdfDoc.getPageIterator(page).next();
pdfPage.setMediaBox(pdfPage.getCropBox());
for (ImagePosition imagePosition : pageEntry.getValue()) {
Rectangle rectangle = imagePosition.getRectangle();
// Warning coordinate system is different in this call macOs/Linux
double y = -rectangle.getTopLeft().getY() + pdfPage.getCropBox().getY2() - rectangle.getHeight();
rectCollection.addRect(rectangle.getTopLeft().getX(), y, rectangle.getTopLeft().getX() + rectangle.getWidth(), y + rectangle.getHeight());
}
PDFDoc ocrDoc = new PDFDoc();
ocrDoc.pagePushBack(pdfPage);
pdfDocMap.put(pageEntry.getKey(), ocrDoc);
OCROptions options = new OCROptions();
options.addTextZonesForPage(rectCollection, 1);
options.addLang(ENGLISH);
options.addDPI(settings.getOcrDPI());
OCRModule.processPDF(ocrDoc, options);
rectCollection.clear();
} catch (Exception e) {
log.warn("Failed to process PDF page {} - {}", pageEntry.getKey(), e);
}
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
.fileId(fileId)
.numberOfPagesToOCR(pages.keySet().size())
.numberOfOCRedPages(++numberOfOCRedPages)
.build()));
log.warn("Done page {}", pageEntry);
}
}
/**
* There are 2 possibilities to have invisible Text in pdfs.
* 1. gState is set to invisible, this is ocr text.
* 2. Filled Path elements in front of the text.
*/
@SneakyThrows
private void removeInvisibleText(PDFDoc pdfDoc) {
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Set<Integer> visited = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
removeOverlapText(page, reader, writer, visited);
}
}
@SneakyThrows
private void removeOverlapText(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited) {
visited.add((int) page.getSDFObj().getObjNum());
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(reader, writer, visited, false);
writer.end();
reader.end();
}
@SneakyThrows
private void processElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, boolean isInForm) {
Set<Rect> filledRectangles = new HashSet<>();
for (Element element = reader.next(); element != null; element = reader.next())
switch (element.getType()) {
case Element.e_image:
case Element.e_inline_image:
processImage(element, writer, isInForm);
break;
case Element.e_text:
processText(element, writer, filledRectangles);
break;
case Element.e_path:
processPath(element, writer, filledRectangles);
break;
case Element.e_form:
processForm(reader, writer, element, visited);
break;
default:
writer.writeElement(element);
}
}
@SneakyThrows
private void processImage(Element element, ElementWriter writer, boolean isInForm) {
if (!isInForm || !settings.isRemoveWatermark()) {
writer.writeElement(element);
}
}
@SneakyThrows
private void processText(Element element, ElementWriter writer, Set<Rect> filledRectangles) {
if (element.getBBox() == null) {
writer.writeElement(element);
return;
}
double x = element.getBBox().getX1();
double y = element.getBBox().getY1();
boolean filledRectangleIntersection = filledRectangles.stream().anyMatch(r -> {
try {
return r.contains(x, y);
} catch (PDFNetException e) {
throw new RuntimeException("Internal pdftron error during removal of overlap text", e);
}
});
var gState = element.getGState();
//See PDF Reference 5.3 Text rendering modes, 3 = Invisible, however this ocr does not use it.
if (!filledRectangleIntersection && gState.getTextRenderMode() != 3 || filledRectangleIntersection && gState.getTextRenderMode() == 0) {
writer.writeElement(element);
}
}
@SneakyThrows
private void processPath(Element element, ElementWriter writer, Set<Rect> filledRectangles) {
writer.writeElement(element);
if (element.getPathData() != null && element.getPathData().getPoints().length > 4) {
filledRectangles.add(element.getBBox());
}
}
@SneakyThrows
private void processForm(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited) {
writer.writeElement(element);
Obj formObj = element.getXObject();
if (!visited.contains((int) formObj.getObjNum())) {
visited.add((int) formObj.getObjNum());
ElementWriter new_writer = new ElementWriter();
reader.formBegin();
new_writer.begin(formObj);
reader.clearChangeList();
new_writer.setDefaultGState(reader);
processElements(reader, new_writer, visited, true);
new_writer.end();
reader.end();
}
}
}

View File

@ -1,79 +0,0 @@
package com.iqser.red.service.ocr.v1.server.service;
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
import org.springframework.amqp.rabbit.annotation.RabbitListener;
import org.springframework.http.HttpStatus;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.api.model.DocumentRequest;
import feign.FeignException;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class OcrMessageReceiver {
private final ObjectMapper objectMapper;
private final FileStorageService fileStorageService;
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
private final OCRService ocrService;
@RabbitHandler
@RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1")
public void receiveOcr(String in) throws JsonProcessingException {
DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class);
long start = System.currentTimeMillis();
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
setStatusOcrProcessing(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
if (!fileStorageService.untouchedFileExists(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId())) {
byte[] originalFile = fileStorageService.getOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile);
}
var ocrResult = ocrService.ocrDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), ocrResult);
long end = System.currentTimeMillis();
log.info("Successfully processed ocr for file with dossierId {} and fileId {}, took {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), end - start);
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
}
@RabbitHandler
@RabbitListener(queues = MessagingConfiguration.OCR_DLQ, concurrency = "1")
public void receiveOcrDQL(String in) throws JsonProcessingException {
DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class);
log.info("OCR DQL received: {}", ocrRequestMessage);
fileStatusProcessingUpdateClient.ocrFailed(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
}
private void setStatusOcrProcessing(String dossierId, String fileId) {
try {
fileStatusProcessingUpdateClient.ocrProcessing(dossierId, fileId);
} catch (FeignException e) {
if (e.status() == HttpStatus.CONFLICT.value()) {
throw new AmqpRejectAndDontRequeueException(e.getMessage());
}
}
}
}

View File

@ -1,14 +0,0 @@
package com.iqser.red.service.ocr.v1.server.settings;
import org.springframework.boot.context.properties.ConfigurationProperties;
import lombok.Data;
@Data
@ConfigurationProperties("ocr-service")
public class OcrServiceSettings {
private int ocrDPI = 300;
private boolean removeWatermark;
}

Some files were not shown because too many files have changed in this diff Show More