Compare commits

..

227 Commits

Author SHA1 Message Date
Matthias Bisping
03e7b00cfd refactoring 2022-04-14 12:20:05 +02:00
Matthias Bisping
7aee00cb49 alpha channel querying improved 2022-04-13 17:31:33 +02:00
Matthias Bisping
2cc52c4630 renaming 2022-04-13 13:36:45 +02:00
Matthias Bisping
daa1da3a50 fix name 2022-04-13 13:17:23 +02:00
Matthias Bisping
6a7debde14 added exploration tests 2022-04-13 13:15:05 +02:00
Matthias Bisping
b4f279c549 test for until 2022-04-13 13:12:19 +02:00
Matthias Bisping
f5881f2229 formatting 2022-04-13 13:06:20 +02:00
Matthias Bisping
62bfedfea8 alpha channel test fix 2022-04-13 12:06:55 +02:00
Matthias Bisping
1d88876ab1 alpha channel info WIP 2022-04-12 18:44:04 +02:00
Matthias Bisping
bbafad5561 refactoring in preparationfor alpha channel info 2022-04-12 18:22:38 +02:00
Matthias Bisping
f17a232009 tests for box validation 2022-04-12 16:54:40 +02:00
Matthias Bisping
88a46ae7cd adjustet expected output for actual pipeline test for change from pixel to pdf units (x2 - x1 etc.) for width and height fields of metadata records 2022-04-12 16:42:31 +02:00
Matthias Bisping
e82a81f5c8 refactoring 2022-04-12 16:34:00 +02:00
Matthias Bisping
35c5b15e32 tolerance forwarding through pipeline constructor; box validation; tiny box filtering 2022-04-12 16:29:20 +02:00
Matthias Bisping
698e647c6f applied black 2022-04-12 15:06:18 +02:00
Matthias Bisping
d8f86d14a5 fuzzy stitching completed 2022-04-12 15:04:32 +02:00
Matthias Bisping
bb7c1be630 fuzzy stitching WIP: mostly works, but sometimes fails. run test_image_stitcher_with_gaps to debug 2022-04-11 19:20:47 +02:00
Matthias Bisping
79cd31850d fuzzy stitching WIP: added tolerance to stitching; added fuzzification function; added tests for grouping and (fuzzy and exact) 2022-04-11 16:47:47 +02:00
Matthias Bisping
3d335783dc topological sorting of definitions by caller hierarchy 2022-04-11 16:08:54 +02:00
Matthias Bisping
bb79f9dd55 applied black 2022-04-11 13:57:32 +02:00
Matthias Bisping
585cdf5c70 integrated stitching into parsable pdf extractor 2022-04-11 13:57:10 +02:00
Matthias Bisping
04cf0245ed formatting 2022-04-11 13:38:09 +02:00
Matthias Bisping
3530ef72c5 docstring update 2022-04-11 13:37:46 +02:00
Matthias Bisping
d80af336eb refactoring 2022-04-11 13:28:39 +02:00
Matthias Bisping
bcf6dc5c47 generalized split mapper 2022-04-11 13:03:02 +02:00
Matthias Bisping
f4c0547405 refactoring: replaced split mapper with dataclass 2022-04-11 12:16:42 +02:00
Matthias Bisping
1bea5fb9a8 refactoring 2022-04-11 10:29:13 +02:00
Matthias Bisping
57440f5106 refactoring 2022-04-11 09:53:32 +02:00
Matthias Bisping
710783a2f8 merging algorithm explanation adjusted 2022-04-11 09:28:00 +02:00
Matthias Bisping
887b8339a2 renaming 2022-04-08 14:17:05 +02:00
Matthias Bisping
43cb0fffed refactoring 2022-04-08 14:13:03 +02:00
Matthias Bisping
6e7645e319 topological sorting of definitions by caller hierarchy 2022-04-08 14:04:48 +02:00
Matthias Bisping
3b18fc6158 refactoring 2022-04-08 13:56:57 +02:00
Matthias Bisping
1b10445f91 refactoring 2022-04-08 12:01:20 +02:00
Matthias Bisping
5967149c49 refactoring 2022-04-07 21:49:55 +02:00
Matthias Bisping
303970db51 refactoring 2022-04-07 21:44:04 +02:00
Matthias Bisping
51793d19e9 refactoring 2022-04-07 21:39:01 +02:00
Matthias Bisping
e276a5ec27 refactoring 2022-04-07 21:20:55 +02:00
Matthias Bisping
7e6fe7cf11 refactoring 2022-04-07 21:12:57 +02:00
Matthias Bisping
bb5db1b4ef refactoring 2022-04-07 20:47:58 +02:00
Matthias Bisping
8ac9fcb19f stitcher test passes 2022-04-07 19:40:26 +02:00
Matthias Bisping
160973e2be refactoring 2022-04-07 19:05:13 +02:00
Matthias Bisping
803cc57155 refactoring 2022-04-07 18:48:12 +02:00
Matthias Bisping
50b4d239cb group merging done 2022-04-07 18:05:15 +02:00
Matthias Bisping
9bb07f95fb refactoring 2022-04-07 17:51:53 +02:00
Matthias Bisping
29028cc1a5 refactoring 2022-04-07 17:44:54 +02:00
Matthias Bisping
2fcb0bd149 refactoring 2022-04-07 17:28:25 +02:00
Matthias Bisping
3e882dc247 group merging wip 2022-04-07 17:18:09 +02:00
Matthias Bisping
2b1e7cbb08 added img-mdat-pair merging logic 2022-04-07 16:11:12 +02:00
Matthias Bisping
5e8b55ef10 added image concatenation; refactoring 2022-04-07 11:42:38 +02:00
Matthias Bisping
3266e0af58 refactoring; added metadata merging logic 2022-04-06 15:55:35 +02:00
Matthias Bisping
7e2696d5c5 stitching impl wip 2022-04-05 23:39:17 +02:00
Matthias Bisping
302613bf2b refactoring
eager eval because double iter later
2022-04-05 23:08:41 +02:00
Matthias Bisping
66fd103d1b refactoring 2022-04-05 22:56:08 +02:00
Matthias Bisping
6e5d6912ed refactoring 2022-04-05 22:53:26 +02:00
Matthias Bisping
b1efb5ed09 refactoring 2022-04-05 19:40:13 +02:00
Matthias Bisping
ef70e11352 refactoring 2022-04-05 19:38:29 +02:00
Matthias Bisping
315679468b applied black 2022-04-05 19:35:36 +02:00
Matthias Bisping
64e3350dee refactoring 2022-04-05 19:35:13 +02:00
Matthias Bisping
6a7e0e1000 refactoring 2022-04-05 19:33:22 +02:00
Matthias Bisping
11fc63035d refactoring 2022-04-05 19:03:31 +02:00
Matthias Bisping
4bc295b212 refactoring 2022-04-05 18:57:08 +02:00
Matthias Bisping
4c46be4abc test param adjustment 2022-04-05 18:09:43 +02:00
Matthias Bisping
37ee086b5d applied black 2022-04-05 17:55:38 +02:00
Matthias Bisping
1fd30e68b6 test data generation for image stitching 2022-04-05 17:54:43 +02:00
Matthias Bisping
2c908162f1 refactoring 2022-04-05 16:31:57 +02:00
Matthias Bisping
4756b8c9bd refactoring 2022-04-05 13:03:22 +02:00
Matthias Bisping
e0885c545a added page range paramter to extractor 2022-04-05 13:03:17 +02:00
Matthias Bisping
fdb7ebe618 logging change 2022-04-04 23:37:49 +02:00
Matthias Bisping
ce69f7d160 removed obsolete imports 2022-04-04 21:50:10 +02:00
Matthias Bisping
8f61c4cba2 doc.extract_image(xref) can yield None; hence added filtering for None images 2022-04-04 21:49:45 +02:00
Matthias Bisping
f3e2b2335f updated dependency versions 2022-04-04 19:35:49 +02:00
Matthias Bisping
9cda65ad41 removed obsolete code 2022-04-04 18:30:43 +02:00
Matthias Bisping
692e72b3b2 refactoring 2022-04-04 18:29:17 +02:00
Matthias Bisping
38869d52c6 refactoring 2022-04-04 18:17:49 +02:00
Matthias Bisping
e01b5c9acd refactoring 2022-04-04 15:50:09 +02:00
Matthias Bisping
6a6fc19958 refactoring 2022-04-04 15:48:15 +02:00
Matthias Bisping
1b1f1aafef refactoring 2022-04-04 14:19:06 +02:00
Matthias Bisping
caef37376b renaming 2022-04-04 14:04:36 +02:00
Matthias Bisping
16aa951c96 refactoring 2022-04-04 14:01:19 +02:00
Matthias Bisping
89afb8f920 added cooridate transformation testing by images 2022-04-04 13:55:48 +02:00
Matthias Bisping
1ffc9dcc68 refactoring 2022-04-04 13:12:08 +02:00
Matthias Bisping
0976971117 refactoring 2022-04-04 10:23:22 +02:00
Matthias Bisping
b4b0058475 added additional corners coordinates for coordinate transformation tests 2022-04-04 10:18:23 +02:00
Matthias Bisping
2ee36dcb54 applied black 2022-04-03 04:48:11 +02:00
Matthias Bisping
ab382646b7 applied black 2022-04-03 04:47:49 +02:00
Matthias Bisping
8c916a79c3 updated gitignore 2022-04-03 04:47:36 +02:00
Matthias Bisping
3ff6dac2e0 added explanations for how the coordinate transformations were inferred 2022-04-03 04:46:52 +02:00
Matthias Bisping
d134884553 misc 2022-04-03 04:35:44 +02:00
Matthias Bisping
2d0545c928 refactoring 2022-04-03 04:31:50 +02:00
Matthias Bisping
65a4a8e34e refactoring 2022-04-03 04:25:10 +02:00
Matthias Bisping
39c111fd42 integrated PDFNet coordinate transformer into pipeline 2022-04-03 04:08:00 +02:00
Matthias Bisping
0376223c9d coordinate transformers refac 2022-04-03 04:00:15 +02:00
Matthias Bisping
bf85ef357c coordinate transformers version 1 completed 2022-04-03 03:51:31 +02:00
Matthias Bisping
f6a7a14a20 pdfnet coordinate transformer wip 2022-04-03 03:19:46 +02:00
Matthias Bisping
41f783dc5d coordinate transformer refac 2022-04-03 02:21:30 +02:00
Matthias Bisping
32397256c8 coordinate transformer wip 2022-04-03 02:20:03 +02:00
Matthias Bisping
f44e6f4fd7 coordinate transformer, added Fitz transformer 2022-04-03 02:15:41 +02:00
Matthias Bisping
3d2c97bc10 coordinate transformer wip 2022-04-03 01:58:51 +02:00
Matthias Bisping
9663cec12d coordinate transformer wip 2022-04-03 01:54:51 +02:00
Matthias Bisping
c1c3f541d4 coordinate transformer wip 2022-04-03 01:45:01 +02:00
Matthias Bisping
4d86e78307 muting logger in tests 2022-04-02 19:31:08 +02:00
Matthias Bisping
1cf6ab256c muting logger in tests 2022-04-02 18:34:13 +02:00
Matthias Bisping
a89e374c67 removed obsolete code 2022-04-02 03:41:55 +02:00
Matthias Bisping
0861e22542 fixed pipeline not working with flask... model was loaded in external process, probably; known issue 2022-04-02 03:38:44 +02:00
Matthias Bisping
7827869af4 fixed logger's logging level 2022-04-02 02:58:30 +02:00
Matthias Bisping
613bba8cfc ... 2022-04-02 02:45:21 +02:00
Matthias Bisping
5c23898280 added log messages to all pipelien components; converting pipelien output to list for REST transport; refactoring; added e2e test (flask + pipeline)... but hangs 2022-04-02 02:44:30 +02:00
Matthias Bisping
e8d0299e46 refactoring 2022-04-02 01:27:30 +02:00
Matthias Bisping
cb00aed62c refactoring 2022-04-02 01:23:57 +02:00
Matthias Bisping
1501653673 coverage increased for flask tests 2022-04-02 00:16:01 +02:00
Matthias Bisping
b4b929b65f added mocked server tests with flask testing uitilities 2022-04-01 21:55:59 +02:00
Matthias Bisping
3d1c251e10 removed redundant TF env var export 2022-04-01 21:35:10 +02:00
Matthias Bisping
c80549d5d3 refactoring: model wrapper to base class and derived class for efficient net 2022-04-01 21:32:18 +02:00
Matthias Bisping
070749880e removed obsolete code 2022-04-01 21:13:15 +02:00
Matthias Bisping
94783c54f2 eliminated redai dependency; updated requirement versions 2022-04-01 21:10:41 +02:00
Matthias Bisping
2b48c6108b added coverage.process_startup for multiprocessing coverage... but does not quite work yet 2022-04-01 19:51:33 +02:00
Matthias Bisping
da9b3d0cb9 applied black 2022-04-01 19:50:44 +02:00
Matthias Bisping
c372529ee5 dynamic waiting for server to be ready in tests 2022-04-01 19:04:41 +02:00
Matthias Bisping
1a1ece1f95 adjusted call of server running function 2022-04-01 12:22:24 +02:00
Matthias Bisping
426061e5ea applied black 2022-04-01 12:20:32 +02:00
Matthias Bisping
7c2cf44ad0 refactoring 2022-04-01 00:21:57 +02:00
Matthias Bisping
c125e1ff6c web server refactoring + tests 2022-03-31 23:43:14 +02:00
Matthias Bisping
dd007891c7 changed banner 2022-03-31 19:50:12 +02:00
Matthias Bisping
d3257fdeda refactoring 2022-03-31 19:39:08 +02:00
Matthias Bisping
1581880ec6 added updated version of serve.py 2022-03-31 19:38:35 +02:00
Matthias Bisping
268b83a1ff refactoring 2022-03-31 19:17:48 +02:00
Matthias Bisping
5caa9807e2 added response formatter and pipeline test 2022-03-31 19:01:32 +02:00
Matthias Bisping
82added50a empty implementation of abstract base class method 2022-03-31 17:29:05 +02:00
Matthias Bisping
b6ccfbcf8f removed obsolete import 2022-03-31 17:25:42 +02:00
Matthias Bisping
e17912caa9 derived enum formatter from key formatter 2022-03-31 17:22:54 +02:00
Matthias Bisping
3eaf9dc0e1 refactoring: introduced key mapper base class and proba mapper key enum 2022-03-31 16:55:58 +02:00
Matthias Bisping
0cefef4e15 more test cases for key transformer 2022-03-31 16:35:12 +02:00
Matthias Bisping
4f94cbd68d refactoring 2022-03-31 16:26:40 +02:00
Matthias Bisping
2517b45d44 fixed bug in camel case transformer 2022-03-31 15:55:15 +02:00
Matthias Bisping
2a62ad7aba typo 2022-03-31 15:48:52 +02:00
Matthias Bisping
20c980dbe6 fixed bug in camel case transformer 2022-03-31 15:47:45 +02:00
Matthias Bisping
726298b155 made formatter a transformer derivation 2022-03-31 15:26:30 +02:00
Matthias Bisping
479afbcd34 formatting 2022-03-31 15:20:41 +02:00
Matthias Bisping
4ab9f0d89b corrected camel case converter 2022-03-31 15:18:59 +02:00
Matthias Bisping
d4604a2cb5 renaming 2022-03-31 14:52:37 +02:00
Matthias Bisping
4ebb36247e refactoring 2022-03-31 14:49:46 +02:00
Matthias Bisping
7ec7390e90 refactoring 2022-03-31 12:52:35 +02:00
Matthias Bisping
dc1cdde458 refactoring; added compositor for formatters 2022-03-31 12:52:15 +02:00
Matthias Bisping
0921ef9a4f removed obsolete import 2022-03-31 11:12:59 +02:00
Matthias Bisping
91dd467142 applied black 2022-03-30 19:38:15 +02:00
Matthias Bisping
b3e1604ecc added floating point conversion to label mapper for json serializability 2022-03-30 19:36:45 +02:00
Matthias Bisping
20718996bd refactoring; testing of prediction model handel redai adapter 2022-03-30 19:01:54 +02:00
Matthias Bisping
cc8d87338c removed obsolete code 2022-03-30 18:17:35 +02:00
Matthias Bisping
258c1ab02d testing laberl mappers for raising of excpetions when encountering unexpected input formats 2022-03-30 18:15:45 +02:00
Matthias Bisping
ce3d33955e removing unused code / refactoring for coverage maximization 2022-03-30 18:03:21 +02:00
Matthias Bisping
a95cc4e06b added config tests 2022-03-30 17:55:49 +02:00
Matthias Bisping
6d1ace473b removed obsolete code 2022-03-30 16:35:47 +02:00
Matthias Bisping
0a22a35912 refactoring; renaming 2022-03-30 16:35:26 +02:00
Matthias Bisping
a5d3232dd0 testing index and probability label format in classifier prediction test 2022-03-30 16:34:17 +02:00
Matthias Bisping
49f9847d9a removed obsolete code 2022-03-30 16:07:45 +02:00
Matthias Bisping
1c6f5749dd updated classifier test for label mappers 2022-03-30 16:04:13 +02:00
Matthias Bisping
8bccec277f added array label mapper 2022-03-30 15:54:18 +02:00
Matthias Bisping
7f37f841dd renaming 2022-03-30 15:32:21 +02:00
Matthias Bisping
8c7e3e29f5 added label mapper 2022-03-30 14:17:58 +02:00
Matthias Bisping
99d8e921db renaming 2022-03-30 13:57:29 +02:00
Matthias Bisping
6835394d30 added formatter test; refactored batch_size fixture 2022-03-30 13:43:13 +02:00
Matthias Bisping
ad6bb80900 fixed sorting predictions by probabilities in wrong order 2022-03-30 01:14:03 +02:00
Matthias Bisping
95209a5c9d typo 2022-03-30 01:06:06 +02:00
Matthias Bisping
45a07c620a fixed chaining bug that lead to greedy evaluation 2022-03-30 00:53:34 +02:00
Matthias Bisping
81ab9a5f53 tuning prediction format handling 2022-03-30 00:13:12 +02:00
Matthias Bisping
8b15ac6df4 docstring update 2022-03-29 23:57:09 +02:00
Matthias Bisping
e9489287bd support for array prediction format 2022-03-29 23:56:22 +02:00
Matthias Bisping
15c0b73034 support for different prediction formats 2022-03-29 23:41:43 +02:00
Matthias Bisping
7a64af156b refactoring 2022-03-29 22:59:01 +02:00
Matthias Bisping
60617fd622 added formatter to pipeline 2022-03-29 22:47:54 +02:00
Matthias Bisping
ade318c7b7 made classifier accept tupls of images in addition to np.arrays; added pipeline (wip) 2022-03-29 22:00:34 +02:00
Matthias Bisping
3339ed2eab removed unneeded adapter derivatives and made estimator adapter abstract base class to normal class 2022-03-29 20:44:26 +02:00
Matthias Bisping
7340fb6dda replaced string keys for metadata fields with enum members 2022-03-29 20:29:44 +02:00
Matthias Bisping
358d7ecd91 restructuring of modules 2022-03-29 20:02:40 +02:00
Matthias Bisping
d33a882d65 removed obsolete code 2022-03-29 19:54:14 +02:00
Matthias Bisping
06adedac57 reimplemented model loader logic and moved base weights into mlflow run dir 2022-03-29 19:50:43 +02:00
Matthias Bisping
edbc5c3f84 redoing model loading design 2022-03-29 18:21:14 +02:00
Matthias Bisping
f60bafd007 redoing model loading design 2022-03-29 17:25:06 +02:00
Matthias Bisping
a1c7dd4a8d added identity preprocessor; changed default preprocessor to idenitity 2022-03-29 11:40:58 +02:00
Matthias Bisping
6b58756103 refactoring of mlflow model loader 2022-03-29 11:02:43 +02:00
Matthias Bisping
3b4c2a40b2 added patched test for mlflow model loader 2022-03-28 21:51:21 +02:00
Matthias Bisping
c06905625d added model loader interface, model loader mock and mlflow model loader (the latter so far not tested) 2022-03-28 21:22:35 +02:00
Matthias Bisping
d44622dddc test parametrization changed 2022-03-28 19:52:24 +02:00
Matthias Bisping
3c6dfed508 made input size adjustable via test fixture 2022-03-28 19:22:31 +02:00
Matthias Bisping
f18e183ab0 added type hint 2022-03-28 18:54:28 +02:00
Matthias Bisping
86f2abc553 renaming 2022-03-28 18:52:39 +02:00
Matthias Bisping
f0a8f2224c refactoring 2022-03-28 18:50:18 +02:00
Matthias Bisping
9bf1dcbe1d removed obsolete import 2022-03-28 18:31:09 +02:00
Matthias Bisping
9ce7b6e6da refactoring 2022-03-28 18:30:51 +02:00
Matthias Bisping
e818b05472 applied black 2022-03-28 16:39:34 +02:00
Matthias Bisping
b818ee4724 fixed misaligned metadata and images 2022-03-28 16:38:46 +02:00
Julius Unverfehrt
9461be29d5 add ParsablePDFImageExtractor test 2022-03-28 15:42:54 +02:00
Julius Unverfehrt
2631eb5c0f add metadata fixture 2022-03-28 12:05:07 +02:00
Matthias Bisping
643ab99bd3 added parsable pdf image extractor 2022-03-28 11:27:05 +02:00
Matthias Bisping
e0ab365bb9 list -> generator 2022-03-28 00:05:37 +02:00
Matthias Bisping
48737d9439 added extractor classifier 2022-03-28 00:01:19 +02:00
Matthias Bisping
a5147c9a58 added image extractor interface and mock 2022-03-27 23:05:27 +02:00
Matthias Bisping
4c939464b0 renaming 2022-03-27 22:59:28 +02:00
Matthias Bisping
334dc79f7e refactoring 2022-03-27 18:13:58 +02:00
Matthias Bisping
9d58ae714f renaming 2022-03-27 17:55:01 +02:00
Matthias Bisping
0f811bdc56 removed unnecessary kwarg 2022-03-27 01:24:29 +01:00
Matthias Bisping
d11333981f applied black 2022-03-27 01:21:12 +01:00
Matthias Bisping
4fcd1e79d3 removed obsolete code; added missing __init__ for predictor 2022-03-27 01:20:03 +01:00
Matthias Bisping
5c5d132d7f fixed batching issue in prediction monkey patch by introducinbg an output generator, that yields the expected predictions 2022-03-27 01:13:28 +01:00
Matthias Bisping
0f9510906d refactoring; added predictor; mocking of predict function is broken: fixing next commit 2022-03-26 21:19:02 +01:00
Matthias Bisping
6343229c1e added chunk_iterable tests 2022-03-26 20:24:59 +01:00
Matthias Bisping
7d21b0a585 refactoring 2022-03-26 19:54:18 +01:00
Matthias Bisping
364111db89 preprocessor refactoring 2022-03-26 19:38:34 +01:00
Matthias Bisping
ea298dacfa renaming 2022-03-26 19:27:37 +01:00
Matthias Bisping
373c619b0c formatting 2022-03-26 19:24:34 +01:00
Matthias Bisping
8aa0717007 added image-tensor conversion logic 2022-03-26 19:24:15 +01:00
Matthias Bisping
a3215e0bc3 renaming of service estimator to estimator 2022-03-25 18:24:05 +01:00
Matthias Bisping
c64bff0843 renaming of service estimator to estimator 2022-03-25 18:20:44 +01:00
Matthias Bisping
dd18087261 restructuring of modules 2022-03-25 18:18:17 +01:00
Matthias Bisping
d97b477208 added estimator preprocessor and removed adapter and adapter patch 2022-03-25 18:09:06 +01:00
Matthias Bisping
981d7816a0 refactoring: replaced estimator adapter with monkeypatch 2022-03-25 17:58:34 +01:00
Matthias Bisping
2e36a9d46d added type hint 2022-03-25 16:28:17 +01:00
Matthias Bisping
03f269c2d7 fixed incorrect pycharme-refactoring 2022-03-25 16:28:00 +01:00
Matthias Bisping
6853d862ed added comment motivating the implementation of the predict function of the adapter patch 2022-03-25 15:02:02 +01:00
Matthias Bisping
31591bef0f suppress tf-internal deprication warning 2022-03-25 14:56:47 +01:00
Matthias Bisping
7834a65ff5 added keras estimator wrapper 2022-03-25 14:46:04 +01:00
Matthias Bisping
8b7293be09 introduced estimator-adapter and estimator-adapter-patch 2022-03-25 13:35:03 +01:00
Matthias Bisping
9c9070e8bf refactoring 2022-03-25 12:24:23 +01:00
Matthias Bisping
e8fb01b4b7 formatting 2022-03-25 11:49:02 +01:00
Matthias Bisping
41f0cc8a41 estimator + model label mapping 2022-03-25 11:42:31 +01:00
Matthias Bisping
ee959346b7 refactoring: estimator + model 2022-03-25 11:23:07 +01:00
167 changed files with 1682 additions and 43084 deletions

View File

@ -1,8 +1,6 @@
[core]
remote = azure_remote
remote = vector
autostage = true
['remote "vector"']
url = ssh://vector.iqser.com/research/image-prediction/
port = 22
['remote "azure_remote"']
url = azure://image-classification-dvc/

9
.gitignore vendored
View File

@ -1,8 +1,7 @@
.vscode/
*.h5
*venv
/venv/
.idea/
src/data
!.gitignore
*.project
@ -34,7 +33,6 @@ src/data
**/dependencies-and-licenses-overview.txt
.coverage
.coverage\.*\.*
*__pycache__
@ -48,6 +46,7 @@ src/data
*misc
/coverage_html_report/
.coverage\.*
# Created by https://www.toptal.com/developers/gitignore/api/linux,pycharm
# Edit at https://www.toptal.com/developers/gitignore?templates=linux,pycharm
@ -173,4 +172,6 @@ fabric.properties
# https://plugins.jetbrains.com/plugin/12206-codestream
.idea/codestream.xml
# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
/image_prediction/data/mlruns/
#/data/mlruns/

View File

@ -1,51 +0,0 @@
include:
- project: "Gitlab/gitlab"
ref: main
file: "/ci-templates/research/dvc.gitlab-ci.yml"
- project: "Gitlab/gitlab"
ref: main
file: "/ci-templates/research/versioning-build-test-release.gitlab-ci.yml"
variables:
NEXUS_PROJECT_DIR: red
IMAGENAME: "${CI_PROJECT_NAME}"
INTEGRATION_TEST_FILE: "${CI_PROJECT_ID}.pdf"
FF_USE_FASTZIP: "true" # enable fastzip - a faster zip implementation that also supports level configuration.
ARTIFACT_COMPRESSION_LEVEL: default # can also be set to fastest, fast, slow and slowest. If just enabling fastzip is not enough try setting this to fastest or fast.
CACHE_COMPRESSION_LEVEL: default # same as above, but for caches
# TRANSFER_METER_FREQUENCY: 5s # will display transfer progress every 5 seconds for artifacts and remote caches. For debugging purposes.
stages:
- data
- setup
- tests
- sonarqube
- versioning
- build
- integration-tests
- release
docker-build:
extends: .docker-build
needs:
- job: dvc-pull
artifacts: true
- !reference [.needs-versioning, needs] # leave this line as is
###################
# INTEGRATION TESTS
trigger-integration-tests:
extends: .integration-tests
# ADD THE MODEL BUILD WHICH SHOULD TRIGGER THE INTEGRATION TESTS
# needs:
# - job: docker-build::model_name
# artifacts: true
rules:
- when: never
#########
# RELEASE
release:
extends: .release
needs:
- !reference [.needs-versioning, needs] # leave this line as is

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "incl/redai_image"]
path = incl/redai_image
url = ssh://git@git.iqser.com:2222/rr/redai_image.git

View File

@ -1 +0,0 @@
3.10

View File

@ -1,73 +1,25 @@
FROM python:3.10-slim AS builder
ARG BASE_ROOT="nexus.iqser.com:5001/red/"
ARG VERSION_TAG="latest"
ARG GITLAB_USER
ARG GITLAB_ACCESS_TOKEN
FROM ${BASE_ROOT}image-prediction-base:${VERSION_TAG}
ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
WORKDIR /app/service
ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
ARG POETRY_SOURCE_REF_RED=gitlab-red
COPY src src
COPY data data
COPY image_prediction image_prediction
COPY incl/redai_image/redai incl/redai_image/redai
COPY setup.py setup.py
COPY requirements.txt requirements.txt
COPY config.yaml config.yaml
ARG PYPI_REGISTRY_FFORESIGHT=https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
ARG POETRY_SOURCE_REF_FFORESIGHT=gitlab-fforesight
# Install dependencies differing from base image.
RUN python3 -m pip install -r requirements.txt
ARG VERSION=dev
LABEL maintainer="Research <research@knecon.com>"
LABEL version="${VERSION}"
WORKDIR /app
###########
# ENV SETUP
ENV PYTHONDONTWRITEBYTECODE=true
ENV PYTHONUNBUFFERED=true
ENV POETRY_HOME=/opt/poetry
ENV PATH="$POETRY_HOME/bin:$PATH"
RUN apt-get update && \
apt-get install -y curl git bash build-essential libffi-dev libssl-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN curl -sSL https://install.python-poetry.org | python3 -
RUN poetry --version
COPY pyproject.toml poetry.lock ./
RUN poetry config virtualenvs.create true && \
poetry config virtualenvs.in-project true && \
poetry config installer.max-workers 10 && \
poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
poetry config http-basic.${POETRY_SOURCE_REF_RED} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
poetry config repositories.${POETRY_SOURCE_REF_FFORESIGHT} ${PYPI_REGISTRY_FFORESIGHT} && \
poetry config http-basic.${POETRY_SOURCE_REF_FFORESIGHT} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
poetry install --without=dev -vv --no-interaction --no-root
###############
# WORKING IMAGE
FROM python:3.10-slim
WORKDIR /app
# COPY SOURCE CODE FROM BUILDER IMAGE
COPY --from=builder /app /app
# COPY BILL OF MATERIALS (BOM)
COPY bom.json /bom.json
ENV PATH="/app/.venv/bin:$PATH"
###################
# COPY SOURCE CODE
COPY ./src ./src
COPY ./config ./config
COPY ./data ./data
COPY banner.txt ./
RUN python3 -m pip install -e .
RUN python3 -m pip install -e incl/redai_image/redai
EXPOSE 5000
EXPOSE 8080
CMD [ "python", "src/serve.py"]
CMD ["python3", "src/serve.py"]

25
Dockerfile_base Normal file
View File

@ -0,0 +1,25 @@
FROM python:3.8 as builder1
# Use a virtual environment.
RUN python -m venv /app/venv
ENV PATH="/app/venv/bin:$PATH"
# Upgrade pip.
RUN python -m pip install --upgrade pip
# Make a directory for the service files and copy the service repo into the container.
WORKDIR /app/service
COPY ./requirements.txt ./requirements.txt
# Install dependencies.
RUN python3 -m pip install -r requirements.txt
# Make a new container and copy all relevant files over to filter out temporary files
# produced during setup to reduce the final container's size.
FROM python:3.8
WORKDIR /app/
COPY --from=builder1 /app .
ENV PATH="/app/venv/bin:$PATH"
WORKDIR /app/service

View File

@ -1,43 +0,0 @@
FROM python:3.10
ARG USERNAME
ARG TOKEN
ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
ARG POETRY_SOURCE_REF_RED=gitlab-red
ARG VERSION=dev
LABEL maintainer="Research <research@knecon.com>"
LABEL version="${VERSION}"
WORKDIR /app
ENV PYTHONUNBUFFERED=true
ENV POETRY_HOME=/opt/poetry
ENV PATH="$POETRY_HOME/bin:$PATH"
RUN curl -sSL https://install.python-poetry.org | python3 -
COPY ./data ./data
COPY ./test ./test
COPY ./config ./config
COPY ./src ./src
COPY pyproject.toml poetry.lock banner.txt config.yaml./
RUN poetry config virtualenvs.create false && \
poetry config installer.max-workers 10 && \
poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${USERNAME} ${TOKEN} && \
poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
poetry config http-basic.${POETRY_SOURCE_REF_RED} ${USERNAME} ${TOKEN} && \
poetry install --without=dev -vv --no-interaction --no-root
EXPOSE 5000
EXPOSE 8080
RUN apt update --yes
RUN apt install vim --yes
RUN apt install poppler-utils --yes
CMD coverage run -m pytest test/ --tb=native -q -s -vvv -x && coverage combine && coverage report -m && coverage xml

136
README.md
View File

@ -1,143 +1,25 @@
### Setup
### Building
Build base image
```bash
docker build -t image-classification-image --progress=plain --no-cache \
-f Dockerfile \
--build-arg USERNAME=$GITLAB_USER \
--build-arg TOKEN=$GITLAB_ACCESS_TOKEN \
.
setup/docker.sh
```
Build head image
```bash
docker build -f Dockerfile -t image-prediction . --build-arg BASE_ROOT=""
```
### Usage
#### Without Docker
```bash
py scripts/run_pipeline.py /path/to/a/pdf
```
#### With Docker
Shell 1
```bash
docker run --rm --net=host image-prediction
docker run --rm --net=host --rm image-prediction
```
Shell 2
```bash
python scripts/pyinfra_mock.py /path/to/a/pdf
python scripts/pyinfra_mock.py --pdf_path /path/to/a/pdf
```
### Tests
Run for example this command to execute all tests and get a coverage report:
```bash
coverage run -m pytest test --tb=native -q -s -vvv -x && coverage combine && coverage report -m
```
After having built the service container as specified above, you can also run tests in a container as follows:
```bash
./run_tests.sh
```
### Message Body Formats
#### Request Format
The request messages need to provide the fields `"dossierId"` and `"fileId"`. A request should look like this:
```json
{
"dossierId": "<string identifier>",
"fileId": "<string identifier>"
}
```
Any additional keys are ignored.
#### Response Format
Response bodies contain information about the identified class of the image, the confidence of the classification, the
position and size of the image as well as the results of additional convenience filters which can be configured through
environment variables. A response body looks like this:
```json
{
"dossierId": "debug",
"fileId": "13ffa9851740c8d20c4c7d1706d72f2a",
"data": [...]
}
```
An image metadata record (entry in `"data"` field of a response body) looks like this:
```json
{
"classification": {
"label": "logo",
"probabilities": {
"logo": 1.0,
"signature": 1.1599173226749333e-17,
"other": 2.994595513398207e-23,
"formula": 4.352109377281029e-31
}
},
"position": {
"x1": 475.95,
"x2": 533.4,
"y1": 796.47,
"y2": 827.62,
"pageNumber": 6
},
"geometry": {
"width": 57.44999999999999,
"height": 31.149999999999977
},
"alpha": false,
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.05975350599135938,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 1.8443017656500813,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
}
```
## Configuration
A configuration file is located under `config.yaml`. All relevant variables can be configured via
exporting environment variables.
| __Environment Variable__ | Default | Description |
|------------------------------------|------------------------------------|----------------------------------------------------------------------------------------|
| __LOGGING_LEVEL_ROOT__ | "INFO" | Logging level for log file messages |
| __VERBOSE__ | *true* | Service prints document processing progress to stdout |
| __BATCH_SIZE__ | 16 | Number of images in memory simultaneously per service instance |
| __RUN_ID__ | "fabfb1f192c745369b88cab34471aba7" | The ID of the mlflow run to load the image classifier from |
| __MIN_REL_IMAGE_SIZE__ | 0.05 | Minimally permissible image size to page size ratio |
| __MAX_REL_IMAGE_SIZE__ | 0.75 | Maximally permissible image size to page size ratio |
| __MIN_IMAGE_FORMAT__ | 0.1 | Minimally permissible image width to height ratio |
| __MAX_IMAGE_FORMAT__ | 10 | Maximally permissible image width to height ratio |
See also: https://git.iqser.com/projects/RED/repos/helm/browse/redaction/templates/image-service-v2

40
bamboo-specs/pom.xml Normal file
View File

@ -0,0 +1,40 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs-parent</artifactId>
<version>7.1.2</version>
<relativePath/>
</parent>
<artifactId>bamboo-specs</artifactId>
<version>1.0.0-SNAPSHOT</version>
<packaging>jar</packaging>
<properties>
<sonar.skip>true</sonar.skip>
</properties>
<dependencies>
<dependency>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs-api</artifactId>
</dependency>
<dependency>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs</artifactId>
</dependency>
<!-- Test dependencies -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<!-- run 'mvn test' to perform offline validation of the plan -->
<!-- run 'mvn -Ppublish-specs' to upload the plan to your Bamboo server -->
</project>

View File

@ -0,0 +1,182 @@
package buildjob;
import com.atlassian.bamboo.specs.api.BambooSpec;
import com.atlassian.bamboo.specs.api.builders.BambooKey;
import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
import com.atlassian.bamboo.specs.api.builders.permission.PermissionType;
import com.atlassian.bamboo.specs.api.builders.permission.Permissions;
import com.atlassian.bamboo.specs.api.builders.permission.PlanPermissions;
import com.atlassian.bamboo.specs.api.builders.plan.Job;
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
import com.atlassian.bamboo.specs.api.builders.plan.PlanIdentifier;
import com.atlassian.bamboo.specs.api.builders.plan.Stage;
import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup;
import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement;
import com.atlassian.bamboo.specs.api.builders.project.Project;
import com.atlassian.bamboo.specs.builders.task.CheckoutItem;
import com.atlassian.bamboo.specs.builders.task.InjectVariablesTask;
import com.atlassian.bamboo.specs.builders.task.ScriptTask;
import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask;
import com.atlassian.bamboo.specs.builders.task.CleanWorkingDirectoryTask;
import com.atlassian.bamboo.specs.builders.task.VcsTagTask;
import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
import com.atlassian.bamboo.specs.api.builders.Variable;
import com.atlassian.bamboo.specs.util.BambooServer;
import com.atlassian.bamboo.specs.builders.task.ScriptTask;
import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
/**
* Plan configuration for Bamboo.
* Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
*/
@BambooSpec
public class PlanSpec {
private static final String SERVICE_NAME = "image-prediction";
private static final String SERVICE_NAME_BASE = "image-prediction-base";
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_","");
/**
* Run main to publish plan on Bamboo
*/
public static void main(final String[] args) throws Exception {
//By default credentials are read from the '.credentials' file.
BambooServer bambooServer = new BambooServer("http://localhost:8085");
Plan plan = new PlanSpec().createDockerBuildPlan();
bambooServer.publish(plan);
PlanPermissions planPermission = new PlanSpec().createPlanPermission(plan.getIdentifier());
bambooServer.publish(planPermission);
}
private PlanPermissions createPlanPermission(PlanIdentifier planIdentifier) {
Permissions permission = new Permissions()
.userPermissions("atlbamboo", PermissionType.EDIT, PermissionType.VIEW, PermissionType.ADMIN, PermissionType.CLONE, PermissionType.BUILD)
.groupPermissions("research", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
.groupPermissions("Development", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
.groupPermissions("QA", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
.loggedInUserPermissions(PermissionType.VIEW)
.anonymousUserPermissionView();
return new PlanPermissions(planIdentifier.getProjectKey(), planIdentifier.getPlanKey()).permissions(permission);
}
private Project project() {
return new Project()
.name("RED")
.key(new BambooKey("RED"));
}
public Plan createDockerBuildPlan() {
return new Plan(
project(),
SERVICE_NAME, new BambooKey(SERVICE_KEY))
.description("Docker build for image-prediction.")
// .variables()
.stages(new Stage("Build Stage")
.jobs(
new Job("Build Job", new BambooKey("BUILD"))
.tasks(
new CleanWorkingDirectoryTask()
.description("Clean working directory.")
.enabled(true),
new VcsCheckoutTask()
.description("Checkout default repository.")
.checkoutItems(new CheckoutItem().defaultRepository()),
new VcsCheckoutTask()
.description("Checkout redai_image research repository.")
.checkoutItems(new CheckoutItem().repository("RR / redai_image").path("redai_image")),
new ScriptTask()
.description("Set config and keys.")
.inlineBody("mkdir -p ~/.ssh\n" +
"echo \"${bamboo.bamboo_agent_ssh}\" | base64 -d >> ~/.ssh/id_rsa\n" +
"echo \"host vector.iqser.com\" > ~/.ssh/config\n" +
"echo \" user bamboo-agent\" >> ~/.ssh/config\n" +
"chmod 600 ~/.ssh/config ~/.ssh/id_rsa"),
new ScriptTask()
.description("Build Docker container.")
.location(Location.FILE)
.fileFromPath("bamboo-specs/src/main/resources/scripts/docker-build.sh")
.argument(SERVICE_NAME + " " + SERVICE_NAME_BASE))
.dockerConfiguration(
new DockerConfiguration()
.image("nexus.iqser.com:5001/infra/release_build:4.2.0")
.volume("/var/run/docker.sock", "/var/run/docker.sock")),
new Job("Sonar Job", new BambooKey("SONAR"))
.tasks(
new CleanWorkingDirectoryTask()
.description("Clean working directory.")
.enabled(true),
new VcsCheckoutTask()
.description("Checkout default repository.")
.checkoutItems(new CheckoutItem().defaultRepository()),
new VcsCheckoutTask()
.description("Checkout redai_image repository.")
.checkoutItems(new CheckoutItem().repository("RR / redai_image").path("redai_image")),
new ScriptTask()
.description("Set config and keys.")
.inlineBody("mkdir -p ~/.ssh\n" +
"echo \"${bamboo.bamboo_agent_ssh}\" | base64 -d >> ~/.ssh/id_rsa\n" +
"echo \"host vector.iqser.com\" > ~/.ssh/config\n" +
"echo \" user bamboo-agent\" >> ~/.ssh/config\n" +
"chmod 600 ~/.ssh/config ~/.ssh/id_rsa"),
new ScriptTask()
.description("Run Sonarqube scan.")
.location(Location.FILE)
.fileFromPath("bamboo-specs/src/main/resources/scripts/sonar-scan.sh")
.argument(SERVICE_NAME))
.dockerConfiguration(
new DockerConfiguration()
.image("nexus.iqser.com:5001/infra/release_build:4.2.0")
.volume("/var/run/docker.sock", "/var/run/docker.sock"))),
new Stage("Licence Stage")
.jobs(
new Job("Git Tag Job", new BambooKey("GITTAG"))
.tasks(
new VcsCheckoutTask()
.description("Checkout default repository.")
.checkoutItems(new CheckoutItem().defaultRepository()),
new ScriptTask()
.description("Build git tag.")
.location(Location.FILE)
.fileFromPath("bamboo-specs/src/main/resources/scripts/git-tag.sh"),
new InjectVariablesTask()
.description("Inject git tag.")
.path("git.tag")
.namespace("g")
.scope(InjectVariablesScope.LOCAL),
new VcsTagTask()
.description("${bamboo.g.gitTag}")
.tagName("${bamboo.g.gitTag}")
.defaultRepository())
.dockerConfiguration(
new DockerConfiguration()
.image("nexus.iqser.com:5001/infra/release_build:4.4.1")),
new Job("Licence Job", new BambooKey("LICENCE"))
.enabled(false)
.tasks(
new VcsCheckoutTask()
.description("Checkout default repository.")
.checkoutItems(new CheckoutItem().defaultRepository()),
new ScriptTask()
.description("Build licence.")
.location(Location.FILE)
.fileFromPath("bamboo-specs/src/main/resources/scripts/create-licence.sh"))
.dockerConfiguration(
new DockerConfiguration()
.image("nexus.iqser.com:5001/infra/maven:3.6.2-jdk-13-3.0.0")
.volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
.linkedRepositories("RR / " + SERVICE_NAME)
.linkedRepositories("RR / redai_image")
.triggers(new BitbucketServerTrigger())
.planBranchManagement(new PlanBranchManagement()
.createForVcsBranch()
.delete(new BranchCleanup()
.whenInactiveInRepositoryAfterDays(14))
.notificationForCommitters());
}
}

View File

@ -0,0 +1,19 @@
#!/bin/bash
set -e
if [[ \"${bamboo_version_tag}\" != \"dev\" ]]
then
${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
-f ${bamboo_build_working_directory}/pom.xml \
versions:set \
-DnewVersion=${bamboo_version_tag}
${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
-f ${bamboo_build_working_directory}/pom.xml \
-B clean deploy \
-e -DdeployAtEnd=true \
-Dmaven.wagon.http.ssl.insecure=true \
-Dmaven.wagon.http.ssl.allowall=true \
-Dmaven.wagon.http.ssl.ignore.validity.dates=true \
-DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/gin4-platform-releases
fi

View File

@ -0,0 +1,19 @@
#!/bin/bash
set -e
SERVICE_NAME=$1
SERVICE_NAME_BASE=$2
python3 -m venv build_venv
source build_venv/bin/activate
python3 -m pip install --upgrade pip
pip install dvc
pip install 'dvc[ssh]'
dvc pull
echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
docker build -f Dockerfile_base -t nexus.iqser.com:5001/red/$SERVICE_NAME_BASE:${bamboo_version_tag} .
docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} --build-arg VERSION_TAG=${bamboo_version_tag} .
echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag}

View File

@ -0,0 +1,9 @@
#!/bin/bash
set -e
if [[ "${bamboo_version_tag}" = "dev" ]]
then
echo "gitTag=${bamboo_planRepository_1_branch}_${bamboo_buildNumber}" > git.tag
else
echo "gitTag=${bamboo_version_tag}" > git.tag
fi

View File

@ -0,0 +1,51 @@
#!/bin/bash
set -e
export JAVA_HOME=/usr/bin/sonar-scanner/jre
python3 -m venv build_venv
source build_venv/bin/activate
python3 -m pip install --upgrade pip
echo "dev setup for unit test and coverage 💖"
pip install -e .
pip install -r requirements.txt
SERVICE_NAME=$1
echo "dependency-check:aggregate"
mkdir -p reports
dependency-check --enableExperimental -f JSON -f HTML -f XML \
--disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
--exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**"
if [[ -z "${bamboo_repository_pr_key}" ]]
then
echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
/usr/bin/sonar-scanner/bin/sonar-scanner \
-Dsonar.projectKey=RED_$SERVICE_NAME \
-Dsonar.sources=image_prediction \
-Dsonar.host.url=https://sonarqube.iqser.com \
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-Dsonar.branch.name=${bamboo_planRepository_1_branch} \
-Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
-Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
-Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
-Dsonar.python.coverage.reportPaths=reports/coverage.xml
else
echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
/usr/bin/sonar-scanner/bin/sonar-scanner \
-Dsonar.projectKey=RED_$SERVICE_NAME \
-Dsonar.sources=image_prediction \
-Dsonar.host.url=https://sonarqube.iqser.com \
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
-Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \
-Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \
-Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
-Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
-Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
-Dsonar.python.coverage.reportPaths=reports/coverage.xml
fi

View File

@ -0,0 +1,16 @@
package buildjob;
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
import com.atlassian.bamboo.specs.api.exceptions.PropertiesValidationException;
import com.atlassian.bamboo.specs.api.util.EntityPropertiesBuilders;
import org.junit.Test;
public class PlanSpecTest {
@Test
public void checkYourPlanOffline() throws PropertiesValidationException {
Plan plan = new PlanSpec().createDockerBuildPlan();
EntityPropertiesBuilders.build(plan);
}
}

33697
bom.json

File diff suppressed because it is too large Load Diff

28
config.yaml Normal file
View File

@ -0,0 +1,28 @@
webserver:
host: $SERVER_HOST|"127.0.0.1" # webserver address
port: $SERVER_PORT|5000 # webserver port
mode: $SERVER_MODE|production # webserver mode: {development, production}
service:
logging_level: INFO # Logging level for service logger
progressbar: True # Whether a progress bar over the pages of a document is displayed while processing
batch_size: $BATCH_SIZE|32 # Number of images in memory simultaneously
verbose: $VERBOSE|True # Service prints document processing progress to stdout
run_id: $RUN_ID|fabfb1f192c745369b88cab34471aba7 # The ID of the mlflow run to load the service_estimator from
# These variables control filters that are applied to either images, image metadata or service_estimator predictions. The filter
# result values are reported in the service responses. For convenience the response to a request contains a
# "filters.allPassed" field, which is set to false if any of the filters returned values did not meet its specified
# required value.
filters:
image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)
min: $MIN_REL_IMAGE_SIZE|0.05 # Minimum permissible
max: $MAX_REL_IMAGE_SIZE|0.75 # Maximum permissible
image_width_to_height_quotient: # Image width to height ratio
min: $MIN_IMAGE_FORMAT|0.1 # Minimum permissible
max: $MAX_IMAGE_FORMAT|10 # Maximum permissible
min_confidence: $MIN_CONFIDENCE|0.5 # Minimum permissible prediction confidence

View File

@ -1,68 +0,0 @@
[asyncio]
max_concurrent_tasks = 10
[dynamic_tenant_queues]
enabled = true
[metrics.prometheus]
enabled = true
prefix = "redactmanager_image_service"
[tracing]
enabled = true
# possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
type = "azure_monitor"
[tracing.opentelemetry]
endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
service_name = "redactmanager_image_service"
exporter = "otlp"
[webserver]
host = "0.0.0.0"
port = 8080
[rabbitmq]
host = "localhost"
port = 5672
username = ""
password = ""
heartbeat = 60
# Has to be a divider of heartbeat, and shouldn't be too big, since only in these intervals queue interactions happen (like receiving new messages)
# This is also the minimum time the service needs to process a message
connection_sleep = 5
input_queue = "request_queue"
output_queue = "response_queue"
dead_letter_queue = "dead_letter_queue"
tenant_event_queue_suffix = "_tenant_event_queue"
tenant_event_dlq_suffix = "_tenant_events_dlq"
tenant_exchange_name = "tenants-exchange"
queue_expiration_time = 300000 # 5 minutes in milliseconds
service_request_queue_prefix = "image_request_queue"
service_request_exchange_name = "image_request_exchange"
service_response_exchange_name = "image_response_exchange"
service_dlq_name = "image_dlq"
[storage]
backend = "s3"
[storage.s3]
bucket = "redaction"
endpoint = "http://127.0.0.1:9000"
key = ""
secret = ""
region = "eu-central-1"
[storage.azure]
container = "redaction"
connection_string = ""
[storage.tenant_server]
public_key = ""
endpoint = "http://tenant-user-management:8081/internal-api/tenants"
[kubernetes]
pod_name = "test_pod"

View File

@ -1,42 +0,0 @@
[logging]
level = "INFO"
[service]
# Print document processing progress to stdout
verbose = false
batch_size = 6
image_stiching_tolerance = 1 # in pixels
mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
# These variables control filters that are applied to either images, image metadata or service_estimator predictions.
# The filter result values are reported in the service responses. For convenience the response to a request contains a
# "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
# specified required value.
[filters.confidence]
# Minimum permissible prediction confidence
min = 0.5
# Image size to page size ratio (ratio of geometric means of areas)
[filters.image_to_page_quotient]
min = 0.05
max = 0.75
[filters.is_scanned_page]
# Minimum permissible image to page ratio tolerance for a page to be considered scanned.
# This is only used for filtering small images on scanned pages and is applied before processing the image, therefore
# superseding the image_to_page_quotient filter that only applies a tag to the image after processing.
tolerance = 0
# Image width to height ratio
[filters.image_width_to_height_quotient]
min = 0.1
max = 10
# put class specific filters here ['signature', 'formula', 'logo']
[filters.overrides.signature.image_to_page_quotient]
max = 0.4
[filters.overrides.logo.image_to_page_quotient]
min = 0.06

View File

@ -1,5 +1,5 @@
outs:
- md5: ad061d607f615afc149643f62dbf37cc.dir
size: 166952700
- md5: 4219c52caf5f87f5a94f1ae00c60fb91.dir
size: 166952679
nfiles: 179
path: mlruns

View File

@ -24,11 +24,10 @@ class Classifier:
self.__pipe = rcompose(self.__estimator_adapter, self.__label_mapper)
def predict(self, batch: Union[np.array, Tuple[Image]]) -> List[str]:
if isinstance(batch, np.ndarray) and batch.shape[0] == 0:
if not isinstance(batch, tuple) and batch.shape[0] == 0:
return []
return self.__pipe(batch)
return list(self.__pipe(batch))
def __call__(self, batch: np.array) -> List[str]:
logger.debug("Classifier.predict")

View File

@ -0,0 +1,40 @@
"""Implements a config object with dot-indexing syntax."""
from envyaml import EnvYAML
from image_prediction.locations import CONFIG_FILE
def _get_item_and_maybe_make_dotindexable(container, item):
ret = container[item]
return DotIndexable(ret) if isinstance(ret, dict) else ret
class DotIndexable:
def __init__(self, x):
self.x = x
def __getattr__(self, item):
return _get_item_and_maybe_make_dotindexable(self.x, item)
def __repr__(self):
return self.x.__repr__()
def __getitem__(self, item):
return self.__getattr__(item)
class Config:
def __init__(self, config_path):
self.__config = EnvYAML(config_path)
def __getattr__(self, item):
if item in self.__config:
return _get_item_and_maybe_make_dotindexable(self.__config, item)
def __getitem__(self, item):
return self.__getattr__(item)
CONFIG = Config(CONFIG_FILE)

View File

@ -3,17 +3,17 @@ from funcy import juxt
from image_prediction.classifier.classifier import Classifier
from image_prediction.classifier.image_classifier import ImageClassifier
from image_prediction.compositor.compositor import TransformerCompositor
from image_prediction.encoder.encoders.hash_encoder import HashEncoder
from image_prediction.estimator.adapter.adapter import EstimatorAdapter
from image_prediction.extractor_classifier.extractor_classifier import ExtractorClassifier
from image_prediction.formatter.formatters.camel_case import Snake2CamelCaseKeyFormatter
from image_prediction.formatter.formatters.enum import EnumFormatter
from image_prediction.transformer.transformers.response import ResponseTransformer
from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
from image_prediction.label_mapper.mappers.probability import ProbabilityMapper
from image_prediction.model_loader.loader import ModelLoader
from image_prediction.model_loader.loaders.mlflow import MlflowConnector
from image_prediction.redai_adapter.mlflow import MlflowModelReader
from image_prediction.transformer.transformers.coordinate.pdfnet import PDFNetCoordinateTransformer
from image_prediction.transformer.transformers.response import ResponseTransformer
def get_mlflow_model_loader(mlruns_dir):
@ -32,12 +32,16 @@ def get_extractor(**kwargs):
return image_extractor
def get_extractor_classifier(model_loader, model_identifier, **kwargs):
extractor_classifier = ExtractorClassifier(
get_extractor(**kwargs), get_image_classifier(model_loader, model_identifier)
)
return extractor_classifier
def get_formatter():
formatter = TransformerCompositor(
PDFNetCoordinateTransformer(), EnumFormatter(), ResponseTransformer(), Snake2CamelCaseKeyFormatter()
)
return formatter
def get_encoder():
return HashEncoder()

View File

@ -32,11 +32,3 @@ class IntentionalTestException(RuntimeError):
class InvalidBox(Exception):
pass
class ParsingError(Exception):
pass
class BadXref(ValueError):
pass

View File

@ -0,0 +1,32 @@
from itertools import chain
from typing import Iterable
from funcy import chunks
from image_prediction.classifier.image_classifier import ImageClassifier
from image_prediction.image_extractor.extractor import ImageExtractor
class ExtractorClassifier:
"""This class is responsible for orchestrating the pairing of classifications and image metadata. It extracts images
from an object and classifies them. Then it ties the classification together with the metadata. It returns an
iterable of dictionaries, where each dictionary has a field 'label' for the classification and possibly additional
fields for metadata -- metadata could be void.
"""
def __init__(self, image_extractor: ImageExtractor, image_classifier: ImageClassifier):
self.classifier = image_classifier
self.extractor = image_extractor
def __process_batch(self, batch):
images, metadata = zip(*batch)
predictions = self.classifier(images)
responses = ({"classification": prd, **mdt} for prd, mdt in zip(predictions, metadata))
return responses
def __call__(self, obj, **kwargs) -> Iterable[dict]:
image_metadata_pairs = self.extractor(obj, **kwargs)
batches = chunks(16, image_metadata_pairs)
predictions = chain.from_iterable(map(self.__process_batch, batches))
return predictions

View File

@ -1,20 +1,39 @@
import multiprocessing
import traceback
from typing import Callable
from flask import Flask, request, jsonify
from prometheus_client import generate_latest, CollectorRegistry, Summary
from image_prediction.utils import get_logger
from image_prediction.utils.process_wrapping import wrap_in_process
logger = get_logger()
def run_in_process(func):
p = multiprocessing.Process(target=func)
p.start()
p.join()
def wrap_in_process(func_to_wrap):
def build_function_and_run_in_process(*args, **kwargs):
def func():
try:
result = func_to_wrap(*args, **kwargs)
return_dict["result"] = result
except:
logger.error(traceback.format_exc())
manager = multiprocessing.Manager()
return_dict = manager.dict()
run_in_process(func)
return return_dict.get("result", None)
return build_function_and_run_in_process
def make_prediction_server(predict_fn: Callable):
app = Flask(__name__)
registry = CollectorRegistry(auto_describe=True)
metric = Summary(
f"redactmanager_imageClassification_seconds", f"Time spent on image-service classification.", registry=registry
)
@app.route("/ready", methods=["GET"])
def ready():
@ -34,8 +53,6 @@ def make_prediction_server(predict_fn: Callable):
return response
@app.route("/predict", methods=["POST"])
@app.route("/", methods=["POST"])
@metric.time()
def predict():
# Tensorflow does not free RAM. Workaround: Run prediction function (which instantiates a model) in sub-process.
@ -45,7 +62,7 @@ def make_prediction_server(predict_fn: Callable):
logger.info("Analysing...")
predictions = predict_fn_wrapped(request.data)
if predictions is not None:
if predictions:
response = jsonify(predictions)
logger.info("Analysis completed.")
return response
@ -53,8 +70,4 @@ def make_prediction_server(predict_fn: Callable):
logger.error("Analysis failed.")
return __failure()
@app.route("/prometheus", methods=["GET"])
def prometheus():
return generate_latest(registry=registry)
return app

View File

@ -0,0 +1,181 @@
import atexit
import io
from functools import partial, lru_cache
from itertools import chain, starmap, filterfalse, repeat
from operator import itemgetter
from typing import List
import fitz
from PIL import Image
from funcy import rcompose, merge, zipdict
from tqdm import tqdm
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
from image_prediction.info import Info
from image_prediction.stitching.stitching import stitch_pairs
from image_prediction.stitching.utils import validate_box_coords, validate_box_size
class ParsablePDFImageExtractor(ImageExtractor):
def __init__(self, verbose=False, tolerance=0):
"""
Args:
verbose: Whether to show progressbar
tolerance: The tolerance in pixels for the distance images beyond which they will not be stitched together
"""
self.doc: fitz.fitz.Document = None
self.verbose = verbose
self.tolerance = tolerance
def extract(self, pdf: bytes, page_range: range = None):
self.doc = fitz.Document(stream=pdf)
pages = extract_pages(self.doc, page_range) if page_range else self.doc
image_metadata_pairs = chain.from_iterable(
map(
self.__process_images_on_page,
tqdm(pages, desc="Extracting", disable=not self.verbose, total=len(page_range) if page_range else None),
)
)
yield from image_metadata_pairs
def __process_images_on_page(self, page: fitz.fitz.Page):
images = get_images_on_page(self.doc, page)
metadata = get_metadata_for_images_on_page(self.doc, page)
clear_caches()
image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
yield from image_metadata_pairs
def extract_pages(doc, page_range):
page_range = range(page_range.start + 1, page_range.stop + 1)
pages = map(doc.load_page, page_range)
return pages
@lru_cache(maxsize=None)
def get_images_on_page(doc, page: fitz.Page):
image_infos = get_image_infos(page)
xrefs = map(itemgetter("xref"), image_infos)
images = map(partial(xref_to_image, doc), xrefs)
return images
def get_metadata_for_images_on_page(doc, page: fitz.Page):
metadata = map(get_image_metadata, get_image_infos(page))
metadata = validate_coords_and_passthrough(metadata)
metadata = filter_out_tiny_images(metadata)
metadata = validate_size_and_passthrough(metadata)
metadata = add_page_metadata(page, metadata)
metadata = add_alpha_channel_info(doc, page, metadata)
yield from metadata
@lru_cache(maxsize=None)
def get_image_infos(page: fitz.Page) -> List[dict]:
return page.get_image_info(xrefs=True)
@lru_cache(maxsize=None)
def xref_to_image(doc, xref) -> Image:
maybe_image = load_image_handle_from_xref(doc, xref)
return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None
def get_image_metadata(image_info):
x1, y1, x2, y2 = map(rounder, image_info["bbox"])
width = abs(x2 - x1)
height = abs(y2 - y1)
return {
Info.WIDTH: width,
Info.HEIGHT: height,
Info.X1: x1,
Info.X2: x2,
Info.Y1: y1,
Info.Y2: y2,
}
def validate_coords_and_passthrough(metadata):
yield from map(validate_box_coords, metadata)
def filter_out_tiny_images(metadata):
return filterfalse(tiny, metadata)
def validate_size_and_passthrough(metadata):
yield from map(validate_box_size, metadata)
def add_page_metadata(page, metadata):
return map(partial(merge, get_page_metadata(page)), metadata)
def add_alpha_channel_info(doc, page, metadata):
xrefs = map(itemgetter("xref"), get_image_infos(page))
alpha = map(partial(has_alpha_channel, doc), xrefs)
alpha = ({Info.ALPHA: a} for a in alpha)
# alpha = map(dict, zip(repeat(Info.ALPHA), alpha))
metadata = starmap(merge, zip(alpha, metadata))
return metadata
@lru_cache(maxsize=None)
def load_image_handle_from_xref(doc, xref):
return doc.extract_image(xref)
rounder = rcompose(round, int)
def get_page_metadata(page):
page_width, page_height = map(rounder, page.mediabox_size)
return {
Info.PAGE_WIDTH: page_width,
Info.PAGE_HEIGHT: page_height,
Info.PAGE_IDX: page.number,
}
def has_alpha_channel(doc, xref):
maybe_image = load_image_handle_from_xref(doc, xref)
maybe_smask = maybe_image["smask"] if maybe_image else None
if maybe_smask:
return any([doc.extract_image(maybe_smask) is not None, bool(fitz.Pixmap(doc, maybe_smask).alpha)])
else:
return bool(fitz.Pixmap(doc, xref).alpha)
def tiny(metadata):
return metadata[Info.WIDTH] * metadata[Info.HEIGHT] <= 4
def clear_caches():
get_image_infos.cache_clear()
load_image_handle_from_xref.cache_clear()
get_images_on_page.cache_clear()
xref_to_image.cache_clear()
atexit.register(clear_caches)

View File

@ -12,4 +12,3 @@ class Info(Enum):
Y1 = "y1"
Y2 = "y2"
ALPHA = "alpha"
XREF = "xref"

View File

@ -0,0 +1,17 @@
"""Defines constant paths relative to the module root path."""
from pathlib import Path
MODULE_DIR = Path(__file__).resolve().parents[0]
PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
CONFIG_FILE = PACKAGE_ROOT_DIR / "config.yaml"
BANNER_FILE = PACKAGE_ROOT_DIR / "banner.txt"
DATA_DIR = PACKAGE_ROOT_DIR / "data"
MLRUNS_DIR = str(DATA_DIR / "mlruns")
TEST_DATA_DIR = PACKAGE_ROOT_DIR / "test" / "data"

View File

@ -0,0 +1,26 @@
import os
from funcy import rcompose
from image_prediction.config import CONFIG
from image_prediction.default_objects import get_extractor_classifier, get_formatter, get_mlflow_model_loader
from image_prediction.locations import MLRUNS_DIR
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
def load_pipeline(**kwargs):
model_loader = get_mlflow_model_loader(MLRUNS_DIR)
model_identifier = CONFIG.service.run_id
pipeline = Pipeline(model_loader, model_identifier, **kwargs)
return pipeline
class Pipeline:
def __init__(self, model_loader, model_identifier, **kwargs):
self.pipe = rcompose(get_extractor_classifier(model_loader, model_identifier, **kwargs), get_formatter())
def __call__(self, pdf: bytes, page_range: range = None):
yield from self.pipe(pdf, page_range=page_range)

View File

@ -3,7 +3,7 @@ from functools import reduce
from typing import Iterable, Callable, List
from PIL import Image
from funcy import juxt, first, rest, rcompose, rpartial, complement, ilen
from funcy import juxt, first, rest, rcompose, rpartial
from image_prediction.image_extractor.extractor import ImageMetadataPair
from image_prediction.info import Info
@ -13,22 +13,8 @@ from image_prediction.stitching.utils import make_coord_getter, flatten_groups_o
from image_prediction.utils.generic import until
def make_merger_sentinel():
def no_new_mergers(pairs):
nonlocal number_of_pairs_so_far
number_of_pairs_now = len(pairs)
if number_of_pairs_now == number_of_pairs_so_far:
return True
else:
number_of_pairs_so_far = number_of_pairs_now
return False
number_of_pairs_so_far = -1
return no_new_mergers
def no_new_merges(pairs1, pairs2):
return len(pairs1) == len(pairs2)
def merge_along_both_axes(pairs: Iterable[ImageMetadataPair], tolerance=0) -> List[ImageMetadataPair]:
@ -86,8 +72,7 @@ def merge_group_horizontally(group: Iterable[ImageMetadataPair], tolerance=0):
def merge_group(group: Iterable[ImageMetadataPair], direction, tolerance=0):
reduce_group = make_merger_aggregator(direction, tolerance=tolerance)
no_new_mergers = make_merger_sentinel()
return until(no_new_mergers, reduce_group, group)
return until(no_new_merges, reduce_group, group)
def make_merger_aggregator(axis, tolerance=0) -> Callable[[Iterable[ImageMetadataPair]], Iterable[ImageMetadataPair]]:

View File

@ -3,13 +3,11 @@ from typing import Iterable, List
from funcy import rpartial
from image_prediction.image_extractor.extractor import ImageMetadataPair
from image_prediction.stitching.merging import merge_along_both_axes, make_merger_sentinel
from image_prediction.stitching.merging import merge_along_both_axes, no_new_merges
from image_prediction.utils.generic import until
def stitch_pairs(pairs: Iterable[ImageMetadataPair], tolerance=0) -> List[ImageMetadataPair]:
"""Given a collection of image-metadata pairs from the same pages, combines all pairs that constitute adjacent
images."""
no_new_mergers = make_merger_sentinel()
merge = rpartial(merge_along_both_axes, tolerance)
return until(no_new_mergers, merge, pairs)
return until(no_new_merges, rpartial(merge_along_both_axes, tolerance), pairs)

View File

@ -1,5 +1,4 @@
import math
from dynaconf import Dynaconf
from operator import itemgetter
from image_prediction.config import CONFIG
@ -16,45 +15,38 @@ class ResponseTransformer(Transformer):
def build_image_info(data: dict) -> dict:
page_width, page_height, x1, x2, y1, y2, width, height, alpha = itemgetter(
"page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height", "alpha"
def compute_geometric_quotient():
page_area_sqrt = math.sqrt(abs(page_width * page_height))
image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1))
return image_area_sqrt / page_area_sqrt
page_width, page_height, x1, x2, y1, y2, width, height = itemgetter(
"page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height"
)(data)
classification = data["classification"]
label = classification["label"]
representation = data["representation"]
geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4)
min_image_to_page_quotient_breached = bool(
geometric_quotient < get_class_specific_filter_value(label, CONFIG, "image_to_page_quotient", "min")
)
max_image_to_page_quotient_breached = bool(
geometric_quotient > get_class_specific_filter_value(label, CONFIG, "image_to_page_quotient", "max")
)
quotient = round(compute_geometric_quotient(), 4)
min_image_to_page_quotient_breached = bool(quotient < CONFIG.filters.image_to_page_quotient.min)
max_image_to_page_quotient_breached = bool(quotient > CONFIG.filters.image_to_page_quotient.max)
min_image_width_to_height_quotient_breached = bool(
width / height < get_class_specific_filter_value(label, CONFIG, "image_width_to_height_quotient", "min")
width / height < CONFIG.filters.image_width_to_height_quotient.min
)
max_image_width_to_height_quotient_breached = bool(
width / height > get_class_specific_filter_value(label, CONFIG, "image_width_to_height_quotient", "max")
width / height > CONFIG.filters.image_width_to_height_quotient.max
)
min_confidence_breached = bool(
max(classification["probabilities"].values())
< get_class_specific_filter_value(label, CONFIG, "confidence", "min")
)
classification = data["classification"]
min_confidence_breached = bool(max(classification["probabilities"].values()) < CONFIG.filters.min_confidence)
image_info = {
"classification": classification,
"representation": representation,
"position": {"x1": x1, "x2": x2, "y1": y1, "y2": y2, "pageNumber": data["page_idx"] + 1},
"geometry": {"width": width, "height": height},
"alpha": alpha,
"filters": {
"geometry": {
"imageSize": {
"quotient": geometric_quotient,
"quotient": quotient,
"tooLarge": max_image_to_page_quotient_breached,
"tooSmall": min_image_to_page_quotient_breached,
},
@ -78,23 +70,3 @@ def build_image_info(data: dict) -> dict:
}
return image_info
def compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1):
page_area_sqrt = math.sqrt(abs(page_width * page_height))
image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1))
return image_area_sqrt / page_area_sqrt
def get_class_specific_filter_value(label: str, settings: Dynaconf, filter_type: str, bound: str = None):
try:
value = (
settings.filters.overrides[label][filter_type][bound]
if bound
else settings.filters.overrides[label][filter_type]
)
logger.warning(f"Using {label=} specific {bound=} {filter_type=} {value=}.")
except KeyError:
value = settings.filters[filter_type][bound]
return value

View File

@ -4,7 +4,8 @@ from image_prediction.locations import BANNER_FILE
def show_banner():
banner = load_banner()
with open(BANNER_FILE) as f:
banner = "\n" + "".join(f.readlines()) + "\n"
logger = logging.getLogger(__name__)
logger.propagate = False
@ -18,9 +19,3 @@ def show_banner():
logger.addHandler(handler)
logger.info(banner)
def load_banner():
with open(BANNER_FILE) as f:
banner = "\n" + "".join(f.readlines()) + "\n"
return banner

View File

@ -0,0 +1,7 @@
from funcy import iterate, chunks
def until(cond, func, *args, **kwargs):
for a, b in chunks(2, iterate(func, *args, **kwargs)):
if cond(a, b):
return a

View File

@ -0,0 +1,29 @@
import logging
from image_prediction.config import CONFIG
logging.basicConfig()
def make_logger_getter():
logger = logging.getLogger("imclf")
logger.propagate = False
handler = logging.StreamHandler()
handler.setLevel(CONFIG.service.logging_level)
log_format = "%(asctime)s %(levelname)-8s %(message)s"
formatter = logging.Formatter(log_format, datefmt="%Y-%m-%d %H:%M:%S")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(CONFIG.service.logging_level)
def get_logger():
return logger
return get_logger
get_logger = make_logger_getter()

View File

@ -56,8 +56,7 @@ def annotate_image(doc, image_info):
def init():
PDFNet.Initialize(
# "Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
"Knecon AG:OEM:DDA-R::WL+:AMS(20270129):EA5FDFB23C7F36B9C2AE606F4F0D9197DE1FB649119F9730B622FABEF5C7"
"Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
)

Some files were not shown because too many files have changed in this diff Show More