chore: formatting and linting
This commit is contained in:
parent
9e04693ee1
commit
5d13d8b3d0
@ -1,4 +1,5 @@
|
|||||||
# cv-analysis — Visual (CV-Based) Document Parsing
|
# cv-analysis - Visual (CV-Based) Document Parsing
|
||||||
|
|
||||||
parse_pdf()
|
parse_pdf()
|
||||||
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
|
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
|
||||||
previous redactions in documents.
|
previous redactions in documents.
|
||||||
|
|||||||
145
poetry.lock
generated
145
poetry.lock
generated
@ -305,19 +305,17 @@ tests = ["mypy (>=0.800)", "pytest", "pytest-asyncio"]
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "astroid"
|
name = "astroid"
|
||||||
version = "2.15.8"
|
version = "3.1.0"
|
||||||
description = "An abstract syntax tree for Python with inference support."
|
description = "An abstract syntax tree for Python with inference support."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7.2"
|
python-versions = ">=3.8.0"
|
||||||
files = [
|
files = [
|
||||||
{file = "astroid-2.15.8-py3-none-any.whl", hash = "sha256:1aa149fc5c6589e3d0ece885b4491acd80af4f087baafa3fb5203b113e68cd3c"},
|
{file = "astroid-3.1.0-py3-none-any.whl", hash = "sha256:951798f922990137ac090c53af473db7ab4e70c770e6d7fae0cec59f74411819"},
|
||||||
{file = "astroid-2.15.8.tar.gz", hash = "sha256:6c107453dffee9055899705de3c9ead36e74119cee151e5a9aaf7f0b0e020a6a"},
|
{file = "astroid-3.1.0.tar.gz", hash = "sha256:ac248253bfa4bd924a0de213707e7ebeeb3138abeb48d798784ead1e56d419d4"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
lazy-object-proxy = ">=1.4.0"
|
|
||||||
typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
|
typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
|
||||||
wrapt = {version = ">=1.11,<2", markers = "python_version < \"3.11\""}
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "asttokens"
|
name = "asttokens"
|
||||||
@ -539,13 +537,13 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "blinker"
|
name = "blinker"
|
||||||
version = "1.7.0"
|
version = "1.8.1"
|
||||||
description = "Fast, simple object-to-object and broadcast signaling"
|
description = "Fast, simple object-to-object and broadcast signaling"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "blinker-1.7.0-py3-none-any.whl", hash = "sha256:c3f865d4d54db7abc53758a01601cf343fe55b84c1de4e3fa910e420b438d5b9"},
|
{file = "blinker-1.8.1-py3-none-any.whl", hash = "sha256:5f1cdeff423b77c31b89de0565cd03e5275a03028f44b2b15f912632a58cced6"},
|
||||||
{file = "blinker-1.7.0.tar.gz", hash = "sha256:e6820ff6fa4e4d1d8e2747c2283749c3f547e4fee112b98555cdcdae32996182"},
|
{file = "blinker-1.8.1.tar.gz", hash = "sha256:da44ec748222dcd0105ef975eed946da197d5bdf8bafb6aa92f5bc89da63fa25"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -2407,52 +2405,6 @@ sqs = ["boto3 (>=1.26.143)", "pycurl (>=7.43.0.5)", "urllib3 (>=1.26.16)"]
|
|||||||
yaml = ["PyYAML (>=3.10)"]
|
yaml = ["PyYAML (>=3.10)"]
|
||||||
zookeeper = ["kazoo (>=2.8.0)"]
|
zookeeper = ["kazoo (>=2.8.0)"]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "lazy-object-proxy"
|
|
||||||
version = "1.10.0"
|
|
||||||
description = "A fast and thorough lazy object proxy."
|
|
||||||
optional = false
|
|
||||||
python-versions = ">=3.8"
|
|
||||||
files = [
|
|
||||||
{file = "lazy-object-proxy-1.10.0.tar.gz", hash = "sha256:78247b6d45f43a52ef35c25b5581459e85117225408a4128a3daf8bf9648ac69"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:855e068b0358ab916454464a884779c7ffa312b8925c6f7401e952dcf3b89977"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab7004cf2e59f7c2e4345604a3e6ea0d92ac44e1c2375527d56492014e690c3"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc0d2fc424e54c70c4bc06787e4072c4f3b1aa2f897dfdc34ce1013cf3ceef05"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e2adb09778797da09d2b5ebdbceebf7dd32e2c96f79da9052b2e87b6ea495895"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b1f711e2c6dcd4edd372cf5dec5c5a30d23bba06ee012093267b3376c079ec83"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp310-cp310-win32.whl", hash = "sha256:76a095cfe6045c7d0ca77db9934e8f7b71b14645f0094ffcd842349ada5c5fb9"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:b4f87d4ed9064b2628da63830986c3d2dca7501e6018347798313fcf028e2fd4"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fec03caabbc6b59ea4a638bee5fce7117be8e99a4103d9d5ad77f15d6f81020c"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02c83f957782cbbe8136bee26416686a6ae998c7b6191711a04da776dc9e47d4"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:009e6bb1f1935a62889ddc8541514b6a9e1fcf302667dcb049a0be5c8f613e56"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:75fc59fc450050b1b3c203c35020bc41bd2695ed692a392924c6ce180c6f1dc9"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:782e2c9b2aab1708ffb07d4bf377d12901d7a1d99e5e410d648d892f8967ab1f"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp311-cp311-win32.whl", hash = "sha256:edb45bb8278574710e68a6b021599a10ce730d156e5b254941754a9cc0b17d03"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:e271058822765ad5e3bca7f05f2ace0de58a3f4e62045a8c90a0dfd2f8ad8cc6"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e98c8af98d5707dcdecc9ab0863c0ea6e88545d42ca7c3feffb6b4d1e370c7ba"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:952c81d415b9b80ea261d2372d2a4a2332a3890c2b83e0535f263ddfe43f0d43"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80b39d3a151309efc8cc48675918891b865bdf742a8616a337cb0090791a0de9"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e221060b701e2aa2ea991542900dd13907a5c90fa80e199dbf5a03359019e7a3"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:92f09ff65ecff3108e56526f9e2481b8116c0b9e1425325e13245abfd79bdb1b"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp312-cp312-win32.whl", hash = "sha256:3ad54b9ddbe20ae9f7c1b29e52f123120772b06dbb18ec6be9101369d63a4074"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:127a789c75151db6af398b8972178afe6bda7d6f68730c057fbbc2e96b08d282"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4ed0518a14dd26092614412936920ad081a424bdcb54cc13349a8e2c6d106a"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ad9e6ed739285919aa9661a5bbed0aaf410aa60231373c5579c6b4801bd883c"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fc0a92c02fa1ca1e84fc60fa258458e5bf89d90a1ddaeb8ed9cc3147f417255"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0aefc7591920bbd360d57ea03c995cebc204b424524a5bd78406f6e1b8b2a5d8"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5faf03a7d8942bb4476e3b62fd0f4cf94eaf4618e304a19865abf89a35c0bbee"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp38-cp38-win32.whl", hash = "sha256:e333e2324307a7b5d86adfa835bb500ee70bfcd1447384a822e96495796b0ca4"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:cb73507defd385b7705c599a94474b1d5222a508e502553ef94114a143ec6696"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:366c32fe5355ef5fc8a232c5436f4cc66e9d3e8967c01fb2e6302fd6627e3d94"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2297f08f08a2bb0d32a4265e98a006643cd7233fb7983032bd61ac7a02956b3b"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18dd842b49456aaa9a7cf535b04ca4571a302ff72ed8740d06b5adcd41fe0757"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:217138197c170a2a74ca0e05bddcd5f1796c735c37d0eee33e43259b192aa424"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9a3a87cf1e133e5b1994144c12ca4aa3d9698517fe1e2ca82977781b16955658"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp39-cp39-win32.whl", hash = "sha256:30b339b2a743c5288405aa79a69e706a06e02958eab31859f7f3c04980853b70"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:a899b10e17743683b293a729d3a11f2f399e8a90c73b089e29f5d0fe3509f0dd"},
|
|
||||||
{file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "loguru"
|
name = "loguru"
|
||||||
version = "0.7.2"
|
version = "0.7.2"
|
||||||
@ -2817,6 +2769,64 @@ files = [
|
|||||||
{file = "multidict-6.0.5.tar.gz", hash = "sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da"},
|
{file = "multidict-6.0.5.tar.gz", hash = "sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "mypy"
|
||||||
|
version = "1.10.0"
|
||||||
|
description = "Optional static typing for Python"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "mypy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:da1cbf08fb3b851ab3b9523a884c232774008267b1f83371ace57f412fe308c2"},
|
||||||
|
{file = "mypy-1.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:12b6bfc1b1a66095ab413160a6e520e1dc076a28f3e22f7fb25ba3b000b4ef99"},
|
||||||
|
{file = "mypy-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e36fb078cce9904c7989b9693e41cb9711e0600139ce3970c6ef814b6ebc2b2"},
|
||||||
|
{file = "mypy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2b0695d605ddcd3eb2f736cd8b4e388288c21e7de85001e9f85df9187f2b50f9"},
|
||||||
|
{file = "mypy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:cd777b780312ddb135bceb9bc8722a73ec95e042f911cc279e2ec3c667076051"},
|
||||||
|
{file = "mypy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3be66771aa5c97602f382230165b856c231d1277c511c9a8dd058be4784472e1"},
|
||||||
|
{file = "mypy-1.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8b2cbaca148d0754a54d44121b5825ae71868c7592a53b7292eeb0f3fdae95ee"},
|
||||||
|
{file = "mypy-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ec404a7cbe9fc0e92cb0e67f55ce0c025014e26d33e54d9e506a0f2d07fe5de"},
|
||||||
|
{file = "mypy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e22e1527dc3d4aa94311d246b59e47f6455b8729f4968765ac1eacf9a4760bc7"},
|
||||||
|
{file = "mypy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:a87dbfa85971e8d59c9cc1fcf534efe664d8949e4c0b6b44e8ca548e746a8d53"},
|
||||||
|
{file = "mypy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a781f6ad4bab20eef8b65174a57e5203f4be627b46291f4589879bf4e257b97b"},
|
||||||
|
{file = "mypy-1.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b808e12113505b97d9023b0b5e0c0705a90571c6feefc6f215c1df9381256e30"},
|
||||||
|
{file = "mypy-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f55583b12156c399dce2df7d16f8a5095291354f1e839c252ec6c0611e86e2e"},
|
||||||
|
{file = "mypy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4cf18f9d0efa1b16478c4c129eabec36148032575391095f73cae2e722fcf9d5"},
|
||||||
|
{file = "mypy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:bc6ac273b23c6b82da3bb25f4136c4fd42665f17f2cd850771cb600bdd2ebeda"},
|
||||||
|
{file = "mypy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9fd50226364cd2737351c79807775136b0abe084433b55b2e29181a4c3c878c0"},
|
||||||
|
{file = "mypy-1.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f90cff89eea89273727d8783fef5d4a934be2fdca11b47def50cf5d311aff727"},
|
||||||
|
{file = "mypy-1.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fcfc70599efde5c67862a07a1aaf50e55bce629ace26bb19dc17cece5dd31ca4"},
|
||||||
|
{file = "mypy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:075cbf81f3e134eadaf247de187bd604748171d6b79736fa9b6c9685b4083061"},
|
||||||
|
{file = "mypy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:3f298531bca95ff615b6e9f2fc0333aae27fa48052903a0ac90215021cdcfa4f"},
|
||||||
|
{file = "mypy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa7ef5244615a2523b56c034becde4e9e3f9b034854c93639adb667ec9ec2976"},
|
||||||
|
{file = "mypy-1.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3236a4c8f535a0631f85f5fcdffba71c7feeef76a6002fcba7c1a8e57c8be1ec"},
|
||||||
|
{file = "mypy-1.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a2b5cdbb5dd35aa08ea9114436e0d79aceb2f38e32c21684dcf8e24e1e92821"},
|
||||||
|
{file = "mypy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92f93b21c0fe73dc00abf91022234c79d793318b8a96faac147cd579c1671746"},
|
||||||
|
{file = "mypy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:28d0e038361b45f099cc086d9dd99c15ff14d0188f44ac883010e172ce86c38a"},
|
||||||
|
{file = "mypy-1.10.0-py3-none-any.whl", hash = "sha256:f8c083976eb530019175aabadb60921e73b4f45736760826aa1689dda8208aee"},
|
||||||
|
{file = "mypy-1.10.0.tar.gz", hash = "sha256:3d087fcbec056c4ee34974da493a826ce316947485cef3901f511848e687c131"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
mypy-extensions = ">=1.0.0"
|
||||||
|
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
|
||||||
|
typing-extensions = ">=4.1.0"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
dmypy = ["psutil (>=4.0)"]
|
||||||
|
install-types = ["pip"]
|
||||||
|
mypyc = ["setuptools (>=50)"]
|
||||||
|
reports = ["lxml"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "mypy-extensions"
|
||||||
|
version = "1.0.0"
|
||||||
|
description = "Type system extensions for programs checked with the mypy type checker."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.5"
|
||||||
|
files = [
|
||||||
|
{file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"},
|
||||||
|
{file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "nanotime"
|
name = "nanotime"
|
||||||
version = "0.5.2"
|
version = "0.5.2"
|
||||||
@ -3924,20 +3934,20 @@ tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"]
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pylint"
|
name = "pylint"
|
||||||
version = "2.17.7"
|
version = "3.1.0"
|
||||||
description = "python code static checker"
|
description = "python code static checker"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7.2"
|
python-versions = ">=3.8.0"
|
||||||
files = [
|
files = [
|
||||||
{file = "pylint-2.17.7-py3-none-any.whl", hash = "sha256:27a8d4c7ddc8c2f8c18aa0050148f89ffc09838142193fdbe98f172781a3ff87"},
|
{file = "pylint-3.1.0-py3-none-any.whl", hash = "sha256:507a5b60953874766d8a366e8e8c7af63e058b26345cfcb5f91f89d987fd6b74"},
|
||||||
{file = "pylint-2.17.7.tar.gz", hash = "sha256:f4fcac7ae74cfe36bc8451e931d8438e4a476c20314b1101c458ad0f05191fad"},
|
{file = "pylint-3.1.0.tar.gz", hash = "sha256:6a69beb4a6f63debebaab0a3477ecd0f559aa726af4954fc948c51f7a2549e23"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
astroid = ">=2.15.8,<=2.17.0-dev0"
|
astroid = ">=3.1.0,<=3.2.0-dev0"
|
||||||
colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
|
colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
|
||||||
dill = {version = ">=0.2", markers = "python_version < \"3.11\""}
|
dill = {version = ">=0.2", markers = "python_version < \"3.11\""}
|
||||||
isort = ">=4.2.5,<6"
|
isort = ">=4.2.5,<5.13.0 || >5.13.0,<6"
|
||||||
mccabe = ">=0.6,<0.8"
|
mccabe = ">=0.6,<0.8"
|
||||||
platformdirs = ">=2.2.0"
|
platformdirs = ">=2.2.0"
|
||||||
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
|
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
|
||||||
@ -4579,6 +4589,17 @@ files = [
|
|||||||
docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"]
|
docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"]
|
||||||
test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<8.2)", "pytest-mock", "pytest-mypy-testing"]
|
test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<8.2)", "pytest-mock", "pytest-mypy-testing"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "types-pillow"
|
||||||
|
version = "10.2.0.20240423"
|
||||||
|
description = "Typing stubs for Pillow"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8"
|
||||||
|
files = [
|
||||||
|
{file = "types-Pillow-10.2.0.20240423.tar.gz", hash = "sha256:696e68b9b6a58548fc307a8669830469237c5b11809ddf978ac77fafa79251cd"},
|
||||||
|
{file = "types_Pillow-10.2.0.20240423-py3-none-any.whl", hash = "sha256:bd12923093b96c91d523efcdb66967a307f1a843bcfaf2d5a529146c10a9ced3"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "typing-extensions"
|
name = "typing-extensions"
|
||||||
version = "4.11.0"
|
version = "4.11.0"
|
||||||
@ -4933,4 +4954,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = ">=3.10,<3.11"
|
python-versions = ">=3.10,<3.11"
|
||||||
content-hash = "67faa0bb4b0477691c9cbb10f5a9e4a5d30c1bba99802f1059e7be63fe94db7d"
|
content-hash = "06b9635bd0acdb0cd78ee9ab0e6a8c75ef91648c36bbc1f78ac44a5c0671990b"
|
||||||
|
|||||||
@ -30,14 +30,17 @@ kn-utils = { version = "0.2.7", source = "gitlab-research" }
|
|||||||
pdf2img = { version = "0.7.0", source = "gitlab-red" }
|
pdf2img = { version = "0.7.0", source = "gitlab-red" }
|
||||||
dvc-azure = "^2.21.2"
|
dvc-azure = "^2.21.2"
|
||||||
pymupdf = "^1.24.1"
|
pymupdf = "^1.24.1"
|
||||||
|
types-pillow = "^10.2.0.20240423"
|
||||||
|
|
||||||
[tool.poetry.group.test.dependencies]
|
[tool.poetry.group.test.dependencies]
|
||||||
pytest = "^7.0.1"
|
pytest = "^7.0.1"
|
||||||
pylint = "^2.17.4"
|
pylint = "^3.1"
|
||||||
|
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
ipython = "^8.21.0"
|
ipython = "^8.21.0"
|
||||||
|
mypy = "^1.10.0"
|
||||||
|
pylint = "^3.1.0"
|
||||||
|
|
||||||
[tool.pytest.ini_options]
|
[tool.pytest.ini_options]
|
||||||
testpaths = ["test"]
|
testpaths = ["test"]
|
||||||
@ -57,6 +60,12 @@ name = "gitlab-red"
|
|||||||
url = "https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi/simple"
|
url = "https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi/simple"
|
||||||
priority = "explicit"
|
priority = "explicit"
|
||||||
|
|
||||||
|
[tool.pylint]
|
||||||
|
max-line-length = 120
|
||||||
|
docstring-min-length=4
|
||||||
|
extension-pkg-whitelist = ["cv2"]
|
||||||
|
extension-pkg-allow-list = ["cv2"]
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core"]
|
requires = ["poetry-core"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|||||||
@ -45,6 +45,7 @@ if __name__ == "__main__":
|
|||||||
elif args.type == "layout":
|
elif args.type == "layout":
|
||||||
from cv_analysis.layout_parsing import parse_layout as analyze
|
from cv_analysis.layout_parsing import parse_layout as analyze
|
||||||
elif args.type == "figure":
|
elif args.type == "figure":
|
||||||
from cv_analysis.figure_detection.figure_detection import detect_figures
|
from cv_analysis.figure_detection.figure_detection import \
|
||||||
|
detect_figures
|
||||||
analyze = detect_figures
|
analyze = detect_figures
|
||||||
annotate_page(page, analyze, draw, name=name, show=args.show)
|
annotate_page(page, analyze, draw, name=name, show=args.show)
|
||||||
|
|||||||
@ -1,14 +1,16 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import timeit
|
import timeit
|
||||||
|
from itertools import starmap
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from funcy import lmap
|
||||||
|
from pdf2img.conversion import convert_pages_to_images
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from cv_analysis.figure_detection.figure_detection import detect_figures
|
from cv_analysis.figure_detection.figure_detection import detect_figures
|
||||||
from cv_analysis.layout_parsing import parse_layout
|
from cv_analysis.layout_parsing import parse_layout
|
||||||
from cv_analysis.table_parsing import parse_tables
|
from cv_analysis.table_parsing import parse_tables
|
||||||
from cv_analysis.utils.draw import draw_rectangles
|
from cv_analysis.utils.draw import draw_rectangles
|
||||||
from funcy import lmap
|
|
||||||
from itertools import starmap
|
|
||||||
from pathlib import Path
|
|
||||||
from pdf2img.conversion import convert_pages_to_images
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
|
|||||||
@ -1,15 +1,15 @@
|
|||||||
import argparse
|
import argparse
|
||||||
from dataclasses import dataclass, asdict, field
|
from dataclasses import asdict, dataclass, field
|
||||||
from operator import truth
|
from operator import truth
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from funcy import lfilter, lmap, lflatten
|
from funcy import lfilter, lflatten, lmap
|
||||||
|
from pdf2img.conversion import convert_pages_to_images
|
||||||
|
|
||||||
from cv_analysis.table_parsing import parse_tables
|
from cv_analysis.table_parsing import parse_tables
|
||||||
from cv_analysis.utils.display import show_image_mpl
|
from cv_analysis.utils.display import show_image_mpl
|
||||||
from pdf2img.conversion import convert_pages_to_images
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
from pyinfra.k8s_probes import startup
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
from pyinfra.k8s_probes import startup
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
logger.debug("running health check")
|
logger.debug("running health check")
|
||||||
|
|||||||
@ -4,10 +4,9 @@ import json
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from pyinfra.config import get_config
|
from pyinfra.config import get_config
|
||||||
from pyinfra.storage.storage import get_s3_storage
|
from pyinfra.storage.storage import get_s3_storage
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
CONFIG = get_config()
|
CONFIG = get_config()
|
||||||
|
|
||||||
|
|||||||
@ -8,7 +8,7 @@ import numpy as np
|
|||||||
from funcy import lmap
|
from funcy import lmap
|
||||||
from matplotlib import pyplot as plt
|
from matplotlib import pyplot as plt
|
||||||
|
|
||||||
from cv_analysis.server.pipeline import make_analysis_pipeline, get_analysis_fn
|
from cv_analysis.server.pipeline import get_analysis_fn, make_analysis_pipeline
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
|
|||||||
@ -2,7 +2,6 @@ import argparse
|
|||||||
import json
|
import json
|
||||||
|
|
||||||
import pika
|
import pika
|
||||||
|
|
||||||
from pyinfra.config import get_config
|
from pyinfra.config import get_config
|
||||||
from pyinfra.storage.storage import get_s3_storage
|
from pyinfra.storage.storage import get_s3_storage
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from pyinfra.config.loader import load_settings
|
from pyinfra.config.loader import load_settings # type: ignore
|
||||||
|
|
||||||
|
|
||||||
def get_config():
|
def get_config():
|
||||||
|
|||||||
@ -5,11 +5,7 @@ import numpy as np
|
|||||||
|
|
||||||
from cv_analysis.figure_detection.figures import detect_large_coherent_structures
|
from cv_analysis.figure_detection.figures import detect_large_coherent_structures
|
||||||
from cv_analysis.figure_detection.text import remove_primary_text_regions
|
from cv_analysis.figure_detection.text import remove_primary_text_regions
|
||||||
from cv_analysis.utils.filters import (
|
from cv_analysis.utils.filters import has_acceptable_format, is_large_enough, is_not_too_large
|
||||||
is_large_enough,
|
|
||||||
has_acceptable_format,
|
|
||||||
is_not_too_large,
|
|
||||||
)
|
|
||||||
from cv_analysis.utils.postprocessing import remove_included
|
from cv_analysis.utils.postprocessing import remove_included
|
||||||
from cv_analysis.utils.structures import Rectangle
|
from cv_analysis.utils.structures import Rectangle
|
||||||
|
|
||||||
|
|||||||
@ -1,22 +1,17 @@
|
|||||||
import itertools
|
import itertools
|
||||||
from itertools import compress
|
from itertools import compress, starmap
|
||||||
from itertools import starmap
|
|
||||||
from operator import __and__
|
from operator import __and__
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
from cv_analysis.utils.connect_rects import connect_related_rects2
|
from cv_analysis.utils.connect_rects import connect_related_rects2
|
||||||
|
from cv_analysis.utils.postprocessing import has_no_parent, remove_included, remove_overlapping
|
||||||
from cv_analysis.utils.structures import Rectangle
|
from cv_analysis.utils.structures import Rectangle
|
||||||
from cv_analysis.utils.postprocessing import (
|
|
||||||
remove_overlapping,
|
|
||||||
remove_included,
|
|
||||||
has_no_parent,
|
|
||||||
)
|
|
||||||
from cv_analysis.utils.visual_logging import vizlogger
|
from cv_analysis.utils.visual_logging import vizlogger
|
||||||
|
|
||||||
#could be dynamic parameter is the scan is noisy
|
|
||||||
|
# could be dynamic parameter is the scan is noisy
|
||||||
def is_likely_segment(rect, min_area=100):
|
def is_likely_segment(rect, min_area=100):
|
||||||
return cv2.contourArea(rect, False) > min_area
|
return cv2.contourArea(rect, False) > min_area
|
||||||
|
|
||||||
@ -34,7 +29,7 @@ def find_segments(image):
|
|||||||
|
|
||||||
|
|
||||||
def dilate_page_components(image):
|
def dilate_page_components(image):
|
||||||
#if text is detected in words make kernel bigger
|
# if text is detected in words make kernel bigger
|
||||||
image = cv2.GaussianBlur(image, (7, 7), 0)
|
image = cv2.GaussianBlur(image, (7, 7), 0)
|
||||||
thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
||||||
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
|
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
|
||||||
@ -49,7 +44,6 @@ def fill_in_component_area(image, rect):
|
|||||||
return ~image
|
return ~image
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_layout(image: np.array):
|
def parse_layout(image: np.array):
|
||||||
image = image.copy()
|
image = image.copy()
|
||||||
image_ = image.copy()
|
image_ = image.copy()
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
MODULE_PATH = Path(__file__).resolve().parents[0]
|
MODULE_PATH = Path(__file__).resolve().parents[0]
|
||||||
PACKAGE_ROOT_PATH = MODULE_PATH.parents[0] # i.e. /Users/USERNAME/gitlab/cv-analysis-service/src
|
PACKAGE_ROOT_PATH = MODULE_PATH.parents[0] # i.e. /Users/USERNAME/gitlab/cv-analysis-service/src
|
||||||
REPO_ROOT_PATH = PACKAGE_ROOT_PATH
|
REPO_ROOT_PATH = PACKAGE_ROOT_PATH
|
||||||
TEST_DIR_PATH = REPO_ROOT_PATH / "test"
|
TEST_DIR_PATH = REPO_ROOT_PATH / "test"
|
||||||
TEST_DATA_DVC = TEST_DIR_PATH / "test_data.dvc"
|
TEST_DATA_DVC = TEST_DIR_PATH / "test_data.dvc"
|
||||||
|
|||||||
@ -2,9 +2,9 @@ from functools import partial
|
|||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from iteration_utilities import starfilter, first
|
from iteration_utilities import first, starfilter
|
||||||
|
|
||||||
from cv_analysis.utils.filters import is_large_enough, is_filled, is_boxy
|
from cv_analysis.utils.filters import is_boxy, is_filled, is_large_enough
|
||||||
from cv_analysis.utils.visual_logging import vizlogger
|
from cv_analysis.utils.visual_logging import vizlogger
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
from dataclasses import asdict
|
from dataclasses import asdict
|
||||||
from operator import itemgetter, truth
|
from operator import itemgetter, truth
|
||||||
from typing import Generator, Callable
|
from typing import Callable, Generator
|
||||||
|
|
||||||
from funcy import flatten, lmap
|
from funcy import flatten, lmap
|
||||||
from pdf2img.conversion import convert_pages_to_images
|
from pdf2img.conversion import convert_pages_to_images
|
||||||
@ -48,6 +48,7 @@ def make_image_analysis_pipeline(
|
|||||||
images, info, page_info = extract_images_from_pdf(pdf_bytes, vlp_output)
|
images, info, page_info = extract_images_from_pdf(pdf_bytes, vlp_output)
|
||||||
# rel_bboxes = map()
|
# rel_bboxes = map()
|
||||||
img_results = lmap(analysis_fn, images)
|
img_results = lmap(analysis_fn, images)
|
||||||
|
|
||||||
def make_offsets():
|
def make_offsets():
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|||||||
@ -2,12 +2,12 @@ from operator import itemgetter
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Callable, Optional, Tuple
|
from typing import Callable, Optional, Tuple
|
||||||
|
|
||||||
import cv2
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from kn_utils.logging import logger
|
from cv2 import cv2
|
||||||
|
from kn_utils.logging import logger # type: ignore
|
||||||
from numpy import ndarray as Array
|
from numpy import ndarray as Array
|
||||||
from scipy.stats import norm
|
from scipy.stats import norm # type: ignore
|
||||||
|
|
||||||
|
|
||||||
def show_multiple(arrs: Tuple[Array], title: str = ""):
|
def show_multiple(arrs: Tuple[Array], title: str = ""):
|
||||||
@ -70,9 +70,7 @@ def make_gaussian_nonpositive_kernel(kernel_size: int, sd: float) -> Array:
|
|||||||
def make_quadratic_kernel(kernel_size: int, ratio: float) -> Array:
|
def make_quadratic_kernel(kernel_size: int, ratio: float) -> Array:
|
||||||
kernel_size += int(not kernel_size % 2)
|
kernel_size += int(not kernel_size % 2)
|
||||||
wing_size = int((kernel_size - 1) / 2)
|
wing_size = int((kernel_size - 1) / 2)
|
||||||
kernel = np.array(
|
kernel = np.array(list(map(lambda x: float(-(x**2)), range(-wing_size, wing_size + 1))))
|
||||||
list(map(lambda x: float(-(x**2)), range(-wing_size, wing_size + 1)))
|
|
||||||
)
|
|
||||||
maxval, minval = np.max(kernel), np.min(kernel)
|
maxval, minval = np.max(kernel), np.min(kernel)
|
||||||
diff = maxval - minval
|
diff = maxval - minval
|
||||||
kernel += diff / (1 - ratio)
|
kernel += diff / (1 - ratio)
|
||||||
@ -80,17 +78,16 @@ def make_quadratic_kernel(kernel_size: int, ratio: float) -> Array:
|
|||||||
return kernel
|
return kernel
|
||||||
|
|
||||||
|
|
||||||
def min_avg_for_interval(filtered: Array, interval: int) -> float:
|
def min_avg_for_interval(filtered: Array, interval: int) -> tuple[float, int]:
|
||||||
n = len(filtered)
|
n = len(filtered)
|
||||||
avgs = [np.mean(filtered[range(start, n, interval)]) for start in range(interval)]
|
avgs: list[float] = [np.mean(filtered[range(start, n, interval)]) for start in range(interval)]
|
||||||
best = min(avgs)
|
best: float = min(avgs)
|
||||||
return best, avgs.index(best)
|
return best, avgs.index(best)
|
||||||
|
|
||||||
|
|
||||||
def search_intervals(filtered: Array, min_interval: int, max_interval: int):
|
def search_intervals(filtered: Array, min_interval: int, max_interval: int):
|
||||||
performance = [
|
performance = [
|
||||||
(interval, *min_avg_for_interval(filtered, interval))
|
(interval, *min_avg_for_interval(filtered, interval)) for interval in range(min_interval, max_interval + 1)
|
||||||
for interval in range(min_interval, max_interval + 1)
|
|
||||||
]
|
]
|
||||||
best = min(performance, key=lambda x: x[1])
|
best = min(performance, key=lambda x: x[1])
|
||||||
return best[0], best[2]
|
return best[0], best[2]
|
||||||
@ -98,7 +95,7 @@ def search_intervals(filtered: Array, min_interval: int, max_interval: int):
|
|||||||
|
|
||||||
def filter_array(
|
def filter_array(
|
||||||
array: Array,
|
array: Array,
|
||||||
sum_filter: Array,
|
sum_filter: Array | None,
|
||||||
padding: Optional[Array] = None,
|
padding: Optional[Array] = None,
|
||||||
pad_value_function: Callable[[Array], float] = lambda x: 255.0, # np.mean,
|
pad_value_function: Callable[[Array], float] = lambda x: 255.0, # np.mean,
|
||||||
) -> Array:
|
) -> Array:
|
||||||
@ -123,7 +120,7 @@ COL_FILTER2_WIDTH = 70
|
|||||||
COL_FILTER2_SD = 12
|
COL_FILTER2_SD = 12
|
||||||
COL_FILTER3_WIDTH = 200
|
COL_FILTER3_WIDTH = 200
|
||||||
COL_FILTER3_SD = 20
|
COL_FILTER3_SD = 20
|
||||||
FILTERS = {
|
FILTERS: dict[str, dict[int, np.ndarray | None]] = {
|
||||||
"row": {
|
"row": {
|
||||||
1: make_gaussian_kernel(ROW_FILTER1_WIDTH, ROW_FILTER1_SD),
|
1: make_gaussian_kernel(ROW_FILTER1_WIDTH, ROW_FILTER1_SD),
|
||||||
2: make_gaussian_kernel(ROW_FILTER2_WIDTH, ROW_FILTER2_SD),
|
2: make_gaussian_kernel(ROW_FILTER2_WIDTH, ROW_FILTER2_SD),
|
||||||
@ -140,20 +137,13 @@ FILTERS = {
|
|||||||
def filter_fp_col_lines(line_list: list[int], filt_sums: Array) -> list[int]:
|
def filter_fp_col_lines(line_list: list[int], filt_sums: Array) -> list[int]:
|
||||||
if not line_list:
|
if not line_list:
|
||||||
return []
|
return []
|
||||||
centers = list(
|
centers = list(np.where((filt_sums[1:-1] < filt_sums[:-2]) * (filt_sums[1:-1] < filt_sums[2:]))[0] + 1)
|
||||||
np.where(
|
|
||||||
(filt_sums[1:-1] < filt_sums[:-2]) * (filt_sums[1:-1] < filt_sums[2:])
|
|
||||||
)[0]
|
|
||||||
+ 1
|
|
||||||
)
|
|
||||||
|
|
||||||
if line_list[0] > centers[0]:
|
if line_list[0] > centers[0]:
|
||||||
centers = centers[1:] + [len(filt_sums) - 1]
|
centers = centers[1:] + [len(filt_sums) - 1]
|
||||||
mindiff = np.std(filt_sums)
|
mindiff = np.std(filt_sums)
|
||||||
line_list = [
|
line_list = [
|
||||||
maxidx
|
maxidx for maxidx, minidx in zip(line_list, centers) if (filt_sums[maxidx] - filt_sums[minidx]) > mindiff
|
||||||
for maxidx, minidx in zip(line_list, centers)
|
|
||||||
if (filt_sums[maxidx] - filt_sums[minidx]) > mindiff
|
|
||||||
]
|
]
|
||||||
return line_list
|
return line_list
|
||||||
|
|
||||||
@ -161,7 +151,6 @@ def filter_fp_col_lines(line_list: list[int], filt_sums: Array) -> list[int]:
|
|||||||
def get_lines_either(table_array: Array, horizontal=True) -> Array:
|
def get_lines_either(table_array: Array, horizontal=True) -> Array:
|
||||||
key = "row" if horizontal else "col"
|
key = "row" if horizontal else "col"
|
||||||
|
|
||||||
filters = FILTERS
|
|
||||||
sums = np.mean(table_array, axis=int(horizontal))
|
sums = np.mean(table_array, axis=int(horizontal))
|
||||||
threshold = 0.3 * 255 # np.mean(sums) - (1 + 2 * horizontal) * np.std(sums)
|
threshold = 0.3 * 255 # np.mean(sums) - (1 + 2 * horizontal) * np.std(sums)
|
||||||
predicate = 1000.0 * (sums < threshold)
|
predicate = 1000.0 * (sums < threshold)
|
||||||
@ -174,11 +163,7 @@ def get_lines_either(table_array: Array, horizontal=True) -> Array:
|
|||||||
filtered_sums = filter_array(filtered_sums, FILTERS[key][3])
|
filtered_sums = filter_array(filtered_sums, FILTERS[key][3])
|
||||||
|
|
||||||
lines = list(
|
lines = list(
|
||||||
np.where(
|
np.where((filtered_sums[1:-1] > filtered_sums[:-2]) * (filtered_sums[1:-1] > filtered_sums[2:]))[0] + 1
|
||||||
(filtered_sums[1:-1] > filtered_sums[:-2])
|
|
||||||
* (filtered_sums[1:-1] > filtered_sums[2:])
|
|
||||||
)[0]
|
|
||||||
+ 1
|
|
||||||
)
|
)
|
||||||
if not horizontal:
|
if not horizontal:
|
||||||
lines = filter_fp_col_lines(lines, filtered_sums)
|
lines = filter_fp_col_lines(lines, filtered_sums)
|
||||||
@ -191,7 +176,7 @@ def img_bytes_to_array(img_bytes: bytes) -> Array:
|
|||||||
return img_np
|
return img_np
|
||||||
|
|
||||||
|
|
||||||
def infer_lines(img: Array) -> dict[str, list[dict[str, int]]]:
|
def infer_lines(img: Array) -> dict[str, list[dict[str, int]] | list[dict[str, int]]]:
|
||||||
cv2.imwrite("/tmp/table.png", img)
|
cv2.imwrite("/tmp/table.png", img)
|
||||||
_, img = cv2.threshold(img, 220, 255, cv2.THRESH_BINARY)
|
_, img = cv2.threshold(img, 220, 255, cv2.THRESH_BINARY)
|
||||||
cv2.imwrite("/tmp/table_bin.png", img)
|
cv2.imwrite("/tmp/table_bin.png", img)
|
||||||
|
|||||||
@ -1,19 +1,15 @@
|
|||||||
import cv2
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from funcy import lfilter, lmap
|
from cv2 import cv2
|
||||||
|
from funcy import lfilter, lmap # type: ignore
|
||||||
|
|
||||||
from cv_analysis.layout_parsing import parse_layout
|
from cv_analysis.layout_parsing import parse_layout
|
||||||
from cv_analysis.utils.postprocessing import (
|
from cv_analysis.utils.postprocessing import remove_isolated # xywh_to_vecs, xywh_to_vec_rect, adjacent1d
|
||||||
remove_isolated,
|
|
||||||
) # xywh_to_vecs, xywh_to_vec_rect, adjacent1d
|
|
||||||
from cv_analysis.utils.structures import Rectangle
|
from cv_analysis.utils.structures import Rectangle
|
||||||
from cv_analysis.utils.visual_logging import vizlogger
|
from cv_analysis.utils.visual_logging import vizlogger
|
||||||
|
|
||||||
|
|
||||||
def add_external_contours(image, image_h_w_lines_only):
|
def add_external_contours(image: np.ndarray, image_h_w_lines_only):
|
||||||
contours, _ = cv2.findContours(
|
contours, _ = cv2.findContours(image_h_w_lines_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
|
||||||
image_h_w_lines_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE
|
|
||||||
)
|
|
||||||
for cnt in contours:
|
for cnt in contours:
|
||||||
x, y, w, h = cv2.boundingRect(cnt)
|
x, y, w, h = cv2.boundingRect(cnt)
|
||||||
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
|
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
|
||||||
@ -21,7 +17,7 @@ def add_external_contours(image, image_h_w_lines_only):
|
|||||||
return image
|
return image
|
||||||
|
|
||||||
|
|
||||||
def apply_motion_blur(image: np.array, angle, size=80):
|
def apply_motion_blur(image: np.ndarray, angle, size=80):
|
||||||
"""Solidifies and slightly extends detected lines.
|
"""Solidifies and slightly extends detected lines.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -33,19 +29,19 @@ def apply_motion_blur(image: np.array, angle, size=80):
|
|||||||
np.array
|
np.array
|
||||||
|
|
||||||
"""
|
"""
|
||||||
k = np.zeros((size, size), dtype=np.float32)
|
kernel = np.zeros((size, size), dtype=np.float32)
|
||||||
vizlogger.debug(k, "tables08_blur_kernel1.png")
|
vizlogger.debug(kernel, "tables08_blur_kernel1.png")
|
||||||
k[(size - 1) // 2, :] = np.ones(size, dtype=np.float32)
|
kernel[(size - 1) // 2, :] = np.ones(size, dtype=np.float32)
|
||||||
vizlogger.debug(k, "tables09_blur_kernel2.png")
|
vizlogger.debug(kernel, "tables09_blur_kernel2.png")
|
||||||
k = cv2.warpAffine(
|
new_kernel: np.ndarray = cv2.warpAffine(
|
||||||
k,
|
kernel,
|
||||||
cv2.getRotationMatrix2D((size / 2 - 0.5, size / 2 - 0.5), angle, 1.0),
|
cv2.getRotationMatrix2D((size / 2 - 0.5, size / 2 - 0.5), angle, 1.0),
|
||||||
(size, size),
|
(size, size),
|
||||||
)
|
)
|
||||||
vizlogger.debug(k, "tables10_blur_kernel3.png")
|
vizlogger.debug(new_kernel, "tables10_blur_kernel3.png")
|
||||||
k = k * (1.0 / np.sum(k))
|
new_kernel = new_kernel * (1.0 / np.sum(new_kernel))
|
||||||
vizlogger.debug(k, "tables11_blur_kernel4.png")
|
vizlogger.debug(new_kernel, "tables11_blur_kernel4.png")
|
||||||
blurred = cv2.filter2D(image, -1, k)
|
blurred = cv2.filter2D(image, -1, new_kernel)
|
||||||
return blurred
|
return blurred
|
||||||
|
|
||||||
|
|
||||||
@ -57,7 +53,7 @@ def isolate_vertical_and_horizontal_components(img_bin):
|
|||||||
bounding_rects (list): list of layout boxes of the form (x, y, w, h), potentially containing tables
|
bounding_rects (list): list of layout boxes of the form (x, y, w, h), potentially containing tables
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
np.array
|
np.ndarray
|
||||||
"""
|
"""
|
||||||
line_min_width = 48
|
line_min_width = 48
|
||||||
kernel_h = np.ones((1, line_min_width), np.uint8)
|
kernel_h = np.ones((1, line_min_width), np.uint8)
|
||||||
@ -78,16 +74,14 @@ def isolate_vertical_and_horizontal_components(img_bin):
|
|||||||
img_bin_extended = img_bin_h | img_bin_v
|
img_bin_extended = img_bin_h | img_bin_v
|
||||||
|
|
||||||
_, img_bin_extended = cv2.threshold(img_bin_extended, 120, 255, cv2.THRESH_BINARY)
|
_, img_bin_extended = cv2.threshold(img_bin_extended, 120, 255, cv2.THRESH_BINARY)
|
||||||
img_bin_final = cv2.dilate(
|
img_bin_final = cv2.dilate(img_bin_extended, np.ones((1, 1), np.uint8), iterations=1)
|
||||||
img_bin_extended, np.ones((1, 1), np.uint8), iterations=1
|
|
||||||
)
|
|
||||||
# add contours before lines are extended by blurring
|
# add contours before lines are extended by blurring
|
||||||
img_bin_final = add_external_contours(img_bin_final, img_lines_raw)
|
img_bin_final = add_external_contours(img_bin_final, img_lines_raw)
|
||||||
|
|
||||||
return img_bin_final
|
return img_bin_final
|
||||||
|
|
||||||
|
|
||||||
def find_table_layout_boxes(image: np.array):
|
def find_table_layout_boxes(image: np.ndarray):
|
||||||
def is_large_enough(box):
|
def is_large_enough(box):
|
||||||
(_, _, w, h) = box
|
(_, _, w, h) = box
|
||||||
if w * h >= 100000:
|
if w * h >= 100000:
|
||||||
@ -98,29 +92,27 @@ def find_table_layout_boxes(image: np.array):
|
|||||||
return lmap(is_large_enough, layout_boxes)
|
return lmap(is_large_enough, layout_boxes)
|
||||||
|
|
||||||
|
|
||||||
def preprocess(image: np.array):
|
def preprocess(image: np.ndarray):
|
||||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
|
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
|
||||||
_, image = cv2.threshold(image, 195, 255, cv2.THRESH_BINARY)
|
_, image = cv2.threshold(image, 195, 255, cv2.THRESH_BINARY)
|
||||||
return ~image
|
return ~image
|
||||||
|
|
||||||
|
|
||||||
def turn_connected_components_into_rects(image: np.array):
|
def turn_connected_components_into_rects(image: np.ndarray):
|
||||||
def is_large_enough(stat):
|
def is_large_enough(stat):
|
||||||
x1, y1, w, h, area = stat
|
x1, y1, w, h, area = stat
|
||||||
return area > 2000 and w > 35 and h > 25
|
return area > 2000 and w > 35 and h > 25
|
||||||
|
|
||||||
_, _, stats, _ = cv2.connectedComponentsWithStats(
|
_, _, stats_list, _ = cv2.connectedComponentsWithStats(~image, connectivity=8, ltype=cv2.CV_32S)
|
||||||
~image, connectivity=8, ltype=cv2.CV_32S
|
|
||||||
)
|
|
||||||
|
|
||||||
stats = lfilter(is_large_enough, stats)
|
rects_list: list[np.ndarray] = lfilter(is_large_enough, stats_list)
|
||||||
if stats:
|
if rects_list:
|
||||||
stats = np.vstack(stats)
|
stats: np.ndarray = np.vstack(rects_list)
|
||||||
return stats[:, :-1][2:]
|
return stats[:, :-1][2:]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
def parse_tables(image: np.array, show=False):
|
def parse_tables(image: np.ndarray, show=False):
|
||||||
"""Runs the full table parsing process.
|
"""Runs the full table parsing process.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -142,7 +134,7 @@ def parse_tables(image: np.array, show=False):
|
|||||||
# def make_lines(image: np.array, horizontal=True, kernel_length=40)
|
# def make_lines(image: np.array, horizontal=True, kernel_length=40)
|
||||||
|
|
||||||
|
|
||||||
def detect_horizontal_lines(image_bin: np.array, kernel_length=40):
|
def detect_horizontal_lines(image_bin: np.ndarray, kernel_length=40):
|
||||||
line_min_width = 48
|
line_min_width = 48
|
||||||
kernel_h = np.ones((1, line_min_width), np.uint8)
|
kernel_h = np.ones((1, line_min_width), np.uint8)
|
||||||
img_bin_h = cv2.morphologyEx(image_bin, cv2.MORPH_OPEN, kernel_h)
|
img_bin_h = cv2.morphologyEx(image_bin, cv2.MORPH_OPEN, kernel_h)
|
||||||
@ -154,7 +146,7 @@ def detect_horizontal_lines(image_bin: np.array, kernel_length=40):
|
|||||||
return img_bin_h
|
return img_bin_h
|
||||||
|
|
||||||
|
|
||||||
def detect_vertical_lines(image_bin: np.array, kernel_length=40):
|
def detect_vertical_lines(image_bin: np.ndarray, kernel_length=40):
|
||||||
line_min_width = 48
|
line_min_width = 48
|
||||||
kernel_v = np.ones((line_min_width, 1), np.uint8)
|
kernel_v = np.ones((line_min_width, 1), np.uint8)
|
||||||
img_bin_v = cv2.morphologyEx(image_bin, cv2.MORPH_OPEN, kernel_v)
|
img_bin_v = cv2.morphologyEx(image_bin, cv2.MORPH_OPEN, kernel_v)
|
||||||
@ -166,12 +158,8 @@ def detect_vertical_lines(image_bin: np.array, kernel_length=40):
|
|||||||
return img_bin_v
|
return img_bin_v
|
||||||
|
|
||||||
|
|
||||||
def detect_endpoints(
|
def detect_endpoints(image: np.ndarray, is_horizontal: bool) -> list[tuple[int, int, int, int]]:
|
||||||
image: np.array, is_horizontal: bool
|
def are_collinear(quad1: tuple[int, int, int, int], quad2: tuple[int, int, int, int], index: int) -> bool:
|
||||||
) -> list[tuple[int, int, int, int]]:
|
|
||||||
def are_collinear(
|
|
||||||
quad1: tuple[int, int, int, int], quad2: tuple[int, int, int, int], index: int
|
|
||||||
) -> bool:
|
|
||||||
dist_a = abs(quad1[index] - quad2[index])
|
dist_a = abs(quad1[index] - quad2[index])
|
||||||
dist_b = abs(quad1[index + 2] - quad2[index + 2])
|
dist_b = abs(quad1[index + 2] - quad2[index + 2])
|
||||||
overlap = True if index else (quad1[1] >= quad2[3] or quad1[3] >= quad2[1])
|
overlap = True if index else (quad1[1] >= quad2[3] or quad1[3] >= quad2[1])
|
||||||
@ -218,7 +206,7 @@ def detect_endpoints(
|
|||||||
return corrected
|
return corrected
|
||||||
|
|
||||||
|
|
||||||
def parse_lines(image: np.array, show=False) -> list[dict[str, list[int]]]:
|
def parse_lines(image: np.ndarray, show=False) -> list[dict[str, list[int]]]:
|
||||||
image = preprocess(image)
|
image = preprocess(image)
|
||||||
# kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2, 2))
|
# kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2, 2))
|
||||||
# image = cv2.dilate(image, kernel, iterations=4)
|
# image = cv2.dilate(image, kernel, iterations=4)
|
||||||
@ -228,9 +216,7 @@ def parse_lines(image: np.array, show=False) -> list[dict[str, list[int]]]:
|
|||||||
horizontal_endpoints = detect_endpoints(horizontal_line_img, is_horizontal=True)
|
horizontal_endpoints = detect_endpoints(horizontal_line_img, is_horizontal=True)
|
||||||
vertical_endpoints = detect_endpoints(vertical_line_img, is_horizontal=False)
|
vertical_endpoints = detect_endpoints(vertical_line_img, is_horizontal=False)
|
||||||
|
|
||||||
def format_quad(
|
def format_quad(quad: tuple[int, int, int, int], max_x: int, max_y: int) -> tuple[int, int, int, int]:
|
||||||
quad: tuple[int, int, int, int], max_x: int, max_y: int
|
|
||||||
) -> tuple[int, int, int, int]:
|
|
||||||
x1, y1, x2, y2 = quad
|
x1, y1, x2, y2 = quad
|
||||||
if x1 > (x2 + 5):
|
if x1 > (x2 + 5):
|
||||||
x1, y1, x2, y2 = x2, y2, x1, y1
|
x1, y1, x2, y2 = x2, y2, x1, y1
|
||||||
|
|||||||
@ -8,7 +8,9 @@ from kn_utils.logging import logger
|
|||||||
|
|
||||||
|
|
||||||
def annotate_pdf(
|
def annotate_pdf(
|
||||||
pdf: Union[str, bytes, Path], annotations, output_path: Union[str, Path] = None
|
pdf: Union[str, bytes, Path],
|
||||||
|
annotations,
|
||||||
|
output_path: Union[str, Path, None] = None,
|
||||||
):
|
):
|
||||||
pdf_bytes = provide_byte_stream(pdf)
|
pdf_bytes = provide_byte_stream(pdf)
|
||||||
with fitz.open(stream=pdf_bytes) as pdf_handle:
|
with fitz.open(stream=pdf_bytes) as pdf_handle:
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
from itertools import combinations, starmap, product
|
from itertools import combinations, product, starmap
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
|
|
||||||
@ -6,10 +6,14 @@ def is_near_enough(rect_pair, max_gap=14):
|
|||||||
x1, y1, w1, h1 = rect_pair[0]
|
x1, y1, w1, h1 = rect_pair[0]
|
||||||
x2, y2, w2, h2 = rect_pair[1]
|
x2, y2, w2, h2 = rect_pair[1]
|
||||||
|
|
||||||
return any([abs(x1 - (x2 + w2)) <= max_gap,
|
return any(
|
||||||
abs(x2 - (x1 + w1)) <= max_gap,
|
[
|
||||||
abs(y2 - (y1 + h1)) <= max_gap,
|
abs(x1 - (x2 + w2)) <= max_gap,
|
||||||
abs(y1 - (y2 + h2)) <= max_gap])
|
abs(x2 - (x1 + w1)) <= max_gap,
|
||||||
|
abs(y2 - (y1 + h1)) <= max_gap,
|
||||||
|
abs(y1 - (y2 + h2)) <= max_gap,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def is_overlapping(rect_pair):
|
def is_overlapping(rect_pair):
|
||||||
@ -23,28 +27,41 @@ def is_overlapping(rect_pair):
|
|||||||
def is_on_same_line(rect_pair):
|
def is_on_same_line(rect_pair):
|
||||||
x1, y1, w1, h1 = rect_pair[0]
|
x1, y1, w1, h1 = rect_pair[0]
|
||||||
x2, y2, w2, h2 = rect_pair[1]
|
x2, y2, w2, h2 = rect_pair[1]
|
||||||
return any([any([abs(y1 - y2) <= 10,
|
return any(
|
||||||
abs(y1 + h1 - (y2 + h2)) <= 10]),
|
[
|
||||||
any([y2 <= y1 and y1 + h1 <= y2 + h2,
|
any([abs(y1 - y2) <= 10, abs(y1 + h1 - (y2 + h2)) <= 10]),
|
||||||
y1 <= y2 and y2 + h2 <= y1 + h1])])
|
any([y2 <= y1 and y1 + h1 <= y2 + h2, y1 <= y2 and y2 + h2 <= y1 + h1]),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def has_correct_position1(rect_pair):
|
def has_correct_position1(rect_pair):
|
||||||
x1, y1, w1, h1 = rect_pair[0]
|
x1, y1, w1, h1 = rect_pair[0]
|
||||||
x2, y2, w2, h2 = rect_pair[1]
|
x2, y2, w2, h2 = rect_pair[1]
|
||||||
return any([any([abs(x1 - x2) <= 10,
|
return any(
|
||||||
abs(y1 - y2) <= 10,
|
[
|
||||||
abs(x1 + w1 - (x2 + w2)) <= 10,
|
any(
|
||||||
abs(y1 + h1 - (y2 + h2)) <= 10]),
|
[
|
||||||
any([y2 <= y1 and y1 + h1 <= y2 + h2,
|
abs(x1 - x2) <= 10,
|
||||||
y1 <= y2 and y2 + h2 <= y1 + h1,
|
abs(y1 - y2) <= 10,
|
||||||
x2 <= x1 and x1 + w1 <= x2 + w2,
|
abs(x1 + w1 - (x2 + w2)) <= 10,
|
||||||
x1 <= x2 and x2 + w2 <= x1 + w1])])
|
abs(y1 + h1 - (y2 + h2)) <= 10,
|
||||||
|
]
|
||||||
|
),
|
||||||
|
any(
|
||||||
|
[
|
||||||
|
y2 <= y1 and y1 + h1 <= y2 + h2,
|
||||||
|
y1 <= y2 and y2 + h2 <= y1 + h1,
|
||||||
|
x2 <= x1 and x1 + w1 <= x2 + w2,
|
||||||
|
x1 <= x2 and x2 + w2 <= x1 + w1,
|
||||||
|
]
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def is_related(rect_pair):
|
def is_related(rect_pair):
|
||||||
return (is_near_enough(rect_pair) and has_correct_position1(rect_pair)) or is_overlapping(
|
return (is_near_enough(rect_pair) and has_correct_position1(rect_pair)) or is_overlapping(rect_pair)
|
||||||
rect_pair)
|
|
||||||
|
|
||||||
|
|
||||||
def fuse_rects(rect1, rect2):
|
def fuse_rects(rect1, rect2):
|
||||||
|
|||||||
@ -1,10 +1,12 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
from matplotlib import pyplot as plt
|
from matplotlib import pyplot as plt
|
||||||
|
|
||||||
if os.environ["USER"] == "isaac":
|
if os.environ["USER"] == "isaac":
|
||||||
import matplotlib
|
import matplotlib
|
||||||
matplotlib.use('module://matplotlib-backend-wezterm')
|
|
||||||
|
matplotlib.use("module://matplotlib-backend-wezterm")
|
||||||
|
|
||||||
|
|
||||||
def show_image_cv2(image, maxdim=700):
|
def show_image_cv2(image, maxdim=700):
|
||||||
|
|||||||
@ -4,7 +4,6 @@ from cv_analysis.utils import copy_and_normalize_channels
|
|||||||
|
|
||||||
|
|
||||||
def draw_contours(image, contours, color=None, annotate=False):
|
def draw_contours(image, contours, color=None, annotate=False):
|
||||||
|
|
||||||
image = copy_and_normalize_channels(image)
|
image = copy_and_normalize_channels(image)
|
||||||
|
|
||||||
for cont in contours:
|
for cont in contours:
|
||||||
|
|||||||
@ -29,9 +29,7 @@ def transform_image_coordinates_to_pdf_coordinates(
|
|||||||
transformation_matrix: fitz.Matrix,
|
transformation_matrix: fitz.Matrix,
|
||||||
dpi: int = None,
|
dpi: int = None,
|
||||||
) -> Tuple:
|
) -> Tuple:
|
||||||
x1, y1, x2, y2 = (
|
x1, y1, x2, y2 = map(lambda x: (x / dpi) * 72, bbox) if dpi else bbox # Convert to points, can be done before
|
||||||
map(lambda x: (x / dpi) * 72, bbox) if dpi else bbox
|
|
||||||
) # Convert to points, can be done before
|
|
||||||
rect = fitz.Rect(x1, y1, x2, y2)
|
rect = fitz.Rect(x1, y1, x2, y2)
|
||||||
rect = rect * rotation_matrix * transformation_matrix
|
rect = rect * rotation_matrix * transformation_matrix
|
||||||
|
|
||||||
@ -45,18 +43,12 @@ def rescale_to_pdf(bbox: Iterable[int | float], page_info: PageInfo) -> Iterable
|
|||||||
pix_h, pix_w = page_info.image_height, page_info.image_width
|
pix_h, pix_w = page_info.image_height, page_info.image_width
|
||||||
ratio_h, ratio_w = pdf_h / pix_h, pdf_w / pix_w
|
ratio_h, ratio_w = pdf_h / pix_h, pdf_w / pix_w
|
||||||
round3 = lambda x: tuple(map(lambda y: round(y, 3), x))
|
round3 = lambda x: tuple(map(lambda y: round(y, 3), x))
|
||||||
ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h = round3(
|
ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h = round3((ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h))
|
||||||
(ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h)
|
new_bbox = round3((bbox[0] * ratio_w, bbox[1] * ratio_h, bbox[2] * ratio_w, bbox[3] * ratio_h))
|
||||||
)
|
|
||||||
new_bbox = round3(
|
|
||||||
(bbox[0] * ratio_w, bbox[1] * ratio_h, bbox[2] * ratio_w, bbox[3] * ratio_h)
|
|
||||||
)
|
|
||||||
return new_bbox
|
return new_bbox
|
||||||
|
|
||||||
|
|
||||||
def transform_table_lines_by_page_info(
|
def transform_table_lines_by_page_info(bboxes: dict, offsets: tuple, page_info: PageInfo) -> dict:
|
||||||
bboxes: dict, offsets: tuple, page_info: PageInfo
|
|
||||||
) -> dict:
|
|
||||||
transform = partial(rescale_to_pdf, page_info=page_info)
|
transform = partial(rescale_to_pdf, page_info=page_info)
|
||||||
logger.debug(f"{offsets=}")
|
logger.debug(f"{offsets=}")
|
||||||
|
|
||||||
@ -74,9 +66,7 @@ def transform_table_lines_by_page_info(
|
|||||||
|
|
||||||
table_lines = bboxes.get("tableLines", [])
|
table_lines = bboxes.get("tableLines", [])
|
||||||
transformed_lines = list(map(convert, table_lines))
|
transformed_lines = list(map(convert, table_lines))
|
||||||
bboxes[
|
bboxes["tableLines"] = transformed_lines # lfilter(lambda b: b['y1']==b['y2'], transformed_lines)
|
||||||
"tableLines"
|
|
||||||
] = transformed_lines # lfilter(lambda b: b['y1']==b['y2'], transformed_lines)
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
for i in range(len(table_lines)):
|
for i in range(len(table_lines)):
|
||||||
@ -126,11 +116,7 @@ def extract_images_from_pdf(
|
|||||||
# current_page_info object to include the derotation_matrix.
|
# current_page_info object to include the derotation_matrix.
|
||||||
rect = rect * page.transformation_matrix * page.rotation_matrix
|
rect = rect * page.transformation_matrix * page.rotation_matrix
|
||||||
pixmap = page.get_pixmap(clip=rect, dpi=dpi, colorspace=fitz.csGRAY)
|
pixmap = page.get_pixmap(clip=rect, dpi=dpi, colorspace=fitz.csGRAY)
|
||||||
shape = (
|
shape = (pixmap.h, pixmap.w, pixmap.n) if pixmap.n > 1 else (pixmap.h, pixmap.w)
|
||||||
(pixmap.h, pixmap.w, pixmap.n)
|
|
||||||
if pixmap.n > 1
|
|
||||||
else (pixmap.h, pixmap.w)
|
|
||||||
)
|
|
||||||
image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(*shape)
|
image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(*shape)
|
||||||
|
|
||||||
table_images.append(image)
|
table_images.append(image)
|
||||||
|
|||||||
@ -13,17 +13,11 @@ def open_pdf(pdf, first_page=0, last_page=None):
|
|||||||
if pdf.lower().endswith((".png", ".jpg", ".jpeg")):
|
if pdf.lower().endswith((".png", ".jpg", ".jpeg")):
|
||||||
pages = [Image.open(pdf)]
|
pages = [Image.open(pdf)]
|
||||||
elif pdf.lower().endswith(".pdf"):
|
elif pdf.lower().endswith(".pdf"):
|
||||||
pages = pdf2image.convert_from_path(
|
pages = pdf2image.convert_from_path(pdf, first_page=first_page, last_page=last_page)
|
||||||
pdf, first_page=first_page, last_page=last_page
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
raise IOError(
|
raise IOError("Invalid file extension. Accepted filetypes:\n\t.png\n\t.jpg\n\t.jpeg\n\t.pdf")
|
||||||
"Invalid file extension. Accepted filetypes:\n\t.png\n\t.jpg\n\t.jpeg\n\t.pdf"
|
|
||||||
)
|
|
||||||
elif type(pdf) == bytes:
|
elif type(pdf) == bytes:
|
||||||
pages = pdf2image.convert_from_bytes(
|
pages = pdf2image.convert_from_bytes(pdf, first_page=first_page, last_page=last_page)
|
||||||
pdf, first_page=first_page, last_page=last_page
|
|
||||||
)
|
|
||||||
elif type(pdf) in {list, ndarray}:
|
elif type(pdf) in {list, ndarray}:
|
||||||
return pdf
|
return pdf
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,8 @@
|
|||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from itertools import starmap, compress
|
from itertools import compress, starmap
|
||||||
from typing import Iterable, List
|
from typing import Iterable, List
|
||||||
|
|
||||||
from cv_analysis.utils.structures import Rectangle
|
from cv_analysis.utils.structures import Rectangle
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
from numpy import frombuffer, ndarray
|
|
||||||
import cv2
|
import cv2
|
||||||
|
from numpy import frombuffer, ndarray
|
||||||
|
|
||||||
|
|
||||||
def preprocess_page_array(page):
|
def preprocess_page_array(page):
|
||||||
@ -10,7 +10,6 @@ def preprocess_page_array(page):
|
|||||||
|
|
||||||
|
|
||||||
def page2image(page):
|
def page2image(page):
|
||||||
|
|
||||||
if type(page) == bytes:
|
if type(page) == bytes:
|
||||||
page = frombuffer(page)
|
page = frombuffer(page)
|
||||||
elif type(page) == ndarray:
|
elif type(page) == ndarray:
|
||||||
|
|||||||
@ -1,12 +1,23 @@
|
|||||||
from json import dumps
|
from json import dumps
|
||||||
|
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from funcy import identity
|
from funcy import identity
|
||||||
|
|
||||||
|
|
||||||
class Rectangle:
|
class Rectangle:
|
||||||
def __init__(self, x1=None, y1=None, w=None, h=None, x2=None, y2=None, indent=4, format="xywh", discrete=True):
|
def __init__(
|
||||||
|
self,
|
||||||
|
x1=None,
|
||||||
|
y1=None,
|
||||||
|
w=None,
|
||||||
|
h=None,
|
||||||
|
x2=None,
|
||||||
|
y2=None,
|
||||||
|
indent=4,
|
||||||
|
format="xywh",
|
||||||
|
discrete=True,
|
||||||
|
):
|
||||||
make_discrete = int if discrete else identity
|
make_discrete = int if discrete else identity
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -111,7 +122,13 @@ class Rectangle:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dict_xywh(cls, xywh_dict, discrete=True):
|
def from_dict_xywh(cls, xywh_dict, discrete=True):
|
||||||
return cls(x1=xywh_dict["x"], y1=xywh_dict["y"], w=xywh_dict["width"], h=xywh_dict["height"], discrete=discrete)
|
return cls(
|
||||||
|
x1=xywh_dict["x"],
|
||||||
|
y1=xywh_dict["y"],
|
||||||
|
w=xywh_dict["width"],
|
||||||
|
h=xywh_dict["height"],
|
||||||
|
discrete=discrete,
|
||||||
|
)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return dumps(self.json(), indent=self.indent)
|
return dumps(self.json(), indent=self.indent)
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from cv_analysis.utils.structures import Rectangle
|
from cv_analysis.utils.structures import Rectangle
|
||||||
|
|
||||||
|
|
||||||
@ -26,7 +28,6 @@ def compute_page_iou(results_boxes: Iterable[Rectangle], ground_truth_boxes: Ite
|
|||||||
|
|
||||||
|
|
||||||
def compute_document_score(results_dict, annotation_dict):
|
def compute_document_score(results_dict, annotation_dict):
|
||||||
|
|
||||||
page_weights = np.array([len(page["cells"]) for page in annotation_dict["pages"]])
|
page_weights = np.array([len(page["cells"]) for page in annotation_dict["pages"]])
|
||||||
page_weights = page_weights / sum(page_weights)
|
page_weights = page_weights / sum(page_weights)
|
||||||
|
|
||||||
|
|||||||
@ -1,9 +1,8 @@
|
|||||||
from numpy import generic
|
|
||||||
import cv2
|
import cv2
|
||||||
|
from numpy import generic
|
||||||
|
|
||||||
|
|
||||||
def copy_and_normalize_channels(image):
|
def copy_and_normalize_channels(image):
|
||||||
|
|
||||||
image = image.copy()
|
image = image.copy()
|
||||||
try:
|
try:
|
||||||
image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
|
image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from pyinfra.config.loader import load_settings
|
from pyinfra.config.loader import load_settings
|
||||||
|
|
||||||
from cv_analysis.config import get_config
|
from cv_analysis.config import get_config
|
||||||
|
|||||||
@ -1,9 +1,9 @@
|
|||||||
from sys import stdout
|
from sys import stdout
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
from kn_utils.logging import logger
|
from kn_utils.logging import logger # type: ignore
|
||||||
from pyinfra.examples import start_standard_queue_consumer
|
from pyinfra.examples import start_standard_queue_consumer # type: ignore
|
||||||
from pyinfra.queue.callback import make_download_process_upload_callback
|
from pyinfra.queue.callback import make_download_process_upload_callback # type: ignore
|
||||||
|
|
||||||
from cv_analysis.config import get_config
|
from cv_analysis.config import get_config
|
||||||
from cv_analysis.server.pipeline import get_analysis_pipeline
|
from cv_analysis.server.pipeline import get_analysis_pipeline
|
||||||
|
|||||||
4
test/fixtures/figure_detection.py
vendored
4
test/fixtures/figure_detection.py
vendored
@ -5,9 +5,7 @@ import numpy as np
|
|||||||
import pytest
|
import pytest
|
||||||
from lorem_text import lorem
|
from lorem_text import lorem
|
||||||
|
|
||||||
from cv_analysis.figure_detection.figure_detection import (
|
from cv_analysis.figure_detection.figure_detection import detect_figures
|
||||||
detect_figures,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|||||||
2
test/fixtures/server.py
vendored
2
test/fixtures/server.py
vendored
@ -1,8 +1,8 @@
|
|||||||
import gzip
|
import gzip
|
||||||
import io
|
import io
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import cv2
|
import cv2
|
||||||
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
from funcy import first
|
from funcy import first
|
||||||
|
|
||||||
|
|||||||
2
test/fixtures/table_parsing.py
vendored
2
test/fixtures/table_parsing.py
vendored
@ -1,5 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
from os.path import join
|
from os.path import join
|
||||||
|
from test.fixtures.figure_detection import paste_text
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import pytest
|
import pytest
|
||||||
@ -11,7 +12,6 @@ from cv_analysis.config import get_config
|
|||||||
from cv_analysis.locations import REPO_ROOT_PATH, TEST_DATA_DVC
|
from cv_analysis.locations import REPO_ROOT_PATH, TEST_DATA_DVC
|
||||||
from cv_analysis.utils.draw import draw_rectangles
|
from cv_analysis.utils.draw import draw_rectangles
|
||||||
from cv_analysis.utils.open_pdf import open_pdf
|
from cv_analysis.utils.open_pdf import open_pdf
|
||||||
from test.fixtures.figure_detection import paste_text
|
|
||||||
|
|
||||||
settings = get_config()
|
settings = get_config()
|
||||||
|
|
||||||
|
|||||||
@ -1,10 +1,9 @@
|
|||||||
from math import prod
|
from math import prod
|
||||||
|
from test.utils.utils import powerset
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from test.utils.utils import powerset
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("background_color", [255, 220])
|
@pytest.mark.parametrize("background_color", [255, 220])
|
||||||
class TestFindPrimaryTextRegions:
|
class TestFindPrimaryTextRegions:
|
||||||
|
|||||||
@ -1,12 +1,11 @@
|
|||||||
|
from test.utils.utils import powerset
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from cv_analysis.figure_detection.text import (
|
from cv_analysis.figure_detection.text import (apply_threshold_to_image,
|
||||||
remove_primary_text_regions,
|
remove_primary_text_regions)
|
||||||
apply_threshold_to_image,
|
|
||||||
)
|
|
||||||
from test.utils.utils import powerset
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("error_tolerance", [0.07])
|
@pytest.mark.parametrize("error_tolerance", [0.07])
|
||||||
|
|||||||
@ -2,11 +2,9 @@ import fitz
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from cv_analysis.server.pipeline import (
|
from cv_analysis.server.pipeline import (figure_detection_formatter,
|
||||||
figure_detection_formatter,
|
make_analysis_pipeline,
|
||||||
make_analysis_pipeline,
|
table_parsing_formatter)
|
||||||
table_parsing_formatter,
|
|
||||||
)
|
|
||||||
from cv_analysis.utils.structures import Rectangle
|
from cv_analysis.utils.structures import Rectangle
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user