diff --git a/config/proto/DocumentPositionData.proto b/config/proto/DocumentPositionData.proto index 8dc8493..5353924 100644 --- a/config/proto/DocumentPositionData.proto +++ b/config/proto/DocumentPositionData.proto @@ -20,9 +20,6 @@ message DocumentPositionData { // Definition of a BoundingBox that contains x, y, width, and height. message Position { - float x = 1; - float y = 2; - float width = 3; - float height = 4; + repeated float value = 1; } } diff --git a/poetry.lock b/poetry.lock index a3e4d8b..5b4f965 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1555,13 +1555,13 @@ tests = ["Pygments (==2.10.0)", "collective.checkdocs (==0.2)", "dvc[testing]", [[package]] name = "dvc-data" -version = "3.16.5" +version = "3.16.6" description = "DVC's data management subsystem" optional = false python-versions = ">=3.9" files = [ - {file = "dvc_data-3.16.5-py3-none-any.whl", hash = "sha256:d7004964eef4442521957ccf3ed3f2f6add3c4dfe7b830773ebe68875c481a55"}, - {file = "dvc_data-3.16.5.tar.gz", hash = "sha256:ba0d9ac53a1809622f59ff86b24d07639fce7c98d0c26ef9406eed309d04c856"}, + {file = "dvc_data-3.16.6-py3-none-any.whl", hash = "sha256:ffba21e2d0e420c427257a984213c94ba51db2fb76f2227ab23d9cb978613684"}, + {file = "dvc_data-3.16.6.tar.gz", hash = "sha256:598c10e5ded098f5145ede5535f2b5b964af42dc900486bc9bb739191f8189fd"}, ] [package.dependencies] @@ -1579,8 +1579,8 @@ tqdm = ">=4.63.1,<5" [package.extras] all = ["dvc-data[cli]"] cli = ["rich (>=10.11.0,<14.0.0)", "typer (>=0.6)"] -dev = ["blake3 (>=0.3.1)", "dvc-data[all]", "dvc-data[tests]", "mypy (==1.11.1)", "types-tqdm"] -tests = ["pytest (>=7,<9)", "pytest-benchmark (>=4)", "pytest-cov (>=4.1.0)", "pytest-mock", "pytest-servers[s3] (==0.5.6)", "pytest-sugar"] +dev = ["blake3 (>=0.3.1)", "dvc-data[all]", "dvc-data[tests]", "mypy (==1.11.2)", "types-tqdm"] +tests = ["pytest (>=7,<9)", "pytest-benchmark (>=4)", "pytest-cov (>=4.1.0)", "pytest-mock", "pytest-servers[s3] (==0.5.7)", "pytest-sugar"] [[package]] name = "dvc-http" @@ -3735,13 +3735,13 @@ twisted = ["twisted"] [[package]] name = "prompt-toolkit" -version = "3.0.47" +version = "3.0.48" description = "Library for building powerful interactive command lines in Python" optional = false python-versions = ">=3.7.0" files = [ - {file = "prompt_toolkit-3.0.47-py3-none-any.whl", hash = "sha256:0d7bfa67001d5e39d02c224b663abc33687405033a8c422d0d675a5a13361d10"}, - {file = "prompt_toolkit-3.0.47.tar.gz", hash = "sha256:1e1b29cb58080b1e69f207c893a1a7bf16d127a5c30c9d17a25a5d77792e5360"}, + {file = "prompt_toolkit-3.0.48-py3-none-any.whl", hash = "sha256:f49a827f90062e411f1ce1f854f2aedb3c23353244f8108b89283587397ac10e"}, + {file = "prompt_toolkit-3.0.48.tar.gz", hash = "sha256:d6623ab0477a80df74e646bdbc93621143f5caf104206aa29294d53de1a03d90"}, ] [package.dependencies] @@ -3749,33 +3749,22 @@ wcwidth = "*" [[package]] name = "protobuf" -version = "3.20.3" -description = "Protocol Buffers" +version = "4.25.5" +description = "" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "protobuf-3.20.3-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:f4bd856d702e5b0d96a00ec6b307b0f51c1982c2bf9c0052cf9019e9a544ba99"}, - {file = "protobuf-3.20.3-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:9aae4406ea63d825636cc11ffb34ad3379335803216ee3a856787bcf5ccc751e"}, - {file = "protobuf-3.20.3-cp310-cp310-win32.whl", hash = "sha256:28545383d61f55b57cf4df63eebd9827754fd2dc25f80c5253f9184235db242c"}, - {file = "protobuf-3.20.3-cp310-cp310-win_amd64.whl", hash = "sha256:67a3598f0a2dcbc58d02dd1928544e7d88f764b47d4a286202913f0b2801c2e7"}, - {file = "protobuf-3.20.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:899dc660cd599d7352d6f10d83c95df430a38b410c1b66b407a6b29265d66469"}, - {file = "protobuf-3.20.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e64857f395505ebf3d2569935506ae0dfc4a15cb80dc25261176c784662cdcc4"}, - {file = "protobuf-3.20.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:d9e4432ff660d67d775c66ac42a67cf2453c27cb4d738fc22cb53b5d84c135d4"}, - {file = "protobuf-3.20.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:74480f79a023f90dc6e18febbf7b8bac7508420f2006fabd512013c0c238f454"}, - {file = "protobuf-3.20.3-cp37-cp37m-win32.whl", hash = "sha256:b6cc7ba72a8850621bfec987cb72623e703b7fe2b9127a161ce61e61558ad905"}, - {file = "protobuf-3.20.3-cp37-cp37m-win_amd64.whl", hash = "sha256:8c0c984a1b8fef4086329ff8dd19ac77576b384079247c770f29cc8ce3afa06c"}, - {file = "protobuf-3.20.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:de78575669dddf6099a8a0f46a27e82a1783c557ccc38ee620ed8cc96d3be7d7"}, - {file = "protobuf-3.20.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:f4c42102bc82a51108e449cbb32b19b180022941c727bac0cfd50170341f16ee"}, - {file = "protobuf-3.20.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:44246bab5dd4b7fbd3c0c80b6f16686808fab0e4aca819ade6e8d294a29c7050"}, - {file = "protobuf-3.20.3-cp38-cp38-win32.whl", hash = "sha256:c02ce36ec760252242a33967d51c289fd0e1c0e6e5cc9397e2279177716add86"}, - {file = "protobuf-3.20.3-cp38-cp38-win_amd64.whl", hash = "sha256:447d43819997825d4e71bf5769d869b968ce96848b6479397e29fc24c4a5dfe9"}, - {file = "protobuf-3.20.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:398a9e0c3eaceb34ec1aee71894ca3299605fa8e761544934378bbc6c97de23b"}, - {file = "protobuf-3.20.3-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bf01b5720be110540be4286e791db73f84a2b721072a3711efff6c324cdf074b"}, - {file = "protobuf-3.20.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:daa564862dd0d39c00f8086f88700fdbe8bc717e993a21e90711acfed02f2402"}, - {file = "protobuf-3.20.3-cp39-cp39-win32.whl", hash = "sha256:819559cafa1a373b7096a482b504ae8a857c89593cf3a25af743ac9ecbd23480"}, - {file = "protobuf-3.20.3-cp39-cp39-win_amd64.whl", hash = "sha256:03038ac1cfbc41aa21f6afcbcd357281d7521b4157926f30ebecc8d4ea59dcb7"}, - {file = "protobuf-3.20.3-py2.py3-none-any.whl", hash = "sha256:a7ca6d488aa8ff7f329d4c545b2dbad8ac31464f1d8b1c87ad1346717731e4db"}, - {file = "protobuf-3.20.3.tar.gz", hash = "sha256:2e3427429c9cffebf259491be0af70189607f365c2f41c7c3764af6f337105f2"}, + {file = "protobuf-4.25.5-cp310-abi3-win32.whl", hash = "sha256:5e61fd921603f58d2f5acb2806a929b4675f8874ff5f330b7d6f7e2e784bbcd8"}, + {file = "protobuf-4.25.5-cp310-abi3-win_amd64.whl", hash = "sha256:4be0571adcbe712b282a330c6e89eae24281344429ae95c6d85e79e84780f5ea"}, + {file = "protobuf-4.25.5-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:b2fde3d805354df675ea4c7c6338c1aecd254dfc9925e88c6d31a2bcb97eb173"}, + {file = "protobuf-4.25.5-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:919ad92d9b0310070f8356c24b855c98df2b8bd207ebc1c0c6fcc9ab1e007f3d"}, + {file = "protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:fe14e16c22be926d3abfcb500e60cab068baf10b542b8c858fa27e098123e331"}, + {file = "protobuf-4.25.5-cp38-cp38-win32.whl", hash = "sha256:98d8d8aa50de6a2747efd9cceba361c9034050ecce3e09136f90de37ddba66e1"}, + {file = "protobuf-4.25.5-cp38-cp38-win_amd64.whl", hash = "sha256:b0234dd5a03049e4ddd94b93400b67803c823cfc405689688f59b34e0742381a"}, + {file = "protobuf-4.25.5-cp39-cp39-win32.whl", hash = "sha256:abe32aad8561aa7cc94fc7ba4fdef646e576983edb94a73381b03c53728a626f"}, + {file = "protobuf-4.25.5-cp39-cp39-win_amd64.whl", hash = "sha256:7a183f592dc80aa7c8da7ad9e55091c4ffc9497b3054452d629bb85fa27c2a45"}, + {file = "protobuf-4.25.5-py3-none-any.whl", hash = "sha256:0aebecb809cae990f8129ada5ca273d9d670b76d9bfc9b1809f0a9c02b7dbf41"}, + {file = "protobuf-4.25.5.tar.gz", hash = "sha256:7f8249476b4a9473645db7f8ab42b02fe1488cbe5fb72fddd445e0665afd8584"}, ] [[package]] @@ -4031,13 +4020,13 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" [[package]] name = "pydot" -version = "3.0.1" +version = "3.0.2" description = "Python interface to Graphviz's Dot" optional = false python-versions = ">=3.8" files = [ - {file = "pydot-3.0.1-py3-none-any.whl", hash = "sha256:43f1e878dc1ff7c1c2e3470a6999d4e9e97771c5c862440c2f0af0ba844c231f"}, - {file = "pydot-3.0.1.tar.gz", hash = "sha256:e18cf7f287c497d77b536a3d20a46284568fea390776dface6eabbdf1b1b5efc"}, + {file = "pydot-3.0.2-py3-none-any.whl", hash = "sha256:99cedaa55d04abb0b2bc56d9981a6da781053dd5ac75c428e8dd53db53f90b14"}, + {file = "pydot-3.0.2.tar.gz", hash = "sha256:9180da540b51b3aa09fbf81140b3edfbe2315d778e8589a7d0a4a69c41332bae"}, ] [package.dependencies] @@ -4046,7 +4035,7 @@ pyparsing = ">=3.0.9" [package.extras] dev = ["chardet", "parameterized", "ruff"] release = ["zest.releaser[recommended]"] -tests = ["chardet", "parameterized", "ruff", "tox", "unittest-parallel"] +tests = ["chardet", "parameterized", "pytest", "pytest-cov", "pytest-xdist[psutil]", "ruff", "tox"] [[package]] name = "pygit2" @@ -5435,4 +5424,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.11" -content-hash = "85cc4f846ba584e4b4d41a1e07ad11eeea96ce3377f3137cf5efaf8de4716050" +content-hash = "102e413da0ff757a914290f881301e4182477c0bb08efdb31db5bc8d2a198289" diff --git a/pyinfra/proto/DocumentPage_pb2.py b/pyinfra/proto/DocumentPage_pb2.py index 6562adf..54cfaee 100644 --- a/pyinfra/proto/DocumentPage_pb2.py +++ b/pyinfra/proto/DocumentPage_pb2.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: DocumentPage.proto -# Protobuf Python Version: 4.25.4 +# Protobuf Python Version: 4.25.5 """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool diff --git a/pyinfra/proto/DocumentPositionData_pb2.py b/pyinfra/proto/DocumentPositionData_pb2.py index c018ae4..e45a857 100644 --- a/pyinfra/proto/DocumentPositionData_pb2.py +++ b/pyinfra/proto/DocumentPositionData_pb2.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: DocumentPositionData.proto -# Protobuf Python Version: 4.25.4 +# Protobuf Python Version: 4.25.5 """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool @@ -14,7 +14,7 @@ _sym_db = _symbol_database.Default() DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1a\x44ocumentPositionData.proto"N\n\x17\x41llDocumentPositionData\x12\x33\n\x14\x64ocumentPositionData\x18\x01 \x03(\x0b\x32\x15.DocumentPositionData"\xb6\x01\n\x14\x44ocumentPositionData\x12\n\n\x02id\x18\x01 \x01(\x03\x12\x1e\n\x16stringIdxToPositionIdx\x18\x02 \x03(\x05\x12\x31\n\tpositions\x18\x03 \x03(\x0b\x32\x1e.DocumentPositionData.Position\x1a?\n\x08Position\x12\t\n\x01x\x18\x01 \x01(\x02\x12\t\n\x01y\x18\x02 \x01(\x02\x12\r\n\x05width\x18\x03 \x01(\x02\x12\x0e\n\x06height\x18\x04 \x01(\x02\x62\x06proto3' + b'\n\x1a\x44ocumentPositionData.proto"N\n\x17\x41llDocumentPositionData\x12\x33\n\x14\x64ocumentPositionData\x18\x01 \x03(\x0b\x32\x15.DocumentPositionData"\x90\x01\n\x14\x44ocumentPositionData\x12\n\n\x02id\x18\x01 \x01(\x03\x12\x1e\n\x16stringIdxToPositionIdx\x18\x02 \x03(\x05\x12\x31\n\tpositions\x18\x03 \x03(\x0b\x32\x1e.DocumentPositionData.Position\x1a\x19\n\x08Position\x12\r\n\x05value\x18\x01 \x03(\x02\x62\x06proto3' ) _globals = globals() @@ -25,7 +25,7 @@ if _descriptor._USE_C_DESCRIPTORS == False: _globals["_ALLDOCUMENTPOSITIONDATA"]._serialized_start = 30 _globals["_ALLDOCUMENTPOSITIONDATA"]._serialized_end = 108 _globals["_DOCUMENTPOSITIONDATA"]._serialized_start = 111 - _globals["_DOCUMENTPOSITIONDATA"]._serialized_end = 293 + _globals["_DOCUMENTPOSITIONDATA"]._serialized_end = 255 _globals["_DOCUMENTPOSITIONDATA_POSITION"]._serialized_start = 230 - _globals["_DOCUMENTPOSITIONDATA_POSITION"]._serialized_end = 293 + _globals["_DOCUMENTPOSITIONDATA_POSITION"]._serialized_end = 255 # @@protoc_insertion_point(module_scope) diff --git a/pyinfra/proto/DocumentStructure_pb2.py b/pyinfra/proto/DocumentStructure_pb2.py index 398b9f9..2d2245d 100644 --- a/pyinfra/proto/DocumentStructure_pb2.py +++ b/pyinfra/proto/DocumentStructure_pb2.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: DocumentStructure.proto -# Protobuf Python Version: 4.25.4 +# Protobuf Python Version: 4.25.5 """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool diff --git a/pyinfra/proto/DocumentTextData_pb2.py b/pyinfra/proto/DocumentTextData_pb2.py index 2d59444..9b685b9 100644 --- a/pyinfra/proto/DocumentTextData_pb2.py +++ b/pyinfra/proto/DocumentTextData_pb2.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: DocumentTextData.proto -# Protobuf Python Version: 4.25.4 +# Protobuf Python Version: 4.25.5 """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool diff --git a/pyinfra/proto/EntryData_pb2.py b/pyinfra/proto/EntryData_pb2.py index c35da81..2e70d23 100644 --- a/pyinfra/proto/EntryData_pb2.py +++ b/pyinfra/proto/EntryData_pb2.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: EntryData.proto -# Protobuf Python Version: 4.25.4 +# Protobuf Python Version: 4.25.5 """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool diff --git a/pyinfra/proto/LayoutEngine_pb2.py b/pyinfra/proto/LayoutEngine_pb2.py index 8223864..b00344d 100644 --- a/pyinfra/proto/LayoutEngine_pb2.py +++ b/pyinfra/proto/LayoutEngine_pb2.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: LayoutEngine.proto -# Protobuf Python Version: 4.25.4 +# Protobuf Python Version: 4.25.5 """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool diff --git a/pyinfra/proto/NodeType_pb2.py b/pyinfra/proto/NodeType_pb2.py index fb315d7..fa7d58b 100644 --- a/pyinfra/proto/NodeType_pb2.py +++ b/pyinfra/proto/NodeType_pb2.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! # source: NodeType.proto -# Protobuf Python Version: 4.25.4 +# Protobuf Python Version: 4.25.5 """Generated protocol buffer code.""" from google.protobuf import descriptor as _descriptor from google.protobuf import descriptor_pool as _descriptor_pool diff --git a/pyinfra/storage/proto_data_loader.py b/pyinfra/storage/proto_data_loader.py index 32f2978..2e9cc58 100644 --- a/pyinfra/storage/proto_data_loader.py +++ b/pyinfra/storage/proto_data_loader.py @@ -64,6 +64,8 @@ class ProtoDataLoader: message.ParseFromString(data) message_dict = MessageToDict(message, including_default_value_fields=True) message_dict = convert_int64_fields(message_dict) + if document_type == "POSITION": + message_dict = transform_positions_to_list(message_dict) return self._unpack(message_dict) @@ -85,11 +87,41 @@ class ProtoDataLoader: def convert_int64_fields(obj): # FIXME: find a more sophisticated way to convert int64 fields (defaults to str in python) + + # we skip the following keys because the values are expected to be of type str + skip_keys = ["col", "row", "numberOfCols", "numberOfRows"] if isinstance(obj, dict): for key, value in obj.items(): + if key in skip_keys: + continue obj[key] = convert_int64_fields(value) elif isinstance(obj, list): return [convert_int64_fields(item) for item in obj] elif isinstance(obj, str) and obj.isdigit(): return int(obj) return obj + + +def transform_positions_to_list(obj: dict | list) -> dict: + """Transforms the repeated fields 'positions' to a lists of lists of floats + as expected by DocumentReader. + + Args: + obj (dict | list): Proto message dict + + Returns: + dict: Proto message dict + """ + if isinstance(obj, dict): + # Check if 'positions' is in the dictionary and reshape it as list of lists of floats + if "positions" in obj and isinstance(obj["positions"], list): + obj["positions"] = [pos["value"] for pos in obj["positions"] if isinstance(pos, dict) and "value" in pos] + + # Recursively apply to all nested dictionaries + for key, value in obj.items(): + obj[key] = transform_positions_to_list(value) + elif isinstance(obj, list): + # Recursively apply to all items in the list + obj = [transform_positions_to_list(item) for item in obj] + + return obj diff --git a/pyproject.toml b/pyproject.toml index 3627dd7..a39299b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pyinfra" -version = "3.2.9" +version = "3.2.10" description = "" authors = ["Team Research "] license = "All rights reseverd" @@ -34,7 +34,9 @@ opentelemetry-instrumentation-requests = "^0.46b0" opentelemetry-instrumentation-fastapi = "^0.46b0" wcwidth = "<=0.2.12" azure-monitor-opentelemetry = "^1.6.0" -protobuf = "^3.20" +# We set protobuf to this range because the image classification service depends on a protobuf version <4, but does not use proto files. +# Therefore, we allow latest possible protobuf version in the services which use proto files. As soon as the dependency issue is fixed set this to the latest possible protobuf version +protobuf = ">=3.20 <5.0.0" aio-pika = "^9.4.2" aiohttp = "^3.9.5" tenacity = "^8.5.0" diff --git a/tests/data.dvc b/tests/data.dvc index 7162874..d477c63 100644 --- a/tests/data.dvc +++ b/tests/data.dvc @@ -1,6 +1,6 @@ outs: -- md5: 7d36b38a27b5b959beec9e0e772c14c4.dir - size: 23067894 - nfiles: 8 +- md5: 75cc98b7c8fcf782a7d4941594e6bc12.dir + size: 134913 + nfiles: 9 hash: md5 path: data diff --git a/tests/unit_test/proto_data_loader_test.py b/tests/unit_test/proto_data_loader_test.py index e8dc9c1..ad762f5 100644 --- a/tests/unit_test/proto_data_loader_test.py +++ b/tests/unit_test/proto_data_loader_test.py @@ -2,11 +2,13 @@ import gzip import json from pathlib import Path + import pytest from deepdiff import DeepDiff from pyinfra.storage.proto_data_loader import ProtoDataLoader +enum = 1 @pytest.fixture def test_data_dir(): @@ -16,8 +18,10 @@ def test_data_dir(): @pytest.fixture def document_data(request, test_data_dir) -> (str, bytes, dict | list): doc_type = request.param - input_file_path = test_data_dir / f"72ea04dfdbeb277f37b9eb127efb0896.{doc_type}.proto.gz" - target_file_path = test_data_dir / f"3f9d3d9f255007de8eff13648321e197.{doc_type}.json.gz" + + # Search for relevant doc_type file pairs - there should be one proto and one json file per document type + input_file_path = next(test_data_dir.glob(f"*.{doc_type}.proto.gz"), None) + target_file_path = next(test_data_dir.glob(f"*.{doc_type}.json.gz"), None) input_data = input_file_path.read_bytes() target_data = json.loads(gzip.decompress(target_file_path.read_bytes())) @@ -49,10 +53,10 @@ def should_match(): @pytest.mark.xfail( - reason="FIXME: The test is not stable, but hast to work before we can deploy the code! Right now, we don't have parity between the proto and the json data." + reason="FIXME: The test is not stable, but has to work before we can deploy the code! Right now, we don't have parity between the proto and the json data." ) -# As DOCUMENT_POSITION is a very large file, the test takes forever. If you want to test it, add "DOCUMENT_POSITION" to the list below. -@pytest.mark.parametrize("document_data", ["DOCUMENT_STRUCTURE", "DOCUMENT_TEXT", "DOCUMENT_PAGES"], indirect=True) +# As DOCUMENT_POSITION is a very large file, the test takes forever. If you want to test it, add "DOCUMENT_POSITION" to the list below. - Added per default +@pytest.mark.parametrize("document_data", ["DOCUMENT_STRUCTURE", "DOCUMENT_TEXT", "DOCUMENT_PAGES", "DOCUMENT_POSITION"], indirect=True) def test_proto_data_loader_end2end(document_data, proto_data_loader): file_path, data, target = document_data data = gzip.decompress(data) @@ -61,12 +65,21 @@ def test_proto_data_loader_end2end(document_data, proto_data_loader): loaded_data_str = json.dumps(loaded_data, sort_keys=True) target_str = json.dumps(target, sort_keys=True) - diff = DeepDiff(sorted(loaded_data_str), sorted(target_str), ignore_order=True) + # If you want to look at the files in more detail uncomment code below + # global enum + # with open(f"input-{enum}.json", "w") as f: + # json.dump(target, f, sort_keys=True, indent=4) + + # with open(f"output-{enum}.json", "w") as f: + # json.dump(loaded_data, f, sort_keys=True, indent=4) + # enum += 1 + + diff = DeepDiff(loaded_data_str, target_str, ignore_order=True) # FIXME: remove this block when the test is stable # if diff: - # with open("/tmp/diff.json", "w") as f: - # f.write(diff.to_json(indent=2)) + # with open(f"diff_test.json", "w") as f: + # f.write(diff.to_json(indent=4)) assert not diff @@ -78,3 +91,107 @@ def test_proto_data_loader_unknown_document_type(proto_data_loader): def test_proto_data_loader_file_name_matching(proto_data_loader, should_match): for file_name in should_match: assert proto_data_loader._match(file_name) is not None + + +@pytest.mark.parametrize("document_data", ["DOCUMENT_PAGES"], indirect=True) +def test_document_page_types(document_data, proto_data_loader): + # types from document reader + # number: int + # height: int + # width: int + # rotation: int + + file_path, data, _ = document_data + data = gzip.decompress(data) + loaded_data = proto_data_loader(file_path, data) + + assert isinstance(loaded_data, list) + assert all(isinstance(entry, dict) for entry in loaded_data) + + # since all values need to be int anyway we can summarize it + assert all(all(isinstance(value, int) for value in entry.values()) for entry in loaded_data) + + +@pytest.mark.parametrize("document_data", ["DOCUMENT_POSITION"], indirect=True) +def test_document_position_data_types(document_data, proto_data_loader): + # types from document reader + # id: int + # stringIdxToPositionIdx: list[int] + # positions: list[list[float]] + + file_path, data, _ = document_data + data = gzip.decompress(data) + loaded_data = proto_data_loader(file_path, data) + + assert isinstance(loaded_data, list) + assert all(isinstance(entry, dict) for entry in loaded_data) + + for entry in loaded_data: + assert isinstance(entry["id"], int) + assert isinstance(entry["stringIdxToPositionIdx"], list) + assert isinstance(entry["positions"], list) + assert all(isinstance(position, list) for position in entry["positions"]) + assert all(all(isinstance(coordinate, float) for coordinate in position) for position in entry["positions"]) + + +@pytest.mark.parametrize("document_data", ["DOCUMENT_STRUCTURE"], indirect=True) +def test_document_structure_types(document_data, proto_data_loader): + # types from document reader for DocumentStructure + # root: dict + + # types from document reader for EntryData + # type: str + # tree_id: list[int] + # atomic_block_ids: list[int] + # page_numbers: list[int] + # properties: dict[str, str] + # children: list[dict] + + file_path, data, _ = document_data + data = gzip.decompress(data) + loaded_data = proto_data_loader(file_path, data) + + assert isinstance(loaded_data, dict) + assert isinstance(loaded_data["root"], dict) + assert isinstance(loaded_data["root"]["type"], str) + assert isinstance(loaded_data["root"]["treeId"], list) + assert isinstance(loaded_data["root"]["atomicBlockIds"], list) + assert isinstance(loaded_data["root"]["pageNumbers"], list) + assert isinstance(loaded_data["root"]["children"], list) + + assert all(isinstance(value, int) for value in loaded_data["root"]["treeId"]) + assert all(isinstance(value, int) for value in loaded_data["root"]["atomicBlockIds"]) + assert all(isinstance(value, int) for value in loaded_data["root"]["pageNumbers"]) + assert all(isinstance(value, dict) for value in loaded_data["root"]["properties"].values()) + assert all( + all(isinstance(value, dict) for value in entry.values()) for entry in loaded_data["root"]["properties"].values() + ) + assert all(isinstance(value, dict) for value in loaded_data["root"]["children"]) + + +@pytest.mark.parametrize("document_data", ["DOCUMENT_TEXT"], indirect=True) +def test_document_text_data_types(document_data, proto_data_loader): + # types from document reader + # id: int + # page: int + # search_text: str + # number_on_page: int + # start: int + # end: int + # lineBreaks: list[int] + + file_path, data, _ = document_data + data = gzip.decompress(data) + loaded_data = proto_data_loader(file_path, data) + + assert isinstance(loaded_data, list) + assert all(isinstance(entry, dict) for entry in loaded_data) + + for entry in loaded_data: + assert isinstance(entry["id"], int) + assert isinstance(entry["page"], int) + assert isinstance(entry["searchText"], str) + assert isinstance(entry["numberOnPage"], int) + assert isinstance(entry["start"], int) + assert isinstance(entry["end"], int) + assert all(isinstance(value, int) for value in entry["lineBreaks"])