Merge branch 'feature/RES-856-test-proto-format' into 'master'

RES-856: add type tests for proto format

See merge request knecon/research/pyinfra!99
This commit is contained in:
Jonathan Kössler 2024-09-26 10:07:29 +02:00
commit b70b16c541
13 changed files with 202 additions and 65 deletions

View File

@ -20,9 +20,6 @@ message DocumentPositionData {
// Definition of a BoundingBox that contains x, y, width, and height.
message Position {
float x = 1;
float y = 2;
float width = 3;
float height = 4;
repeated float value = 1;
}
}

65
poetry.lock generated
View File

@ -1555,13 +1555,13 @@ tests = ["Pygments (==2.10.0)", "collective.checkdocs (==0.2)", "dvc[testing]",
[[package]]
name = "dvc-data"
version = "3.16.5"
version = "3.16.6"
description = "DVC's data management subsystem"
optional = false
python-versions = ">=3.9"
files = [
{file = "dvc_data-3.16.5-py3-none-any.whl", hash = "sha256:d7004964eef4442521957ccf3ed3f2f6add3c4dfe7b830773ebe68875c481a55"},
{file = "dvc_data-3.16.5.tar.gz", hash = "sha256:ba0d9ac53a1809622f59ff86b24d07639fce7c98d0c26ef9406eed309d04c856"},
{file = "dvc_data-3.16.6-py3-none-any.whl", hash = "sha256:ffba21e2d0e420c427257a984213c94ba51db2fb76f2227ab23d9cb978613684"},
{file = "dvc_data-3.16.6.tar.gz", hash = "sha256:598c10e5ded098f5145ede5535f2b5b964af42dc900486bc9bb739191f8189fd"},
]
[package.dependencies]
@ -1579,8 +1579,8 @@ tqdm = ">=4.63.1,<5"
[package.extras]
all = ["dvc-data[cli]"]
cli = ["rich (>=10.11.0,<14.0.0)", "typer (>=0.6)"]
dev = ["blake3 (>=0.3.1)", "dvc-data[all]", "dvc-data[tests]", "mypy (==1.11.1)", "types-tqdm"]
tests = ["pytest (>=7,<9)", "pytest-benchmark (>=4)", "pytest-cov (>=4.1.0)", "pytest-mock", "pytest-servers[s3] (==0.5.6)", "pytest-sugar"]
dev = ["blake3 (>=0.3.1)", "dvc-data[all]", "dvc-data[tests]", "mypy (==1.11.2)", "types-tqdm"]
tests = ["pytest (>=7,<9)", "pytest-benchmark (>=4)", "pytest-cov (>=4.1.0)", "pytest-mock", "pytest-servers[s3] (==0.5.7)", "pytest-sugar"]
[[package]]
name = "dvc-http"
@ -3735,13 +3735,13 @@ twisted = ["twisted"]
[[package]]
name = "prompt-toolkit"
version = "3.0.47"
version = "3.0.48"
description = "Library for building powerful interactive command lines in Python"
optional = false
python-versions = ">=3.7.0"
files = [
{file = "prompt_toolkit-3.0.47-py3-none-any.whl", hash = "sha256:0d7bfa67001d5e39d02c224b663abc33687405033a8c422d0d675a5a13361d10"},
{file = "prompt_toolkit-3.0.47.tar.gz", hash = "sha256:1e1b29cb58080b1e69f207c893a1a7bf16d127a5c30c9d17a25a5d77792e5360"},
{file = "prompt_toolkit-3.0.48-py3-none-any.whl", hash = "sha256:f49a827f90062e411f1ce1f854f2aedb3c23353244f8108b89283587397ac10e"},
{file = "prompt_toolkit-3.0.48.tar.gz", hash = "sha256:d6623ab0477a80df74e646bdbc93621143f5caf104206aa29294d53de1a03d90"},
]
[package.dependencies]
@ -3749,33 +3749,22 @@ wcwidth = "*"
[[package]]
name = "protobuf"
version = "3.20.3"
description = "Protocol Buffers"
version = "4.25.5"
description = ""
optional = false
python-versions = ">=3.7"
python-versions = ">=3.8"
files = [
{file = "protobuf-3.20.3-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:f4bd856d702e5b0d96a00ec6b307b0f51c1982c2bf9c0052cf9019e9a544ba99"},
{file = "protobuf-3.20.3-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:9aae4406ea63d825636cc11ffb34ad3379335803216ee3a856787bcf5ccc751e"},
{file = "protobuf-3.20.3-cp310-cp310-win32.whl", hash = "sha256:28545383d61f55b57cf4df63eebd9827754fd2dc25f80c5253f9184235db242c"},
{file = "protobuf-3.20.3-cp310-cp310-win_amd64.whl", hash = "sha256:67a3598f0a2dcbc58d02dd1928544e7d88f764b47d4a286202913f0b2801c2e7"},
{file = "protobuf-3.20.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:899dc660cd599d7352d6f10d83c95df430a38b410c1b66b407a6b29265d66469"},
{file = "protobuf-3.20.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e64857f395505ebf3d2569935506ae0dfc4a15cb80dc25261176c784662cdcc4"},
{file = "protobuf-3.20.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:d9e4432ff660d67d775c66ac42a67cf2453c27cb4d738fc22cb53b5d84c135d4"},
{file = "protobuf-3.20.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:74480f79a023f90dc6e18febbf7b8bac7508420f2006fabd512013c0c238f454"},
{file = "protobuf-3.20.3-cp37-cp37m-win32.whl", hash = "sha256:b6cc7ba72a8850621bfec987cb72623e703b7fe2b9127a161ce61e61558ad905"},
{file = "protobuf-3.20.3-cp37-cp37m-win_amd64.whl", hash = "sha256:8c0c984a1b8fef4086329ff8dd19ac77576b384079247c770f29cc8ce3afa06c"},
{file = "protobuf-3.20.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:de78575669dddf6099a8a0f46a27e82a1783c557ccc38ee620ed8cc96d3be7d7"},
{file = "protobuf-3.20.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:f4c42102bc82a51108e449cbb32b19b180022941c727bac0cfd50170341f16ee"},
{file = "protobuf-3.20.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:44246bab5dd4b7fbd3c0c80b6f16686808fab0e4aca819ade6e8d294a29c7050"},
{file = "protobuf-3.20.3-cp38-cp38-win32.whl", hash = "sha256:c02ce36ec760252242a33967d51c289fd0e1c0e6e5cc9397e2279177716add86"},
{file = "protobuf-3.20.3-cp38-cp38-win_amd64.whl", hash = "sha256:447d43819997825d4e71bf5769d869b968ce96848b6479397e29fc24c4a5dfe9"},
{file = "protobuf-3.20.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:398a9e0c3eaceb34ec1aee71894ca3299605fa8e761544934378bbc6c97de23b"},
{file = "protobuf-3.20.3-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:bf01b5720be110540be4286e791db73f84a2b721072a3711efff6c324cdf074b"},
{file = "protobuf-3.20.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:daa564862dd0d39c00f8086f88700fdbe8bc717e993a21e90711acfed02f2402"},
{file = "protobuf-3.20.3-cp39-cp39-win32.whl", hash = "sha256:819559cafa1a373b7096a482b504ae8a857c89593cf3a25af743ac9ecbd23480"},
{file = "protobuf-3.20.3-cp39-cp39-win_amd64.whl", hash = "sha256:03038ac1cfbc41aa21f6afcbcd357281d7521b4157926f30ebecc8d4ea59dcb7"},
{file = "protobuf-3.20.3-py2.py3-none-any.whl", hash = "sha256:a7ca6d488aa8ff7f329d4c545b2dbad8ac31464f1d8b1c87ad1346717731e4db"},
{file = "protobuf-3.20.3.tar.gz", hash = "sha256:2e3427429c9cffebf259491be0af70189607f365c2f41c7c3764af6f337105f2"},
{file = "protobuf-4.25.5-cp310-abi3-win32.whl", hash = "sha256:5e61fd921603f58d2f5acb2806a929b4675f8874ff5f330b7d6f7e2e784bbcd8"},
{file = "protobuf-4.25.5-cp310-abi3-win_amd64.whl", hash = "sha256:4be0571adcbe712b282a330c6e89eae24281344429ae95c6d85e79e84780f5ea"},
{file = "protobuf-4.25.5-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:b2fde3d805354df675ea4c7c6338c1aecd254dfc9925e88c6d31a2bcb97eb173"},
{file = "protobuf-4.25.5-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:919ad92d9b0310070f8356c24b855c98df2b8bd207ebc1c0c6fcc9ab1e007f3d"},
{file = "protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:fe14e16c22be926d3abfcb500e60cab068baf10b542b8c858fa27e098123e331"},
{file = "protobuf-4.25.5-cp38-cp38-win32.whl", hash = "sha256:98d8d8aa50de6a2747efd9cceba361c9034050ecce3e09136f90de37ddba66e1"},
{file = "protobuf-4.25.5-cp38-cp38-win_amd64.whl", hash = "sha256:b0234dd5a03049e4ddd94b93400b67803c823cfc405689688f59b34e0742381a"},
{file = "protobuf-4.25.5-cp39-cp39-win32.whl", hash = "sha256:abe32aad8561aa7cc94fc7ba4fdef646e576983edb94a73381b03c53728a626f"},
{file = "protobuf-4.25.5-cp39-cp39-win_amd64.whl", hash = "sha256:7a183f592dc80aa7c8da7ad9e55091c4ffc9497b3054452d629bb85fa27c2a45"},
{file = "protobuf-4.25.5-py3-none-any.whl", hash = "sha256:0aebecb809cae990f8129ada5ca273d9d670b76d9bfc9b1809f0a9c02b7dbf41"},
{file = "protobuf-4.25.5.tar.gz", hash = "sha256:7f8249476b4a9473645db7f8ab42b02fe1488cbe5fb72fddd445e0665afd8584"},
]
[[package]]
@ -4031,13 +4020,13 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
[[package]]
name = "pydot"
version = "3.0.1"
version = "3.0.2"
description = "Python interface to Graphviz's Dot"
optional = false
python-versions = ">=3.8"
files = [
{file = "pydot-3.0.1-py3-none-any.whl", hash = "sha256:43f1e878dc1ff7c1c2e3470a6999d4e9e97771c5c862440c2f0af0ba844c231f"},
{file = "pydot-3.0.1.tar.gz", hash = "sha256:e18cf7f287c497d77b536a3d20a46284568fea390776dface6eabbdf1b1b5efc"},
{file = "pydot-3.0.2-py3-none-any.whl", hash = "sha256:99cedaa55d04abb0b2bc56d9981a6da781053dd5ac75c428e8dd53db53f90b14"},
{file = "pydot-3.0.2.tar.gz", hash = "sha256:9180da540b51b3aa09fbf81140b3edfbe2315d778e8589a7d0a4a69c41332bae"},
]
[package.dependencies]
@ -4046,7 +4035,7 @@ pyparsing = ">=3.0.9"
[package.extras]
dev = ["chardet", "parameterized", "ruff"]
release = ["zest.releaser[recommended]"]
tests = ["chardet", "parameterized", "ruff", "tox", "unittest-parallel"]
tests = ["chardet", "parameterized", "pytest", "pytest-cov", "pytest-xdist[psutil]", "ruff", "tox"]
[[package]]
name = "pygit2"
@ -5435,4 +5424,4 @@ type = ["pytest-mypy"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.10,<3.11"
content-hash = "85cc4f846ba584e4b4d41a1e07ad11eeea96ce3377f3137cf5efaf8de4716050"
content-hash = "102e413da0ff757a914290f881301e4182477c0bb08efdb31db5bc8d2a198289"

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: DocumentPage.proto
# Protobuf Python Version: 4.25.4
# Protobuf Python Version: 4.25.5
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: DocumentPositionData.proto
# Protobuf Python Version: 4.25.4
# Protobuf Python Version: 4.25.5
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool
@ -14,7 +14,7 @@ _sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
b'\n\x1a\x44ocumentPositionData.proto"N\n\x17\x41llDocumentPositionData\x12\x33\n\x14\x64ocumentPositionData\x18\x01 \x03(\x0b\x32\x15.DocumentPositionData"\xb6\x01\n\x14\x44ocumentPositionData\x12\n\n\x02id\x18\x01 \x01(\x03\x12\x1e\n\x16stringIdxToPositionIdx\x18\x02 \x03(\x05\x12\x31\n\tpositions\x18\x03 \x03(\x0b\x32\x1e.DocumentPositionData.Position\x1a?\n\x08Position\x12\t\n\x01x\x18\x01 \x01(\x02\x12\t\n\x01y\x18\x02 \x01(\x02\x12\r\n\x05width\x18\x03 \x01(\x02\x12\x0e\n\x06height\x18\x04 \x01(\x02\x62\x06proto3'
b'\n\x1a\x44ocumentPositionData.proto"N\n\x17\x41llDocumentPositionData\x12\x33\n\x14\x64ocumentPositionData\x18\x01 \x03(\x0b\x32\x15.DocumentPositionData"\x90\x01\n\x14\x44ocumentPositionData\x12\n\n\x02id\x18\x01 \x01(\x03\x12\x1e\n\x16stringIdxToPositionIdx\x18\x02 \x03(\x05\x12\x31\n\tpositions\x18\x03 \x03(\x0b\x32\x1e.DocumentPositionData.Position\x1a\x19\n\x08Position\x12\r\n\x05value\x18\x01 \x03(\x02\x62\x06proto3'
)
_globals = globals()
@ -25,7 +25,7 @@ if _descriptor._USE_C_DESCRIPTORS == False:
_globals["_ALLDOCUMENTPOSITIONDATA"]._serialized_start = 30
_globals["_ALLDOCUMENTPOSITIONDATA"]._serialized_end = 108
_globals["_DOCUMENTPOSITIONDATA"]._serialized_start = 111
_globals["_DOCUMENTPOSITIONDATA"]._serialized_end = 293
_globals["_DOCUMENTPOSITIONDATA"]._serialized_end = 255
_globals["_DOCUMENTPOSITIONDATA_POSITION"]._serialized_start = 230
_globals["_DOCUMENTPOSITIONDATA_POSITION"]._serialized_end = 293
_globals["_DOCUMENTPOSITIONDATA_POSITION"]._serialized_end = 255
# @@protoc_insertion_point(module_scope)

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: DocumentStructure.proto
# Protobuf Python Version: 4.25.4
# Protobuf Python Version: 4.25.5
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: DocumentTextData.proto
# Protobuf Python Version: 4.25.4
# Protobuf Python Version: 4.25.5
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: EntryData.proto
# Protobuf Python Version: 4.25.4
# Protobuf Python Version: 4.25.5
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: LayoutEngine.proto
# Protobuf Python Version: 4.25.4
# Protobuf Python Version: 4.25.5
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: NodeType.proto
# Protobuf Python Version: 4.25.4
# Protobuf Python Version: 4.25.5
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool

View File

@ -64,6 +64,8 @@ class ProtoDataLoader:
message.ParseFromString(data)
message_dict = MessageToDict(message, including_default_value_fields=True)
message_dict = convert_int64_fields(message_dict)
if document_type == "POSITION":
message_dict = transform_positions_to_list(message_dict)
return self._unpack(message_dict)
@ -85,11 +87,41 @@ class ProtoDataLoader:
def convert_int64_fields(obj):
# FIXME: find a more sophisticated way to convert int64 fields (defaults to str in python)
# we skip the following keys because the values are expected to be of type str
skip_keys = ["col", "row", "numberOfCols", "numberOfRows"]
if isinstance(obj, dict):
for key, value in obj.items():
if key in skip_keys:
continue
obj[key] = convert_int64_fields(value)
elif isinstance(obj, list):
return [convert_int64_fields(item) for item in obj]
elif isinstance(obj, str) and obj.isdigit():
return int(obj)
return obj
def transform_positions_to_list(obj: dict | list) -> dict:
"""Transforms the repeated fields 'positions' to a lists of lists of floats
as expected by DocumentReader.
Args:
obj (dict | list): Proto message dict
Returns:
dict: Proto message dict
"""
if isinstance(obj, dict):
# Check if 'positions' is in the dictionary and reshape it as list of lists of floats
if "positions" in obj and isinstance(obj["positions"], list):
obj["positions"] = [pos["value"] for pos in obj["positions"] if isinstance(pos, dict) and "value" in pos]
# Recursively apply to all nested dictionaries
for key, value in obj.items():
obj[key] = transform_positions_to_list(value)
elif isinstance(obj, list):
# Recursively apply to all items in the list
obj = [transform_positions_to_list(item) for item in obj]
return obj

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "pyinfra"
version = "3.2.9"
version = "3.2.10"
description = ""
authors = ["Team Research <research@knecon.com>"]
license = "All rights reseverd"
@ -34,7 +34,9 @@ opentelemetry-instrumentation-requests = "^0.46b0"
opentelemetry-instrumentation-fastapi = "^0.46b0"
wcwidth = "<=0.2.12"
azure-monitor-opentelemetry = "^1.6.0"
protobuf = "^3.20"
# We set protobuf to this range because the image classification service depends on a protobuf version <4, but does not use proto files.
# Therefore, we allow latest possible protobuf version in the services which use proto files. As soon as the dependency issue is fixed set this to the latest possible protobuf version
protobuf = ">=3.20 <5.0.0"
aio-pika = "^9.4.2"
aiohttp = "^3.9.5"
tenacity = "^8.5.0"

View File

@ -1,6 +1,6 @@
outs:
- md5: 7d36b38a27b5b959beec9e0e772c14c4.dir
size: 23067894
nfiles: 8
- md5: 75cc98b7c8fcf782a7d4941594e6bc12.dir
size: 134913
nfiles: 9
hash: md5
path: data

View File

@ -2,11 +2,13 @@ import gzip
import json
from pathlib import Path
import pytest
from deepdiff import DeepDiff
from pyinfra.storage.proto_data_loader import ProtoDataLoader
enum = 1
@pytest.fixture
def test_data_dir():
@ -16,8 +18,10 @@ def test_data_dir():
@pytest.fixture
def document_data(request, test_data_dir) -> (str, bytes, dict | list):
doc_type = request.param
input_file_path = test_data_dir / f"72ea04dfdbeb277f37b9eb127efb0896.{doc_type}.proto.gz"
target_file_path = test_data_dir / f"3f9d3d9f255007de8eff13648321e197.{doc_type}.json.gz"
# Search for relevant doc_type file pairs - there should be one proto and one json file per document type
input_file_path = next(test_data_dir.glob(f"*.{doc_type}.proto.gz"), None)
target_file_path = next(test_data_dir.glob(f"*.{doc_type}.json.gz"), None)
input_data = input_file_path.read_bytes()
target_data = json.loads(gzip.decompress(target_file_path.read_bytes()))
@ -49,10 +53,10 @@ def should_match():
@pytest.mark.xfail(
reason="FIXME: The test is not stable, but hast to work before we can deploy the code! Right now, we don't have parity between the proto and the json data."
reason="FIXME: The test is not stable, but has to work before we can deploy the code! Right now, we don't have parity between the proto and the json data."
)
# As DOCUMENT_POSITION is a very large file, the test takes forever. If you want to test it, add "DOCUMENT_POSITION" to the list below.
@pytest.mark.parametrize("document_data", ["DOCUMENT_STRUCTURE", "DOCUMENT_TEXT", "DOCUMENT_PAGES"], indirect=True)
# As DOCUMENT_POSITION is a very large file, the test takes forever. If you want to test it, add "DOCUMENT_POSITION" to the list below. - Added per default
@pytest.mark.parametrize("document_data", ["DOCUMENT_STRUCTURE", "DOCUMENT_TEXT", "DOCUMENT_PAGES", "DOCUMENT_POSITION"], indirect=True)
def test_proto_data_loader_end2end(document_data, proto_data_loader):
file_path, data, target = document_data
data = gzip.decompress(data)
@ -61,12 +65,21 @@ def test_proto_data_loader_end2end(document_data, proto_data_loader):
loaded_data_str = json.dumps(loaded_data, sort_keys=True)
target_str = json.dumps(target, sort_keys=True)
diff = DeepDiff(sorted(loaded_data_str), sorted(target_str), ignore_order=True)
# If you want to look at the files in more detail uncomment code below
# global enum
# with open(f"input-{enum}.json", "w") as f:
# json.dump(target, f, sort_keys=True, indent=4)
# with open(f"output-{enum}.json", "w") as f:
# json.dump(loaded_data, f, sort_keys=True, indent=4)
# enum += 1
diff = DeepDiff(loaded_data_str, target_str, ignore_order=True)
# FIXME: remove this block when the test is stable
# if diff:
# with open("/tmp/diff.json", "w") as f:
# f.write(diff.to_json(indent=2))
# with open(f"diff_test.json", "w") as f:
# f.write(diff.to_json(indent=4))
assert not diff
@ -78,3 +91,107 @@ def test_proto_data_loader_unknown_document_type(proto_data_loader):
def test_proto_data_loader_file_name_matching(proto_data_loader, should_match):
for file_name in should_match:
assert proto_data_loader._match(file_name) is not None
@pytest.mark.parametrize("document_data", ["DOCUMENT_PAGES"], indirect=True)
def test_document_page_types(document_data, proto_data_loader):
# types from document reader
# number: int
# height: int
# width: int
# rotation: int
file_path, data, _ = document_data
data = gzip.decompress(data)
loaded_data = proto_data_loader(file_path, data)
assert isinstance(loaded_data, list)
assert all(isinstance(entry, dict) for entry in loaded_data)
# since all values need to be int anyway we can summarize it
assert all(all(isinstance(value, int) for value in entry.values()) for entry in loaded_data)
@pytest.mark.parametrize("document_data", ["DOCUMENT_POSITION"], indirect=True)
def test_document_position_data_types(document_data, proto_data_loader):
# types from document reader
# id: int
# stringIdxToPositionIdx: list[int]
# positions: list[list[float]]
file_path, data, _ = document_data
data = gzip.decompress(data)
loaded_data = proto_data_loader(file_path, data)
assert isinstance(loaded_data, list)
assert all(isinstance(entry, dict) for entry in loaded_data)
for entry in loaded_data:
assert isinstance(entry["id"], int)
assert isinstance(entry["stringIdxToPositionIdx"], list)
assert isinstance(entry["positions"], list)
assert all(isinstance(position, list) for position in entry["positions"])
assert all(all(isinstance(coordinate, float) for coordinate in position) for position in entry["positions"])
@pytest.mark.parametrize("document_data", ["DOCUMENT_STRUCTURE"], indirect=True)
def test_document_structure_types(document_data, proto_data_loader):
# types from document reader for DocumentStructure
# root: dict
# types from document reader for EntryData
# type: str
# tree_id: list[int]
# atomic_block_ids: list[int]
# page_numbers: list[int]
# properties: dict[str, str]
# children: list[dict]
file_path, data, _ = document_data
data = gzip.decompress(data)
loaded_data = proto_data_loader(file_path, data)
assert isinstance(loaded_data, dict)
assert isinstance(loaded_data["root"], dict)
assert isinstance(loaded_data["root"]["type"], str)
assert isinstance(loaded_data["root"]["treeId"], list)
assert isinstance(loaded_data["root"]["atomicBlockIds"], list)
assert isinstance(loaded_data["root"]["pageNumbers"], list)
assert isinstance(loaded_data["root"]["children"], list)
assert all(isinstance(value, int) for value in loaded_data["root"]["treeId"])
assert all(isinstance(value, int) for value in loaded_data["root"]["atomicBlockIds"])
assert all(isinstance(value, int) for value in loaded_data["root"]["pageNumbers"])
assert all(isinstance(value, dict) for value in loaded_data["root"]["properties"].values())
assert all(
all(isinstance(value, dict) for value in entry.values()) for entry in loaded_data["root"]["properties"].values()
)
assert all(isinstance(value, dict) for value in loaded_data["root"]["children"])
@pytest.mark.parametrize("document_data", ["DOCUMENT_TEXT"], indirect=True)
def test_document_text_data_types(document_data, proto_data_loader):
# types from document reader
# id: int
# page: int
# search_text: str
# number_on_page: int
# start: int
# end: int
# lineBreaks: list[int]
file_path, data, _ = document_data
data = gzip.decompress(data)
loaded_data = proto_data_loader(file_path, data)
assert isinstance(loaded_data, list)
assert all(isinstance(entry, dict) for entry in loaded_data)
for entry in loaded_data:
assert isinstance(entry["id"], int)
assert isinstance(entry["page"], int)
assert isinstance(entry["searchText"], str)
assert isinstance(entry["numberOnPage"], int)
assert isinstance(entry["start"], int)
assert isinstance(entry["end"], int)
assert all(isinstance(value, int) for value in entry["lineBreaks"])