feat: update data loader tests

We now compare the output proto json conversion to expected json files.
This revealed multiple differences between the file.

FIXED: int64 type was cast into string in python. We now get proper
integers

TODO: Empty fields are omitted by proto, but the jsons have them and the
pyinfra implementing services might expect them. We have to test this
behaviour and adjusts the tests accordingly.
This commit is contained in:
Julius Unverfehrt 2024-07-18 12:36:26 +02:00
parent f935056fa9
commit 70d3a210a1
3 changed files with 40 additions and 39 deletions

View File

@ -58,6 +58,7 @@ class ProtoDataLoader:
message = schema()
message.ParseFromString(data)
message_dict = MessageToDict(message)
message_dict = convert_int64_fields(message_dict)
return self._unpack(message_dict)
@ -75,3 +76,15 @@ class ProtoDataLoader:
return message_dict[key]
return message_dict
def convert_int64_fields(obj):
# FIXME: find a more sophisticated way to convert int64 fields (defaults to str in python)
if isinstance(obj, dict):
for key, value in obj.items():
obj[key] = convert_int64_fields(value)
elif isinstance(obj, list):
return [convert_int64_fields(item) for item in obj]
elif isinstance(obj, str) and obj.isdigit():
return int(obj)
return obj

View File

@ -1,6 +1,6 @@
outs:
- md5: 014d60e407398cd6b15ced3a27235f08.dir
size: 10409309
nfiles: 4
- md5: 7d36b38a27b5b959beec9e0e772c14c4.dir
size: 23067894
nfiles: 8
hash: md5
path: data

View File

@ -1,7 +1,9 @@
import gzip
import json
from pathlib import Path
import pytest
from deepdiff import DeepDiff
from pyinfra.storage.proto_data_loader import ProtoDataLoader
@ -12,31 +14,15 @@ def test_data_dir():
@pytest.fixture
def document_structure_document(test_data_dir) -> (str, bytes):
file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_STRUCTURE.proto.gz"
data = file_path.read_bytes()
return file_path, data
def document_data(request, test_data_dir) -> (str, bytes, dict | list):
doc_type = request.param
input_file_path = test_data_dir / f"72ea04dfdbeb277f37b9eb127efb0896.{doc_type}.proto.gz"
target_file_path = test_data_dir / f"3f9d3d9f255007de8eff13648321e197.{doc_type}.json.gz"
input_data = input_file_path.read_bytes()
target_data = json.loads(gzip.decompress(target_file_path.read_bytes()))
@pytest.fixture
def document_text_document(test_data_dir) -> (str, bytes):
file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_TEXT.proto.gz"
data = file_path.read_bytes()
return file_path, data
@pytest.fixture
def document_pages_document(test_data_dir) -> (str, bytes):
file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_PAGES.proto.gz"
data = file_path.read_bytes()
return file_path, data
@pytest.fixture
def document_position_document(test_data_dir) -> (str, bytes):
file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_POSITION.proto.gz"
data = file_path.read_bytes()
return file_path, data
return input_file_path, input_data, target_data
@pytest.fixture
@ -62,23 +48,25 @@ def should_match():
]
@pytest.mark.parametrize(
"document_fixture",
[
"document_structure_document",
"document_text_document",
"document_pages_document",
"document_position_document",
],
@pytest.mark.xfail(
reason="FIXME: The test is not stable, but hast to work before we can deploy the code! Right now, we don't have parity between the proto and the json data."
)
def test_proto_data_loader_end2end(document_fixture, request, proto_data_loader):
file_path, data = request.getfixturevalue(document_fixture)
@pytest.mark.parametrize(
"document_data", ["DOCUMENT_STRUCTURE", "DOCUMENT_TEXT", "DOCUMENT_POSITION", "DOCUMENT_PAGES"], indirect=True
)
def test_proto_data_loader_end2end(document_data, proto_data_loader):
file_path, data, target = document_data
data = gzip.decompress(data)
loaded_data = proto_data_loader(file_path, data)
# TODO: Right now, we don't have access to proto-json pairs to compare the loaded data with the expected data.
# If this becomes available, please update this test to compare the loaded data with the expected data.
assert isinstance(loaded_data, dict) or isinstance(loaded_data, list)
diff = DeepDiff(loaded_data, target, ignore_order=True)
# FIXME: remove this block when the test is stable
if diff:
with open("/tmp/diff.json", "w") as f:
f.write(diff.to_json(indent=2))
assert not diff
def test_proto_data_loader_unknown_document_type(proto_data_loader):