feat: update data loader tests
We now compare the output proto json conversion to expected json files. This revealed multiple differences between the file. FIXED: int64 type was cast into string in python. We now get proper integers TODO: Empty fields are omitted by proto, but the jsons have them and the pyinfra implementing services might expect them. We have to test this behaviour and adjusts the tests accordingly.
This commit is contained in:
parent
f935056fa9
commit
70d3a210a1
@ -58,6 +58,7 @@ class ProtoDataLoader:
|
||||
message = schema()
|
||||
message.ParseFromString(data)
|
||||
message_dict = MessageToDict(message)
|
||||
message_dict = convert_int64_fields(message_dict)
|
||||
|
||||
return self._unpack(message_dict)
|
||||
|
||||
@ -75,3 +76,15 @@ class ProtoDataLoader:
|
||||
return message_dict[key]
|
||||
|
||||
return message_dict
|
||||
|
||||
|
||||
def convert_int64_fields(obj):
|
||||
# FIXME: find a more sophisticated way to convert int64 fields (defaults to str in python)
|
||||
if isinstance(obj, dict):
|
||||
for key, value in obj.items():
|
||||
obj[key] = convert_int64_fields(value)
|
||||
elif isinstance(obj, list):
|
||||
return [convert_int64_fields(item) for item in obj]
|
||||
elif isinstance(obj, str) and obj.isdigit():
|
||||
return int(obj)
|
||||
return obj
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
outs:
|
||||
- md5: 014d60e407398cd6b15ced3a27235f08.dir
|
||||
size: 10409309
|
||||
nfiles: 4
|
||||
- md5: 7d36b38a27b5b959beec9e0e772c14c4.dir
|
||||
size: 23067894
|
||||
nfiles: 8
|
||||
hash: md5
|
||||
path: data
|
||||
|
||||
@ -1,7 +1,9 @@
|
||||
import gzip
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from deepdiff import DeepDiff
|
||||
|
||||
from pyinfra.storage.proto_data_loader import ProtoDataLoader
|
||||
|
||||
@ -12,31 +14,15 @@ def test_data_dir():
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def document_structure_document(test_data_dir) -> (str, bytes):
|
||||
file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_STRUCTURE.proto.gz"
|
||||
data = file_path.read_bytes()
|
||||
return file_path, data
|
||||
def document_data(request, test_data_dir) -> (str, bytes, dict | list):
|
||||
doc_type = request.param
|
||||
input_file_path = test_data_dir / f"72ea04dfdbeb277f37b9eb127efb0896.{doc_type}.proto.gz"
|
||||
target_file_path = test_data_dir / f"3f9d3d9f255007de8eff13648321e197.{doc_type}.json.gz"
|
||||
|
||||
input_data = input_file_path.read_bytes()
|
||||
target_data = json.loads(gzip.decompress(target_file_path.read_bytes()))
|
||||
|
||||
@pytest.fixture
|
||||
def document_text_document(test_data_dir) -> (str, bytes):
|
||||
file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_TEXT.proto.gz"
|
||||
data = file_path.read_bytes()
|
||||
return file_path, data
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def document_pages_document(test_data_dir) -> (str, bytes):
|
||||
file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_PAGES.proto.gz"
|
||||
data = file_path.read_bytes()
|
||||
return file_path, data
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def document_position_document(test_data_dir) -> (str, bytes):
|
||||
file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_POSITION.proto.gz"
|
||||
data = file_path.read_bytes()
|
||||
return file_path, data
|
||||
return input_file_path, input_data, target_data
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@ -62,23 +48,25 @@ def should_match():
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"document_fixture",
|
||||
[
|
||||
"document_structure_document",
|
||||
"document_text_document",
|
||||
"document_pages_document",
|
||||
"document_position_document",
|
||||
],
|
||||
@pytest.mark.xfail(
|
||||
reason="FIXME: The test is not stable, but hast to work before we can deploy the code! Right now, we don't have parity between the proto and the json data."
|
||||
)
|
||||
def test_proto_data_loader_end2end(document_fixture, request, proto_data_loader):
|
||||
file_path, data = request.getfixturevalue(document_fixture)
|
||||
@pytest.mark.parametrize(
|
||||
"document_data", ["DOCUMENT_STRUCTURE", "DOCUMENT_TEXT", "DOCUMENT_POSITION", "DOCUMENT_PAGES"], indirect=True
|
||||
)
|
||||
def test_proto_data_loader_end2end(document_data, proto_data_loader):
|
||||
file_path, data, target = document_data
|
||||
data = gzip.decompress(data)
|
||||
loaded_data = proto_data_loader(file_path, data)
|
||||
|
||||
# TODO: Right now, we don't have access to proto-json pairs to compare the loaded data with the expected data.
|
||||
# If this becomes available, please update this test to compare the loaded data with the expected data.
|
||||
assert isinstance(loaded_data, dict) or isinstance(loaded_data, list)
|
||||
diff = DeepDiff(loaded_data, target, ignore_order=True)
|
||||
|
||||
# FIXME: remove this block when the test is stable
|
||||
if diff:
|
||||
with open("/tmp/diff.json", "w") as f:
|
||||
f.write(diff.to_json(indent=2))
|
||||
|
||||
assert not diff
|
||||
|
||||
|
||||
def test_proto_data_loader_unknown_document_type(proto_data_loader):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user