From 70d3a210a12af54108524f52ca0c03ccf5fbe811 Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Thu, 18 Jul 2024 12:36:26 +0200 Subject: [PATCH] feat: update data loader tests We now compare the output proto json conversion to expected json files. This revealed multiple differences between the file. FIXED: int64 type was cast into string in python. We now get proper integers TODO: Empty fields are omitted by proto, but the jsons have them and the pyinfra implementing services might expect them. We have to test this behaviour and adjusts the tests accordingly. --- pyinfra/storage/proto_data_loader.py | 13 +++++ tests/data.dvc | 6 +-- tests/unit_test/proto_data_loader_test.py | 60 +++++++++-------------- 3 files changed, 40 insertions(+), 39 deletions(-) diff --git a/pyinfra/storage/proto_data_loader.py b/pyinfra/storage/proto_data_loader.py index d5bbbac..0b2a9ba 100644 --- a/pyinfra/storage/proto_data_loader.py +++ b/pyinfra/storage/proto_data_loader.py @@ -58,6 +58,7 @@ class ProtoDataLoader: message = schema() message.ParseFromString(data) message_dict = MessageToDict(message) + message_dict = convert_int64_fields(message_dict) return self._unpack(message_dict) @@ -75,3 +76,15 @@ class ProtoDataLoader: return message_dict[key] return message_dict + + +def convert_int64_fields(obj): + # FIXME: find a more sophisticated way to convert int64 fields (defaults to str in python) + if isinstance(obj, dict): + for key, value in obj.items(): + obj[key] = convert_int64_fields(value) + elif isinstance(obj, list): + return [convert_int64_fields(item) for item in obj] + elif isinstance(obj, str) and obj.isdigit(): + return int(obj) + return obj diff --git a/tests/data.dvc b/tests/data.dvc index 2b3d0d0..7162874 100644 --- a/tests/data.dvc +++ b/tests/data.dvc @@ -1,6 +1,6 @@ outs: -- md5: 014d60e407398cd6b15ced3a27235f08.dir - size: 10409309 - nfiles: 4 +- md5: 7d36b38a27b5b959beec9e0e772c14c4.dir + size: 23067894 + nfiles: 8 hash: md5 path: data diff --git a/tests/unit_test/proto_data_loader_test.py b/tests/unit_test/proto_data_loader_test.py index 5ea0dae..c100943 100644 --- a/tests/unit_test/proto_data_loader_test.py +++ b/tests/unit_test/proto_data_loader_test.py @@ -1,7 +1,9 @@ import gzip +import json from pathlib import Path import pytest +from deepdiff import DeepDiff from pyinfra.storage.proto_data_loader import ProtoDataLoader @@ -12,31 +14,15 @@ def test_data_dir(): @pytest.fixture -def document_structure_document(test_data_dir) -> (str, bytes): - file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_STRUCTURE.proto.gz" - data = file_path.read_bytes() - return file_path, data +def document_data(request, test_data_dir) -> (str, bytes, dict | list): + doc_type = request.param + input_file_path = test_data_dir / f"72ea04dfdbeb277f37b9eb127efb0896.{doc_type}.proto.gz" + target_file_path = test_data_dir / f"3f9d3d9f255007de8eff13648321e197.{doc_type}.json.gz" + input_data = input_file_path.read_bytes() + target_data = json.loads(gzip.decompress(target_file_path.read_bytes())) -@pytest.fixture -def document_text_document(test_data_dir) -> (str, bytes): - file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_TEXT.proto.gz" - data = file_path.read_bytes() - return file_path, data - - -@pytest.fixture -def document_pages_document(test_data_dir) -> (str, bytes): - file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_PAGES.proto.gz" - data = file_path.read_bytes() - return file_path, data - - -@pytest.fixture -def document_position_document(test_data_dir) -> (str, bytes): - file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_POSITION.proto.gz" - data = file_path.read_bytes() - return file_path, data + return input_file_path, input_data, target_data @pytest.fixture @@ -62,23 +48,25 @@ def should_match(): ] -@pytest.mark.parametrize( - "document_fixture", - [ - "document_structure_document", - "document_text_document", - "document_pages_document", - "document_position_document", - ], +@pytest.mark.xfail( + reason="FIXME: The test is not stable, but hast to work before we can deploy the code! Right now, we don't have parity between the proto and the json data." ) -def test_proto_data_loader_end2end(document_fixture, request, proto_data_loader): - file_path, data = request.getfixturevalue(document_fixture) +@pytest.mark.parametrize( + "document_data", ["DOCUMENT_STRUCTURE", "DOCUMENT_TEXT", "DOCUMENT_POSITION", "DOCUMENT_PAGES"], indirect=True +) +def test_proto_data_loader_end2end(document_data, proto_data_loader): + file_path, data, target = document_data data = gzip.decompress(data) loaded_data = proto_data_loader(file_path, data) - # TODO: Right now, we don't have access to proto-json pairs to compare the loaded data with the expected data. - # If this becomes available, please update this test to compare the loaded data with the expected data. - assert isinstance(loaded_data, dict) or isinstance(loaded_data, list) + diff = DeepDiff(loaded_data, target, ignore_order=True) + + # FIXME: remove this block when the test is stable + if diff: + with open("/tmp/diff.json", "w") as f: + f.write(diff.to_json(indent=2)) + + assert not diff def test_proto_data_loader_unknown_document_type(proto_data_loader):