feat: update data loader tests

We now compare the output proto json conversion to expected json files. This revealed multiple differences between the file. FIXED: int64 type was cast into string in python. We now get proper integers TODO: Empty fields are omitted by proto, but the jsons have them and the pyinfra implementing services might expect them. We have to test this behaviour and adjusts the tests accordingly.
2024-07-18 12:36:26 +02:00 · 2024-07-18 12:36:26 +02:00 · 70d3a210a1
commit 70d3a210a1
parent f935056fa9
3 changed files with 40 additions and 39 deletions
--- a/pyinfra/storage/proto_data_loader.py
+++ b/pyinfra/storage/proto_data_loader.py
@ -58,6 +58,7 @@ class ProtoDataLoader:
        message = schema()
        message.ParseFromString(data)
        message_dict = MessageToDict(message)
+        message_dict = convert_int64_fields(message_dict)

        return self._unpack(message_dict)

@ -75,3 +76,15 @@ class ProtoDataLoader:
                return message_dict[key]

        return message_dict
+
+
+def convert_int64_fields(obj):
+    # FIXME: find a more sophisticated way to convert int64 fields (defaults to str in python)
+    if isinstance(obj, dict):
+        for key, value in obj.items():
+            obj[key] = convert_int64_fields(value)
+    elif isinstance(obj, list):
+        return [convert_int64_fields(item) for item in obj]
+    elif isinstance(obj, str) and obj.isdigit():
+        return int(obj)
+    return obj
--- a/tests/data.dvc
+++ b/tests/data.dvc
@ -1,6 +1,6 @@
 outs:
- md5: 014d60e407398cd6b15ced3a27235f08.dir
-  size: 10409309
-  nfiles: 4
+- md5: 7d36b38a27b5b959beec9e0e772c14c4.dir
+  size: 23067894
+  nfiles: 8
  hash: md5
  path: data
--- a/tests/unit_test/proto_data_loader_test.py
+++ b/tests/unit_test/proto_data_loader_test.py
@ -1,7 +1,9 @@
 import gzip
+import json
 from pathlib import Path

 import pytest
+from deepdiff import DeepDiff

 from pyinfra.storage.proto_data_loader import ProtoDataLoader

@ -12,31 +14,15 @@ def test_data_dir():


@pytest.fixture
-def document_structure_document(test_data_dir) -> (str, bytes):
-    file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_STRUCTURE.proto.gz"
-    data = file_path.read_bytes()
-    return file_path, data
+def document_data(request, test_data_dir) -> (str, bytes, dict | list):
+    doc_type = request.param
+    input_file_path = test_data_dir / f"72ea04dfdbeb277f37b9eb127efb0896.{doc_type}.proto.gz"
+    target_file_path = test_data_dir / f"3f9d3d9f255007de8eff13648321e197.{doc_type}.json.gz"

+    input_data = input_file_path.read_bytes()
+    target_data = json.loads(gzip.decompress(target_file_path.read_bytes()))

-@pytest.fixture
-def document_text_document(test_data_dir) -> (str, bytes):
-    file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_TEXT.proto.gz"
-    data = file_path.read_bytes()
-    return file_path, data
-
-
-@pytest.fixture
-def document_pages_document(test_data_dir) -> (str, bytes):
-    file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_PAGES.proto.gz"
-    data = file_path.read_bytes()
-    return file_path, data
-
-
-@pytest.fixture
-def document_position_document(test_data_dir) -> (str, bytes):
-    file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_POSITION.proto.gz"
-    data = file_path.read_bytes()
-    return file_path, data
+    return input_file_path, input_data, target_data


@pytest.fixture
@ -62,23 +48,25 @@ def should_match():
    ]


-@pytest.mark.parametrize(
-    "document_fixture",
-    [
-        "document_structure_document",
-        "document_text_document",
-        "document_pages_document",
-        "document_position_document",
-    ],
+@pytest.mark.xfail(
+    reason="FIXME: The test is not stable, but hast to work before we can deploy the code! Right now, we don't have parity between the proto and the json data."
 )
-def test_proto_data_loader_end2end(document_fixture, request, proto_data_loader):
-    file_path, data = request.getfixturevalue(document_fixture)
+@pytest.mark.parametrize(
+    "document_data", ["DOCUMENT_STRUCTURE", "DOCUMENT_TEXT", "DOCUMENT_POSITION", "DOCUMENT_PAGES"], indirect=True
+)
+def test_proto_data_loader_end2end(document_data, proto_data_loader):
+    file_path, data, target = document_data
    data = gzip.decompress(data)
    loaded_data = proto_data_loader(file_path, data)

-    # TODO: Right now, we don't have access to proto-json pairs to compare the loaded data with the expected data.
-    #  If this becomes available, please update this test to compare the loaded data with the expected data.
-    assert isinstance(loaded_data, dict) or isinstance(loaded_data, list)
+    diff = DeepDiff(loaded_data, target, ignore_order=True)
+
+    # FIXME: remove this block when the test is stable
+    if diff:
+        with open("/tmp/diff.json", "w") as f:
+            f.write(diff.to_json(indent=2))
+
+    assert not diff


 def test_proto_data_loader_unknown_document_type(proto_data_loader):