pyinfra/tests/unit_test/proto_data_loader_test.py

import gzip
import json
import difflib
from pathlib import Path

from google.protobuf import json_format

import pytest
from deepdiff import DeepDiff

from pyinfra.storage.proto_data_loader import ProtoDataLoader


@pytest.fixture
def test_data_dir():
    return Path(__file__).parents[1] / "data"


@pytest.fixture
def document_data(request, test_data_dir) -> (str, bytes, dict | list):
    doc_type = request.param
    input_file_path = test_data_dir / f"72ea04dfdbeb277f37b9eb127efb0896.{doc_type}.proto.gz"
    # input_file_path = test_data_dir / f"6ff38b030fa131e8e39bf5598513f981.{doc_type}.proto.gz" # new proto schema
    # input_file_path = test_data_dir / f"8d1e6798a2c5dc14869e5b3ad8ae501f.{doc_type}.proto.gz"
    target_file_path = test_data_dir / f"3f9d3d9f255007de8eff13648321e197.{doc_type}.json.gz"

    input_data = input_file_path.read_bytes()
    target_data = json.loads(gzip.decompress(target_file_path.read_bytes()))

    return input_file_path, input_data, target_data


@pytest.fixture
def proto_data_loader():
    return ProtoDataLoader()


@pytest.fixture
def should_match():
    return [
        "a.DOCUMENT_STRUCTURE.proto.gz",
        "a.DOCUMENT_TEXT.proto.gz",
        "a.DOCUMENT_PAGES.proto.gz",
        "a.DOCUMENT_POSITION.proto.gz",
        "b.DOCUMENT_STRUCTURE.proto",
        "b.DOCUMENT_TEXT.proto",
        "b.DOCUMENT_PAGES.proto",
        "b.DOCUMENT_POSITION.proto",
        "c.STRUCTURE.proto.gz",
        "c.TEXT.proto.gz",
        "c.PAGES.proto.gz",
        "c.POSITION.proto.gz",
    ]


@pytest.mark.xfail(
    reason="FIXME: The test is not stable, but hast to work before we can deploy the code! Right now, we don't have parity between the proto and the json data."
)
# As DOCUMENT_POSITION is a very large file, the test takes forever. If you want to test it, add "DOCUMENT_POSITION" to the list below.
@pytest.mark.parametrize("document_data", ["DOCUMENT_STRUCTURE", "DOCUMENT_TEXT", "DOCUMENT_PAGES"], indirect=True)
def test_proto_data_loader_end2end(document_data, proto_data_loader):
    file_path, data, target = document_data
    data = gzip.decompress(data)
    loaded_data = proto_data_loader(file_path, data)

    # proto_json = json_format.MessageToJson(loaded_data)

    loaded_data_str = json.dumps(loaded_data, sort_keys=True)
    target_str = json.dumps(target, sort_keys=True)

    # diff = difflib.unified_diff(loaded_data_str.splitlines(), target_str.splitlines())

    # for line in diff:
    #     print(line)

    # diff = DeepDiff(loaded_data, target, ignore_order=True)
    # print(diff)

    diff = DeepDiff(sorted(loaded_data_str), sorted(target_str), ignore_order=True)

    # FIXME: remove this block when the test is stable
    # if diff:
    #     print(diff.to_json(indent=2))
    #     with open(f"diff_{file_path}.json", "w") as f:
    #         f.write(diff.to_json(indent=2))

    assert not diff


def test_proto_data_loader_unknown_document_type(proto_data_loader):
    assert not proto_data_loader("unknown_document_type.proto", b"")


def test_proto_data_loader_file_name_matching(proto_data_loader, should_match):
    for file_name in should_match:
        assert proto_data_loader._match(file_name) is not None


@pytest.mark.parametrize("document_data", ["DOCUMENT_PAGES"], indirect=True)
def test_document_page_types(document_data, proto_data_loader):
    # types from document reader
    # number: int
    # height: int
    # width: int
    # rotation: int

    file_path, data, _ = document_data
    data = gzip.decompress(data)
    loaded_data = proto_data_loader(file_path, data)

    assert isinstance(loaded_data, list)
    assert all(isinstance(entry, dict) for entry in loaded_data)

    # since all values need to be int anyway we can summarize it
    assert all(all(isinstance(value, int) for value in entry.values()) for entry in loaded_data)


@pytest.mark.parametrize("document_data", ["DOCUMENT_POSITION"], indirect=True)
def test_document_position_data_types(document_data, proto_data_loader):
    # types from document reader
    # id: int
    # stringIdxToPositionIdx: list[int]
    # positions: list[list[float]]

    file_path, data, _ = document_data
    data = gzip.decompress(data)
    loaded_data = proto_data_loader(file_path, data)

    assert isinstance(loaded_data, list)
    assert all(isinstance(entry, dict) for entry in loaded_data)

    for entry in loaded_data:
        assert isinstance(entry["id"], int)
        assert isinstance(entry["stringIdxToPositionIdx"], list)
        assert isinstance(entry["positions"], list)
        assert all(isinstance(position, list) for position in entry["positions"])
        assert all(all(isinstance(coordinate, float) for coordinate in position) for position in entry["positions"])


@pytest.mark.parametrize("document_data", ["DOCUMENT_STRUCTURE"], indirect=True)
def test_document_structure_types(document_data, proto_data_loader):
    # types from document reader for DocumentStructure
    # root: dict

    # types from document reader for EntryData
    # type: str
    # tree_id: list[int]
    # atomic_block_ids: list[int]
    # page_numbers: list[int]
    # properties: dict[str, str]
    # children: list[dict]

    file_path, data, _ = document_data
    data = gzip.decompress(data)
    loaded_data = proto_data_loader(file_path, data)

    assert isinstance(loaded_data, dict)
    assert isinstance(loaded_data["root"], dict)
    assert isinstance(loaded_data["root"]["type"], str)
    assert isinstance(loaded_data["root"]["treeId"], list)
    assert isinstance(loaded_data["root"]["atomicBlockIds"], list)
    assert isinstance(loaded_data["root"]["pageNumbers"], list)
    assert isinstance(loaded_data["root"]["children"], list)

    assert all(isinstance(value, int) for value in loaded_data["root"]["treeId"])
    assert all(isinstance(value, int) for value in loaded_data["root"]["atomicBlockIds"])
    assert all(isinstance(value, int) for value in loaded_data["root"]["pageNumbers"])
    assert all(isinstance(value, dict) for value in loaded_data["root"]["properties"].values())
    assert all(
        all(isinstance(value, dict) for value in entry.values()) for entry in loaded_data["root"]["properties"].values()
    )
    assert all(isinstance(value, dict) for value in loaded_data["root"]["children"])


@pytest.mark.parametrize("document_data", ["DOCUMENT_TEXT"], indirect=True)
def test_document_text_data_types(document_data, proto_data_loader):
    # types from document reader
    # id: int
    # page: int
    # search_text: str
    # number_on_page: int
    # start: int
    # end: int
    # lineBreaks: list[int]

    file_path, data, _ = document_data
    data = gzip.decompress(data)
    loaded_data = proto_data_loader(file_path, data)

    assert isinstance(loaded_data, list)
    assert all(isinstance(entry, dict) for entry in loaded_data)

    for entry in loaded_data:
        assert isinstance(entry["id"], int)
        assert isinstance(entry["page"], int)
        assert isinstance(entry["searchText"], str)
        assert isinstance(entry["numberOnPage"], int)
        assert isinstance(entry["start"], int)
        assert isinstance(entry["end"], int)
        assert all(isinstance(value, int) for value in entry["lineBreaks"])