import gzip import json from pathlib import Path import pytest from deepdiff import DeepDiff from pyinfra.storage.proto_data_loader import ProtoDataLoader enum = 1 @pytest.fixture def test_data_dir(): return Path(__file__).parents[1] / "data" @pytest.fixture def document_data(request, test_data_dir) -> (str, bytes, dict | list): doc_type = request.param # Search for relevant doc_type file pairs - there should be one proto and one json file per document type input_file_path = next(test_data_dir.glob(f"*.{doc_type}.proto.gz"), None) target_file_path = next(test_data_dir.glob(f"*.{doc_type}.json.gz"), None) input_data = input_file_path.read_bytes() target_data = json.loads(gzip.decompress(target_file_path.read_bytes())) return input_file_path, input_data, target_data @pytest.fixture def proto_data_loader(): return ProtoDataLoader() @pytest.fixture def should_match(): return [ "a.DOCUMENT_STRUCTURE.proto.gz", "a.DOCUMENT_TEXT.proto.gz", "a.DOCUMENT_PAGES.proto.gz", "a.DOCUMENT_POSITION.proto.gz", "b.DOCUMENT_STRUCTURE.proto", "b.DOCUMENT_TEXT.proto", "b.DOCUMENT_PAGES.proto", "b.DOCUMENT_POSITION.proto", "c.STRUCTURE.proto.gz", "c.TEXT.proto.gz", "c.PAGES.proto.gz", "c.POSITION.proto.gz", ] @pytest.mark.xfail( reason="FIXME: The test is not stable, but has to work before we can deploy the code! Right now, we don't have parity between the proto and the json data." ) # As DOCUMENT_POSITION is a very large file, the test takes forever. If you want to test it, add "DOCUMENT_POSITION" to the list below. - Added per default @pytest.mark.parametrize("document_data", ["DOCUMENT_STRUCTURE", "DOCUMENT_TEXT", "DOCUMENT_PAGES", "DOCUMENT_POSITION"], indirect=True) def test_proto_data_loader_end2end(document_data, proto_data_loader): file_path, data, target = document_data data = gzip.decompress(data) loaded_data = proto_data_loader(file_path, data) loaded_data_str = json.dumps(loaded_data, sort_keys=True) target_str = json.dumps(target, sort_keys=True) # If you want to look at the files in more detail uncomment code below # global enum # with open(f"input-{enum}.json", "w") as f: # json.dump(target, f, sort_keys=True, indent=4) # with open(f"output-{enum}.json", "w") as f: # json.dump(loaded_data, f, sort_keys=True, indent=4) # enum += 1 diff = DeepDiff(loaded_data_str, target_str, ignore_order=True) # FIXME: remove this block when the test is stable # if diff: # with open(f"diff_test.json", "w") as f: # f.write(diff.to_json(indent=4)) assert not diff def test_proto_data_loader_unknown_document_type(proto_data_loader): assert not proto_data_loader("unknown_document_type.proto", b"") def test_proto_data_loader_file_name_matching(proto_data_loader, should_match): for file_name in should_match: assert proto_data_loader._match(file_name) is not None @pytest.mark.parametrize("document_data", ["DOCUMENT_PAGES"], indirect=True) def test_document_page_types(document_data, proto_data_loader): # types from document reader # number: int # height: int # width: int # rotation: int file_path, data, _ = document_data data = gzip.decompress(data) loaded_data = proto_data_loader(file_path, data) assert isinstance(loaded_data, list) assert all(isinstance(entry, dict) for entry in loaded_data) # since all values need to be int anyway we can summarize it assert all(all(isinstance(value, int) for value in entry.values()) for entry in loaded_data) @pytest.mark.parametrize("document_data", ["DOCUMENT_POSITION"], indirect=True) def test_document_position_data_types(document_data, proto_data_loader): # types from document reader # id: int # stringIdxToPositionIdx: list[int] # positions: list[list[float]] file_path, data, _ = document_data data = gzip.decompress(data) loaded_data = proto_data_loader(file_path, data) assert isinstance(loaded_data, list) assert all(isinstance(entry, dict) for entry in loaded_data) for entry in loaded_data: assert isinstance(entry["id"], int) assert isinstance(entry["stringIdxToPositionIdx"], list) assert isinstance(entry["positions"], list) assert all(isinstance(position, list) for position in entry["positions"]) assert all(all(isinstance(coordinate, float) for coordinate in position) for position in entry["positions"]) @pytest.mark.parametrize("document_data", ["DOCUMENT_STRUCTURE"], indirect=True) def test_document_structure_types(document_data, proto_data_loader): # types from document reader for DocumentStructure # root: dict # types from document reader for EntryData # type: str # tree_id: list[int] # atomic_block_ids: list[int] # page_numbers: list[int] # properties: dict[str, str] # children: list[dict] file_path, data, _ = document_data data = gzip.decompress(data) loaded_data = proto_data_loader(file_path, data) assert isinstance(loaded_data, dict) assert isinstance(loaded_data["root"], dict) assert isinstance(loaded_data["root"]["type"], str) assert isinstance(loaded_data["root"]["treeId"], list) assert isinstance(loaded_data["root"]["atomicBlockIds"], list) assert isinstance(loaded_data["root"]["pageNumbers"], list) assert isinstance(loaded_data["root"]["children"], list) assert all(isinstance(value, int) for value in loaded_data["root"]["treeId"]) assert all(isinstance(value, int) for value in loaded_data["root"]["atomicBlockIds"]) assert all(isinstance(value, int) for value in loaded_data["root"]["pageNumbers"]) assert all(isinstance(value, dict) for value in loaded_data["root"]["properties"].values()) assert all( all(isinstance(value, dict) for value in entry.values()) for entry in loaded_data["root"]["properties"].values() ) assert all(isinstance(value, dict) for value in loaded_data["root"]["children"]) @pytest.mark.parametrize("document_data", ["DOCUMENT_TEXT"], indirect=True) def test_document_text_data_types(document_data, proto_data_loader): # types from document reader # id: int # page: int # search_text: str # number_on_page: int # start: int # end: int # lineBreaks: list[int] file_path, data, _ = document_data data = gzip.decompress(data) loaded_data = proto_data_loader(file_path, data) assert isinstance(loaded_data, list) assert all(isinstance(entry, dict) for entry in loaded_data) for entry in loaded_data: assert isinstance(entry["id"], int) assert isinstance(entry["page"], int) assert isinstance(entry["searchText"], str) assert isinstance(entry["numberOnPage"], int) assert isinstance(entry["start"], int) assert isinstance(entry["end"], int) assert all(isinstance(value, int) for value in entry["lineBreaks"])