198 lines
6.9 KiB
Python
198 lines
6.9 KiB
Python
import gzip
|
|
import json
|
|
from pathlib import Path
|
|
|
|
|
|
import pytest
|
|
from deepdiff import DeepDiff
|
|
|
|
from pyinfra.storage.proto_data_loader import ProtoDataLoader
|
|
|
|
enum = 1
|
|
|
|
@pytest.fixture
|
|
def test_data_dir():
|
|
return Path(__file__).parents[1] / "data"
|
|
|
|
|
|
@pytest.fixture
|
|
def document_data(request, test_data_dir) -> (str, bytes, dict | list):
|
|
doc_type = request.param
|
|
|
|
# Search for relevant doc_type file pairs - there should be one proto and one json file per document type
|
|
input_file_path = next(test_data_dir.glob(f"*.{doc_type}.proto.gz"), None)
|
|
target_file_path = next(test_data_dir.glob(f"*.{doc_type}.json.gz"), None)
|
|
|
|
input_data = input_file_path.read_bytes()
|
|
target_data = json.loads(gzip.decompress(target_file_path.read_bytes()))
|
|
|
|
return input_file_path, input_data, target_data
|
|
|
|
|
|
@pytest.fixture
|
|
def proto_data_loader():
|
|
return ProtoDataLoader()
|
|
|
|
|
|
@pytest.fixture
|
|
def should_match():
|
|
return [
|
|
"a.DOCUMENT_STRUCTURE.proto.gz",
|
|
"a.DOCUMENT_TEXT.proto.gz",
|
|
"a.DOCUMENT_PAGES.proto.gz",
|
|
"a.DOCUMENT_POSITION.proto.gz",
|
|
"b.DOCUMENT_STRUCTURE.proto",
|
|
"b.DOCUMENT_TEXT.proto",
|
|
"b.DOCUMENT_PAGES.proto",
|
|
"b.DOCUMENT_POSITION.proto",
|
|
"c.STRUCTURE.proto.gz",
|
|
"c.TEXT.proto.gz",
|
|
"c.PAGES.proto.gz",
|
|
"c.POSITION.proto.gz",
|
|
]
|
|
|
|
|
|
@pytest.mark.xfail(
|
|
reason="FIXME: The test is not stable, but has to work before we can deploy the code! Right now, we don't have parity between the proto and the json data."
|
|
)
|
|
# As DOCUMENT_POSITION is a very large file, the test takes forever. If you want to test it, add "DOCUMENT_POSITION" to the list below. - Added per default
|
|
@pytest.mark.parametrize("document_data", ["DOCUMENT_STRUCTURE", "DOCUMENT_TEXT", "DOCUMENT_PAGES", "DOCUMENT_POSITION"], indirect=True)
|
|
def test_proto_data_loader_end2end(document_data, proto_data_loader):
|
|
file_path, data, target = document_data
|
|
data = gzip.decompress(data)
|
|
loaded_data = proto_data_loader(file_path, data)
|
|
|
|
loaded_data_str = json.dumps(loaded_data, sort_keys=True)
|
|
target_str = json.dumps(target, sort_keys=True)
|
|
|
|
# If you want to look at the files in more detail uncomment code below
|
|
# global enum
|
|
# with open(f"input-{enum}.json", "w") as f:
|
|
# json.dump(target, f, sort_keys=True, indent=4)
|
|
|
|
# with open(f"output-{enum}.json", "w") as f:
|
|
# json.dump(loaded_data, f, sort_keys=True, indent=4)
|
|
# enum += 1
|
|
|
|
diff = DeepDiff(loaded_data_str, target_str, ignore_order=True)
|
|
|
|
# FIXME: remove this block when the test is stable
|
|
# if diff:
|
|
# with open(f"diff_test.json", "w") as f:
|
|
# f.write(diff.to_json(indent=4))
|
|
|
|
assert not diff
|
|
|
|
|
|
def test_proto_data_loader_unknown_document_type(proto_data_loader):
|
|
assert not proto_data_loader("unknown_document_type.proto", b"")
|
|
|
|
|
|
def test_proto_data_loader_file_name_matching(proto_data_loader, should_match):
|
|
for file_name in should_match:
|
|
assert proto_data_loader._match(file_name) is not None
|
|
|
|
|
|
@pytest.mark.parametrize("document_data", ["DOCUMENT_PAGES"], indirect=True)
|
|
def test_document_page_types(document_data, proto_data_loader):
|
|
# types from document reader
|
|
# number: int
|
|
# height: int
|
|
# width: int
|
|
# rotation: int
|
|
|
|
file_path, data, _ = document_data
|
|
data = gzip.decompress(data)
|
|
loaded_data = proto_data_loader(file_path, data)
|
|
|
|
assert isinstance(loaded_data, list)
|
|
assert all(isinstance(entry, dict) for entry in loaded_data)
|
|
|
|
# since all values need to be int anyway we can summarize it
|
|
assert all(all(isinstance(value, int) for value in entry.values()) for entry in loaded_data)
|
|
|
|
|
|
@pytest.mark.parametrize("document_data", ["DOCUMENT_POSITION"], indirect=True)
|
|
def test_document_position_data_types(document_data, proto_data_loader):
|
|
# types from document reader
|
|
# id: int
|
|
# stringIdxToPositionIdx: list[int]
|
|
# positions: list[list[float]]
|
|
|
|
file_path, data, _ = document_data
|
|
data = gzip.decompress(data)
|
|
loaded_data = proto_data_loader(file_path, data)
|
|
|
|
assert isinstance(loaded_data, list)
|
|
assert all(isinstance(entry, dict) for entry in loaded_data)
|
|
|
|
for entry in loaded_data:
|
|
assert isinstance(entry["id"], int)
|
|
assert isinstance(entry["stringIdxToPositionIdx"], list)
|
|
assert isinstance(entry["positions"], list)
|
|
assert all(isinstance(position, list) for position in entry["positions"])
|
|
assert all(all(isinstance(coordinate, float) for coordinate in position) for position in entry["positions"])
|
|
|
|
|
|
@pytest.mark.parametrize("document_data", ["DOCUMENT_STRUCTURE"], indirect=True)
|
|
def test_document_structure_types(document_data, proto_data_loader):
|
|
# types from document reader for DocumentStructure
|
|
# root: dict
|
|
|
|
# types from document reader for EntryData
|
|
# type: str
|
|
# tree_id: list[int]
|
|
# atomic_block_ids: list[int]
|
|
# page_numbers: list[int]
|
|
# properties: dict[str, str]
|
|
# children: list[dict]
|
|
|
|
file_path, data, _ = document_data
|
|
data = gzip.decompress(data)
|
|
loaded_data = proto_data_loader(file_path, data)
|
|
|
|
assert isinstance(loaded_data, dict)
|
|
assert isinstance(loaded_data["root"], dict)
|
|
assert isinstance(loaded_data["root"]["type"], str)
|
|
assert isinstance(loaded_data["root"]["treeId"], list)
|
|
assert isinstance(loaded_data["root"]["atomicBlockIds"], list)
|
|
assert isinstance(loaded_data["root"]["pageNumbers"], list)
|
|
assert isinstance(loaded_data["root"]["children"], list)
|
|
|
|
assert all(isinstance(value, int) for value in loaded_data["root"]["treeId"])
|
|
assert all(isinstance(value, int) for value in loaded_data["root"]["atomicBlockIds"])
|
|
assert all(isinstance(value, int) for value in loaded_data["root"]["pageNumbers"])
|
|
assert all(isinstance(value, dict) for value in loaded_data["root"]["properties"].values())
|
|
assert all(
|
|
all(isinstance(value, dict) for value in entry.values()) for entry in loaded_data["root"]["properties"].values()
|
|
)
|
|
assert all(isinstance(value, dict) for value in loaded_data["root"]["children"])
|
|
|
|
|
|
@pytest.mark.parametrize("document_data", ["DOCUMENT_TEXT"], indirect=True)
|
|
def test_document_text_data_types(document_data, proto_data_loader):
|
|
# types from document reader
|
|
# id: int
|
|
# page: int
|
|
# search_text: str
|
|
# number_on_page: int
|
|
# start: int
|
|
# end: int
|
|
# lineBreaks: list[int]
|
|
|
|
file_path, data, _ = document_data
|
|
data = gzip.decompress(data)
|
|
loaded_data = proto_data_loader(file_path, data)
|
|
|
|
assert isinstance(loaded_data, list)
|
|
assert all(isinstance(entry, dict) for entry in loaded_data)
|
|
|
|
for entry in loaded_data:
|
|
assert isinstance(entry["id"], int)
|
|
assert isinstance(entry["page"], int)
|
|
assert isinstance(entry["searchText"], str)
|
|
assert isinstance(entry["numberOnPage"], int)
|
|
assert isinstance(entry["start"], int)
|
|
assert isinstance(entry["end"], int)
|
|
assert all(isinstance(value, int) for value in entry["lineBreaks"])
|