import gzip from pathlib import Path import pytest from pyinfra.storage.proto_data_loader import ProtoDataLoader @pytest.fixture def test_data_dir(): return Path(__file__).parents[1] / "data" @pytest.fixture def document_structure_document(test_data_dir) -> (str, bytes): file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_STRUCTURE.proto.gz" data = file_path.read_bytes() return file_path, data @pytest.fixture def document_text_document(test_data_dir) -> (str, bytes): file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_TEXT.proto.gz" data = file_path.read_bytes() return file_path, data @pytest.fixture def document_pages_document(test_data_dir) -> (str, bytes): file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_PAGES.proto.gz" data = file_path.read_bytes() return file_path, data @pytest.fixture def document_position_document(test_data_dir) -> (str, bytes): file_path = test_data_dir / "72ea04dfdbeb277f37b9eb127efb0896.DOCUMENT_POSITION.proto.gz" data = file_path.read_bytes() return file_path, data @pytest.fixture def proto_data_loader(): return ProtoDataLoader() @pytest.fixture def should_match(): return [ "a.DOCUMENT_STRUCTURE.proto.gz", "a.DOCUMENT_TEXT.proto.gz", "a.DOCUMENT_PAGES.proto.gz", "a.DOCUMENT_POSITION.proto.gz", "b.DOCUMENT_STRUCTURE.proto", "b.DOCUMENT_TEXT.proto", "b.DOCUMENT_PAGES.proto", "b.DOCUMENT_POSITION.proto", "c.STRUCTURE.proto.gz", "c.TEXT.proto.gz", "c.PAGES.proto.gz", "c.POSITION.proto.gz", ] @pytest.mark.parametrize( "document_fixture", [ "document_structure_document", "document_text_document", "document_pages_document", "document_position_document", ], ) def test_proto_data_loader_end2end(document_fixture, request, proto_data_loader): file_path, data = request.getfixturevalue(document_fixture) data = gzip.decompress(data) loaded_data = proto_data_loader(file_path, data) # TODO: Right now, we don't have access to proto-json pairs to compare the loaded data with the expected data. # If this becomes available, please update this test to compare the loaded data with the expected data. assert isinstance(loaded_data, dict) or isinstance(loaded_data, list) def test_proto_data_loader_unknown_document_type(proto_data_loader): with pytest.raises(ValueError): proto_data_loader("unknown_document_type.proto", b"") def test_proto_data_loader_file_name_matching(proto_data_loader, should_match): for file_name in should_match: assert proto_data_loader._match(file_name) is not None