Merge branch 'feat/RES-757-protobuffer' into 'master'

feat: add protobuffer

See merge request knecon/research/pyinfra!87
This commit is contained in:
Jonathan Kössler 2024-08-06 09:44:01 +02:00
commit 789f6a7f7c
27 changed files with 14211 additions and 2296 deletions

2
.dvc/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/config.local
/cache

5
.dvc/config Normal file
View File

@ -0,0 +1,5 @@
[core]
remote = azure
['remote "azure"']
url = azure://pyinfra-dvc
connection_string =

3
.dvcignore Normal file
View File

@ -0,0 +1,3 @@
# Add patterns of files dvc should ignore, which could improve
# the performance. Learn more at
# https://dvc.org/doc/user-guide/dvcignore

View File

@ -6,6 +6,7 @@
4. [ Module Installation ](#module-installation)
5. [ Scripts ](#scripts)
6. [ Tests ](#tests)
7. [ Protobuf ](#protobuf)
## About
@ -200,3 +201,49 @@ $ python scripts/send_request.py
Tests require a running minio and rabbitmq container, meaning you have to run `docker compose up` in the tests folder
before running the tests.
## Protobuf
### Opentelemetry Compatibility Issue
**Note**: Status: 31/07/2024, the currently used `opentelemetry-exporter-otlp-proto-http` version `1.25.0` requires
a `protobuf` version < `5.x.x` and is not compatible with the latest protobuf version `5.27.x`. This is an [open issue](https://github.com/open-telemetry/opentelemetry-python/issues/3958) in opentelemetry, because [support for 4.25.x ends in Q2 '25](https://protobuf.dev/support/version-support/#python). Therefore, we should keep this in mind and update the dependency once opentelemetry includes support for `protobuf 5.27.x`.
### Install Protobuf Compiler
**Linux**
1. Download the version of the protobuf compiler matching the protobuf package, currently v4.25.4 so protoc v25.4, from [GitHub](https://github.com/protocolbuffers/protobuf/releases) -> `protobuf-25.4.zip`
2. Extract the files under `$HOME/.local` or another directory of your choice
```bash
unzip protoc-<version>-linux-x86_64.zip -d $HOME/.local
```
3. Ensure that the `bin` directory is in your `PATH` by adding the following line to your `.bashrc` or `.zshrc`:
```bash
export PATH="$PATH:$HOME/.local/bin"
```
**MacOS**
1. Download the version of the protobuf compiler matching the protobuf package, currently v4.25.4 so protoc v25.4, from [GitHub](https://github.com/protocolbuffers/protobuf/releases) -> `protoc-25.4-osx-universal_binary.zip`
2. Extract the files to a directory of your choice
3. Copy the executable bin `protoc` to `/usr/local/bin`
```bash
sudo cp /Users/you/location-of-unzipped-dir/bin/protoc /usr/local/bin/
```
4. Open `protoc` in `/usr/local/bin/` via Finder to make it executable, now it should be also on your `PATH`
### Compile Protobuf Files
1. Ensure that the protobuf compiler is installed on your system. You can check this by running:
```bash
protoc --version
```
2. Compile proto files:
```bash
protoc --proto_path=./config/proto --python_out=./pyinfra/proto ./config/proto/*.proto
```
3. Manually adjust import statements in the generated files to match the package structure, e.g.:
`import EntryData_pb2 as EntryData__pb2` -> `import pyinfra.proto.EntryData_pb2 as EntryData__pb2`.
This does not work automatically because the generated files are not in the same directory as the proto files.

13933
bom.json

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
syntax = "proto3";
message AllDocumentPages {
repeated DocumentPage documentPages = 1;
}
message DocumentPage {
// The page number, starting with 1.
int32 number = 1;
// The page height in PDF user units.
int32 height = 2;
// The page width in PDF user units.
int32 width = 3;
// The page rotation as specified by the PDF.
int32 rotation = 4;
}

View File

@ -0,0 +1,28 @@
syntax = "proto3";
message AllDocumentPositionData {
repeated DocumentPositionData documentPositionData = 1;
}
message DocumentPositionData {
// Identifier of the text block.
int64 id = 1;
// For each string coordinate in the search text of the text block, the array contains an entry relating the string coordinate to the position coordinate.
// This is required due to the text and position coordinates not being equal.
repeated int32 stringIdxToPositionIdx = 2;
// The bounding box for each glyph as a rectangle. This matrix is of size (n,4), where n is the number of glyphs in the text block.
// The second dimension specifies the rectangle with the value x, y, width, height, with x, y specifying the lower left corner.
// In order to access this information, the stringIdxToPositionIdx array must be used to transform the coordinates.
repeated Position positions = 3;
// Definition of a BoundingBox that contains x, y, width, and height.
message Position {
float x = 1;
float y = 2;
float width = 3;
float height = 4;
}
}

View File

@ -0,0 +1,8 @@
syntax = "proto3";
import "EntryData.proto";
message DocumentStructure {
// The root EntryData represents the Document.
EntryData root = 1;
}

View File

@ -0,0 +1,29 @@
syntax = "proto3";
message AllDocumentTextData {
repeated DocumentTextData documentTextData = 1;
}
message DocumentTextData {
// Identifier of the text block.
int64 id = 1;
// The page the text block occurs on.
int64 page = 2;
// The text of the text block.
string searchText = 3;
// Each text block is assigned a number on a page, starting from 0.
int32 numberOnPage = 4;
// The text blocks are ordered, this number represents the start of the text block as a string offset.
int32 start = 5;
// The text blocks are ordered, this number represents the end of the text block as a string offset.
int32 end = 6;
// The line breaks in the text of this semantic node in string offsets. They are exclusive end. At the end of each semantic node there is an implicit linebreak.
repeated int32 lineBreaks = 7;
}

View File

@ -0,0 +1,27 @@
syntax = "proto3";
import "LayoutEngine.proto";
import "NodeType.proto";
message EntryData {
// Type of the semantic node.
NodeType type = 1;
// Specifies the position in the parsed tree structure.
repeated int32 treeId = 2;
// Specifies the text block IDs associated with this semantic node.
repeated int64 atomicBlockIds = 3;
// Specifies the pages this semantic node appears on.
repeated int64 pageNumbers = 4;
// Some semantic nodes have additional information, this information is stored in this Map.
map<string, string> properties = 5;
// All child Entries of this Entry.
repeated EntryData children = 6;
// Describes the origin of the semantic node.
repeated LayoutEngine engines = 7;
}

View File

@ -0,0 +1,7 @@
syntax = "proto3";
enum LayoutEngine {
ALGORITHM = 0;
AI = 1;
OUTLINE = 2;
}

View File

@ -0,0 +1,14 @@
syntax = "proto3";
enum NodeType {
DOCUMENT = 0;
SECTION = 1;
SUPER_SECTION = 2;
HEADLINE = 3;
PARAGRAPH = 4;
TABLE = 5;
TABLE_CELL = 6;
IMAGE = 7;
HEADER = 8;
FOOTER = 9;
}

1944
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: DocumentPage.proto
# Protobuf Python Version: 4.25.4
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool
from google.protobuf import symbol_database as _symbol_database
from google.protobuf.internal import builder as _builder
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
b'\n\x12\x44ocumentPage.proto"8\n\x10\x41llDocumentPages\x12$\n\rdocumentPages\x18\x01 \x03(\x0b\x32\r.DocumentPage"O\n\x0c\x44ocumentPage\x12\x0e\n\x06number\x18\x01 \x01(\x05\x12\x0e\n\x06height\x18\x02 \x01(\x05\x12\r\n\x05width\x18\x03 \x01(\x05\x12\x10\n\x08rotation\x18\x04 \x01(\x05\x62\x06proto3'
)
_globals = globals()
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "DocumentPage_pb2", _globals)
if _descriptor._USE_C_DESCRIPTORS == False:
DESCRIPTOR._options = None
_globals["_ALLDOCUMENTPAGES"]._serialized_start = 22
_globals["_ALLDOCUMENTPAGES"]._serialized_end = 78
_globals["_DOCUMENTPAGE"]._serialized_start = 80
_globals["_DOCUMENTPAGE"]._serialized_end = 159
# @@protoc_insertion_point(module_scope)

View File

@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: DocumentPositionData.proto
# Protobuf Python Version: 4.25.4
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool
from google.protobuf import symbol_database as _symbol_database
from google.protobuf.internal import builder as _builder
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
b'\n\x1a\x44ocumentPositionData.proto"N\n\x17\x41llDocumentPositionData\x12\x33\n\x14\x64ocumentPositionData\x18\x01 \x03(\x0b\x32\x15.DocumentPositionData"\xb6\x01\n\x14\x44ocumentPositionData\x12\n\n\x02id\x18\x01 \x01(\x03\x12\x1e\n\x16stringIdxToPositionIdx\x18\x02 \x03(\x05\x12\x31\n\tpositions\x18\x03 \x03(\x0b\x32\x1e.DocumentPositionData.Position\x1a?\n\x08Position\x12\t\n\x01x\x18\x01 \x01(\x02\x12\t\n\x01y\x18\x02 \x01(\x02\x12\r\n\x05width\x18\x03 \x01(\x02\x12\x0e\n\x06height\x18\x04 \x01(\x02\x62\x06proto3'
)
_globals = globals()
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "DocumentPositionData_pb2", _globals)
if _descriptor._USE_C_DESCRIPTORS == False:
DESCRIPTOR._options = None
_globals["_ALLDOCUMENTPOSITIONDATA"]._serialized_start = 30
_globals["_ALLDOCUMENTPOSITIONDATA"]._serialized_end = 108
_globals["_DOCUMENTPOSITIONDATA"]._serialized_start = 111
_globals["_DOCUMENTPOSITIONDATA"]._serialized_end = 293
_globals["_DOCUMENTPOSITIONDATA_POSITION"]._serialized_start = 230
_globals["_DOCUMENTPOSITIONDATA_POSITION"]._serialized_end = 293
# @@protoc_insertion_point(module_scope)

View File

@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: DocumentStructure.proto
# Protobuf Python Version: 4.25.4
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool
from google.protobuf import symbol_database as _symbol_database
from google.protobuf.internal import builder as _builder
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
import pyinfra.proto.EntryData_pb2 as EntryData__pb2
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
b'\n\x17\x44ocumentStructure.proto\x1a\x0f\x45ntryData.proto"-\n\x11\x44ocumentStructure\x12\x18\n\x04root\x18\x01 \x01(\x0b\x32\n.EntryDatab\x06proto3'
)
_globals = globals()
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "DocumentStructure_pb2", _globals)
if _descriptor._USE_C_DESCRIPTORS == False:
DESCRIPTOR._options = None
_globals["_DOCUMENTSTRUCTURE"]._serialized_start = 44
_globals["_DOCUMENTSTRUCTURE"]._serialized_end = 89
# @@protoc_insertion_point(module_scope)

View File

@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: DocumentTextData.proto
# Protobuf Python Version: 4.25.4
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool
from google.protobuf import symbol_database as _symbol_database
from google.protobuf.internal import builder as _builder
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
b'\n\x16\x44ocumentTextData.proto"B\n\x13\x41llDocumentTextData\x12+\n\x10\x64ocumentTextData\x18\x01 \x03(\x0b\x32\x11.DocumentTextData"\x86\x01\n\x10\x44ocumentTextData\x12\n\n\x02id\x18\x01 \x01(\x03\x12\x0c\n\x04page\x18\x02 \x01(\x03\x12\x12\n\nsearchText\x18\x03 \x01(\t\x12\x14\n\x0cnumberOnPage\x18\x04 \x01(\x05\x12\r\n\x05start\x18\x05 \x01(\x05\x12\x0b\n\x03\x65nd\x18\x06 \x01(\x05\x12\x12\n\nlineBreaks\x18\x07 \x03(\x05\x62\x06proto3'
)
_globals = globals()
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "DocumentTextData_pb2", _globals)
if _descriptor._USE_C_DESCRIPTORS == False:
DESCRIPTOR._options = None
_globals["_ALLDOCUMENTTEXTDATA"]._serialized_start = 26
_globals["_ALLDOCUMENTTEXTDATA"]._serialized_end = 92
_globals["_DOCUMENTTEXTDATA"]._serialized_start = 95
_globals["_DOCUMENTTEXTDATA"]._serialized_end = 229
# @@protoc_insertion_point(module_scope)

View File

@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: EntryData.proto
# Protobuf Python Version: 4.25.4
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool
from google.protobuf import symbol_database as _symbol_database
from google.protobuf.internal import builder as _builder
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
import pyinfra.proto.LayoutEngine_pb2 as LayoutEngine__pb2
import pyinfra.proto.NodeType_pb2 as NodeType__pb2
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
b'\n\x0f\x45ntryData.proto\x1a\x12LayoutEngine.proto\x1a\x0eNodeType.proto"\x82\x02\n\tEntryData\x12\x17\n\x04type\x18\x01 \x01(\x0e\x32\t.NodeType\x12\x0e\n\x06treeId\x18\x02 \x03(\x05\x12\x16\n\x0e\x61tomicBlockIds\x18\x03 \x03(\x03\x12\x13\n\x0bpageNumbers\x18\x04 \x03(\x03\x12.\n\nproperties\x18\x05 \x03(\x0b\x32\x1a.EntryData.PropertiesEntry\x12\x1c\n\x08\x63hildren\x18\x06 \x03(\x0b\x32\n.EntryData\x12\x1e\n\x07\x65ngines\x18\x07 \x03(\x0e\x32\r.LayoutEngine\x1a\x31\n\x0fPropertiesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x62\x06proto3'
)
_globals = globals()
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "EntryData_pb2", _globals)
if _descriptor._USE_C_DESCRIPTORS == False:
DESCRIPTOR._options = None
_globals["_ENTRYDATA_PROPERTIESENTRY"]._options = None
_globals["_ENTRYDATA_PROPERTIESENTRY"]._serialized_options = b"8\001"
_globals["_ENTRYDATA"]._serialized_start = 56
_globals["_ENTRYDATA"]._serialized_end = 314
_globals["_ENTRYDATA_PROPERTIESENTRY"]._serialized_start = 265
_globals["_ENTRYDATA_PROPERTIESENTRY"]._serialized_end = 314
# @@protoc_insertion_point(module_scope)

View File

@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: LayoutEngine.proto
# Protobuf Python Version: 4.25.4
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool
from google.protobuf import symbol_database as _symbol_database
from google.protobuf.internal import builder as _builder
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
b"\n\x12LayoutEngine.proto*2\n\x0cLayoutEngine\x12\r\n\tALGORITHM\x10\x00\x12\x06\n\x02\x41I\x10\x01\x12\x0b\n\x07OUTLINE\x10\x02\x62\x06proto3"
)
_globals = globals()
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "LayoutEngine_pb2", _globals)
if _descriptor._USE_C_DESCRIPTORS == False:
DESCRIPTOR._options = None
_globals["_LAYOUTENGINE"]._serialized_start = 22
_globals["_LAYOUTENGINE"]._serialized_end = 72
# @@protoc_insertion_point(module_scope)

View File

@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: NodeType.proto
# Protobuf Python Version: 4.25.4
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool
from google.protobuf import symbol_database as _symbol_database
from google.protobuf.internal import builder as _builder
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
b"\n\x0eNodeType.proto*\x93\x01\n\x08NodeType\x12\x0c\n\x08\x44OCUMENT\x10\x00\x12\x0b\n\x07SECTION\x10\x01\x12\x11\n\rSUPER_SECTION\x10\x02\x12\x0c\n\x08HEADLINE\x10\x03\x12\r\n\tPARAGRAPH\x10\x04\x12\t\n\x05TABLE\x10\x05\x12\x0e\n\nTABLE_CELL\x10\x06\x12\t\n\x05IMAGE\x10\x07\x12\n\n\x06HEADER\x10\x08\x12\n\n\x06\x46OOTER\x10\tb\x06proto3"
)
_globals = globals()
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "NodeType_pb2", _globals)
if _descriptor._USE_C_DESCRIPTORS == False:
DESCRIPTOR._options = None
_globals["_NODETYPE"]._serialized_start = 19
_globals["_NODETYPE"]._serialized_end = 166
# @@protoc_insertion_point(module_scope)

View File

View File

@ -0,0 +1,95 @@
import re
from enum import Enum
from pathlib import Path
from google.protobuf.json_format import MessageToDict
from kn_utils.logging import logger
from pyinfra.proto import (
DocumentPage_pb2,
DocumentPositionData_pb2,
DocumentStructure_pb2,
DocumentTextData_pb2,
)
class ProtoDataLoader:
"""Loads proto data from a file and returns it as a dictionary or list.
The loader is a singleton and should be used as a callable. The file name and byte data are passed as arguments.
The document type is determined based on the file name and the data is returned as a dictionary or list, depending
on the document type.
The DocumentType enum contains all supported document types and their corresponding proto schema.
KEYS_TO_UNPACK contains the keys that should be unpacked from the message dictionary. Keys are unpacked if the
message dictionary contains only one key. This behaviour is necessary since lists are wrapped in a dictionary.
"""
_instance = None
_pattern = None
class DocumentType(Enum):
STRUCTURE = (DocumentStructure_pb2.DocumentStructure, "DocumentStructure")
TEXT = (DocumentTextData_pb2.AllDocumentTextData, "AllDocumentTextData")
PAGES = (DocumentPage_pb2.AllDocumentPages, "AllDocumentPages")
POSITION = (DocumentPositionData_pb2.AllDocumentPositionData, "AllDocumentPositionData")
KEYS_TO_UNPACK = ["documentTextData", "documentPages", "documentPositionData"]
@classmethod
def _build_pattern(cls) -> re.Pattern:
types = "|".join([dt.name for dt in cls.DocumentType])
return re.compile(rf"\..*({types}).*\.proto.*")
def __new__(cls, *args, **kwargs):
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._pattern = cls._build_pattern()
return cls._instance
def __call__(self, file_name: str | Path, data: bytes) -> dict:
return self._load(file_name, data)
def _load(self, file_name: str | Path, data: bytes) -> dict | list:
file_name = str(file_name)
document_type = self._match(file_name)
if not document_type:
logger.error(f"Unknown document type: {file_name}, supported types: {self.DocumentType}")
return {}
logger.debug(f"Loading document type: {document_type}")
schema, _ = self.DocumentType[document_type].value
message = schema()
message.ParseFromString(data)
message_dict = MessageToDict(message, including_default_value_fields=True)
message_dict = convert_int64_fields(message_dict)
return self._unpack(message_dict)
def _match(self, file_name: str) -> str | None:
match = self._pattern.search(file_name)
return match.group(1) if match else None
def _unpack(self, message_dict: dict) -> list | dict:
if len(message_dict) > 1:
return message_dict
for key in self.KEYS_TO_UNPACK:
if key in message_dict:
logger.debug(f"Unpacking key: {key}")
return message_dict[key]
return message_dict
def convert_int64_fields(obj):
# FIXME: find a more sophisticated way to convert int64 fields (defaults to str in python)
if isinstance(obj, dict):
for key, value in obj.items():
obj[key] = convert_int64_fields(value)
elif isinstance(obj, list):
return [convert_int64_fields(item) for item in obj]
elif isinstance(obj, str) and obj.isdigit():
return int(obj)
return obj

View File

@ -6,6 +6,7 @@ from typing import Union
from kn_utils.logging import logger
from pydantic import BaseModel, ValidationError
from pyinfra.storage.proto_data_loader import ProtoDataLoader
from pyinfra.storage.storages.storage import Storage
@ -104,7 +105,14 @@ def _download_single_file(file_path: str, storage: Storage) -> bytes:
data = storage.get_object(file_path)
data = gzip.decompress(data) if ".gz" in file_path else data
data = json.loads(data.decode("utf-8")) if ".json" in file_path else data
if ".json" in file_path:
data = json.loads(data.decode("utf-8"))
elif ".proto" in file_path:
data = ProtoDataLoader()(file_path, data)
else:
pass # identity for other file types
logger.info(f"Downloaded {file_path} from storage.")
return data

View File

@ -34,6 +34,7 @@ opentelemetry-instrumentation-requests = "^0.46b0"
opentelemetry-instrumentation-fastapi = "^0.46b0"
wcwidth = "<=0.2.12"
azure-monitor-opentelemetry = "^1.6.0"
protobuf = "^4.25.3" # FIXME: update to ^5.27.2 after opentelemetry is updated (see README.md/Protobuf)
aio-pika = "^9.4.2"
aiohttp = "^3.9.5"
tenacity = "^8.5.0"
@ -47,6 +48,9 @@ coverage = "^7.3"
requests = "^2.31"
pre-commit = "^3.6.0"
cyclonedx-bom = "^4.1.1"
dvc = "^3.51.2"
dvc-azure = "^3.1.0"
deepdiff = "^7.0.1"
[tool.pytest.ini_options]
minversion = "6.0"

6
tests/data.dvc Normal file
View File

@ -0,0 +1,6 @@
outs:
- md5: 7d36b38a27b5b959beec9e0e772c14c4.dir
size: 23067894
nfiles: 8
hash: md5
path: data

View File

@ -11,10 +11,18 @@ def exporter(settings):
return get_exporter(settings)
class TestOpenTelemetry:
def test_queue_messages_are_traced(self, queue_manager, input_message, stop_message, settings, exporter):
setup_trace(settings, exporter=exporter)
@pytest.fixture(autouse=True)
def setup_test_trace(settings, exporter, tracing_type):
settings.tracing.type = tracing_type
setup_trace(settings, exporter=exporter)
class TestOpenTelemetry:
@pytest.mark.xfail(
reason="Azure Monitor requires a connection string. Therefore the test is allowed to fail in this case."
)
@pytest.mark.parametrize("tracing_type", ["opentelemetry", "azure_monitor"])
def test_queue_messages_are_traced(self, queue_manager, input_message, stop_message, settings, exporter):
instrument_pika()
queue_manager.purge_queues()
@ -31,21 +39,3 @@ class TestOpenTelemetry:
assert (
exported_trace["resource"]["attributes"]["service.name"] == settings.tracing.opentelemetry.service_name
)
# def test_webserver_requests_are_traced(self, settings):
# settings.tracing.opentelemetry.exporter = "console"
# settings.tracing.enabled = True
#
# app = FastAPI()
#
# @app.get("/test")
# def test():
# return {"test": "test"}
#
# thread = create_webserver_thread_from_settings(app, settings)
# thread.start()
# sleep(1)
#
# requests.get(f"http://{settings.webserver.host}:{settings.webserver.port}/test")
#
# thread.join(timeout=1)

View File

@ -0,0 +1,80 @@
import gzip
import json
from pathlib import Path
import pytest
from deepdiff import DeepDiff
from pyinfra.storage.proto_data_loader import ProtoDataLoader
@pytest.fixture
def test_data_dir():
return Path(__file__).parents[1] / "data"
@pytest.fixture
def document_data(request, test_data_dir) -> (str, bytes, dict | list):
doc_type = request.param
input_file_path = test_data_dir / f"72ea04dfdbeb277f37b9eb127efb0896.{doc_type}.proto.gz"
target_file_path = test_data_dir / f"3f9d3d9f255007de8eff13648321e197.{doc_type}.json.gz"
input_data = input_file_path.read_bytes()
target_data = json.loads(gzip.decompress(target_file_path.read_bytes()))
return input_file_path, input_data, target_data
@pytest.fixture
def proto_data_loader():
return ProtoDataLoader()
@pytest.fixture
def should_match():
return [
"a.DOCUMENT_STRUCTURE.proto.gz",
"a.DOCUMENT_TEXT.proto.gz",
"a.DOCUMENT_PAGES.proto.gz",
"a.DOCUMENT_POSITION.proto.gz",
"b.DOCUMENT_STRUCTURE.proto",
"b.DOCUMENT_TEXT.proto",
"b.DOCUMENT_PAGES.proto",
"b.DOCUMENT_POSITION.proto",
"c.STRUCTURE.proto.gz",
"c.TEXT.proto.gz",
"c.PAGES.proto.gz",
"c.POSITION.proto.gz",
]
@pytest.mark.xfail(
reason="FIXME: The test is not stable, but hast to work before we can deploy the code! Right now, we don't have parity between the proto and the json data."
)
# As DOCUMENT_POSITION is a very large file, the test takes forever. If you want to test it, add "DOCUMENT_POSITION" to the list below.
@pytest.mark.parametrize("document_data", ["DOCUMENT_STRUCTURE", "DOCUMENT_TEXT", "DOCUMENT_PAGES"], indirect=True)
def test_proto_data_loader_end2end(document_data, proto_data_loader):
file_path, data, target = document_data
data = gzip.decompress(data)
loaded_data = proto_data_loader(file_path, data)
loaded_data_str = json.dumps(loaded_data, sort_keys=True)
target_str = json.dumps(target, sort_keys=True)
diff = DeepDiff(sorted(loaded_data_str), sorted(target_str), ignore_order=True)
# FIXME: remove this block when the test is stable
# if diff:
# with open("/tmp/diff.json", "w") as f:
# f.write(diff.to_json(indent=2))
assert not diff
def test_proto_data_loader_unknown_document_type(proto_data_loader):
assert not proto_data_loader("unknown_document_type.proto", b"")
def test_proto_data_loader_file_name_matching(proto_data_loader, should_match):
for file_name in should_match:
assert proto_data_loader._match(file_name) is not None