diff --git a/README.md b/README.md index b510837..5dcbd10 100755 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ 4. [ Module Installation ](#module-installation) 5. [ Scripts ](#scripts) 6. [ Tests ](#tests) -7. [ Protobuf ](#protobuf) +7. [ Opentelemetry protobuf dependency hell ](#opentelemetry-protobuf-dependency-hell) ## About @@ -213,48 +213,8 @@ $ python scripts/send_request.py Tests require a running minio and rabbitmq container, meaning you have to run `docker compose up` in the tests folder before running the tests. -## Protobuf +## OpenTelemetry Protobuf Dependency Hell -### Opentelemetry Compatibility Issue - -**Note**: Status: 31/07/2024, the currently used `opentelemetry-exporter-otlp-proto-http` version `1.25.0` requires -a `protobuf` version < `5.x.x` and is not compatible with the latest protobuf version `5.27.x`. This is an [open issue](https://github.com/open-telemetry/opentelemetry-python/issues/3958) in opentelemetry, because [support for 4.25.x ends in Q2 '25](https://protobuf.dev/support/version-support/#python). Therefore, we should keep this in mind and update the dependency once opentelemetry includes support for `protobuf 5.27.x`. - - -### Install Protobuf Compiler - -**Linux** - -1. Download the version of the protobuf compiler matching the protobuf package, currently v4.25.4 so protoc v25.4, from [GitHub](https://github.com/protocolbuffers/protobuf/releases) -> `protobuf-25.4.zip` -2. Extract the files under `$HOME/.local` or another directory of your choice - ```bash - unzip protoc--linux-x86_64.zip -d $HOME/.local - ``` -3. Ensure that the `bin` directory is in your `PATH` by adding the following line to your `.bashrc` or `.zshrc`: - ```bash - export PATH="$PATH:$HOME/.local/bin" - ``` - -**MacOS** - -1. Download the version of the protobuf compiler matching the protobuf package, currently v4.25.4 so protoc v25.4, from [GitHub](https://github.com/protocolbuffers/protobuf/releases) -> `protoc-25.4-osx-universal_binary.zip` -2. Extract the files to a directory of your choice -3. Copy the executable bin `protoc` to `/usr/local/bin` - ```bash - sudo cp /Users/you/location-of-unzipped-dir/bin/protoc /usr/local/bin/ - ``` -4. Open `protoc` in `/usr/local/bin/` via Finder to make it executable, now it should be also on your `PATH` - -### Compile Protobuf Files - -1. Ensure that the protobuf compiler is installed on your system. You can check this by running: - ```bash - protoc --version - ``` -2. Compile proto files: - ```bash - protoc --proto_path=./config/proto --python_out=./pyinfra/proto ./config/proto/*.proto - ``` -3. Manually adjust import statements in the generated files to match the package structure, e.g.: - `import EntryData_pb2 as EntryData__pb2` -> `import pyinfra.proto.EntryData_pb2 as EntryData__pb2`. - This does not work automatically because the generated files are not in the same directory as the proto files. +**Note**: Status 2025/01/09: the currently used `opentelemetry-exporter-otlp-proto-http` version `1.25.0` requires +a `protobuf` version < `5.x.x` and is not compatible with the latest protobuf version `5.27.x`. This is an [open issue](https://github.com/open-telemetry/opentelemetry-python/issues/3958) in opentelemetry, because [support for 4.25.x ends in Q2 '25](https://protobuf.dev/support/version-support/#python). +Therefore, we should keep this in mind and update the dependency once opentelemetry includes support for `protobuf 5.27.x`. diff --git a/config/proto/DocumentPage.proto b/config/proto/DocumentPage.proto deleted file mode 100644 index a3d518b..0000000 --- a/config/proto/DocumentPage.proto +++ /dev/null @@ -1,21 +0,0 @@ -syntax = "proto3"; - -message AllDocumentPages { - - repeated DocumentPage documentPages = 1; -} - -message DocumentPage { - // The page number, starting with 1. - int32 number = 1; - - // The page height in PDF user units. - int32 height = 2; - - // The page width in PDF user units. - int32 width = 3; - - // The page rotation as specified by the PDF. - int32 rotation = 4; -} - diff --git a/config/proto/DocumentPositionData.proto b/config/proto/DocumentPositionData.proto deleted file mode 100644 index 5353924..0000000 --- a/config/proto/DocumentPositionData.proto +++ /dev/null @@ -1,25 +0,0 @@ -syntax = "proto3"; - -message AllDocumentPositionData { - - repeated DocumentPositionData documentPositionData = 1; -} - -message DocumentPositionData { - // Identifier of the text block. - int64 id = 1; - - // For each string coordinate in the search text of the text block, the array contains an entry relating the string coordinate to the position coordinate. - // This is required due to the text and position coordinates not being equal. - repeated int32 stringIdxToPositionIdx = 2; - - // The bounding box for each glyph as a rectangle. This matrix is of size (n,4), where n is the number of glyphs in the text block. - // The second dimension specifies the rectangle with the value x, y, width, height, with x, y specifying the lower left corner. - // In order to access this information, the stringIdxToPositionIdx array must be used to transform the coordinates. - repeated Position positions = 3; - - // Definition of a BoundingBox that contains x, y, width, and height. - message Position { - repeated float value = 1; - } -} diff --git a/config/proto/DocumentStructure.proto b/config/proto/DocumentStructure.proto deleted file mode 100644 index fc7aea6..0000000 --- a/config/proto/DocumentStructure.proto +++ /dev/null @@ -1,8 +0,0 @@ -syntax = "proto3"; - -import "EntryData.proto"; - -message DocumentStructure { - // The root EntryData represents the Document. - EntryData root = 1; -} diff --git a/config/proto/DocumentTextData.proto b/config/proto/DocumentTextData.proto deleted file mode 100644 index 9f187ce..0000000 --- a/config/proto/DocumentTextData.proto +++ /dev/null @@ -1,29 +0,0 @@ -syntax = "proto3"; - -message AllDocumentTextData { - - repeated DocumentTextData documentTextData = 1; -} - -message DocumentTextData { - // Identifier of the text block. - int64 id = 1; - - // The page the text block occurs on. - int64 page = 2; - - // The text of the text block. - string searchText = 3; - - // Each text block is assigned a number on a page, starting from 0. - int32 numberOnPage = 4; - - // The text blocks are ordered, this number represents the start of the text block as a string offset. - int32 start = 5; - - // The text blocks are ordered, this number represents the end of the text block as a string offset. - int32 end = 6; - - // The line breaks in the text of this semantic node in string offsets. They are exclusive end. At the end of each semantic node there is an implicit linebreak. - repeated int32 lineBreaks = 7; -} diff --git a/config/proto/EntryData.proto b/config/proto/EntryData.proto deleted file mode 100644 index 09e7851..0000000 --- a/config/proto/EntryData.proto +++ /dev/null @@ -1,27 +0,0 @@ -syntax = "proto3"; - -import "LayoutEngine.proto"; -import "NodeType.proto"; - -message EntryData { - // Type of the semantic node. - NodeType type = 1; - - // Specifies the position in the parsed tree structure. - repeated int32 treeId = 2; - - // Specifies the text block IDs associated with this semantic node. - repeated int64 atomicBlockIds = 3; - - // Specifies the pages this semantic node appears on. - repeated int64 pageNumbers = 4; - - // Some semantic nodes have additional information, this information is stored in this Map. - map properties = 5; - - // All child Entries of this Entry. - repeated EntryData children = 6; - - // Describes the origin of the semantic node. - repeated LayoutEngine engines = 7; -} diff --git a/config/proto/LayoutEngine.proto b/config/proto/LayoutEngine.proto deleted file mode 100644 index 584da56..0000000 --- a/config/proto/LayoutEngine.proto +++ /dev/null @@ -1,7 +0,0 @@ -syntax = "proto3"; - -enum LayoutEngine { - ALGORITHM = 0; - AI = 1; - OUTLINE = 2; -} diff --git a/config/proto/NodeType.proto b/config/proto/NodeType.proto deleted file mode 100644 index 6cf9172..0000000 --- a/config/proto/NodeType.proto +++ /dev/null @@ -1,14 +0,0 @@ -syntax = "proto3"; - -enum NodeType { - DOCUMENT = 0; - SECTION = 1; - SUPER_SECTION = 2; - HEADLINE = 3; - PARAGRAPH = 4; - TABLE = 5; - TABLE_CELL = 6; - IMAGE = 7; - HEADER = 8; - FOOTER = 9; -} diff --git a/pyinfra/proto/DocumentPage_pb2.py b/pyinfra/proto/DocumentPage_pb2.py deleted file mode 100644 index 54cfaee..0000000 --- a/pyinfra/proto/DocumentPage_pb2.py +++ /dev/null @@ -1,29 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: DocumentPage.proto -# Protobuf Python Version: 4.25.5 -"""Generated protocol buffer code.""" -from google.protobuf import descriptor as _descriptor -from google.protobuf import descriptor_pool as _descriptor_pool -from google.protobuf import symbol_database as _symbol_database -from google.protobuf.internal import builder as _builder - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x12\x44ocumentPage.proto"8\n\x10\x41llDocumentPages\x12$\n\rdocumentPages\x18\x01 \x03(\x0b\x32\r.DocumentPage"O\n\x0c\x44ocumentPage\x12\x0e\n\x06number\x18\x01 \x01(\x05\x12\x0e\n\x06height\x18\x02 \x01(\x05\x12\r\n\x05width\x18\x03 \x01(\x05\x12\x10\n\x08rotation\x18\x04 \x01(\x05\x62\x06proto3' -) - -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "DocumentPage_pb2", _globals) -if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals["_ALLDOCUMENTPAGES"]._serialized_start = 22 - _globals["_ALLDOCUMENTPAGES"]._serialized_end = 78 - _globals["_DOCUMENTPAGE"]._serialized_start = 80 - _globals["_DOCUMENTPAGE"]._serialized_end = 159 -# @@protoc_insertion_point(module_scope) diff --git a/pyinfra/proto/DocumentPositionData_pb2.py b/pyinfra/proto/DocumentPositionData_pb2.py deleted file mode 100644 index e45a857..0000000 --- a/pyinfra/proto/DocumentPositionData_pb2.py +++ /dev/null @@ -1,31 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: DocumentPositionData.proto -# Protobuf Python Version: 4.25.5 -"""Generated protocol buffer code.""" -from google.protobuf import descriptor as _descriptor -from google.protobuf import descriptor_pool as _descriptor_pool -from google.protobuf import symbol_database as _symbol_database -from google.protobuf.internal import builder as _builder - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1a\x44ocumentPositionData.proto"N\n\x17\x41llDocumentPositionData\x12\x33\n\x14\x64ocumentPositionData\x18\x01 \x03(\x0b\x32\x15.DocumentPositionData"\x90\x01\n\x14\x44ocumentPositionData\x12\n\n\x02id\x18\x01 \x01(\x03\x12\x1e\n\x16stringIdxToPositionIdx\x18\x02 \x03(\x05\x12\x31\n\tpositions\x18\x03 \x03(\x0b\x32\x1e.DocumentPositionData.Position\x1a\x19\n\x08Position\x12\r\n\x05value\x18\x01 \x03(\x02\x62\x06proto3' -) - -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "DocumentPositionData_pb2", _globals) -if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals["_ALLDOCUMENTPOSITIONDATA"]._serialized_start = 30 - _globals["_ALLDOCUMENTPOSITIONDATA"]._serialized_end = 108 - _globals["_DOCUMENTPOSITIONDATA"]._serialized_start = 111 - _globals["_DOCUMENTPOSITIONDATA"]._serialized_end = 255 - _globals["_DOCUMENTPOSITIONDATA_POSITION"]._serialized_start = 230 - _globals["_DOCUMENTPOSITIONDATA_POSITION"]._serialized_end = 255 -# @@protoc_insertion_point(module_scope) diff --git a/pyinfra/proto/DocumentStructure_pb2.py b/pyinfra/proto/DocumentStructure_pb2.py deleted file mode 100644 index 2d2245d..0000000 --- a/pyinfra/proto/DocumentStructure_pb2.py +++ /dev/null @@ -1,29 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: DocumentStructure.proto -# Protobuf Python Version: 4.25.5 -"""Generated protocol buffer code.""" -from google.protobuf import descriptor as _descriptor -from google.protobuf import descriptor_pool as _descriptor_pool -from google.protobuf import symbol_database as _symbol_database -from google.protobuf.internal import builder as _builder - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -import pyinfra.proto.EntryData_pb2 as EntryData__pb2 - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x17\x44ocumentStructure.proto\x1a\x0f\x45ntryData.proto"-\n\x11\x44ocumentStructure\x12\x18\n\x04root\x18\x01 \x01(\x0b\x32\n.EntryDatab\x06proto3' -) - -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "DocumentStructure_pb2", _globals) -if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals["_DOCUMENTSTRUCTURE"]._serialized_start = 44 - _globals["_DOCUMENTSTRUCTURE"]._serialized_end = 89 -# @@protoc_insertion_point(module_scope) diff --git a/pyinfra/proto/DocumentTextData_pb2.py b/pyinfra/proto/DocumentTextData_pb2.py deleted file mode 100644 index 9b685b9..0000000 --- a/pyinfra/proto/DocumentTextData_pb2.py +++ /dev/null @@ -1,29 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: DocumentTextData.proto -# Protobuf Python Version: 4.25.5 -"""Generated protocol buffer code.""" -from google.protobuf import descriptor as _descriptor -from google.protobuf import descriptor_pool as _descriptor_pool -from google.protobuf import symbol_database as _symbol_database -from google.protobuf.internal import builder as _builder - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x16\x44ocumentTextData.proto"B\n\x13\x41llDocumentTextData\x12+\n\x10\x64ocumentTextData\x18\x01 \x03(\x0b\x32\x11.DocumentTextData"\x86\x01\n\x10\x44ocumentTextData\x12\n\n\x02id\x18\x01 \x01(\x03\x12\x0c\n\x04page\x18\x02 \x01(\x03\x12\x12\n\nsearchText\x18\x03 \x01(\t\x12\x14\n\x0cnumberOnPage\x18\x04 \x01(\x05\x12\r\n\x05start\x18\x05 \x01(\x05\x12\x0b\n\x03\x65nd\x18\x06 \x01(\x05\x12\x12\n\nlineBreaks\x18\x07 \x03(\x05\x62\x06proto3' -) - -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "DocumentTextData_pb2", _globals) -if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals["_ALLDOCUMENTTEXTDATA"]._serialized_start = 26 - _globals["_ALLDOCUMENTTEXTDATA"]._serialized_end = 92 - _globals["_DOCUMENTTEXTDATA"]._serialized_start = 95 - _globals["_DOCUMENTTEXTDATA"]._serialized_end = 229 -# @@protoc_insertion_point(module_scope) diff --git a/pyinfra/proto/EntryData_pb2.py b/pyinfra/proto/EntryData_pb2.py deleted file mode 100644 index 2e70d23..0000000 --- a/pyinfra/proto/EntryData_pb2.py +++ /dev/null @@ -1,34 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: EntryData.proto -# Protobuf Python Version: 4.25.5 -"""Generated protocol buffer code.""" -from google.protobuf import descriptor as _descriptor -from google.protobuf import descriptor_pool as _descriptor_pool -from google.protobuf import symbol_database as _symbol_database -from google.protobuf.internal import builder as _builder - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -import pyinfra.proto.LayoutEngine_pb2 as LayoutEngine__pb2 -import pyinfra.proto.NodeType_pb2 as NodeType__pb2 - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x0f\x45ntryData.proto\x1a\x12LayoutEngine.proto\x1a\x0eNodeType.proto"\x82\x02\n\tEntryData\x12\x17\n\x04type\x18\x01 \x01(\x0e\x32\t.NodeType\x12\x0e\n\x06treeId\x18\x02 \x03(\x05\x12\x16\n\x0e\x61tomicBlockIds\x18\x03 \x03(\x03\x12\x13\n\x0bpageNumbers\x18\x04 \x03(\x03\x12.\n\nproperties\x18\x05 \x03(\x0b\x32\x1a.EntryData.PropertiesEntry\x12\x1c\n\x08\x63hildren\x18\x06 \x03(\x0b\x32\n.EntryData\x12\x1e\n\x07\x65ngines\x18\x07 \x03(\x0e\x32\r.LayoutEngine\x1a\x31\n\x0fPropertiesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x62\x06proto3' -) - -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "EntryData_pb2", _globals) -if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals["_ENTRYDATA_PROPERTIESENTRY"]._options = None - _globals["_ENTRYDATA_PROPERTIESENTRY"]._serialized_options = b"8\001" - _globals["_ENTRYDATA"]._serialized_start = 56 - _globals["_ENTRYDATA"]._serialized_end = 314 - _globals["_ENTRYDATA_PROPERTIESENTRY"]._serialized_start = 265 - _globals["_ENTRYDATA_PROPERTIESENTRY"]._serialized_end = 314 -# @@protoc_insertion_point(module_scope) diff --git a/pyinfra/proto/LayoutEngine_pb2.py b/pyinfra/proto/LayoutEngine_pb2.py deleted file mode 100644 index b00344d..0000000 --- a/pyinfra/proto/LayoutEngine_pb2.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: LayoutEngine.proto -# Protobuf Python Version: 4.25.5 -"""Generated protocol buffer code.""" -from google.protobuf import descriptor as _descriptor -from google.protobuf import descriptor_pool as _descriptor_pool -from google.protobuf import symbol_database as _symbol_database -from google.protobuf.internal import builder as _builder - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b"\n\x12LayoutEngine.proto*2\n\x0cLayoutEngine\x12\r\n\tALGORITHM\x10\x00\x12\x06\n\x02\x41I\x10\x01\x12\x0b\n\x07OUTLINE\x10\x02\x62\x06proto3" -) - -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "LayoutEngine_pb2", _globals) -if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals["_LAYOUTENGINE"]._serialized_start = 22 - _globals["_LAYOUTENGINE"]._serialized_end = 72 -# @@protoc_insertion_point(module_scope) diff --git a/pyinfra/proto/NodeType_pb2.py b/pyinfra/proto/NodeType_pb2.py deleted file mode 100644 index fa7d58b..0000000 --- a/pyinfra/proto/NodeType_pb2.py +++ /dev/null @@ -1,27 +0,0 @@ -# -*- coding: utf-8 -*- -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: NodeType.proto -# Protobuf Python Version: 4.25.5 -"""Generated protocol buffer code.""" -from google.protobuf import descriptor as _descriptor -from google.protobuf import descriptor_pool as _descriptor_pool -from google.protobuf import symbol_database as _symbol_database -from google.protobuf.internal import builder as _builder - -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b"\n\x0eNodeType.proto*\x93\x01\n\x08NodeType\x12\x0c\n\x08\x44OCUMENT\x10\x00\x12\x0b\n\x07SECTION\x10\x01\x12\x11\n\rSUPER_SECTION\x10\x02\x12\x0c\n\x08HEADLINE\x10\x03\x12\r\n\tPARAGRAPH\x10\x04\x12\t\n\x05TABLE\x10\x05\x12\x0e\n\nTABLE_CELL\x10\x06\x12\t\n\x05IMAGE\x10\x07\x12\n\n\x06HEADER\x10\x08\x12\n\n\x06\x46OOTER\x10\tb\x06proto3" -) - -_globals = globals() -_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) -_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "NodeType_pb2", _globals) -if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _globals["_NODETYPE"]._serialized_start = 19 - _globals["_NODETYPE"]._serialized_end = 166 -# @@protoc_insertion_point(module_scope) diff --git a/pyinfra/proto/__init__.py b/pyinfra/proto/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/pyinfra/storage/proto_data_loader.py b/pyinfra/storage/proto_data_loader.py deleted file mode 100644 index 2e9cc58..0000000 --- a/pyinfra/storage/proto_data_loader.py +++ /dev/null @@ -1,127 +0,0 @@ -import re -from enum import Enum -from pathlib import Path - -from google.protobuf.json_format import MessageToDict -from kn_utils.logging import logger - -from pyinfra.proto import ( - DocumentPage_pb2, - DocumentPositionData_pb2, - DocumentStructure_pb2, - DocumentTextData_pb2, -) - - -class ProtoDataLoader: - """Loads proto data from a file and returns it as a dictionary or list. - - The loader is a singleton and should be used as a callable. The file name and byte data are passed as arguments. - - The document type is determined based on the file name and the data is returned as a dictionary or list, depending - on the document type. - The DocumentType enum contains all supported document types and their corresponding proto schema. - KEYS_TO_UNPACK contains the keys that should be unpacked from the message dictionary. Keys are unpacked if the - message dictionary contains only one key. This behaviour is necessary since lists are wrapped in a dictionary. - """ - - _instance = None - _pattern = None - - class DocumentType(Enum): - STRUCTURE = (DocumentStructure_pb2.DocumentStructure, "DocumentStructure") - TEXT = (DocumentTextData_pb2.AllDocumentTextData, "AllDocumentTextData") - PAGES = (DocumentPage_pb2.AllDocumentPages, "AllDocumentPages") - POSITION = (DocumentPositionData_pb2.AllDocumentPositionData, "AllDocumentPositionData") - - KEYS_TO_UNPACK = ["documentTextData", "documentPages", "documentPositionData"] - - @classmethod - def _build_pattern(cls) -> re.Pattern: - types = "|".join([dt.name for dt in cls.DocumentType]) - return re.compile(rf"\..*({types}).*\.proto.*") - - def __new__(cls, *args, **kwargs): - if cls._instance is None: - cls._instance = super().__new__(cls) - cls._pattern = cls._build_pattern() - return cls._instance - - def __call__(self, file_name: str | Path, data: bytes) -> dict: - return self._load(file_name, data) - - def _load(self, file_name: str | Path, data: bytes) -> dict | list: - file_name = str(file_name) - document_type = self._match(file_name) - - if not document_type: - logger.error(f"Unknown document type: {file_name}, supported types: {self.DocumentType}") - return {} - - logger.debug(f"Loading document type: {document_type}") - schema, _ = self.DocumentType[document_type].value - message = schema() - message.ParseFromString(data) - message_dict = MessageToDict(message, including_default_value_fields=True) - message_dict = convert_int64_fields(message_dict) - if document_type == "POSITION": - message_dict = transform_positions_to_list(message_dict) - - return self._unpack(message_dict) - - def _match(self, file_name: str) -> str | None: - match = self._pattern.search(file_name) - return match.group(1) if match else None - - def _unpack(self, message_dict: dict) -> list | dict: - if len(message_dict) > 1: - return message_dict - - for key in self.KEYS_TO_UNPACK: - if key in message_dict: - logger.debug(f"Unpacking key: {key}") - return message_dict[key] - - return message_dict - - -def convert_int64_fields(obj): - # FIXME: find a more sophisticated way to convert int64 fields (defaults to str in python) - - # we skip the following keys because the values are expected to be of type str - skip_keys = ["col", "row", "numberOfCols", "numberOfRows"] - if isinstance(obj, dict): - for key, value in obj.items(): - if key in skip_keys: - continue - obj[key] = convert_int64_fields(value) - elif isinstance(obj, list): - return [convert_int64_fields(item) for item in obj] - elif isinstance(obj, str) and obj.isdigit(): - return int(obj) - return obj - - -def transform_positions_to_list(obj: dict | list) -> dict: - """Transforms the repeated fields 'positions' to a lists of lists of floats - as expected by DocumentReader. - - Args: - obj (dict | list): Proto message dict - - Returns: - dict: Proto message dict - """ - if isinstance(obj, dict): - # Check if 'positions' is in the dictionary and reshape it as list of lists of floats - if "positions" in obj and isinstance(obj["positions"], list): - obj["positions"] = [pos["value"] for pos in obj["positions"] if isinstance(pos, dict) and "value" in pos] - - # Recursively apply to all nested dictionaries - for key, value in obj.items(): - obj[key] = transform_positions_to_list(value) - elif isinstance(obj, list): - # Recursively apply to all items in the list - obj = [transform_positions_to_list(item) for item in obj] - - return obj