Merge branch 'feat/RES-757-protobuffer' into 'master'
feat: add protobuffer See merge request knecon/research/pyinfra!87
This commit is contained in:
commit
789f6a7f7c
2
.dvc/.gitignore
vendored
Normal file
2
.dvc/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
/config.local
|
||||
/cache
|
||||
5
.dvc/config
Normal file
5
.dvc/config
Normal file
@ -0,0 +1,5 @@
|
||||
[core]
|
||||
remote = azure
|
||||
['remote "azure"']
|
||||
url = azure://pyinfra-dvc
|
||||
connection_string =
|
||||
3
.dvcignore
Normal file
3
.dvcignore
Normal file
@ -0,0 +1,3 @@
|
||||
# Add patterns of files dvc should ignore, which could improve
|
||||
# the performance. Learn more at
|
||||
# https://dvc.org/doc/user-guide/dvcignore
|
||||
47
README.md
47
README.md
@ -6,6 +6,7 @@
|
||||
4. [ Module Installation ](#module-installation)
|
||||
5. [ Scripts ](#scripts)
|
||||
6. [ Tests ](#tests)
|
||||
7. [ Protobuf ](#protobuf)
|
||||
|
||||
## About
|
||||
|
||||
@ -200,3 +201,49 @@ $ python scripts/send_request.py
|
||||
|
||||
Tests require a running minio and rabbitmq container, meaning you have to run `docker compose up` in the tests folder
|
||||
before running the tests.
|
||||
|
||||
## Protobuf
|
||||
|
||||
### Opentelemetry Compatibility Issue
|
||||
|
||||
**Note**: Status: 31/07/2024, the currently used `opentelemetry-exporter-otlp-proto-http` version `1.25.0` requires
|
||||
a `protobuf` version < `5.x.x` and is not compatible with the latest protobuf version `5.27.x`. This is an [open issue](https://github.com/open-telemetry/opentelemetry-python/issues/3958) in opentelemetry, because [support for 4.25.x ends in Q2 '25](https://protobuf.dev/support/version-support/#python). Therefore, we should keep this in mind and update the dependency once opentelemetry includes support for `protobuf 5.27.x`.
|
||||
|
||||
|
||||
### Install Protobuf Compiler
|
||||
|
||||
**Linux**
|
||||
|
||||
1. Download the version of the protobuf compiler matching the protobuf package, currently v4.25.4 so protoc v25.4, from [GitHub](https://github.com/protocolbuffers/protobuf/releases) -> `protobuf-25.4.zip`
|
||||
2. Extract the files under `$HOME/.local` or another directory of your choice
|
||||
```bash
|
||||
unzip protoc-<version>-linux-x86_64.zip -d $HOME/.local
|
||||
```
|
||||
3. Ensure that the `bin` directory is in your `PATH` by adding the following line to your `.bashrc` or `.zshrc`:
|
||||
```bash
|
||||
export PATH="$PATH:$HOME/.local/bin"
|
||||
```
|
||||
|
||||
**MacOS**
|
||||
|
||||
1. Download the version of the protobuf compiler matching the protobuf package, currently v4.25.4 so protoc v25.4, from [GitHub](https://github.com/protocolbuffers/protobuf/releases) -> `protoc-25.4-osx-universal_binary.zip`
|
||||
2. Extract the files to a directory of your choice
|
||||
3. Copy the executable bin `protoc` to `/usr/local/bin`
|
||||
```bash
|
||||
sudo cp /Users/you/location-of-unzipped-dir/bin/protoc /usr/local/bin/
|
||||
```
|
||||
4. Open `protoc` in `/usr/local/bin/` via Finder to make it executable, now it should be also on your `PATH`
|
||||
|
||||
### Compile Protobuf Files
|
||||
|
||||
1. Ensure that the protobuf compiler is installed on your system. You can check this by running:
|
||||
```bash
|
||||
protoc --version
|
||||
```
|
||||
2. Compile proto files:
|
||||
```bash
|
||||
protoc --proto_path=./config/proto --python_out=./pyinfra/proto ./config/proto/*.proto
|
||||
```
|
||||
3. Manually adjust import statements in the generated files to match the package structure, e.g.:
|
||||
`import EntryData_pb2 as EntryData__pb2` -> `import pyinfra.proto.EntryData_pb2 as EntryData__pb2`.
|
||||
This does not work automatically because the generated files are not in the same directory as the proto files.
|
||||
|
||||
21
config/proto/DocumentPage.proto
Normal file
21
config/proto/DocumentPage.proto
Normal file
@ -0,0 +1,21 @@
|
||||
syntax = "proto3";
|
||||
|
||||
message AllDocumentPages {
|
||||
|
||||
repeated DocumentPage documentPages = 1;
|
||||
}
|
||||
|
||||
message DocumentPage {
|
||||
// The page number, starting with 1.
|
||||
int32 number = 1;
|
||||
|
||||
// The page height in PDF user units.
|
||||
int32 height = 2;
|
||||
|
||||
// The page width in PDF user units.
|
||||
int32 width = 3;
|
||||
|
||||
// The page rotation as specified by the PDF.
|
||||
int32 rotation = 4;
|
||||
}
|
||||
|
||||
28
config/proto/DocumentPositionData.proto
Normal file
28
config/proto/DocumentPositionData.proto
Normal file
@ -0,0 +1,28 @@
|
||||
syntax = "proto3";
|
||||
|
||||
message AllDocumentPositionData {
|
||||
|
||||
repeated DocumentPositionData documentPositionData = 1;
|
||||
}
|
||||
|
||||
message DocumentPositionData {
|
||||
// Identifier of the text block.
|
||||
int64 id = 1;
|
||||
|
||||
// For each string coordinate in the search text of the text block, the array contains an entry relating the string coordinate to the position coordinate.
|
||||
// This is required due to the text and position coordinates not being equal.
|
||||
repeated int32 stringIdxToPositionIdx = 2;
|
||||
|
||||
// The bounding box for each glyph as a rectangle. This matrix is of size (n,4), where n is the number of glyphs in the text block.
|
||||
// The second dimension specifies the rectangle with the value x, y, width, height, with x, y specifying the lower left corner.
|
||||
// In order to access this information, the stringIdxToPositionIdx array must be used to transform the coordinates.
|
||||
repeated Position positions = 3;
|
||||
|
||||
// Definition of a BoundingBox that contains x, y, width, and height.
|
||||
message Position {
|
||||
float x = 1;
|
||||
float y = 2;
|
||||
float width = 3;
|
||||
float height = 4;
|
||||
}
|
||||
}
|
||||
8
config/proto/DocumentStructure.proto
Normal file
8
config/proto/DocumentStructure.proto
Normal file
@ -0,0 +1,8 @@
|
||||
syntax = "proto3";
|
||||
|
||||
import "EntryData.proto";
|
||||
|
||||
message DocumentStructure {
|
||||
// The root EntryData represents the Document.
|
||||
EntryData root = 1;
|
||||
}
|
||||
29
config/proto/DocumentTextData.proto
Normal file
29
config/proto/DocumentTextData.proto
Normal file
@ -0,0 +1,29 @@
|
||||
syntax = "proto3";
|
||||
|
||||
message AllDocumentTextData {
|
||||
|
||||
repeated DocumentTextData documentTextData = 1;
|
||||
}
|
||||
|
||||
message DocumentTextData {
|
||||
// Identifier of the text block.
|
||||
int64 id = 1;
|
||||
|
||||
// The page the text block occurs on.
|
||||
int64 page = 2;
|
||||
|
||||
// The text of the text block.
|
||||
string searchText = 3;
|
||||
|
||||
// Each text block is assigned a number on a page, starting from 0.
|
||||
int32 numberOnPage = 4;
|
||||
|
||||
// The text blocks are ordered, this number represents the start of the text block as a string offset.
|
||||
int32 start = 5;
|
||||
|
||||
// The text blocks are ordered, this number represents the end of the text block as a string offset.
|
||||
int32 end = 6;
|
||||
|
||||
// The line breaks in the text of this semantic node in string offsets. They are exclusive end. At the end of each semantic node there is an implicit linebreak.
|
||||
repeated int32 lineBreaks = 7;
|
||||
}
|
||||
27
config/proto/EntryData.proto
Normal file
27
config/proto/EntryData.proto
Normal file
@ -0,0 +1,27 @@
|
||||
syntax = "proto3";
|
||||
|
||||
import "LayoutEngine.proto";
|
||||
import "NodeType.proto";
|
||||
|
||||
message EntryData {
|
||||
// Type of the semantic node.
|
||||
NodeType type = 1;
|
||||
|
||||
// Specifies the position in the parsed tree structure.
|
||||
repeated int32 treeId = 2;
|
||||
|
||||
// Specifies the text block IDs associated with this semantic node.
|
||||
repeated int64 atomicBlockIds = 3;
|
||||
|
||||
// Specifies the pages this semantic node appears on.
|
||||
repeated int64 pageNumbers = 4;
|
||||
|
||||
// Some semantic nodes have additional information, this information is stored in this Map.
|
||||
map<string, string> properties = 5;
|
||||
|
||||
// All child Entries of this Entry.
|
||||
repeated EntryData children = 6;
|
||||
|
||||
// Describes the origin of the semantic node.
|
||||
repeated LayoutEngine engines = 7;
|
||||
}
|
||||
7
config/proto/LayoutEngine.proto
Normal file
7
config/proto/LayoutEngine.proto
Normal file
@ -0,0 +1,7 @@
|
||||
syntax = "proto3";
|
||||
|
||||
enum LayoutEngine {
|
||||
ALGORITHM = 0;
|
||||
AI = 1;
|
||||
OUTLINE = 2;
|
||||
}
|
||||
14
config/proto/NodeType.proto
Normal file
14
config/proto/NodeType.proto
Normal file
@ -0,0 +1,14 @@
|
||||
syntax = "proto3";
|
||||
|
||||
enum NodeType {
|
||||
DOCUMENT = 0;
|
||||
SECTION = 1;
|
||||
SUPER_SECTION = 2;
|
||||
HEADLINE = 3;
|
||||
PARAGRAPH = 4;
|
||||
TABLE = 5;
|
||||
TABLE_CELL = 6;
|
||||
IMAGE = 7;
|
||||
HEADER = 8;
|
||||
FOOTER = 9;
|
||||
}
|
||||
1944
poetry.lock
generated
1944
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
29
pyinfra/proto/DocumentPage_pb2.py
Normal file
29
pyinfra/proto/DocumentPage_pb2.py
Normal file
@ -0,0 +1,29 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# source: DocumentPage.proto
|
||||
# Protobuf Python Version: 4.25.4
|
||||
"""Generated protocol buffer code."""
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
from google.protobuf.internal import builder as _builder
|
||||
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
|
||||
b'\n\x12\x44ocumentPage.proto"8\n\x10\x41llDocumentPages\x12$\n\rdocumentPages\x18\x01 \x03(\x0b\x32\r.DocumentPage"O\n\x0c\x44ocumentPage\x12\x0e\n\x06number\x18\x01 \x01(\x05\x12\x0e\n\x06height\x18\x02 \x01(\x05\x12\r\n\x05width\x18\x03 \x01(\x05\x12\x10\n\x08rotation\x18\x04 \x01(\x05\x62\x06proto3'
|
||||
)
|
||||
|
||||
_globals = globals()
|
||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "DocumentPage_pb2", _globals)
|
||||
if _descriptor._USE_C_DESCRIPTORS == False:
|
||||
DESCRIPTOR._options = None
|
||||
_globals["_ALLDOCUMENTPAGES"]._serialized_start = 22
|
||||
_globals["_ALLDOCUMENTPAGES"]._serialized_end = 78
|
||||
_globals["_DOCUMENTPAGE"]._serialized_start = 80
|
||||
_globals["_DOCUMENTPAGE"]._serialized_end = 159
|
||||
# @@protoc_insertion_point(module_scope)
|
||||
31
pyinfra/proto/DocumentPositionData_pb2.py
Normal file
31
pyinfra/proto/DocumentPositionData_pb2.py
Normal file
@ -0,0 +1,31 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# source: DocumentPositionData.proto
|
||||
# Protobuf Python Version: 4.25.4
|
||||
"""Generated protocol buffer code."""
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
from google.protobuf.internal import builder as _builder
|
||||
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
|
||||
b'\n\x1a\x44ocumentPositionData.proto"N\n\x17\x41llDocumentPositionData\x12\x33\n\x14\x64ocumentPositionData\x18\x01 \x03(\x0b\x32\x15.DocumentPositionData"\xb6\x01\n\x14\x44ocumentPositionData\x12\n\n\x02id\x18\x01 \x01(\x03\x12\x1e\n\x16stringIdxToPositionIdx\x18\x02 \x03(\x05\x12\x31\n\tpositions\x18\x03 \x03(\x0b\x32\x1e.DocumentPositionData.Position\x1a?\n\x08Position\x12\t\n\x01x\x18\x01 \x01(\x02\x12\t\n\x01y\x18\x02 \x01(\x02\x12\r\n\x05width\x18\x03 \x01(\x02\x12\x0e\n\x06height\x18\x04 \x01(\x02\x62\x06proto3'
|
||||
)
|
||||
|
||||
_globals = globals()
|
||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "DocumentPositionData_pb2", _globals)
|
||||
if _descriptor._USE_C_DESCRIPTORS == False:
|
||||
DESCRIPTOR._options = None
|
||||
_globals["_ALLDOCUMENTPOSITIONDATA"]._serialized_start = 30
|
||||
_globals["_ALLDOCUMENTPOSITIONDATA"]._serialized_end = 108
|
||||
_globals["_DOCUMENTPOSITIONDATA"]._serialized_start = 111
|
||||
_globals["_DOCUMENTPOSITIONDATA"]._serialized_end = 293
|
||||
_globals["_DOCUMENTPOSITIONDATA_POSITION"]._serialized_start = 230
|
||||
_globals["_DOCUMENTPOSITIONDATA_POSITION"]._serialized_end = 293
|
||||
# @@protoc_insertion_point(module_scope)
|
||||
29
pyinfra/proto/DocumentStructure_pb2.py
Normal file
29
pyinfra/proto/DocumentStructure_pb2.py
Normal file
@ -0,0 +1,29 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# source: DocumentStructure.proto
|
||||
# Protobuf Python Version: 4.25.4
|
||||
"""Generated protocol buffer code."""
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
from google.protobuf.internal import builder as _builder
|
||||
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
import pyinfra.proto.EntryData_pb2 as EntryData__pb2
|
||||
|
||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
|
||||
b'\n\x17\x44ocumentStructure.proto\x1a\x0f\x45ntryData.proto"-\n\x11\x44ocumentStructure\x12\x18\n\x04root\x18\x01 \x01(\x0b\x32\n.EntryDatab\x06proto3'
|
||||
)
|
||||
|
||||
_globals = globals()
|
||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "DocumentStructure_pb2", _globals)
|
||||
if _descriptor._USE_C_DESCRIPTORS == False:
|
||||
DESCRIPTOR._options = None
|
||||
_globals["_DOCUMENTSTRUCTURE"]._serialized_start = 44
|
||||
_globals["_DOCUMENTSTRUCTURE"]._serialized_end = 89
|
||||
# @@protoc_insertion_point(module_scope)
|
||||
29
pyinfra/proto/DocumentTextData_pb2.py
Normal file
29
pyinfra/proto/DocumentTextData_pb2.py
Normal file
@ -0,0 +1,29 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# source: DocumentTextData.proto
|
||||
# Protobuf Python Version: 4.25.4
|
||||
"""Generated protocol buffer code."""
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
from google.protobuf.internal import builder as _builder
|
||||
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
|
||||
b'\n\x16\x44ocumentTextData.proto"B\n\x13\x41llDocumentTextData\x12+\n\x10\x64ocumentTextData\x18\x01 \x03(\x0b\x32\x11.DocumentTextData"\x86\x01\n\x10\x44ocumentTextData\x12\n\n\x02id\x18\x01 \x01(\x03\x12\x0c\n\x04page\x18\x02 \x01(\x03\x12\x12\n\nsearchText\x18\x03 \x01(\t\x12\x14\n\x0cnumberOnPage\x18\x04 \x01(\x05\x12\r\n\x05start\x18\x05 \x01(\x05\x12\x0b\n\x03\x65nd\x18\x06 \x01(\x05\x12\x12\n\nlineBreaks\x18\x07 \x03(\x05\x62\x06proto3'
|
||||
)
|
||||
|
||||
_globals = globals()
|
||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "DocumentTextData_pb2", _globals)
|
||||
if _descriptor._USE_C_DESCRIPTORS == False:
|
||||
DESCRIPTOR._options = None
|
||||
_globals["_ALLDOCUMENTTEXTDATA"]._serialized_start = 26
|
||||
_globals["_ALLDOCUMENTTEXTDATA"]._serialized_end = 92
|
||||
_globals["_DOCUMENTTEXTDATA"]._serialized_start = 95
|
||||
_globals["_DOCUMENTTEXTDATA"]._serialized_end = 229
|
||||
# @@protoc_insertion_point(module_scope)
|
||||
34
pyinfra/proto/EntryData_pb2.py
Normal file
34
pyinfra/proto/EntryData_pb2.py
Normal file
@ -0,0 +1,34 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# source: EntryData.proto
|
||||
# Protobuf Python Version: 4.25.4
|
||||
"""Generated protocol buffer code."""
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
from google.protobuf.internal import builder as _builder
|
||||
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
import pyinfra.proto.LayoutEngine_pb2 as LayoutEngine__pb2
|
||||
import pyinfra.proto.NodeType_pb2 as NodeType__pb2
|
||||
|
||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
|
||||
b'\n\x0f\x45ntryData.proto\x1a\x12LayoutEngine.proto\x1a\x0eNodeType.proto"\x82\x02\n\tEntryData\x12\x17\n\x04type\x18\x01 \x01(\x0e\x32\t.NodeType\x12\x0e\n\x06treeId\x18\x02 \x03(\x05\x12\x16\n\x0e\x61tomicBlockIds\x18\x03 \x03(\x03\x12\x13\n\x0bpageNumbers\x18\x04 \x03(\x03\x12.\n\nproperties\x18\x05 \x03(\x0b\x32\x1a.EntryData.PropertiesEntry\x12\x1c\n\x08\x63hildren\x18\x06 \x03(\x0b\x32\n.EntryData\x12\x1e\n\x07\x65ngines\x18\x07 \x03(\x0e\x32\r.LayoutEngine\x1a\x31\n\x0fPropertiesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x62\x06proto3'
|
||||
)
|
||||
|
||||
_globals = globals()
|
||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "EntryData_pb2", _globals)
|
||||
if _descriptor._USE_C_DESCRIPTORS == False:
|
||||
DESCRIPTOR._options = None
|
||||
_globals["_ENTRYDATA_PROPERTIESENTRY"]._options = None
|
||||
_globals["_ENTRYDATA_PROPERTIESENTRY"]._serialized_options = b"8\001"
|
||||
_globals["_ENTRYDATA"]._serialized_start = 56
|
||||
_globals["_ENTRYDATA"]._serialized_end = 314
|
||||
_globals["_ENTRYDATA_PROPERTIESENTRY"]._serialized_start = 265
|
||||
_globals["_ENTRYDATA_PROPERTIESENTRY"]._serialized_end = 314
|
||||
# @@protoc_insertion_point(module_scope)
|
||||
27
pyinfra/proto/LayoutEngine_pb2.py
Normal file
27
pyinfra/proto/LayoutEngine_pb2.py
Normal file
@ -0,0 +1,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# source: LayoutEngine.proto
|
||||
# Protobuf Python Version: 4.25.4
|
||||
"""Generated protocol buffer code."""
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
from google.protobuf.internal import builder as _builder
|
||||
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
|
||||
b"\n\x12LayoutEngine.proto*2\n\x0cLayoutEngine\x12\r\n\tALGORITHM\x10\x00\x12\x06\n\x02\x41I\x10\x01\x12\x0b\n\x07OUTLINE\x10\x02\x62\x06proto3"
|
||||
)
|
||||
|
||||
_globals = globals()
|
||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "LayoutEngine_pb2", _globals)
|
||||
if _descriptor._USE_C_DESCRIPTORS == False:
|
||||
DESCRIPTOR._options = None
|
||||
_globals["_LAYOUTENGINE"]._serialized_start = 22
|
||||
_globals["_LAYOUTENGINE"]._serialized_end = 72
|
||||
# @@protoc_insertion_point(module_scope)
|
||||
27
pyinfra/proto/NodeType_pb2.py
Normal file
27
pyinfra/proto/NodeType_pb2.py
Normal file
@ -0,0 +1,27 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
||||
# source: NodeType.proto
|
||||
# Protobuf Python Version: 4.25.4
|
||||
"""Generated protocol buffer code."""
|
||||
from google.protobuf import descriptor as _descriptor
|
||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
||||
from google.protobuf import symbol_database as _symbol_database
|
||||
from google.protobuf.internal import builder as _builder
|
||||
|
||||
# @@protoc_insertion_point(imports)
|
||||
|
||||
_sym_db = _symbol_database.Default()
|
||||
|
||||
|
||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
|
||||
b"\n\x0eNodeType.proto*\x93\x01\n\x08NodeType\x12\x0c\n\x08\x44OCUMENT\x10\x00\x12\x0b\n\x07SECTION\x10\x01\x12\x11\n\rSUPER_SECTION\x10\x02\x12\x0c\n\x08HEADLINE\x10\x03\x12\r\n\tPARAGRAPH\x10\x04\x12\t\n\x05TABLE\x10\x05\x12\x0e\n\nTABLE_CELL\x10\x06\x12\t\n\x05IMAGE\x10\x07\x12\n\n\x06HEADER\x10\x08\x12\n\n\x06\x46OOTER\x10\tb\x06proto3"
|
||||
)
|
||||
|
||||
_globals = globals()
|
||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "NodeType_pb2", _globals)
|
||||
if _descriptor._USE_C_DESCRIPTORS == False:
|
||||
DESCRIPTOR._options = None
|
||||
_globals["_NODETYPE"]._serialized_start = 19
|
||||
_globals["_NODETYPE"]._serialized_end = 166
|
||||
# @@protoc_insertion_point(module_scope)
|
||||
0
pyinfra/proto/__init__.py
Normal file
0
pyinfra/proto/__init__.py
Normal file
95
pyinfra/storage/proto_data_loader.py
Normal file
95
pyinfra/storage/proto_data_loader.py
Normal file
@ -0,0 +1,95 @@
|
||||
import re
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
|
||||
from google.protobuf.json_format import MessageToDict
|
||||
from kn_utils.logging import logger
|
||||
|
||||
from pyinfra.proto import (
|
||||
DocumentPage_pb2,
|
||||
DocumentPositionData_pb2,
|
||||
DocumentStructure_pb2,
|
||||
DocumentTextData_pb2,
|
||||
)
|
||||
|
||||
|
||||
class ProtoDataLoader:
|
||||
"""Loads proto data from a file and returns it as a dictionary or list.
|
||||
|
||||
The loader is a singleton and should be used as a callable. The file name and byte data are passed as arguments.
|
||||
|
||||
The document type is determined based on the file name and the data is returned as a dictionary or list, depending
|
||||
on the document type.
|
||||
The DocumentType enum contains all supported document types and their corresponding proto schema.
|
||||
KEYS_TO_UNPACK contains the keys that should be unpacked from the message dictionary. Keys are unpacked if the
|
||||
message dictionary contains only one key. This behaviour is necessary since lists are wrapped in a dictionary.
|
||||
"""
|
||||
|
||||
_instance = None
|
||||
_pattern = None
|
||||
|
||||
class DocumentType(Enum):
|
||||
STRUCTURE = (DocumentStructure_pb2.DocumentStructure, "DocumentStructure")
|
||||
TEXT = (DocumentTextData_pb2.AllDocumentTextData, "AllDocumentTextData")
|
||||
PAGES = (DocumentPage_pb2.AllDocumentPages, "AllDocumentPages")
|
||||
POSITION = (DocumentPositionData_pb2.AllDocumentPositionData, "AllDocumentPositionData")
|
||||
|
||||
KEYS_TO_UNPACK = ["documentTextData", "documentPages", "documentPositionData"]
|
||||
|
||||
@classmethod
|
||||
def _build_pattern(cls) -> re.Pattern:
|
||||
types = "|".join([dt.name for dt in cls.DocumentType])
|
||||
return re.compile(rf"\..*({types}).*\.proto.*")
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
if cls._instance is None:
|
||||
cls._instance = super().__new__(cls)
|
||||
cls._pattern = cls._build_pattern()
|
||||
return cls._instance
|
||||
|
||||
def __call__(self, file_name: str | Path, data: bytes) -> dict:
|
||||
return self._load(file_name, data)
|
||||
|
||||
def _load(self, file_name: str | Path, data: bytes) -> dict | list:
|
||||
file_name = str(file_name)
|
||||
document_type = self._match(file_name)
|
||||
|
||||
if not document_type:
|
||||
logger.error(f"Unknown document type: {file_name}, supported types: {self.DocumentType}")
|
||||
return {}
|
||||
|
||||
logger.debug(f"Loading document type: {document_type}")
|
||||
schema, _ = self.DocumentType[document_type].value
|
||||
message = schema()
|
||||
message.ParseFromString(data)
|
||||
message_dict = MessageToDict(message, including_default_value_fields=True)
|
||||
message_dict = convert_int64_fields(message_dict)
|
||||
|
||||
return self._unpack(message_dict)
|
||||
|
||||
def _match(self, file_name: str) -> str | None:
|
||||
match = self._pattern.search(file_name)
|
||||
return match.group(1) if match else None
|
||||
|
||||
def _unpack(self, message_dict: dict) -> list | dict:
|
||||
if len(message_dict) > 1:
|
||||
return message_dict
|
||||
|
||||
for key in self.KEYS_TO_UNPACK:
|
||||
if key in message_dict:
|
||||
logger.debug(f"Unpacking key: {key}")
|
||||
return message_dict[key]
|
||||
|
||||
return message_dict
|
||||
|
||||
|
||||
def convert_int64_fields(obj):
|
||||
# FIXME: find a more sophisticated way to convert int64 fields (defaults to str in python)
|
||||
if isinstance(obj, dict):
|
||||
for key, value in obj.items():
|
||||
obj[key] = convert_int64_fields(value)
|
||||
elif isinstance(obj, list):
|
||||
return [convert_int64_fields(item) for item in obj]
|
||||
elif isinstance(obj, str) and obj.isdigit():
|
||||
return int(obj)
|
||||
return obj
|
||||
@ -6,6 +6,7 @@ from typing import Union
|
||||
from kn_utils.logging import logger
|
||||
from pydantic import BaseModel, ValidationError
|
||||
|
||||
from pyinfra.storage.proto_data_loader import ProtoDataLoader
|
||||
from pyinfra.storage.storages.storage import Storage
|
||||
|
||||
|
||||
@ -104,7 +105,14 @@ def _download_single_file(file_path: str, storage: Storage) -> bytes:
|
||||
data = storage.get_object(file_path)
|
||||
|
||||
data = gzip.decompress(data) if ".gz" in file_path else data
|
||||
data = json.loads(data.decode("utf-8")) if ".json" in file_path else data
|
||||
|
||||
if ".json" in file_path:
|
||||
data = json.loads(data.decode("utf-8"))
|
||||
elif ".proto" in file_path:
|
||||
data = ProtoDataLoader()(file_path, data)
|
||||
else:
|
||||
pass # identity for other file types
|
||||
|
||||
logger.info(f"Downloaded {file_path} from storage.")
|
||||
|
||||
return data
|
||||
|
||||
@ -34,6 +34,7 @@ opentelemetry-instrumentation-requests = "^0.46b0"
|
||||
opentelemetry-instrumentation-fastapi = "^0.46b0"
|
||||
wcwidth = "<=0.2.12"
|
||||
azure-monitor-opentelemetry = "^1.6.0"
|
||||
protobuf = "^4.25.3" # FIXME: update to ^5.27.2 after opentelemetry is updated (see README.md/Protobuf)
|
||||
aio-pika = "^9.4.2"
|
||||
aiohttp = "^3.9.5"
|
||||
tenacity = "^8.5.0"
|
||||
@ -47,6 +48,9 @@ coverage = "^7.3"
|
||||
requests = "^2.31"
|
||||
pre-commit = "^3.6.0"
|
||||
cyclonedx-bom = "^4.1.1"
|
||||
dvc = "^3.51.2"
|
||||
dvc-azure = "^3.1.0"
|
||||
deepdiff = "^7.0.1"
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
minversion = "6.0"
|
||||
|
||||
6
tests/data.dvc
Normal file
6
tests/data.dvc
Normal file
@ -0,0 +1,6 @@
|
||||
outs:
|
||||
- md5: 7d36b38a27b5b959beec9e0e772c14c4.dir
|
||||
size: 23067894
|
||||
nfiles: 8
|
||||
hash: md5
|
||||
path: data
|
||||
@ -11,10 +11,18 @@ def exporter(settings):
|
||||
return get_exporter(settings)
|
||||
|
||||
|
||||
class TestOpenTelemetry:
|
||||
def test_queue_messages_are_traced(self, queue_manager, input_message, stop_message, settings, exporter):
|
||||
setup_trace(settings, exporter=exporter)
|
||||
@pytest.fixture(autouse=True)
|
||||
def setup_test_trace(settings, exporter, tracing_type):
|
||||
settings.tracing.type = tracing_type
|
||||
setup_trace(settings, exporter=exporter)
|
||||
|
||||
|
||||
class TestOpenTelemetry:
|
||||
@pytest.mark.xfail(
|
||||
reason="Azure Monitor requires a connection string. Therefore the test is allowed to fail in this case."
|
||||
)
|
||||
@pytest.mark.parametrize("tracing_type", ["opentelemetry", "azure_monitor"])
|
||||
def test_queue_messages_are_traced(self, queue_manager, input_message, stop_message, settings, exporter):
|
||||
instrument_pika()
|
||||
|
||||
queue_manager.purge_queues()
|
||||
@ -31,21 +39,3 @@ class TestOpenTelemetry:
|
||||
assert (
|
||||
exported_trace["resource"]["attributes"]["service.name"] == settings.tracing.opentelemetry.service_name
|
||||
)
|
||||
|
||||
# def test_webserver_requests_are_traced(self, settings):
|
||||
# settings.tracing.opentelemetry.exporter = "console"
|
||||
# settings.tracing.enabled = True
|
||||
#
|
||||
# app = FastAPI()
|
||||
#
|
||||
# @app.get("/test")
|
||||
# def test():
|
||||
# return {"test": "test"}
|
||||
#
|
||||
# thread = create_webserver_thread_from_settings(app, settings)
|
||||
# thread.start()
|
||||
# sleep(1)
|
||||
#
|
||||
# requests.get(f"http://{settings.webserver.host}:{settings.webserver.port}/test")
|
||||
#
|
||||
# thread.join(timeout=1)
|
||||
|
||||
80
tests/unit_test/proto_data_loader_test.py
Normal file
80
tests/unit_test/proto_data_loader_test.py
Normal file
@ -0,0 +1,80 @@
|
||||
import gzip
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from deepdiff import DeepDiff
|
||||
|
||||
from pyinfra.storage.proto_data_loader import ProtoDataLoader
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_data_dir():
|
||||
return Path(__file__).parents[1] / "data"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def document_data(request, test_data_dir) -> (str, bytes, dict | list):
|
||||
doc_type = request.param
|
||||
input_file_path = test_data_dir / f"72ea04dfdbeb277f37b9eb127efb0896.{doc_type}.proto.gz"
|
||||
target_file_path = test_data_dir / f"3f9d3d9f255007de8eff13648321e197.{doc_type}.json.gz"
|
||||
|
||||
input_data = input_file_path.read_bytes()
|
||||
target_data = json.loads(gzip.decompress(target_file_path.read_bytes()))
|
||||
|
||||
return input_file_path, input_data, target_data
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def proto_data_loader():
|
||||
return ProtoDataLoader()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def should_match():
|
||||
return [
|
||||
"a.DOCUMENT_STRUCTURE.proto.gz",
|
||||
"a.DOCUMENT_TEXT.proto.gz",
|
||||
"a.DOCUMENT_PAGES.proto.gz",
|
||||
"a.DOCUMENT_POSITION.proto.gz",
|
||||
"b.DOCUMENT_STRUCTURE.proto",
|
||||
"b.DOCUMENT_TEXT.proto",
|
||||
"b.DOCUMENT_PAGES.proto",
|
||||
"b.DOCUMENT_POSITION.proto",
|
||||
"c.STRUCTURE.proto.gz",
|
||||
"c.TEXT.proto.gz",
|
||||
"c.PAGES.proto.gz",
|
||||
"c.POSITION.proto.gz",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason="FIXME: The test is not stable, but hast to work before we can deploy the code! Right now, we don't have parity between the proto and the json data."
|
||||
)
|
||||
# As DOCUMENT_POSITION is a very large file, the test takes forever. If you want to test it, add "DOCUMENT_POSITION" to the list below.
|
||||
@pytest.mark.parametrize("document_data", ["DOCUMENT_STRUCTURE", "DOCUMENT_TEXT", "DOCUMENT_PAGES"], indirect=True)
|
||||
def test_proto_data_loader_end2end(document_data, proto_data_loader):
|
||||
file_path, data, target = document_data
|
||||
data = gzip.decompress(data)
|
||||
loaded_data = proto_data_loader(file_path, data)
|
||||
|
||||
loaded_data_str = json.dumps(loaded_data, sort_keys=True)
|
||||
target_str = json.dumps(target, sort_keys=True)
|
||||
|
||||
diff = DeepDiff(sorted(loaded_data_str), sorted(target_str), ignore_order=True)
|
||||
|
||||
# FIXME: remove this block when the test is stable
|
||||
# if diff:
|
||||
# with open("/tmp/diff.json", "w") as f:
|
||||
# f.write(diff.to_json(indent=2))
|
||||
|
||||
assert not diff
|
||||
|
||||
|
||||
def test_proto_data_loader_unknown_document_type(proto_data_loader):
|
||||
assert not proto_data_loader("unknown_document_type.proto", b"")
|
||||
|
||||
|
||||
def test_proto_data_loader_file_name_matching(proto_data_loader, should_match):
|
||||
for file_name in should_match:
|
||||
assert proto_data_loader._match(file_name) is not None
|
||||
Loading…
x
Reference in New Issue
Block a user