Compare commits
28 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3ef4246d1e | ||
|
|
841c492639 | ||
|
|
ead069d3a7 | ||
|
|
044ea6cf0a | ||
|
|
ff7547e2c6 | ||
|
|
fbf79ef758 | ||
|
|
f382887d40 | ||
|
|
5c4400aa8b | ||
|
|
5ce66f18a0 | ||
|
|
ea0c55930a | ||
|
|
87f57e2244 | ||
|
|
3fb8c4e641 | ||
|
|
e23f63acf0 | ||
|
|
d3fecc518e | ||
|
|
341500d463 | ||
|
|
e002f77fd5 | ||
|
|
3c6d8f2dcc | ||
|
|
f6d6ba40bb | ||
|
|
6a0bbad108 | ||
|
|
527a671a75 | ||
|
|
cf91189728 | ||
|
|
61a6d0eeed | ||
|
|
bc0b355ff9 | ||
|
|
235e27b74c | ||
|
|
1540c2894e | ||
|
|
9b60594ce1 | ||
|
|
3d3c76b466 | ||
|
|
9d4ec84b49 |
@ -1,49 +1,23 @@
|
|||||||
|
# CI for services, check gitlab repo for python package CI
|
||||||
include:
|
include:
|
||||||
- project: "Gitlab/gitlab"
|
- project: "Gitlab/gitlab"
|
||||||
ref: 0.3.0
|
ref: main
|
||||||
file: "/ci-templates/research/python_pkg_venv_test_build_release_gitlab-ci.yml"
|
file: "/ci-templates/research/python_pkg-test-build-release.gitlab-ci.yml"
|
||||||
|
|
||||||
default:
|
|
||||||
image: python:3.10
|
|
||||||
|
|
||||||
|
# set project variables here
|
||||||
variables:
|
variables:
|
||||||
PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip"
|
NEXUS_PROJECT_DIR: research # subfolder in Nexus docker-gin where your container will be stored
|
||||||
GITLAB_PYPI_URL: https://gitlab.knecon.com/api/v4/projects/${CI_PROJECT_ID}/packages/pypi
|
IMAGENAME: $CI_PROJECT_NAME # if the project URL is gitlab.example.com/group-name/project-1, CI_PROJECT_NAME is project-1
|
||||||
PYPI_REGISTRY_RESEARCH: https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
|
REPORTS_DIR: reports
|
||||||
POETRY_SOURCE_REF_RESEARCH: gitlab-research
|
FF_USE_FASTZIP: "true" # enable fastzip - a faster zip implementation that also supports level configuration.
|
||||||
PYPI_REGISTRY_RED: https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
|
ARTIFACT_COMPRESSION_LEVEL: default # can also be set to fastest, fast, slow and slowest. If just enabling fastzip is not enough try setting this to fastest or fast.
|
||||||
POETRY_SOURCE_REF_RED: gitlab-red
|
CACHE_COMPRESSION_LEVEL: default # same as above, but for caches
|
||||||
PYPI_REGISTRY_FFORESIGHT: https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
|
# TRANSFER_METER_FREQUENCY: 5s # will display transfer progress every 5 seconds for artifacts and remote caches. For debugging purposes.
|
||||||
POETRY_SOURCE_REF_FFORESIGHT: gitlab-fforesight
|
|
||||||
# POETRY_HOME: /opt/poetry
|
|
||||||
|
|
||||||
setup-poetry-venv:
|
|
||||||
stage: setup
|
|
||||||
script:
|
|
||||||
- env # check env vars
|
|
||||||
# install poetry & return versions
|
|
||||||
- pip install --upgrade pip
|
|
||||||
- pip -V
|
|
||||||
- python -V
|
|
||||||
- pip install poetry
|
|
||||||
- poetry -V
|
|
||||||
# configure poetry
|
|
||||||
- poetry config installer.max-workers 10
|
|
||||||
- poetry config virtualenvs.in-project true
|
|
||||||
- poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH}
|
|
||||||
- poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${CI_REGISTRY_USER} ${CI_JOB_TOKEN}
|
|
||||||
- poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED}
|
|
||||||
- poetry config http-basic.${POETRY_SOURCE_REF_RED} ${CI_REGISTRY_USER} ${CI_JOB_TOKEN}
|
|
||||||
- poetry config repositories.${POETRY_SOURCE_REF_FFORESIGHT} ${PYPI_REGISTRY_FFORESIGHT}
|
|
||||||
- poetry config http-basic.${POETRY_SOURCE_REF_FFORESIGHT} ${CI_REGISTRY_USER} ${CI_JOB_TOKEN}
|
|
||||||
# create and activate venv
|
|
||||||
- poetry env use $(which python)
|
|
||||||
- source .venv/bin/activate
|
|
||||||
- python -m ensurepip
|
|
||||||
- env # check env vars again
|
|
||||||
# install from poetry.lock file
|
|
||||||
- poetry install --all-extras -vvv
|
|
||||||
|
|
||||||
run-tests:
|
############
|
||||||
script:
|
# UNIT TESTS
|
||||||
- echo "Disabled until we have an automated way to run docker compose before tests."
|
unit-tests:
|
||||||
|
variables:
|
||||||
|
###### UPDATE/EDIT ######
|
||||||
|
UNIT_TEST_DIR: "tests/unit_test"
|
||||||
|
|||||||
@ -5,7 +5,7 @@ default_language_version:
|
|||||||
python: python3.10
|
python: python3.10
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
rev: v4.6.0
|
rev: v5.0.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: trailing-whitespace
|
- id: trailing-whitespace
|
||||||
- id: end-of-file-fixer
|
- id: end-of-file-fixer
|
||||||
@ -39,7 +39,7 @@ repos:
|
|||||||
- --profile black
|
- --profile black
|
||||||
|
|
||||||
- repo: https://github.com/psf/black
|
- repo: https://github.com/psf/black
|
||||||
rev: 24.4.2
|
rev: 24.10.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: black
|
- id: black
|
||||||
# exclude: ^(docs/|notebooks/|data/|src/secrets/)
|
# exclude: ^(docs/|notebooks/|data/|src/secrets/)
|
||||||
@ -47,7 +47,7 @@ repos:
|
|||||||
- --line-length=120
|
- --line-length=120
|
||||||
|
|
||||||
- repo: https://github.com/compilerla/conventional-pre-commit
|
- repo: https://github.com/compilerla/conventional-pre-commit
|
||||||
rev: v3.2.0
|
rev: v3.6.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: conventional-pre-commit
|
- id: conventional-pre-commit
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
|
|||||||
61
README.md
61
README.md
@ -6,7 +6,7 @@
|
|||||||
4. [ Module Installation ](#module-installation)
|
4. [ Module Installation ](#module-installation)
|
||||||
5. [ Scripts ](#scripts)
|
5. [ Scripts ](#scripts)
|
||||||
6. [ Tests ](#tests)
|
6. [ Tests ](#tests)
|
||||||
7. [ Protobuf ](#protobuf)
|
7. [ Opentelemetry protobuf dependency hell ](#opentelemetry-protobuf-dependency-hell)
|
||||||
|
|
||||||
## About
|
## About
|
||||||
|
|
||||||
@ -73,6 +73,17 @@ the [complete example](pyinfra/examples.py).
|
|||||||
| TRACING\_\_OPENTELEMETRY\_\_EXPORTER | tracing.opentelemetry.exporter | Name of exporter |
|
| TRACING\_\_OPENTELEMETRY\_\_EXPORTER | tracing.opentelemetry.exporter | Name of exporter |
|
||||||
| KUBERNETES\_\_POD_NAME | kubernetes.pod_name | Service pod name |
|
| KUBERNETES\_\_POD_NAME | kubernetes.pod_name | Service pod name |
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
**IMPORTANT** you need to set the following environment variables before running the setup script:
|
||||||
|
- ``$NEXUS_USER`` your Nexus user (usually equal to firstname.lastname@knecon.com)
|
||||||
|
- ``$NEXUS_PASSWORD`` your Nexus password (usually equal to your Azure Login)
|
||||||
|
|
||||||
|
```shell
|
||||||
|
# create venv and activate it
|
||||||
|
source ./scripts/setup/devenvsetup.sh {{ cookiecutter.python_version }} $NEXUS_USER $NEXUS_PASSWORD
|
||||||
|
source .venv/bin/activate
|
||||||
|
```
|
||||||
|
|
||||||
### OpenTelemetry
|
### OpenTelemetry
|
||||||
|
|
||||||
Open telemetry (vis its Python SDK) is set up to be as unobtrusive as possible; for typical use cases it can be
|
Open telemetry (vis its Python SDK) is set up to be as unobtrusive as possible; for typical use cases it can be
|
||||||
@ -202,48 +213,8 @@ $ python scripts/send_request.py
|
|||||||
Tests require a running minio and rabbitmq container, meaning you have to run `docker compose up` in the tests folder
|
Tests require a running minio and rabbitmq container, meaning you have to run `docker compose up` in the tests folder
|
||||||
before running the tests.
|
before running the tests.
|
||||||
|
|
||||||
## Protobuf
|
## OpenTelemetry Protobuf Dependency Hell
|
||||||
|
|
||||||
### Opentelemetry Compatibility Issue
|
**Note**: Status 2025/01/09: the currently used `opentelemetry-exporter-otlp-proto-http` version `1.25.0` requires
|
||||||
|
a `protobuf` version < `5.x.x` and is not compatible with the latest protobuf version `5.27.x`. This is an [open issue](https://github.com/open-telemetry/opentelemetry-python/issues/3958) in opentelemetry, because [support for 4.25.x ends in Q2 '25](https://protobuf.dev/support/version-support/#python).
|
||||||
**Note**: Status: 31/07/2024, the currently used `opentelemetry-exporter-otlp-proto-http` version `1.25.0` requires
|
Therefore, we should keep this in mind and update the dependency once opentelemetry includes support for `protobuf 5.27.x`.
|
||||||
a `protobuf` version < `5.x.x` and is not compatible with the latest protobuf version `5.27.x`. This is an [open issue](https://github.com/open-telemetry/opentelemetry-python/issues/3958) in opentelemetry, because [support for 4.25.x ends in Q2 '25](https://protobuf.dev/support/version-support/#python). Therefore, we should keep this in mind and update the dependency once opentelemetry includes support for `protobuf 5.27.x`.
|
|
||||||
|
|
||||||
|
|
||||||
### Install Protobuf Compiler
|
|
||||||
|
|
||||||
**Linux**
|
|
||||||
|
|
||||||
1. Download the version of the protobuf compiler matching the protobuf package, currently v4.25.4 so protoc v25.4, from [GitHub](https://github.com/protocolbuffers/protobuf/releases) -> `protobuf-25.4.zip`
|
|
||||||
2. Extract the files under `$HOME/.local` or another directory of your choice
|
|
||||||
```bash
|
|
||||||
unzip protoc-<version>-linux-x86_64.zip -d $HOME/.local
|
|
||||||
```
|
|
||||||
3. Ensure that the `bin` directory is in your `PATH` by adding the following line to your `.bashrc` or `.zshrc`:
|
|
||||||
```bash
|
|
||||||
export PATH="$PATH:$HOME/.local/bin"
|
|
||||||
```
|
|
||||||
|
|
||||||
**MacOS**
|
|
||||||
|
|
||||||
1. Download the version of the protobuf compiler matching the protobuf package, currently v4.25.4 so protoc v25.4, from [GitHub](https://github.com/protocolbuffers/protobuf/releases) -> `protoc-25.4-osx-universal_binary.zip`
|
|
||||||
2. Extract the files to a directory of your choice
|
|
||||||
3. Copy the executable bin `protoc` to `/usr/local/bin`
|
|
||||||
```bash
|
|
||||||
sudo cp /Users/you/location-of-unzipped-dir/bin/protoc /usr/local/bin/
|
|
||||||
```
|
|
||||||
4. Open `protoc` in `/usr/local/bin/` via Finder to make it executable, now it should be also on your `PATH`
|
|
||||||
|
|
||||||
### Compile Protobuf Files
|
|
||||||
|
|
||||||
1. Ensure that the protobuf compiler is installed on your system. You can check this by running:
|
|
||||||
```bash
|
|
||||||
protoc --version
|
|
||||||
```
|
|
||||||
2. Compile proto files:
|
|
||||||
```bash
|
|
||||||
protoc --proto_path=./config/proto --python_out=./pyinfra/proto ./config/proto/*.proto
|
|
||||||
```
|
|
||||||
3. Manually adjust import statements in the generated files to match the package structure, e.g.:
|
|
||||||
`import EntryData_pb2 as EntryData__pb2` -> `import pyinfra.proto.EntryData_pb2 as EntryData__pb2`.
|
|
||||||
This does not work automatically because the generated files are not in the same directory as the proto files.
|
|
||||||
|
|||||||
@ -1,21 +0,0 @@
|
|||||||
syntax = "proto3";
|
|
||||||
|
|
||||||
message AllDocumentPages {
|
|
||||||
|
|
||||||
repeated DocumentPage documentPages = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
message DocumentPage {
|
|
||||||
// The page number, starting with 1.
|
|
||||||
int32 number = 1;
|
|
||||||
|
|
||||||
// The page height in PDF user units.
|
|
||||||
int32 height = 2;
|
|
||||||
|
|
||||||
// The page width in PDF user units.
|
|
||||||
int32 width = 3;
|
|
||||||
|
|
||||||
// The page rotation as specified by the PDF.
|
|
||||||
int32 rotation = 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
@ -1,25 +0,0 @@
|
|||||||
syntax = "proto3";
|
|
||||||
|
|
||||||
message AllDocumentPositionData {
|
|
||||||
|
|
||||||
repeated DocumentPositionData documentPositionData = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
message DocumentPositionData {
|
|
||||||
// Identifier of the text block.
|
|
||||||
int64 id = 1;
|
|
||||||
|
|
||||||
// For each string coordinate in the search text of the text block, the array contains an entry relating the string coordinate to the position coordinate.
|
|
||||||
// This is required due to the text and position coordinates not being equal.
|
|
||||||
repeated int32 stringIdxToPositionIdx = 2;
|
|
||||||
|
|
||||||
// The bounding box for each glyph as a rectangle. This matrix is of size (n,4), where n is the number of glyphs in the text block.
|
|
||||||
// The second dimension specifies the rectangle with the value x, y, width, height, with x, y specifying the lower left corner.
|
|
||||||
// In order to access this information, the stringIdxToPositionIdx array must be used to transform the coordinates.
|
|
||||||
repeated Position positions = 3;
|
|
||||||
|
|
||||||
// Definition of a BoundingBox that contains x, y, width, and height.
|
|
||||||
message Position {
|
|
||||||
repeated float value = 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -1,8 +0,0 @@
|
|||||||
syntax = "proto3";
|
|
||||||
|
|
||||||
import "EntryData.proto";
|
|
||||||
|
|
||||||
message DocumentStructure {
|
|
||||||
// The root EntryData represents the Document.
|
|
||||||
EntryData root = 1;
|
|
||||||
}
|
|
||||||
@ -1,29 +0,0 @@
|
|||||||
syntax = "proto3";
|
|
||||||
|
|
||||||
message AllDocumentTextData {
|
|
||||||
|
|
||||||
repeated DocumentTextData documentTextData = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
message DocumentTextData {
|
|
||||||
// Identifier of the text block.
|
|
||||||
int64 id = 1;
|
|
||||||
|
|
||||||
// The page the text block occurs on.
|
|
||||||
int64 page = 2;
|
|
||||||
|
|
||||||
// The text of the text block.
|
|
||||||
string searchText = 3;
|
|
||||||
|
|
||||||
// Each text block is assigned a number on a page, starting from 0.
|
|
||||||
int32 numberOnPage = 4;
|
|
||||||
|
|
||||||
// The text blocks are ordered, this number represents the start of the text block as a string offset.
|
|
||||||
int32 start = 5;
|
|
||||||
|
|
||||||
// The text blocks are ordered, this number represents the end of the text block as a string offset.
|
|
||||||
int32 end = 6;
|
|
||||||
|
|
||||||
// The line breaks in the text of this semantic node in string offsets. They are exclusive end. At the end of each semantic node there is an implicit linebreak.
|
|
||||||
repeated int32 lineBreaks = 7;
|
|
||||||
}
|
|
||||||
@ -1,27 +0,0 @@
|
|||||||
syntax = "proto3";
|
|
||||||
|
|
||||||
import "LayoutEngine.proto";
|
|
||||||
import "NodeType.proto";
|
|
||||||
|
|
||||||
message EntryData {
|
|
||||||
// Type of the semantic node.
|
|
||||||
NodeType type = 1;
|
|
||||||
|
|
||||||
// Specifies the position in the parsed tree structure.
|
|
||||||
repeated int32 treeId = 2;
|
|
||||||
|
|
||||||
// Specifies the text block IDs associated with this semantic node.
|
|
||||||
repeated int64 atomicBlockIds = 3;
|
|
||||||
|
|
||||||
// Specifies the pages this semantic node appears on.
|
|
||||||
repeated int64 pageNumbers = 4;
|
|
||||||
|
|
||||||
// Some semantic nodes have additional information, this information is stored in this Map.
|
|
||||||
map<string, string> properties = 5;
|
|
||||||
|
|
||||||
// All child Entries of this Entry.
|
|
||||||
repeated EntryData children = 6;
|
|
||||||
|
|
||||||
// Describes the origin of the semantic node.
|
|
||||||
repeated LayoutEngine engines = 7;
|
|
||||||
}
|
|
||||||
@ -1,7 +0,0 @@
|
|||||||
syntax = "proto3";
|
|
||||||
|
|
||||||
enum LayoutEngine {
|
|
||||||
ALGORITHM = 0;
|
|
||||||
AI = 1;
|
|
||||||
OUTLINE = 2;
|
|
||||||
}
|
|
||||||
@ -1,14 +0,0 @@
|
|||||||
syntax = "proto3";
|
|
||||||
|
|
||||||
enum NodeType {
|
|
||||||
DOCUMENT = 0;
|
|
||||||
SECTION = 1;
|
|
||||||
SUPER_SECTION = 2;
|
|
||||||
HEADLINE = 3;
|
|
||||||
PARAGRAPH = 4;
|
|
||||||
TABLE = 5;
|
|
||||||
TABLE_CELL = 6;
|
|
||||||
IMAGE = 7;
|
|
||||||
HEADER = 8;
|
|
||||||
FOOTER = 9;
|
|
||||||
}
|
|
||||||
3978
poetry.lock
generated
3978
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,5 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
|
import signal
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
@ -22,12 +23,44 @@ from pyinfra.webserver.utils import (
|
|||||||
run_async_webserver,
|
run_async_webserver,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
shutdown_flag = False
|
||||||
|
|
||||||
|
|
||||||
|
async def graceful_shutdown(manager: AsyncQueueManager, queue_task, webserver_task):
|
||||||
|
global shutdown_flag
|
||||||
|
shutdown_flag = True
|
||||||
|
logger.info("SIGTERM received, shutting down gracefully...")
|
||||||
|
|
||||||
|
if queue_task and not queue_task.done():
|
||||||
|
queue_task.cancel()
|
||||||
|
|
||||||
|
# await queue manager shutdown
|
||||||
|
await asyncio.gather(queue_task, manager.shutdown(), return_exceptions=True)
|
||||||
|
|
||||||
|
if webserver_task and not webserver_task.done():
|
||||||
|
webserver_task.cancel()
|
||||||
|
|
||||||
|
# await webserver shutdown
|
||||||
|
await asyncio.gather(webserver_task, return_exceptions=True)
|
||||||
|
|
||||||
|
logger.info("Shutdown complete.")
|
||||||
|
|
||||||
|
|
||||||
async def run_async_queues(manager: AsyncQueueManager, app, port, host):
|
async def run_async_queues(manager: AsyncQueueManager, app, port, host):
|
||||||
"""Run the async webserver and the async queue manager concurrently."""
|
"""Run the async webserver and the async queue manager concurrently."""
|
||||||
queue_task = None
|
queue_task = None
|
||||||
webserver_task = None
|
webserver_task = None
|
||||||
tenant_api_available = True
|
tenant_api_available = True
|
||||||
|
|
||||||
|
# add signal handler for SIGTERM and SIGINT
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
loop.add_signal_handler(
|
||||||
|
signal.SIGTERM, lambda: asyncio.create_task(graceful_shutdown(manager, queue_task, webserver_task))
|
||||||
|
)
|
||||||
|
loop.add_signal_handler(
|
||||||
|
signal.SIGINT, lambda: asyncio.create_task(graceful_shutdown(manager, queue_task, webserver_task))
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
active_tenants = await manager.fetch_active_tenants()
|
active_tenants = await manager.fetch_active_tenants()
|
||||||
|
|
||||||
@ -45,17 +78,20 @@ async def run_async_queues(manager: AsyncQueueManager, app, port, host):
|
|||||||
logger.error(f"An error occurred while running async queues: {e}", exc_info=True)
|
logger.error(f"An error occurred while running async queues: {e}", exc_info=True)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
finally:
|
finally:
|
||||||
logger.info("Signal received, shutting down...")
|
if shutdown_flag:
|
||||||
if not tenant_api_available:
|
logger.debug("Graceful shutdown already in progress.")
|
||||||
sys.exit(0)
|
else:
|
||||||
if queue_task and not queue_task.done():
|
logger.warning("Initiating shutdown due to error or manual interruption.")
|
||||||
queue_task.cancel()
|
if not tenant_api_available:
|
||||||
if webserver_task and not webserver_task.done():
|
sys.exit(0)
|
||||||
webserver_task.cancel()
|
if queue_task and not queue_task.done():
|
||||||
|
queue_task.cancel()
|
||||||
|
|
||||||
await manager.shutdown()
|
if webserver_task and not webserver_task.done():
|
||||||
|
webserver_task.cancel()
|
||||||
|
|
||||||
await asyncio.gather(queue_task, webserver_task, return_exceptions=True)
|
await asyncio.gather(queue_task, manager.shutdown(), webserver_task, return_exceptions=True)
|
||||||
|
logger.info("Shutdown complete.")
|
||||||
|
|
||||||
|
|
||||||
def start_standard_queue_consumer(
|
def start_standard_queue_consumer(
|
||||||
|
|||||||
@ -1,29 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
||||||
# source: DocumentPage.proto
|
|
||||||
# Protobuf Python Version: 4.25.5
|
|
||||||
"""Generated protocol buffer code."""
|
|
||||||
from google.protobuf import descriptor as _descriptor
|
|
||||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
||||||
from google.protobuf import symbol_database as _symbol_database
|
|
||||||
from google.protobuf.internal import builder as _builder
|
|
||||||
|
|
||||||
# @@protoc_insertion_point(imports)
|
|
||||||
|
|
||||||
_sym_db = _symbol_database.Default()
|
|
||||||
|
|
||||||
|
|
||||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
|
|
||||||
b'\n\x12\x44ocumentPage.proto"8\n\x10\x41llDocumentPages\x12$\n\rdocumentPages\x18\x01 \x03(\x0b\x32\r.DocumentPage"O\n\x0c\x44ocumentPage\x12\x0e\n\x06number\x18\x01 \x01(\x05\x12\x0e\n\x06height\x18\x02 \x01(\x05\x12\r\n\x05width\x18\x03 \x01(\x05\x12\x10\n\x08rotation\x18\x04 \x01(\x05\x62\x06proto3'
|
|
||||||
)
|
|
||||||
|
|
||||||
_globals = globals()
|
|
||||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
|
||||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "DocumentPage_pb2", _globals)
|
|
||||||
if _descriptor._USE_C_DESCRIPTORS == False:
|
|
||||||
DESCRIPTOR._options = None
|
|
||||||
_globals["_ALLDOCUMENTPAGES"]._serialized_start = 22
|
|
||||||
_globals["_ALLDOCUMENTPAGES"]._serialized_end = 78
|
|
||||||
_globals["_DOCUMENTPAGE"]._serialized_start = 80
|
|
||||||
_globals["_DOCUMENTPAGE"]._serialized_end = 159
|
|
||||||
# @@protoc_insertion_point(module_scope)
|
|
||||||
@ -1,31 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
||||||
# source: DocumentPositionData.proto
|
|
||||||
# Protobuf Python Version: 4.25.5
|
|
||||||
"""Generated protocol buffer code."""
|
|
||||||
from google.protobuf import descriptor as _descriptor
|
|
||||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
||||||
from google.protobuf import symbol_database as _symbol_database
|
|
||||||
from google.protobuf.internal import builder as _builder
|
|
||||||
|
|
||||||
# @@protoc_insertion_point(imports)
|
|
||||||
|
|
||||||
_sym_db = _symbol_database.Default()
|
|
||||||
|
|
||||||
|
|
||||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
|
|
||||||
b'\n\x1a\x44ocumentPositionData.proto"N\n\x17\x41llDocumentPositionData\x12\x33\n\x14\x64ocumentPositionData\x18\x01 \x03(\x0b\x32\x15.DocumentPositionData"\x90\x01\n\x14\x44ocumentPositionData\x12\n\n\x02id\x18\x01 \x01(\x03\x12\x1e\n\x16stringIdxToPositionIdx\x18\x02 \x03(\x05\x12\x31\n\tpositions\x18\x03 \x03(\x0b\x32\x1e.DocumentPositionData.Position\x1a\x19\n\x08Position\x12\r\n\x05value\x18\x01 \x03(\x02\x62\x06proto3'
|
|
||||||
)
|
|
||||||
|
|
||||||
_globals = globals()
|
|
||||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
|
||||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "DocumentPositionData_pb2", _globals)
|
|
||||||
if _descriptor._USE_C_DESCRIPTORS == False:
|
|
||||||
DESCRIPTOR._options = None
|
|
||||||
_globals["_ALLDOCUMENTPOSITIONDATA"]._serialized_start = 30
|
|
||||||
_globals["_ALLDOCUMENTPOSITIONDATA"]._serialized_end = 108
|
|
||||||
_globals["_DOCUMENTPOSITIONDATA"]._serialized_start = 111
|
|
||||||
_globals["_DOCUMENTPOSITIONDATA"]._serialized_end = 255
|
|
||||||
_globals["_DOCUMENTPOSITIONDATA_POSITION"]._serialized_start = 230
|
|
||||||
_globals["_DOCUMENTPOSITIONDATA_POSITION"]._serialized_end = 255
|
|
||||||
# @@protoc_insertion_point(module_scope)
|
|
||||||
@ -1,29 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
||||||
# source: DocumentStructure.proto
|
|
||||||
# Protobuf Python Version: 4.25.5
|
|
||||||
"""Generated protocol buffer code."""
|
|
||||||
from google.protobuf import descriptor as _descriptor
|
|
||||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
||||||
from google.protobuf import symbol_database as _symbol_database
|
|
||||||
from google.protobuf.internal import builder as _builder
|
|
||||||
|
|
||||||
# @@protoc_insertion_point(imports)
|
|
||||||
|
|
||||||
_sym_db = _symbol_database.Default()
|
|
||||||
|
|
||||||
|
|
||||||
import pyinfra.proto.EntryData_pb2 as EntryData__pb2
|
|
||||||
|
|
||||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
|
|
||||||
b'\n\x17\x44ocumentStructure.proto\x1a\x0f\x45ntryData.proto"-\n\x11\x44ocumentStructure\x12\x18\n\x04root\x18\x01 \x01(\x0b\x32\n.EntryDatab\x06proto3'
|
|
||||||
)
|
|
||||||
|
|
||||||
_globals = globals()
|
|
||||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
|
||||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "DocumentStructure_pb2", _globals)
|
|
||||||
if _descriptor._USE_C_DESCRIPTORS == False:
|
|
||||||
DESCRIPTOR._options = None
|
|
||||||
_globals["_DOCUMENTSTRUCTURE"]._serialized_start = 44
|
|
||||||
_globals["_DOCUMENTSTRUCTURE"]._serialized_end = 89
|
|
||||||
# @@protoc_insertion_point(module_scope)
|
|
||||||
@ -1,29 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
||||||
# source: DocumentTextData.proto
|
|
||||||
# Protobuf Python Version: 4.25.5
|
|
||||||
"""Generated protocol buffer code."""
|
|
||||||
from google.protobuf import descriptor as _descriptor
|
|
||||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
||||||
from google.protobuf import symbol_database as _symbol_database
|
|
||||||
from google.protobuf.internal import builder as _builder
|
|
||||||
|
|
||||||
# @@protoc_insertion_point(imports)
|
|
||||||
|
|
||||||
_sym_db = _symbol_database.Default()
|
|
||||||
|
|
||||||
|
|
||||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
|
|
||||||
b'\n\x16\x44ocumentTextData.proto"B\n\x13\x41llDocumentTextData\x12+\n\x10\x64ocumentTextData\x18\x01 \x03(\x0b\x32\x11.DocumentTextData"\x86\x01\n\x10\x44ocumentTextData\x12\n\n\x02id\x18\x01 \x01(\x03\x12\x0c\n\x04page\x18\x02 \x01(\x03\x12\x12\n\nsearchText\x18\x03 \x01(\t\x12\x14\n\x0cnumberOnPage\x18\x04 \x01(\x05\x12\r\n\x05start\x18\x05 \x01(\x05\x12\x0b\n\x03\x65nd\x18\x06 \x01(\x05\x12\x12\n\nlineBreaks\x18\x07 \x03(\x05\x62\x06proto3'
|
|
||||||
)
|
|
||||||
|
|
||||||
_globals = globals()
|
|
||||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
|
||||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "DocumentTextData_pb2", _globals)
|
|
||||||
if _descriptor._USE_C_DESCRIPTORS == False:
|
|
||||||
DESCRIPTOR._options = None
|
|
||||||
_globals["_ALLDOCUMENTTEXTDATA"]._serialized_start = 26
|
|
||||||
_globals["_ALLDOCUMENTTEXTDATA"]._serialized_end = 92
|
|
||||||
_globals["_DOCUMENTTEXTDATA"]._serialized_start = 95
|
|
||||||
_globals["_DOCUMENTTEXTDATA"]._serialized_end = 229
|
|
||||||
# @@protoc_insertion_point(module_scope)
|
|
||||||
@ -1,34 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
||||||
# source: EntryData.proto
|
|
||||||
# Protobuf Python Version: 4.25.5
|
|
||||||
"""Generated protocol buffer code."""
|
|
||||||
from google.protobuf import descriptor as _descriptor
|
|
||||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
||||||
from google.protobuf import symbol_database as _symbol_database
|
|
||||||
from google.protobuf.internal import builder as _builder
|
|
||||||
|
|
||||||
# @@protoc_insertion_point(imports)
|
|
||||||
|
|
||||||
_sym_db = _symbol_database.Default()
|
|
||||||
|
|
||||||
|
|
||||||
import pyinfra.proto.LayoutEngine_pb2 as LayoutEngine__pb2
|
|
||||||
import pyinfra.proto.NodeType_pb2 as NodeType__pb2
|
|
||||||
|
|
||||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
|
|
||||||
b'\n\x0f\x45ntryData.proto\x1a\x12LayoutEngine.proto\x1a\x0eNodeType.proto"\x82\x02\n\tEntryData\x12\x17\n\x04type\x18\x01 \x01(\x0e\x32\t.NodeType\x12\x0e\n\x06treeId\x18\x02 \x03(\x05\x12\x16\n\x0e\x61tomicBlockIds\x18\x03 \x03(\x03\x12\x13\n\x0bpageNumbers\x18\x04 \x03(\x03\x12.\n\nproperties\x18\x05 \x03(\x0b\x32\x1a.EntryData.PropertiesEntry\x12\x1c\n\x08\x63hildren\x18\x06 \x03(\x0b\x32\n.EntryData\x12\x1e\n\x07\x65ngines\x18\x07 \x03(\x0e\x32\r.LayoutEngine\x1a\x31\n\x0fPropertiesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x62\x06proto3'
|
|
||||||
)
|
|
||||||
|
|
||||||
_globals = globals()
|
|
||||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
|
||||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "EntryData_pb2", _globals)
|
|
||||||
if _descriptor._USE_C_DESCRIPTORS == False:
|
|
||||||
DESCRIPTOR._options = None
|
|
||||||
_globals["_ENTRYDATA_PROPERTIESENTRY"]._options = None
|
|
||||||
_globals["_ENTRYDATA_PROPERTIESENTRY"]._serialized_options = b"8\001"
|
|
||||||
_globals["_ENTRYDATA"]._serialized_start = 56
|
|
||||||
_globals["_ENTRYDATA"]._serialized_end = 314
|
|
||||||
_globals["_ENTRYDATA_PROPERTIESENTRY"]._serialized_start = 265
|
|
||||||
_globals["_ENTRYDATA_PROPERTIESENTRY"]._serialized_end = 314
|
|
||||||
# @@protoc_insertion_point(module_scope)
|
|
||||||
@ -1,27 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
||||||
# source: LayoutEngine.proto
|
|
||||||
# Protobuf Python Version: 4.25.5
|
|
||||||
"""Generated protocol buffer code."""
|
|
||||||
from google.protobuf import descriptor as _descriptor
|
|
||||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
||||||
from google.protobuf import symbol_database as _symbol_database
|
|
||||||
from google.protobuf.internal import builder as _builder
|
|
||||||
|
|
||||||
# @@protoc_insertion_point(imports)
|
|
||||||
|
|
||||||
_sym_db = _symbol_database.Default()
|
|
||||||
|
|
||||||
|
|
||||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
|
|
||||||
b"\n\x12LayoutEngine.proto*2\n\x0cLayoutEngine\x12\r\n\tALGORITHM\x10\x00\x12\x06\n\x02\x41I\x10\x01\x12\x0b\n\x07OUTLINE\x10\x02\x62\x06proto3"
|
|
||||||
)
|
|
||||||
|
|
||||||
_globals = globals()
|
|
||||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
|
||||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "LayoutEngine_pb2", _globals)
|
|
||||||
if _descriptor._USE_C_DESCRIPTORS == False:
|
|
||||||
DESCRIPTOR._options = None
|
|
||||||
_globals["_LAYOUTENGINE"]._serialized_start = 22
|
|
||||||
_globals["_LAYOUTENGINE"]._serialized_end = 72
|
|
||||||
# @@protoc_insertion_point(module_scope)
|
|
||||||
@ -1,27 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
# Generated by the protocol buffer compiler. DO NOT EDIT!
|
|
||||||
# source: NodeType.proto
|
|
||||||
# Protobuf Python Version: 4.25.5
|
|
||||||
"""Generated protocol buffer code."""
|
|
||||||
from google.protobuf import descriptor as _descriptor
|
|
||||||
from google.protobuf import descriptor_pool as _descriptor_pool
|
|
||||||
from google.protobuf import symbol_database as _symbol_database
|
|
||||||
from google.protobuf.internal import builder as _builder
|
|
||||||
|
|
||||||
# @@protoc_insertion_point(imports)
|
|
||||||
|
|
||||||
_sym_db = _symbol_database.Default()
|
|
||||||
|
|
||||||
|
|
||||||
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
|
|
||||||
b"\n\x0eNodeType.proto*\x93\x01\n\x08NodeType\x12\x0c\n\x08\x44OCUMENT\x10\x00\x12\x0b\n\x07SECTION\x10\x01\x12\x11\n\rSUPER_SECTION\x10\x02\x12\x0c\n\x08HEADLINE\x10\x03\x12\r\n\tPARAGRAPH\x10\x04\x12\t\n\x05TABLE\x10\x05\x12\x0e\n\nTABLE_CELL\x10\x06\x12\t\n\x05IMAGE\x10\x07\x12\n\n\x06HEADER\x10\x08\x12\n\n\x06\x46OOTER\x10\tb\x06proto3"
|
|
||||||
)
|
|
||||||
|
|
||||||
_globals = globals()
|
|
||||||
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
|
|
||||||
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "NodeType_pb2", _globals)
|
|
||||||
if _descriptor._USE_C_DESCRIPTORS == False:
|
|
||||||
DESCRIPTOR._options = None
|
|
||||||
_globals["_NODETYPE"]._serialized_start = 19
|
|
||||||
_globals["_NODETYPE"]._serialized_end = 166
|
|
||||||
# @@protoc_insertion_point(module_scope)
|
|
||||||
@ -122,6 +122,11 @@ class AsyncQueueManager:
|
|||||||
self.config.service_response_exchange_name, ExchangeType.DIRECT, durable=True
|
self.config.service_response_exchange_name, ExchangeType.DIRECT, durable=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# we must declare DLQ to handle error messages
|
||||||
|
self.dead_letter_queue = await self.channel.declare_queue(
|
||||||
|
self.config.service_dead_letter_queue_name, durable=True
|
||||||
|
)
|
||||||
|
|
||||||
@retry(tries=5, exceptions=(AMQPConnectionError, ChannelInvalidStateError), reraise=True, logger=logger)
|
@retry(tries=5, exceptions=(AMQPConnectionError, ChannelInvalidStateError), reraise=True, logger=logger)
|
||||||
async def setup_tenant_queue(self) -> None:
|
async def setup_tenant_queue(self) -> None:
|
||||||
self.tenant_exchange_queue = await self.channel.declare_queue(
|
self.tenant_exchange_queue = await self.channel.declare_queue(
|
||||||
@ -160,6 +165,10 @@ class AsyncQueueManager:
|
|||||||
input_queue = await self.channel.declare_queue(
|
input_queue = await self.channel.declare_queue(
|
||||||
queue_name,
|
queue_name,
|
||||||
durable=True,
|
durable=True,
|
||||||
|
arguments={
|
||||||
|
"x-dead-letter-exchange": "",
|
||||||
|
"x-dead-letter-routing-key": self.config.service_dead_letter_queue_name,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
await input_queue.bind(self.input_exchange, routing_key=tenant_id)
|
await input_queue.bind(self.input_exchange, routing_key=tenant_id)
|
||||||
self.consumer_tags[tenant_id] = await input_queue.consume(self.process_input_message)
|
self.consumer_tags[tenant_id] = await input_queue.consume(self.process_input_message)
|
||||||
@ -290,7 +299,7 @@ class AsyncQueueManager:
|
|||||||
while self.message_count != 0:
|
while self.message_count != 0:
|
||||||
logger.debug(f"Messages are still being processed: {self.message_count=} ")
|
logger.debug(f"Messages are still being processed: {self.message_count=} ")
|
||||||
await asyncio.sleep(2)
|
await asyncio.sleep(2)
|
||||||
await self.channel.close()
|
await self.channel.close(exc=asyncio.CancelledError)
|
||||||
logger.debug("Channel closed.")
|
logger.debug("Channel closed.")
|
||||||
else:
|
else:
|
||||||
logger.debug("No channel to close.")
|
logger.debug("No channel to close.")
|
||||||
@ -304,7 +313,7 @@ class AsyncQueueManager:
|
|||||||
async def close_connection(self) -> None:
|
async def close_connection(self) -> None:
|
||||||
try:
|
try:
|
||||||
if self.connection and not self.connection.is_closed:
|
if self.connection and not self.connection.is_closed:
|
||||||
await self.connection.close()
|
await self.connection.close(exc=asyncio.CancelledError)
|
||||||
logger.debug("Connection closed.")
|
logger.debug("Connection closed.")
|
||||||
else:
|
else:
|
||||||
logger.debug("No connection to close.")
|
logger.debug("No connection to close.")
|
||||||
|
|||||||
@ -1,15 +1,16 @@
|
|||||||
from typing import Callable, Union
|
from typing import Callable
|
||||||
|
|
||||||
from dynaconf import Dynaconf
|
from dynaconf import Dynaconf
|
||||||
from kn_utils.logging import logger
|
from kn_utils.logging import logger
|
||||||
|
|
||||||
from pyinfra.storage.connection import get_storage
|
from pyinfra.storage.connection import get_storage
|
||||||
from pyinfra.storage.utils import (
|
from pyinfra.storage.utils import (
|
||||||
download_data_as_specified_in_message,
|
download_data_bytes_as_specified_in_message,
|
||||||
upload_data_as_specified_in_message,
|
upload_data_as_specified_in_message,
|
||||||
|
DownloadedData,
|
||||||
)
|
)
|
||||||
|
|
||||||
DataProcessor = Callable[[Union[dict, bytes], dict], Union[dict, list, str]]
|
DataProcessor = Callable[[dict[str, DownloadedData] | DownloadedData, dict], dict | list | str]
|
||||||
Callback = Callable[[dict], dict]
|
Callback = Callable[[dict], dict]
|
||||||
|
|
||||||
|
|
||||||
@ -28,7 +29,9 @@ def make_download_process_upload_callback(data_processor: DataProcessor, setting
|
|||||||
|
|
||||||
storage = get_storage(settings, queue_message_payload.get("X-TENANT-ID"))
|
storage = get_storage(settings, queue_message_payload.get("X-TENANT-ID"))
|
||||||
|
|
||||||
data = download_data_as_specified_in_message(storage, queue_message_payload)
|
data: dict[str, DownloadedData] | DownloadedData = download_data_bytes_as_specified_in_message(
|
||||||
|
storage, queue_message_payload
|
||||||
|
)
|
||||||
|
|
||||||
result = data_processor(data, queue_message_payload)
|
result = data_processor(data, queue_message_payload)
|
||||||
|
|
||||||
|
|||||||
@ -1,127 +0,0 @@
|
|||||||
import re
|
|
||||||
from enum import Enum
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from google.protobuf.json_format import MessageToDict
|
|
||||||
from kn_utils.logging import logger
|
|
||||||
|
|
||||||
from pyinfra.proto import (
|
|
||||||
DocumentPage_pb2,
|
|
||||||
DocumentPositionData_pb2,
|
|
||||||
DocumentStructure_pb2,
|
|
||||||
DocumentTextData_pb2,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class ProtoDataLoader:
|
|
||||||
"""Loads proto data from a file and returns it as a dictionary or list.
|
|
||||||
|
|
||||||
The loader is a singleton and should be used as a callable. The file name and byte data are passed as arguments.
|
|
||||||
|
|
||||||
The document type is determined based on the file name and the data is returned as a dictionary or list, depending
|
|
||||||
on the document type.
|
|
||||||
The DocumentType enum contains all supported document types and their corresponding proto schema.
|
|
||||||
KEYS_TO_UNPACK contains the keys that should be unpacked from the message dictionary. Keys are unpacked if the
|
|
||||||
message dictionary contains only one key. This behaviour is necessary since lists are wrapped in a dictionary.
|
|
||||||
"""
|
|
||||||
|
|
||||||
_instance = None
|
|
||||||
_pattern = None
|
|
||||||
|
|
||||||
class DocumentType(Enum):
|
|
||||||
STRUCTURE = (DocumentStructure_pb2.DocumentStructure, "DocumentStructure")
|
|
||||||
TEXT = (DocumentTextData_pb2.AllDocumentTextData, "AllDocumentTextData")
|
|
||||||
PAGES = (DocumentPage_pb2.AllDocumentPages, "AllDocumentPages")
|
|
||||||
POSITION = (DocumentPositionData_pb2.AllDocumentPositionData, "AllDocumentPositionData")
|
|
||||||
|
|
||||||
KEYS_TO_UNPACK = ["documentTextData", "documentPages", "documentPositionData"]
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _build_pattern(cls) -> re.Pattern:
|
|
||||||
types = "|".join([dt.name for dt in cls.DocumentType])
|
|
||||||
return re.compile(rf"\..*({types}).*\.proto.*")
|
|
||||||
|
|
||||||
def __new__(cls, *args, **kwargs):
|
|
||||||
if cls._instance is None:
|
|
||||||
cls._instance = super().__new__(cls)
|
|
||||||
cls._pattern = cls._build_pattern()
|
|
||||||
return cls._instance
|
|
||||||
|
|
||||||
def __call__(self, file_name: str | Path, data: bytes) -> dict:
|
|
||||||
return self._load(file_name, data)
|
|
||||||
|
|
||||||
def _load(self, file_name: str | Path, data: bytes) -> dict | list:
|
|
||||||
file_name = str(file_name)
|
|
||||||
document_type = self._match(file_name)
|
|
||||||
|
|
||||||
if not document_type:
|
|
||||||
logger.error(f"Unknown document type: {file_name}, supported types: {self.DocumentType}")
|
|
||||||
return {}
|
|
||||||
|
|
||||||
logger.debug(f"Loading document type: {document_type}")
|
|
||||||
schema, _ = self.DocumentType[document_type].value
|
|
||||||
message = schema()
|
|
||||||
message.ParseFromString(data)
|
|
||||||
message_dict = MessageToDict(message, including_default_value_fields=True)
|
|
||||||
message_dict = convert_int64_fields(message_dict)
|
|
||||||
if document_type == "POSITION":
|
|
||||||
message_dict = transform_positions_to_list(message_dict)
|
|
||||||
|
|
||||||
return self._unpack(message_dict)
|
|
||||||
|
|
||||||
def _match(self, file_name: str) -> str | None:
|
|
||||||
match = self._pattern.search(file_name)
|
|
||||||
return match.group(1) if match else None
|
|
||||||
|
|
||||||
def _unpack(self, message_dict: dict) -> list | dict:
|
|
||||||
if len(message_dict) > 1:
|
|
||||||
return message_dict
|
|
||||||
|
|
||||||
for key in self.KEYS_TO_UNPACK:
|
|
||||||
if key in message_dict:
|
|
||||||
logger.debug(f"Unpacking key: {key}")
|
|
||||||
return message_dict[key]
|
|
||||||
|
|
||||||
return message_dict
|
|
||||||
|
|
||||||
|
|
||||||
def convert_int64_fields(obj):
|
|
||||||
# FIXME: find a more sophisticated way to convert int64 fields (defaults to str in python)
|
|
||||||
|
|
||||||
# we skip the following keys because the values are expected to be of type str
|
|
||||||
skip_keys = ["col", "row", "numberOfCols", "numberOfRows"]
|
|
||||||
if isinstance(obj, dict):
|
|
||||||
for key, value in obj.items():
|
|
||||||
if key in skip_keys:
|
|
||||||
continue
|
|
||||||
obj[key] = convert_int64_fields(value)
|
|
||||||
elif isinstance(obj, list):
|
|
||||||
return [convert_int64_fields(item) for item in obj]
|
|
||||||
elif isinstance(obj, str) and obj.isdigit():
|
|
||||||
return int(obj)
|
|
||||||
return obj
|
|
||||||
|
|
||||||
|
|
||||||
def transform_positions_to_list(obj: dict | list) -> dict:
|
|
||||||
"""Transforms the repeated fields 'positions' to a lists of lists of floats
|
|
||||||
as expected by DocumentReader.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
obj (dict | list): Proto message dict
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict: Proto message dict
|
|
||||||
"""
|
|
||||||
if isinstance(obj, dict):
|
|
||||||
# Check if 'positions' is in the dictionary and reshape it as list of lists of floats
|
|
||||||
if "positions" in obj and isinstance(obj["positions"], list):
|
|
||||||
obj["positions"] = [pos["value"] for pos in obj["positions"] if isinstance(pos, dict) and "value" in pos]
|
|
||||||
|
|
||||||
# Recursively apply to all nested dictionaries
|
|
||||||
for key, value in obj.items():
|
|
||||||
obj[key] = transform_positions_to_list(value)
|
|
||||||
elif isinstance(obj, list):
|
|
||||||
# Recursively apply to all items in the list
|
|
||||||
obj = [transform_positions_to_list(item) for item in obj]
|
|
||||||
|
|
||||||
return obj
|
|
||||||
@ -1,12 +1,11 @@
|
|||||||
import gzip
|
import gzip
|
||||||
import json
|
import json
|
||||||
from functools import singledispatch
|
from functools import singledispatch
|
||||||
from typing import Union
|
from typing import TypedDict
|
||||||
|
|
||||||
from kn_utils.logging import logger
|
from kn_utils.logging import logger
|
||||||
from pydantic import BaseModel, ValidationError
|
from pydantic import BaseModel, ValidationError
|
||||||
|
|
||||||
from pyinfra.storage.proto_data_loader import ProtoDataLoader
|
|
||||||
from pyinfra.storage.storages.storage import Storage
|
from pyinfra.storage.storages.storage import Storage
|
||||||
|
|
||||||
|
|
||||||
@ -53,28 +52,27 @@ class TenantIdDossierIdFileIdUploadPayload(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class TargetResponseFilePathDownloadPayload(BaseModel):
|
class TargetResponseFilePathDownloadPayload(BaseModel):
|
||||||
targetFilePath: Union[str, dict]
|
targetFilePath: str | dict[str, str]
|
||||||
|
|
||||||
|
|
||||||
class TargetResponseFilePathUploadPayload(BaseModel):
|
class TargetResponseFilePathUploadPayload(BaseModel):
|
||||||
responseFilePath: str
|
responseFilePath: str
|
||||||
|
|
||||||
|
|
||||||
def download_data_as_specified_in_message(storage: Storage, raw_payload: dict) -> Union[dict, bytes]:
|
class DownloadedData(TypedDict):
|
||||||
|
data: bytes
|
||||||
|
file_path: str
|
||||||
|
|
||||||
|
|
||||||
|
def download_data_bytes_as_specified_in_message(
|
||||||
|
storage: Storage, raw_payload: dict
|
||||||
|
) -> dict[str, DownloadedData] | DownloadedData:
|
||||||
"""Convenience function to download a file specified in a message payload.
|
"""Convenience function to download a file specified in a message payload.
|
||||||
Supports both legacy and new payload formats. Also supports downloading multiple files at once, which should
|
Supports both legacy and new payload formats. Also supports downloading multiple files at once, which should
|
||||||
be specified in a dictionary under the 'targetFilePath' key with the file path as value.
|
be specified in a dictionary under the 'targetFilePath' key with the file path as value.
|
||||||
|
The data is downloaded as bytes and returned as a dictionary with the file path as key and the data as value.
|
||||||
If the content is compressed with gzip (.gz), it will be decompressed (-> bytes).
|
In case of several download targets, a nested dictionary is returned with the same keys and dictionaries with
|
||||||
If the content is a json file, it will be decoded (-> dict).
|
the file path and data as values.
|
||||||
If no file is specified in the payload or the file does not exist in storage, an exception will be raised.
|
|
||||||
In all other cases, the content will be returned as is (-> bytes).
|
|
||||||
|
|
||||||
This function can be extended in the future as needed (e.g. handling of more file types), but since further
|
|
||||||
requirements are not specified at this point in time, and it is unclear what these would entail, the code is kept
|
|
||||||
simple for now to improve readability, maintainability and avoid refactoring efforts of generic solutions that
|
|
||||||
weren't as generic as they seemed.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -93,33 +91,25 @@ def download_data_as_specified_in_message(storage: Storage, raw_payload: dict) -
|
|||||||
|
|
||||||
|
|
||||||
@singledispatch
|
@singledispatch
|
||||||
def _download(file_path_or_file_path_dict: Union[str, dict], storage: Storage) -> Union[dict, bytes]:
|
def _download(
|
||||||
|
file_path_or_file_path_dict: str | dict[str, str], storage: Storage
|
||||||
|
) -> dict[str, DownloadedData] | DownloadedData:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@_download.register(str)
|
@_download.register(str)
|
||||||
def _download_single_file(file_path: str, storage: Storage) -> bytes:
|
def _download_single_file(file_path: str, storage: Storage) -> DownloadedData:
|
||||||
if not storage.exists(file_path):
|
if not storage.exists(file_path):
|
||||||
raise FileNotFoundError(f"File '{file_path}' does not exist in storage.")
|
raise FileNotFoundError(f"File '{file_path}' does not exist in storage.")
|
||||||
|
|
||||||
data = storage.get_object(file_path)
|
data = storage.get_object(file_path)
|
||||||
|
|
||||||
data = gzip.decompress(data) if ".gz" in file_path else data
|
|
||||||
|
|
||||||
if ".json" in file_path:
|
|
||||||
data = json.loads(data.decode("utf-8"))
|
|
||||||
elif ".proto" in file_path:
|
|
||||||
data = ProtoDataLoader()(file_path, data)
|
|
||||||
else:
|
|
||||||
pass # identity for other file types
|
|
||||||
|
|
||||||
logger.info(f"Downloaded {file_path} from storage.")
|
logger.info(f"Downloaded {file_path} from storage.")
|
||||||
|
|
||||||
return data
|
return DownloadedData(data=data, file_path=file_path)
|
||||||
|
|
||||||
|
|
||||||
@_download.register(dict)
|
@_download.register(dict)
|
||||||
def _download_multiple_files(file_path_dict: dict, storage: Storage) -> dict:
|
def _download_multiple_files(file_path_dict: dict, storage: Storage) -> dict[str, DownloadedData]:
|
||||||
return {key: _download(value, storage) for key, value in file_path_dict.items()}
|
return {key: _download(value, storage) for key, value in file_path_dict.items()}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -16,6 +16,13 @@ from pyinfra.config.loader import validate_settings
|
|||||||
from pyinfra.config.validators import webserver_validators
|
from pyinfra.config.validators import webserver_validators
|
||||||
|
|
||||||
|
|
||||||
|
class PyInfraUvicornServer(uvicorn.Server):
|
||||||
|
# this is a workaround to enable custom signal handlers
|
||||||
|
# https://github.com/encode/uvicorn/issues/1579
|
||||||
|
def install_signal_handlers(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
@retry(
|
@retry(
|
||||||
tries=5,
|
tries=5,
|
||||||
exceptions=Exception,
|
exceptions=Exception,
|
||||||
@ -53,12 +60,12 @@ def create_webserver_thread(app: FastAPI, port: int, host: str) -> threading.Thr
|
|||||||
async def run_async_webserver(app: FastAPI, port: int, host: str):
|
async def run_async_webserver(app: FastAPI, port: int, host: str):
|
||||||
"""Run the FastAPI web server async."""
|
"""Run the FastAPI web server async."""
|
||||||
config = uvicorn.Config(app, host=host, port=port, log_level=logging.WARNING)
|
config = uvicorn.Config(app, host=host, port=port, log_level=logging.WARNING)
|
||||||
server = uvicorn.Server(config)
|
server = PyInfraUvicornServer(config)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await server.serve()
|
await server.serve()
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
logger.info("Webserver was cancelled.")
|
logger.debug("Webserver was cancelled.")
|
||||||
server.should_exit = True
|
server.should_exit = True
|
||||||
await server.shutdown()
|
await server.shutdown()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "pyinfra"
|
name = "pyinfra"
|
||||||
version = "3.3.2"
|
version = "4.1.0"
|
||||||
description = ""
|
description = ""
|
||||||
authors = ["Team Research <research@knecon.com>"]
|
authors = ["Team Research <research@knecon.com>"]
|
||||||
license = "All rights reseverd"
|
license = "All rights reseverd"
|
||||||
@ -18,29 +18,30 @@ azure-storage-blob = "^12.13"
|
|||||||
# misc utils
|
# misc utils
|
||||||
funcy = "^2"
|
funcy = "^2"
|
||||||
pycryptodome = "^3.19"
|
pycryptodome = "^3.19"
|
||||||
# research shared packages
|
|
||||||
kn-utils = { version = "^0.3.2", source = "gitlab-research" }
|
|
||||||
fastapi = "^0.109.0"
|
fastapi = "^0.109.0"
|
||||||
uvicorn = "^0.26.0"
|
uvicorn = "^0.26.0"
|
||||||
# [tool.poetry.group.telemetry.dependencies]
|
|
||||||
opentelemetry-instrumentation-pika = "^0.46b0"
|
# DONT USE GROUPS BECAUSE THEY ARE NOT INSTALLED FOR PACKAGES
|
||||||
opentelemetry-exporter-otlp = "^1.25.0"
|
# [tool.poetry.group.internal.dependencies] <<< THIS IS NOT WORKING
|
||||||
opentelemetry-instrumentation = "^0.46b0"
|
kn-utils = { version = ">=0.4.0", source = "nexus" }
|
||||||
opentelemetry-api = "^1.25.0"
|
# We set all opentelemetry dependencies to lower bound because the image classification service depends on a protobuf version <4, but does not use proto files.
|
||||||
opentelemetry-sdk = "^1.25.0"
|
# Therefore, we allow latest possible protobuf version in the services which use proto files. As soon as the dependency issue is fixed set this to the latest possible opentelemetry version
|
||||||
opentelemetry-exporter-otlp-proto-http = "^1.25.0"
|
opentelemetry-instrumentation-pika = ">=0.46b0,<0.50"
|
||||||
opentelemetry-instrumentation-flask = "^0.46b0"
|
opentelemetry-exporter-otlp = ">=1.25.0,<1.29"
|
||||||
opentelemetry-instrumentation-requests = "^0.46b0"
|
opentelemetry-instrumentation = ">=0.46b0,<0.50"
|
||||||
opentelemetry-instrumentation-fastapi = "^0.46b0"
|
opentelemetry-api = ">=1.25.0,<1.29"
|
||||||
|
opentelemetry-sdk = ">=1.25.0,<1.29"
|
||||||
|
opentelemetry-exporter-otlp-proto-http = ">=1.25.0,<1.29"
|
||||||
|
opentelemetry-instrumentation-flask = ">=0.46b0,<0.50"
|
||||||
|
opentelemetry-instrumentation-requests = ">=0.46b0,<0.50"
|
||||||
|
opentelemetry-instrumentation-fastapi = ">=0.46b0,<0.50"
|
||||||
|
opentelemetry-instrumentation-aio-pika = ">=0.46b0,<0.50"
|
||||||
wcwidth = "<=0.2.12"
|
wcwidth = "<=0.2.12"
|
||||||
azure-monitor-opentelemetry = "^1.6.0"
|
azure-monitor-opentelemetry = "^1.6.0"
|
||||||
# We set protobuf to this range because the image classification service depends on a protobuf version <4, but does not use proto files.
|
|
||||||
# Therefore, we allow latest possible protobuf version in the services which use proto files. As soon as the dependency issue is fixed set this to the latest possible protobuf version
|
|
||||||
protobuf = ">=3.20 <5.0.0"
|
|
||||||
aio-pika = "^9.4.2"
|
aio-pika = "^9.4.2"
|
||||||
aiohttp = "^3.9.5"
|
aiohttp = "^3.9.5"
|
||||||
opentelemetry-instrumentation-aio-pika = "0.46b0"
|
|
||||||
|
|
||||||
|
# THIS IS NOT AVAILABLE FOR SERVICES THAT IMPLEMENT PYINFRA
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
pytest = "^7"
|
pytest = "^7"
|
||||||
ipykernel = "^6.26.0"
|
ipykernel = "^6.26.0"
|
||||||
@ -53,6 +54,7 @@ cyclonedx-bom = "^4.1.1"
|
|||||||
dvc = "^3.51.2"
|
dvc = "^3.51.2"
|
||||||
dvc-azure = "^3.1.0"
|
dvc-azure = "^3.1.0"
|
||||||
deepdiff = "^7.0.1"
|
deepdiff = "^7.0.1"
|
||||||
|
pytest-cov = "^5.0.0"
|
||||||
|
|
||||||
[tool.pytest.ini_options]
|
[tool.pytest.ini_options]
|
||||||
minversion = "6.0"
|
minversion = "6.0"
|
||||||
@ -87,12 +89,13 @@ disable = [
|
|||||||
docstring-min-length = 3
|
docstring-min-length = 3
|
||||||
|
|
||||||
[[tool.poetry.source]]
|
[[tool.poetry.source]]
|
||||||
name = "PyPI"
|
name = "pypi-proxy"
|
||||||
|
url = "https://nexus.knecon.com/repository/pypi-proxy/simple"
|
||||||
priority = "primary"
|
priority = "primary"
|
||||||
|
|
||||||
[[tool.poetry.source]]
|
[[tool.poetry.source]]
|
||||||
name = "gitlab-research"
|
name = "nexus"
|
||||||
url = "https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi/simple"
|
url = "https://nexus.knecon.com/repository/python/simple"
|
||||||
priority = "explicit"
|
priority = "explicit"
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
|||||||
17
scripts/send_sigterm.py
Normal file
17
scripts/send_sigterm.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
import os
|
||||||
|
import signal
|
||||||
|
import time
|
||||||
|
|
||||||
|
# BE CAREFUL WITH THIS SCRIPT - THIS SIMULATES A SIGTERM FROM KUBERNETES
|
||||||
|
target_pid = int(input("Enter the PID of the target script: "))
|
||||||
|
|
||||||
|
print(f"Sending SIGTERM to PID {target_pid}...")
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.kill(target_pid, signal.SIGTERM)
|
||||||
|
print("SIGTERM sent.")
|
||||||
|
except ProcessLookupError:
|
||||||
|
print("Process not found.")
|
||||||
|
except PermissionError:
|
||||||
|
print("Permission denied. Are you trying to signal a process you don't own?")
|
||||||
39
scripts/setup/devenvsetup.sh
Normal file
39
scripts/setup/devenvsetup.sh
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
python_version=$1
|
||||||
|
nexus_user=$2
|
||||||
|
nexus_password=$3
|
||||||
|
|
||||||
|
# cookiecutter https://gitlab.knecon.com/knecon/research/template-python-project.git --checkout master
|
||||||
|
# latest_dir=$(ls -td -- */ | head -n 1) # should be the dir cookiecutter just created
|
||||||
|
|
||||||
|
# cd $latest_dir
|
||||||
|
|
||||||
|
pyenv install $python_version
|
||||||
|
pyenv local $python_version
|
||||||
|
pyenv shell $python_version
|
||||||
|
|
||||||
|
# install poetry globally (PREFERRED), only need to install it once
|
||||||
|
# curl -sSL https://install.python-poetry.org | python3 -
|
||||||
|
|
||||||
|
# remember to update poetry once in a while
|
||||||
|
poetry self update
|
||||||
|
|
||||||
|
# install poetry in current python environment, can lead to multiple instances of poetry being installed on one system (DISPREFERRED)
|
||||||
|
# pip install --upgrade pip
|
||||||
|
# pip install poetry
|
||||||
|
|
||||||
|
poetry config virtualenvs.in-project true
|
||||||
|
poetry config installer.max-workers 10
|
||||||
|
poetry config repositories.pypi-proxy "https://nexus.knecon.com/repository/pypi-proxy/simple"
|
||||||
|
poetry config http-basic.pypi-proxy ${nexus_user} ${nexus_password}
|
||||||
|
poetry config repositories.nexus https://nexus.knecon.com/repository/python/simple
|
||||||
|
poetry config http-basic.nexus ${nexus_user} ${nexus_password}
|
||||||
|
|
||||||
|
poetry env use $(pyenv which python)
|
||||||
|
poetry install --with=dev
|
||||||
|
poetry update
|
||||||
|
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
pre-commit install
|
||||||
|
pre-commit autoupdate
|
||||||
@ -27,6 +27,8 @@ def storage(storage_backend, settings):
|
|||||||
def queue_manager(settings):
|
def queue_manager(settings):
|
||||||
settings.rabbitmq_heartbeat = 10
|
settings.rabbitmq_heartbeat = 10
|
||||||
settings.connection_sleep = 5
|
settings.connection_sleep = 5
|
||||||
|
settings.rabbitmq.max_retries = 3
|
||||||
|
settings.rabbitmq.max_delay = 10
|
||||||
queue_manager = QueueManager(settings)
|
queue_manager = QueueManager(settings)
|
||||||
yield queue_manager
|
yield queue_manager
|
||||||
|
|
||||||
|
|||||||
@ -3,7 +3,6 @@ from sys import stdout
|
|||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
||||||
import pika
|
import pika
|
||||||
import pytest
|
|
||||||
from kn_utils.logging import logger
|
from kn_utils.logging import logger
|
||||||
|
|
||||||
logger.remove()
|
logger.remove()
|
||||||
@ -7,7 +7,7 @@ from fastapi import FastAPI
|
|||||||
|
|
||||||
from pyinfra.storage.connection import get_storage_for_tenant
|
from pyinfra.storage.connection import get_storage_for_tenant
|
||||||
from pyinfra.storage.utils import (
|
from pyinfra.storage.utils import (
|
||||||
download_data_as_specified_in_message,
|
download_data_bytes_as_specified_in_message,
|
||||||
upload_data_as_specified_in_message,
|
upload_data_as_specified_in_message,
|
||||||
)
|
)
|
||||||
from pyinfra.utils.cipher import encrypt
|
from pyinfra.utils.cipher import encrypt
|
||||||
@ -139,16 +139,6 @@ def payload(payload_type):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def expected_data(payload_type):
|
|
||||||
if payload_type == "target_response_file_path":
|
|
||||||
return {"data": "success"}
|
|
||||||
elif payload_type == "dossier_id_file_id":
|
|
||||||
return {"dossierId": "test", "fileId": "file", "data": "success"}
|
|
||||||
elif payload_type == "target_file_dict":
|
|
||||||
return {"file_1": {"data": "success"}, "file_2": {"data": "success"}}
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"payload_type",
|
"payload_type",
|
||||||
[
|
[
|
||||||
@ -160,17 +150,17 @@ def expected_data(payload_type):
|
|||||||
)
|
)
|
||||||
@pytest.mark.parametrize("storage_backend", ["azure", "s3"], scope="class")
|
@pytest.mark.parametrize("storage_backend", ["azure", "s3"], scope="class")
|
||||||
class TestDownloadAndUploadFromMessage:
|
class TestDownloadAndUploadFromMessage:
|
||||||
def test_download_and_upload_from_message(self, storage, payload, expected_data, payload_type):
|
def test_download_and_upload_from_message(self, storage, payload, payload_type):
|
||||||
storage.clear_bucket()
|
storage.clear_bucket()
|
||||||
|
|
||||||
upload_data = expected_data if payload_type != "target_file_dict" else expected_data["file_1"]
|
result = {"process_result": "success"}
|
||||||
storage.put_object("test/file.target.json.gz", gzip.compress(json.dumps(upload_data).encode()))
|
storage_data = {**payload, "data": result}
|
||||||
|
packed_data = gzip.compress(json.dumps(storage_data).encode())
|
||||||
|
|
||||||
data = download_data_as_specified_in_message(storage, payload)
|
storage.put_object("test/file.target.json.gz", packed_data)
|
||||||
|
|
||||||
assert data == expected_data
|
_ = download_data_bytes_as_specified_in_message(storage, payload)
|
||||||
|
upload_data_as_specified_in_message(storage, payload, result)
|
||||||
upload_data_as_specified_in_message(storage, payload, expected_data)
|
|
||||||
data = json.loads(gzip.decompress(storage.get_object("test/file.response.json.gz")).decode())
|
data = json.loads(gzip.decompress(storage.get_object("test/file.response.json.gz")).decode())
|
||||||
|
|
||||||
assert data == {**payload, "data": expected_data}
|
assert data == storage_data
|
||||||
@ -1,197 +0,0 @@
|
|||||||
import gzip
|
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from deepdiff import DeepDiff
|
|
||||||
|
|
||||||
from pyinfra.storage.proto_data_loader import ProtoDataLoader
|
|
||||||
|
|
||||||
enum = 1
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def test_data_dir():
|
|
||||||
return Path(__file__).parents[1] / "data"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def document_data(request, test_data_dir) -> (str, bytes, dict | list):
|
|
||||||
doc_type = request.param
|
|
||||||
|
|
||||||
# Search for relevant doc_type file pairs - there should be one proto and one json file per document type
|
|
||||||
input_file_path = next(test_data_dir.glob(f"*.{doc_type}.proto.gz"), None)
|
|
||||||
target_file_path = next(test_data_dir.glob(f"*.{doc_type}.json.gz"), None)
|
|
||||||
|
|
||||||
input_data = input_file_path.read_bytes()
|
|
||||||
target_data = json.loads(gzip.decompress(target_file_path.read_bytes()))
|
|
||||||
|
|
||||||
return input_file_path, input_data, target_data
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def proto_data_loader():
|
|
||||||
return ProtoDataLoader()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def should_match():
|
|
||||||
return [
|
|
||||||
"a.DOCUMENT_STRUCTURE.proto.gz",
|
|
||||||
"a.DOCUMENT_TEXT.proto.gz",
|
|
||||||
"a.DOCUMENT_PAGES.proto.gz",
|
|
||||||
"a.DOCUMENT_POSITION.proto.gz",
|
|
||||||
"b.DOCUMENT_STRUCTURE.proto",
|
|
||||||
"b.DOCUMENT_TEXT.proto",
|
|
||||||
"b.DOCUMENT_PAGES.proto",
|
|
||||||
"b.DOCUMENT_POSITION.proto",
|
|
||||||
"c.STRUCTURE.proto.gz",
|
|
||||||
"c.TEXT.proto.gz",
|
|
||||||
"c.PAGES.proto.gz",
|
|
||||||
"c.POSITION.proto.gz",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(
|
|
||||||
reason="FIXME: The test is not stable, but has to work before we can deploy the code! Right now, we don't have parity between the proto and the json data."
|
|
||||||
)
|
|
||||||
# As DOCUMENT_POSITION is a very large file, the test takes forever. If you want to test it, add "DOCUMENT_POSITION" to the list below. - Added per default
|
|
||||||
@pytest.mark.parametrize("document_data", ["DOCUMENT_STRUCTURE", "DOCUMENT_TEXT", "DOCUMENT_PAGES", "DOCUMENT_POSITION"], indirect=True)
|
|
||||||
def test_proto_data_loader_end2end(document_data, proto_data_loader):
|
|
||||||
file_path, data, target = document_data
|
|
||||||
data = gzip.decompress(data)
|
|
||||||
loaded_data = proto_data_loader(file_path, data)
|
|
||||||
|
|
||||||
loaded_data_str = json.dumps(loaded_data, sort_keys=True)
|
|
||||||
target_str = json.dumps(target, sort_keys=True)
|
|
||||||
|
|
||||||
# If you want to look at the files in more detail uncomment code below
|
|
||||||
# global enum
|
|
||||||
# with open(f"input-{enum}.json", "w") as f:
|
|
||||||
# json.dump(target, f, sort_keys=True, indent=4)
|
|
||||||
|
|
||||||
# with open(f"output-{enum}.json", "w") as f:
|
|
||||||
# json.dump(loaded_data, f, sort_keys=True, indent=4)
|
|
||||||
# enum += 1
|
|
||||||
|
|
||||||
diff = DeepDiff(loaded_data_str, target_str, ignore_order=True)
|
|
||||||
|
|
||||||
# FIXME: remove this block when the test is stable
|
|
||||||
# if diff:
|
|
||||||
# with open(f"diff_test.json", "w") as f:
|
|
||||||
# f.write(diff.to_json(indent=4))
|
|
||||||
|
|
||||||
assert not diff
|
|
||||||
|
|
||||||
|
|
||||||
def test_proto_data_loader_unknown_document_type(proto_data_loader):
|
|
||||||
assert not proto_data_loader("unknown_document_type.proto", b"")
|
|
||||||
|
|
||||||
|
|
||||||
def test_proto_data_loader_file_name_matching(proto_data_loader, should_match):
|
|
||||||
for file_name in should_match:
|
|
||||||
assert proto_data_loader._match(file_name) is not None
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("document_data", ["DOCUMENT_PAGES"], indirect=True)
|
|
||||||
def test_document_page_types(document_data, proto_data_loader):
|
|
||||||
# types from document reader
|
|
||||||
# number: int
|
|
||||||
# height: int
|
|
||||||
# width: int
|
|
||||||
# rotation: int
|
|
||||||
|
|
||||||
file_path, data, _ = document_data
|
|
||||||
data = gzip.decompress(data)
|
|
||||||
loaded_data = proto_data_loader(file_path, data)
|
|
||||||
|
|
||||||
assert isinstance(loaded_data, list)
|
|
||||||
assert all(isinstance(entry, dict) for entry in loaded_data)
|
|
||||||
|
|
||||||
# since all values need to be int anyway we can summarize it
|
|
||||||
assert all(all(isinstance(value, int) for value in entry.values()) for entry in loaded_data)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("document_data", ["DOCUMENT_POSITION"], indirect=True)
|
|
||||||
def test_document_position_data_types(document_data, proto_data_loader):
|
|
||||||
# types from document reader
|
|
||||||
# id: int
|
|
||||||
# stringIdxToPositionIdx: list[int]
|
|
||||||
# positions: list[list[float]]
|
|
||||||
|
|
||||||
file_path, data, _ = document_data
|
|
||||||
data = gzip.decompress(data)
|
|
||||||
loaded_data = proto_data_loader(file_path, data)
|
|
||||||
|
|
||||||
assert isinstance(loaded_data, list)
|
|
||||||
assert all(isinstance(entry, dict) for entry in loaded_data)
|
|
||||||
|
|
||||||
for entry in loaded_data:
|
|
||||||
assert isinstance(entry["id"], int)
|
|
||||||
assert isinstance(entry["stringIdxToPositionIdx"], list)
|
|
||||||
assert isinstance(entry["positions"], list)
|
|
||||||
assert all(isinstance(position, list) for position in entry["positions"])
|
|
||||||
assert all(all(isinstance(coordinate, float) for coordinate in position) for position in entry["positions"])
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("document_data", ["DOCUMENT_STRUCTURE"], indirect=True)
|
|
||||||
def test_document_structure_types(document_data, proto_data_loader):
|
|
||||||
# types from document reader for DocumentStructure
|
|
||||||
# root: dict
|
|
||||||
|
|
||||||
# types from document reader for EntryData
|
|
||||||
# type: str
|
|
||||||
# tree_id: list[int]
|
|
||||||
# atomic_block_ids: list[int]
|
|
||||||
# page_numbers: list[int]
|
|
||||||
# properties: dict[str, str]
|
|
||||||
# children: list[dict]
|
|
||||||
|
|
||||||
file_path, data, _ = document_data
|
|
||||||
data = gzip.decompress(data)
|
|
||||||
loaded_data = proto_data_loader(file_path, data)
|
|
||||||
|
|
||||||
assert isinstance(loaded_data, dict)
|
|
||||||
assert isinstance(loaded_data["root"], dict)
|
|
||||||
assert isinstance(loaded_data["root"]["type"], str)
|
|
||||||
assert isinstance(loaded_data["root"]["treeId"], list)
|
|
||||||
assert isinstance(loaded_data["root"]["atomicBlockIds"], list)
|
|
||||||
assert isinstance(loaded_data["root"]["pageNumbers"], list)
|
|
||||||
assert isinstance(loaded_data["root"]["children"], list)
|
|
||||||
|
|
||||||
assert all(isinstance(value, int) for value in loaded_data["root"]["treeId"])
|
|
||||||
assert all(isinstance(value, int) for value in loaded_data["root"]["atomicBlockIds"])
|
|
||||||
assert all(isinstance(value, int) for value in loaded_data["root"]["pageNumbers"])
|
|
||||||
assert all(isinstance(value, dict) for value in loaded_data["root"]["properties"].values())
|
|
||||||
assert all(
|
|
||||||
all(isinstance(value, dict) for value in entry.values()) for entry in loaded_data["root"]["properties"].values()
|
|
||||||
)
|
|
||||||
assert all(isinstance(value, dict) for value in loaded_data["root"]["children"])
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("document_data", ["DOCUMENT_TEXT"], indirect=True)
|
|
||||||
def test_document_text_data_types(document_data, proto_data_loader):
|
|
||||||
# types from document reader
|
|
||||||
# id: int
|
|
||||||
# page: int
|
|
||||||
# search_text: str
|
|
||||||
# number_on_page: int
|
|
||||||
# start: int
|
|
||||||
# end: int
|
|
||||||
# lineBreaks: list[int]
|
|
||||||
|
|
||||||
file_path, data, _ = document_data
|
|
||||||
data = gzip.decompress(data)
|
|
||||||
loaded_data = proto_data_loader(file_path, data)
|
|
||||||
|
|
||||||
assert isinstance(loaded_data, list)
|
|
||||||
assert all(isinstance(entry, dict) for entry in loaded_data)
|
|
||||||
|
|
||||||
for entry in loaded_data:
|
|
||||||
assert isinstance(entry["id"], int)
|
|
||||||
assert isinstance(entry["page"], int)
|
|
||||||
assert isinstance(entry["searchText"], str)
|
|
||||||
assert isinstance(entry["numberOnPage"], int)
|
|
||||||
assert isinstance(entry["start"], int)
|
|
||||||
assert isinstance(entry["end"], int)
|
|
||||||
assert all(isinstance(value, int) for value in entry["lineBreaks"])
|
|
||||||
83
tests/unit_test/utils_download_test.py
Normal file
83
tests/unit_test/utils_download_test.py
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import patch
|
||||||
|
from pyinfra.storage.utils import (
|
||||||
|
download_data_bytes_as_specified_in_message,
|
||||||
|
upload_data_as_specified_in_message,
|
||||||
|
DownloadedData,
|
||||||
|
)
|
||||||
|
from pyinfra.storage.storages.storage import Storage
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_storage():
|
||||||
|
with patch("pyinfra.storage.utils.Storage") as MockStorage:
|
||||||
|
yield MockStorage()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(
|
||||||
|
params=[
|
||||||
|
{
|
||||||
|
"raw_payload": {
|
||||||
|
"tenantId": "tenant1",
|
||||||
|
"dossierId": "dossier1",
|
||||||
|
"fileId": "file1",
|
||||||
|
"targetFileExtension": "txt",
|
||||||
|
"responseFileExtension": "json",
|
||||||
|
},
|
||||||
|
"expected_result": {
|
||||||
|
"data": b'{"key": "value"}',
|
||||||
|
"file_path": "tenant1/dossier1/file1.txt"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"raw_payload": {
|
||||||
|
"targetFilePath": "some/path/to/file.txt.gz",
|
||||||
|
"responseFilePath": "some/path/to/file.json"
|
||||||
|
},
|
||||||
|
"expected_result": {
|
||||||
|
"data": b'{"key": "value"}',
|
||||||
|
"file_path": "some/path/to/file.txt.gz"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"raw_payload": {
|
||||||
|
"targetFilePath": {
|
||||||
|
"file1": "some/path/to/file1.txt.gz",
|
||||||
|
"file2": "some/path/to/file2.txt.gz"
|
||||||
|
},
|
||||||
|
"responseFilePath": "some/path/to/file.json"
|
||||||
|
},
|
||||||
|
"expected_result": {
|
||||||
|
"file1": {
|
||||||
|
"data": b'{"key": "value"}',
|
||||||
|
"file_path": "some/path/to/file1.txt.gz"
|
||||||
|
},
|
||||||
|
"file2": {
|
||||||
|
"data": b'{"key": "value"}',
|
||||||
|
"file_path": "some/path/to/file2.txt.gz"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
def payload_and_expected_result(request):
|
||||||
|
return request.param
|
||||||
|
|
||||||
|
def test_download_data_bytes_as_specified_in_message(mock_storage, payload_and_expected_result):
|
||||||
|
raw_payload = payload_and_expected_result["raw_payload"]
|
||||||
|
expected_result = payload_and_expected_result["expected_result"]
|
||||||
|
mock_storage.get_object.return_value = b'{"key": "value"}'
|
||||||
|
|
||||||
|
result = download_data_bytes_as_specified_in_message(mock_storage, raw_payload)
|
||||||
|
|
||||||
|
assert isinstance(result, dict)
|
||||||
|
assert result == expected_result
|
||||||
|
mock_storage.get_object.assert_called()
|
||||||
|
|
||||||
|
def test_upload_data_as_specified_in_message(mock_storage, payload_and_expected_result):
|
||||||
|
raw_payload = payload_and_expected_result["raw_payload"]
|
||||||
|
data = {"key": "value"}
|
||||||
|
upload_data_as_specified_in_message(mock_storage, raw_payload, data)
|
||||||
|
mock_storage.put_object.assert_called_once()
|
||||||
Loading…
x
Reference in New Issue
Block a user