Merge branch 'RES-534-update-pyinfra' into 'master'

feat(opentel,dynaconf): adapt new pyinfra

Closes RES-534

See merge request redactmanager/image-classification-service!8
This commit is contained in:
Julius Unverfehrt 2024-02-09 09:59:11 +01:00
commit a024ddfcf7
95 changed files with 1912 additions and 1652 deletions

View File

@ -7,3 +7,25 @@ variables:
NEXUS_PROJECT_DIR: red NEXUS_PROJECT_DIR: red
IMAGENAME: "${CI_PROJECT_NAME}" IMAGENAME: "${CI_PROJECT_NAME}"
INTEGRATION_TEST_FILE: "${CI_PROJECT_ID}.pdf" INTEGRATION_TEST_FILE: "${CI_PROJECT_ID}.pdf"
#################################
# temp. disable integration tests, b/c they don't cover the CV analysis case yet
trigger integration tests:
rules:
- when: never
release build:
stage: release
needs:
- job: set custom version
artifacts: true
optional: true
- job: calculate patch version
artifacts: true
optional: true
- job: calculate minor version
artifacts: true
optional: true
- job: build docker nexus
artifacts: true
#################################

View File

@ -20,9 +20,9 @@ ENV PATH="$POETRY_HOME/bin:$PATH"
RUN curl -sSL https://install.python-poetry.org | python3 - RUN curl -sSL https://install.python-poetry.org | python3 -
COPY ./data ./data COPY ./data ./data
COPY ./scripts ./scripts COPY ./config ./config
COPY ./image_prediction ./image_prediction COPY ./src ./src
COPY pyproject.toml poetry.lock banner.txt config.yaml ./src ./ COPY pyproject.toml poetry.lock banner.txt ./
RUN poetry config virtualenvs.create false && \ RUN poetry config virtualenvs.create false && \
poetry config installer.max-workers 10 && \ poetry config installer.max-workers 10 && \
@ -30,9 +30,10 @@ RUN poetry config virtualenvs.create false && \
poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${USERNAME} ${TOKEN} && \ poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${USERNAME} ${TOKEN} && \
poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \ poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
poetry config http-basic.${POETRY_SOURCE_REF_RED} ${USERNAME} ${TOKEN} && \ poetry config http-basic.${POETRY_SOURCE_REF_RED} ${USERNAME} ${TOKEN} && \
poetry install --without=dev -vv --no-interaction --no-root poetry install --without=dev -vv --no-interaction
EXPOSE 5000 EXPOSE 5000
EXPOSE 8080 EXPOSE 8080
CMD [ "python", "serve.py"]
CMD [ "python", "src/serve.py"]

View File

@ -20,9 +20,10 @@ ENV PATH="$POETRY_HOME/bin:$PATH"
RUN curl -sSL https://install.python-poetry.org | python3 - RUN curl -sSL https://install.python-poetry.org | python3 -
COPY ./data ./data COPY ./data ./data
COPY ./image_prediction ./image_prediction
COPY ./test ./test COPY ./test ./test
COPY pyproject.toml poetry.lock banner.txt config.yaml ./src ./ COPY ./config ./config
COPY ./src ./src
COPY pyproject.toml poetry.lock banner.txt config.yaml./
RUN poetry config virtualenvs.create false && \ RUN poetry config virtualenvs.create false && \
poetry config installer.max-workers 10 && \ poetry config installer.max-workers 10 && \

View File

@ -1,24 +0,0 @@
webserver:
host: $SERVER_HOST|"127.0.0.1" # webserver address
port: $SERVER_PORT|5000 # webserver port
service:
logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for service logger
verbose: $VERBOSE|False # Service DOES NOT prints document processing progress to stdout
batch_size: $BATCH_SIZE|16 # Number of images in memory simultaneously
mlflow_run_id: $MLFLOW_RUN_ID|fabfb1f192c745369b88cab34471aba7 # The ID of the mlflow run to load the service_estimator from
# These variables control filters that are applied to either images, image metadata or service_estimator predictions.
# The filter result values are reported in the service responses. For convenience the response to a request contains a
# "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
# specified required value.
filters:
image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)
min: $MIN_REL_IMAGE_SIZE|0.05 # Minimum permissible
max: $MAX_REL_IMAGE_SIZE|0.75 # Maximum permissible
image_width_to_height_quotient: # Image width to height ratio
min: $MIN_IMAGE_FORMAT|0.1 # Minimum permissible
max: $MAX_IMAGE_FORMAT|10 # Maximum permissible
min_confidence: $MIN_CONFIDENCE|0.5 # Minimum permissible prediction confidence

44
config/pyinfra.toml Normal file
View File

@ -0,0 +1,44 @@
[metrics.prometheus]
enabled = true
prefix = "redactmanager_image_service"
[tracing.opentelemetry]
enabled = true
endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
service_name = "redactmanager_image_service"
exporter = "otlp"
[webserver]
host = "0.0.0.0"
port = 8080
[rabbitmq]
host = "localhost"
port = 5672
username = ""
password = ""
heartbeat = 60
# Has to be a divider of heartbeat, and shouldn't be too big, since only in these intervals queue interactions happen (like receiving new messages)
# This is also the minimum time the service needs to process a message
connection_sleep = 5
input_queue = "request_queue"
output_queue = "response_queue"
dead_letter_queue = "dead_letter_queue"
[storage]
backend = "s3"
[storage.s3]
bucket = "redaction"
endpoint = "http://127.0.0.1:9000"
key = ""
secret = ""
region = "eu-central-1"
[storage.azure]
container = "redaction"
connection_string = ""
[storage.tenant_server]
public_key = ""
endpoint = "http://tenant-user-management:8081/internal-api/tenants"

28
config/settings.toml Normal file
View File

@ -0,0 +1,28 @@
[logging]
level = "INFO"
[service]
# Print document processing progress to stdout
verbose = false
batch_size = 16
mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
# These variables control filters that are applied to either images, image metadata or service_estimator predictions.
# The filter result values are reported in the service responses. For convenience the response to a request contains a
# "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
# specified required value.
[filters]
# Minimum permissible prediction confidence
min_confidence = 0.5
# Image size to page size ratio (ratio of geometric means of areas)
[filters.image_to_page_quotient]
min = 0.05
max = 0.75
# Image width to height ratio
[filters.image_width_to_height_quotient]
min = 0.1
max = 10

View File

@ -1,46 +0,0 @@
"""Implements a config object with dot-indexing syntax."""
from envyaml import EnvYAML
from image_prediction.locations import CONFIG_FILE
def _get_item_and_maybe_make_dotindexable(container, item):
ret = container[item]
return DotIndexable(ret) if isinstance(ret, dict) else ret
class DotIndexable:
def __init__(self, x):
self.x = x
def get(self, item, default=None):
try:
return _get_item_and_maybe_make_dotindexable(self.x, item)
except KeyError:
return default
def __getattr__(self, item):
return _get_item_and_maybe_make_dotindexable(self.x, item)
def __repr__(self):
return self.x.__repr__()
def __getitem__(self, item):
return self.__getattr__(item)
class Config:
def __init__(self, config_path):
self.__config = EnvYAML(config_path)
def __getattr__(self, item):
if item in self.__config:
return _get_item_and_maybe_make_dotindexable(self.__config, item)
def __getitem__(self, item):
return self.__getattr__(item)
CONFIG = Config(CONFIG_FILE)

View File

@ -1,16 +0,0 @@
"""Defines constant paths relative to the module root path."""
from pathlib import Path
MODULE_DIR = Path(__file__).resolve().parents[0]
PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
CONFIG_FILE = PACKAGE_ROOT_DIR / "config.yaml"
BANNER_FILE = PACKAGE_ROOT_DIR / "banner.txt"
DATA_DIR = PACKAGE_ROOT_DIR / "data"
MLRUNS_DIR = str(DATA_DIR / "mlruns")
TEST_DIR = PACKAGE_ROOT_DIR / "test"
TEST_DATA_DIR = TEST_DIR / "data"
TEST_DATA_DIR_DVC = TEST_DIR / "data.dvc"

View File

@ -1,27 +0,0 @@
import logging
from image_prediction.config import CONFIG
def make_logger_getter():
logger = logging.getLogger("imclf")
logger.propagate = False
handler = logging.StreamHandler()
handler.setLevel(CONFIG.service.logging_level)
log_format = "%(asctime)s %(levelname)-8s %(message)s"
formatter = logging.Formatter(log_format, datefmt="%Y-%m-%d %H:%M:%S")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(CONFIG.service.logging_level)
def get_logger():
return logger
return get_logger
get_logger = make_logger_getter()

3145
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,14 +1,14 @@
[tool.poetry] [tool.poetry]
name = "image-classification-service" name = "image-classification-service"
version = "1.34.0" version = "2.0.0"
description = "" description = ""
authors = ["Team Research <research@knecon.com>"] authors = ["Team Research <research@knecon.com>"]
readme = "README.md" readme = "README.md"
packages = [{ include = "image_prediction" }] packages = [{ include = "image_prediction", from = "src" }]
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = ">=3.10,<3.11" python = ">=3.10,<3.11"
pyinfra = { version = "1.10.0", source = "gitlab-research" } pyinfra = { version = "2.0.0", source = "gitlab-research" }
kn-utils = { version = "0.2.7", source = "gitlab-research" } kn-utils = { version = "0.2.7", source = "gitlab-research" }
dvc = "^2.34.0" dvc = "^2.34.0"
dvc-ssh = "^2.20.0" dvc-ssh = "^2.20.0"

View File

@ -0,0 +1,7 @@
from pathlib import Path
from pyinfra.config.loader import load_settings
from image_prediction.locations import PROJECT_ROOT_DIR
CONFIG = load_settings(root_path=PROJECT_ROOT_DIR, settings_path="config")

View File

@ -92,12 +92,12 @@ def get_images_on_page(doc, metadata):
def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page): def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
return compose( metadata = get_metadata_for_images_on_page(page)
list, metadata = filter_valid_metadata(metadata)
partial(add_alpha_channel_info, doc), metadata = add_alpha_channel_info(doc, metadata)
filter_valid_metadata,
get_metadata_for_images_on_page, return list(metadata)
)(page)
def get_metadata_for_images_on_page(page: fitz.Page): def get_metadata_for_images_on_page(page: fitz.Page):
@ -207,7 +207,11 @@ def add_alpha_channel_info(doc, metadata):
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def load_image_handle_from_xref(doc, xref): def load_image_handle_from_xref(doc, xref):
return doc.extract_image(xref) try:
return doc.extract_image(xref)
except ValueError:
logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
return
rounder = rcompose(round, int) rounder = rcompose(round, int)

View File

@ -0,0 +1,18 @@
"""Defines constant paths relative to the module root path."""
from pathlib import Path
# FIXME: move these paths to config, only depending on 'ROOT_PATH' environment variable.
MODULE_DIR = Path(__file__).resolve().parents[0]
PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
PROJECT_ROOT_DIR = PACKAGE_ROOT_DIR.parents[0]
CONFIG_FILE = PROJECT_ROOT_DIR / "config" / "settings.toml"
BANNER_FILE = PROJECT_ROOT_DIR / "banner.txt"
DATA_DIR = PROJECT_ROOT_DIR / "data"
MLRUNS_DIR = str(DATA_DIR / "mlruns")
TEST_DIR = PROJECT_ROOT_DIR / "test"
TEST_DATA_DIR = TEST_DIR / "data"
TEST_DATA_DIR_DVC = TEST_DIR / "data.dvc"

View File

@ -0,0 +1,4 @@
import kn_utils
# TODO: remove this module and use the `get_logger` function from the `kn_utils` package.
get_logger = kn_utils.get_logger

View File

@ -1,17 +1,16 @@
from image_prediction import logger from sys import stdout
from image_prediction.config import Config
from image_prediction.locations import CONFIG_FILE from kn_utils.logging import logger
from pyinfra.examples import start_standard_queue_consumer
from pyinfra.queue.callback import make_download_process_upload_callback
from image_prediction.config import CONFIG
from image_prediction.pipeline import load_pipeline from image_prediction.pipeline import load_pipeline
from image_prediction.utils.banner import load_banner from image_prediction.utils.banner import load_banner
from image_prediction.utils.process_wrapping import wrap_in_process from image_prediction.utils.process_wrapping import wrap_in_process
from pyinfra import config
from pyinfra.payload_processing.processor import make_payload_processor
from pyinfra.queue.queue_manager import QueueManager
PYINFRA_CONFIG = config.get_config() logger.remove()
IMAGE_CONFIG = Config(CONFIG_FILE) logger.add(sink=stdout, level=CONFIG.logging.level)
logger.setLevel(PYINFRA_CONFIG.logging_level_root)
# A component of the processing pipeline (probably tensorflow) does not release allocated memory (see RED-4206). # A component of the processing pipeline (probably tensorflow) does not release allocated memory (see RED-4206).
@ -19,18 +18,16 @@ logger.setLevel(PYINFRA_CONFIG.logging_level_root)
# Workaround: Manage Memory with the operating system, by wrapping the processing in a sub-process. # Workaround: Manage Memory with the operating system, by wrapping the processing in a sub-process.
# FIXME: Find more fine-grained solution or if the problem occurs persistently for python services, # FIXME: Find more fine-grained solution or if the problem occurs persistently for python services,
@wrap_in_process @wrap_in_process
def process_data(data: bytes) -> list: def process_data(data: bytes, _message: dict) -> list:
pipeline = load_pipeline(verbose=IMAGE_CONFIG.service.verbose, batch_size=IMAGE_CONFIG.service.batch_size) pipeline = load_pipeline(verbose=CONFIG.service.verbose, batch_size=CONFIG.service.batch_size)
return list(pipeline(data)) return list(pipeline(data))
def main(): def main():
logger.info(load_banner()) logger.info(load_banner())
process_payload = make_payload_processor(process_data, config=PYINFRA_CONFIG) callback = make_download_process_upload_callback(process_data, CONFIG)
start_standard_queue_consumer(callback, CONFIG)
queue_manager = QueueManager(PYINFRA_CONFIG)
queue_manager.start_consuming(process_payload)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -1,10 +1,3 @@
import logging
import pytest
from image_prediction.utils import get_logger
pytest_plugins = [ pytest_plugins = [
"test.fixtures.extractor", "test.fixtures.extractor",
"test.fixtures.image", "test.fixtures.image",
@ -17,14 +10,5 @@ pytest_plugins = [
"test.fixtures.parameters", "test.fixtures.parameters",
"test.fixtures.pdf", "test.fixtures.pdf",
"test.fixtures.target", "test.fixtures.target",
"test.unit_tests.image_stitching_test" "test.unit_tests.image_stitching_test",
] ]
@pytest.fixture(autouse=True)
def mute_logger():
logger = get_logger()
level = logger.level
logger.setLevel(logging.CRITICAL + 1)
yield
logger.setLevel(level)

View File

@ -1,48 +0,0 @@
import tempfile
import pytest
import yaml
from image_prediction.config import Config
@pytest.fixture
def config_file_content():
return {"A": [{"B": [1, 2]}, {"C": 3}, 4], "D": {"E": {"F": True}}}
@pytest.fixture
def config(config_file_content):
with tempfile.NamedTemporaryFile(suffix=".yaml", mode="w") as f:
yaml.dump(config_file_content, f, default_flow_style=False)
yield Config(f.name)
def test_dot_access_key_exists(config):
assert config.A == [{"B": [1, 2]}, {"C": 3}, 4]
assert config.D.E["F"]
def test_access_key_exists(config):
assert config["A"] == [{"B": [1, 2]}, {"C": 3}, 4]
assert config["A"][0] == {"B": [1, 2]}
assert config["A"][0]["B"] == [1, 2]
assert config["A"][0]["B"][0] == 1
def test_dot_access_key_does_not_exists(config):
assert config.B is None
def test_access_key_does_not_exists(config):
assert config["B"] is None
def test_get_method_returns_key_if_key_does_exist(config):
dot_indexable = config.D.E
assert dot_indexable.get("F", "default_value") is True
def test_get_method_returns_default_if_key_does_not_exist(config):
dot_indexable = config.D.E
assert dot_indexable.get("X", "default_value") == "default_value"

View File

@ -1,48 +0,0 @@
import json
import pytest
from image_prediction.exceptions import IntentionalTestException
from image_prediction.flask import make_prediction_server
def predict_fn(x: bytes):
x = int(x.decode())
if x == 42:
return True
else:
raise IntentionalTestException("This is intended.")
@pytest.fixture
def server():
server = make_prediction_server(predict_fn)
server.config.update({"TESTING": True})
return server
@pytest.fixture
def client(server):
return server.test_client()
def test_server_predict_success(client, mute_logger):
response = client.post("/predict", data="42")
assert json.loads(response.data)
def test_server_predict_failure(client, mute_logger):
response = client.post("/predict", data="13")
assert response.status_code == 500
def test_server_health_check(client):
response = client.get("/health")
assert response.status_code == 200
assert response.json == "OK"
def test_server_ready_check(client):
response = client.get("/ready")
assert response.status_code == 200
assert response.json == "OK"