feat: RED-10765: ignore perceptual hash for image deduplication and prefer to keep the ones with allPassed set to True

Merge branch 'feat/RED-10765/filter-duplicate-images' into 'master'
feat: RED-10765: filter out classifications for 'duplicate' images present in the document Closes RED-10765 See merge request redactmanager/image-classification-service!23
2025-01-31 12:59:59 +01:00 · 2025-01-30 13:20:19 +01:00 · 2025-01-30 12:42:41 +01:00 · 2025-01-16 09:29:11 +01:00 · 2025-01-15 13:39:16 +01:00 · 2024-12-18 12:39:44 +01:00
107 changed files with 38549 additions and 2778 deletions
--- a/.dvc/config
+++ b/.dvc/config
@ -5,4 +5,4 @@
    url = ssh://vector.iqser.com/research/image-prediction/
    port = 22
 ['remote "azure_remote"']
-    url = azure://ic-sa-dvc/
+    url = azure://image-classification-dvc/
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -1,4 +1,51 @@
 include:
  - project: "Gitlab/gitlab"
-    ref: 0.2.3
-    file: "/ci-templates/research/red-dvc_versioning_build_gitlab-ci.yml"
+    ref: main
+    file: "/ci-templates/research/dvc.gitlab-ci.yml"
+  - project: "Gitlab/gitlab"
+    ref: main
+    file: "/ci-templates/research/versioning-build-test-release.gitlab-ci.yml"
+
+variables:
+  NEXUS_PROJECT_DIR: red
+  IMAGENAME: "${CI_PROJECT_NAME}"
+  INTEGRATION_TEST_FILE: "${CI_PROJECT_ID}.pdf"
+  FF_USE_FASTZIP: "true" # enable fastzip - a faster zip implementation that also supports level configuration.
+  ARTIFACT_COMPRESSION_LEVEL: default # can also be set to fastest, fast, slow and slowest. If just enabling fastzip is not enough try setting this to fastest or fast.
+  CACHE_COMPRESSION_LEVEL: default # same as above, but for caches
+  # TRANSFER_METER_FREQUENCY: 5s # will display transfer progress every 5 seconds for artifacts and remote caches. For debugging purposes.
+
+stages:
+  - data
+  - setup
+  - tests
+  - sonarqube
+  - versioning
+  - build
+  - integration-tests
+  - release
+
+docker-build:
+  extends: .docker-build
+  needs:
+    - job: dvc-pull
+      artifacts: true
+    - !reference [.needs-versioning, needs] # leave this line as is
+  
+###################
+# INTEGRATION TESTS
+trigger-integration-tests:
+  extends: .integration-tests
+  # ADD THE MODEL BUILD WHICH SHOULD TRIGGER THE INTEGRATION TESTS
+  # needs:
+  #   - job: docker-build::model_name
+  #     artifacts: true
+  rules:
+    - when: never
+
+#########
+# RELEASE
+release:
+  extends: .release
+  needs:
+    - !reference [.needs-versioning, needs] # leave this line as is
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
+3.10
--- a/59
+++ b/59
@ -1,11 +1,17 @@
-FROM python:3.8
+FROM python:3.10-slim AS builder
+
+ARG GITLAB_USER
+ARG GITLAB_ACCESS_TOKEN

-ARG USERNAME
-ARG TOKEN
 ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
 ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
+
 ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
 ARG POETRY_SOURCE_REF_RED=gitlab-red
+
+ARG PYPI_REGISTRY_FFORESIGHT=https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
+ARG POETRY_SOURCE_REF_FFORESIGHT=gitlab-fforesight
+
 ARG VERSION=dev

 LABEL maintainer="Research <research@knecon.com>"
@ -13,26 +19,55 @@ LABEL version="${VERSION}"

 WORKDIR /app

+###########
+# ENV SETUP
+ENV PYTHONDONTWRITEBYTECODE=true
 ENV PYTHONUNBUFFERED=true
 ENV POETRY_HOME=/opt/poetry
 ENV PATH="$POETRY_HOME/bin:$PATH"

+RUN apt-get update && \
+    apt-get install -y curl git bash build-essential libffi-dev libssl-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
 RUN curl -sSL https://install.python-poetry.org | python3 -
+RUN poetry --version

-COPY ./data ./data
-COPY ./scripts ./scripts
-COPY ./image_prediction ./image_prediction
-COPY pyproject.toml poetry.lock banner.txt config.yaml ./src ./
+COPY pyproject.toml poetry.lock ./

-RUN poetry config virtualenvs.create false && \
+RUN poetry config virtualenvs.create true && \
+    poetry config virtualenvs.in-project true && \
    poetry config installer.max-workers 10 && \
    poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
-    poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${USERNAME} ${TOKEN} && \
+    poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
    poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
-    poetry config http-basic.${POETRY_SOURCE_REF_RED} ${USERNAME} ${TOKEN} && \
-    poetry install --without=test -vv --no-interaction --no-root
+    poetry config http-basic.${POETRY_SOURCE_REF_RED} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
+    poetry config repositories.${POETRY_SOURCE_REF_FFORESIGHT} ${PYPI_REGISTRY_FFORESIGHT} && \
+    poetry config http-basic.${POETRY_SOURCE_REF_FFORESIGHT} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
+    poetry install --without=dev -vv --no-interaction --no-root
+
+###############
+# WORKING IMAGE
+FROM python:3.10-slim
+
+WORKDIR /app
+
+# COPY SOURCE CODE FROM BUILDER IMAGE
+COPY --from=builder /app /app
+# COPY BILL OF MATERIALS (BOM)
+COPY bom.json /bom.json
+
+ENV PATH="/app/.venv/bin:$PATH"
+
+###################
+# COPY SOURCE CODE
+COPY ./src ./src
+COPY ./config ./config
+COPY ./data ./data
+COPY banner.txt ./

 EXPOSE 5000
 EXPOSE 8080

-CMD [ "python", "serve.py"]
+CMD [ "python", "src/serve.py"]
--- a/9
+++ b/9
@ -1,4 +1,4 @@
-FROM python:3.8
+FROM python:3.10

 ARG USERNAME
 ARG TOKEN
@ -20,9 +20,10 @@ ENV PATH="$POETRY_HOME/bin:$PATH"
 RUN curl -sSL https://install.python-poetry.org | python3 -

 COPY ./data ./data
-COPY ./image_prediction ./image_prediction
 COPY ./test ./test
-COPY pyproject.toml poetry.lock banner.txt config.yaml ./src ./
+COPY ./config ./config
+COPY ./src ./src
+COPY pyproject.toml poetry.lock banner.txt config.yaml./

 RUN poetry config virtualenvs.create false && \
    poetry config installer.max-workers 10 && \
@ -30,7 +31,7 @@ RUN poetry config virtualenvs.create false && \
    poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${USERNAME} ${TOKEN} && \
    poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
    poetry config http-basic.${POETRY_SOURCE_REF_RED} ${USERNAME} ${TOKEN} && \
-    poetry install --without=test -vv --no-interaction --no-root
+    poetry install --without=dev -vv --no-interaction --no-root

 EXPOSE 5000
 EXPOSE 8080
--- a/bom.json
+++ b/bom.json
--- a/config.yaml
+++ b/config.yaml
@ -1,24 +0,0 @@
-webserver:
-  host: $SERVER_HOST|"127.0.0.1" # webserver address
-  port: $SERVER_PORT|5000 # webserver port
-
-service:
-  logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for service logger
-  verbose: $VERBOSE|False # Service DOES NOT prints document processing progress to stdout
-  batch_size: $BATCH_SIZE|16 # Number of images in memory simultaneously
-  mlflow_run_id: $MLFLOW_RUN_ID|fabfb1f192c745369b88cab34471aba7 # The ID of the mlflow run to load the service_estimator from
-
-# These variables control filters that are applied to either images, image metadata or service_estimator predictions.
-# The filter result values are reported in the service responses. For convenience the response to a request contains a
-# "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
-# specified required value.
-filters:
-  image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)
-    min: $MIN_REL_IMAGE_SIZE|0.05 # Minimum permissible
-    max: $MAX_REL_IMAGE_SIZE|0.75 # Maximum permissible
-
-  image_width_to_height_quotient: # Image width to height ratio
-    min: $MIN_IMAGE_FORMAT|0.1 # Minimum permissible
-    max: $MAX_IMAGE_FORMAT|10 # Maximum permissible
-
-  min_confidence: $MIN_CONFIDENCE|0.5 # Minimum permissible prediction confidence
--- a/config/pyinfra.toml
+++ b/config/pyinfra.toml
@ -0,0 +1,68 @@
+
+[asyncio]
+max_concurrent_tasks = 10
+
+[dynamic_tenant_queues]
+enabled = true
+
+[metrics.prometheus]
+enabled = true
+prefix = "redactmanager_image_service"
+
+[tracing]
+enabled = true
+# possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
+type = "azure_monitor" 
+
+[tracing.opentelemetry]
+endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
+service_name = "redactmanager_image_service"
+exporter = "otlp"
+
+[webserver]
+host = "0.0.0.0"
+port = 8080
+
+[rabbitmq]
+host = "localhost"
+port = 5672
+username = ""
+password = ""
+heartbeat = 60
+# Has to be a divider of heartbeat, and shouldn't be too big, since only in these intervals queue interactions happen (like receiving new messages)
+# This is also the minimum time the service needs to process a message
+connection_sleep = 5
+input_queue = "request_queue"
+output_queue = "response_queue"
+dead_letter_queue = "dead_letter_queue"
+
+tenant_event_queue_suffix = "_tenant_event_queue"
+tenant_event_dlq_suffix = "_tenant_events_dlq"
+tenant_exchange_name = "tenants-exchange"
+queue_expiration_time = 300000  # 5 minutes in milliseconds
+
+service_request_queue_prefix = "image_request_queue"
+service_request_exchange_name = "image_request_exchange"
+service_response_exchange_name = "image_response_exchange"
+service_dlq_name = "image_dlq"
+
+[storage]
+backend = "s3"
+
+[storage.s3]
+bucket = "redaction"
+endpoint = "http://127.0.0.1:9000"
+key = ""
+secret = ""
+region = "eu-central-1"
+
+[storage.azure]
+container = "redaction"
+connection_string = ""
+
+[storage.tenant_server]
+public_key = ""
+endpoint =  "http://tenant-user-management:8081/internal-api/tenants"
+
+[kubernetes]
+pod_name = "test_pod"
--- a/config/settings.toml
+++ b/config/settings.toml
@ -0,0 +1,42 @@
+[logging]
+level = "INFO"
+
+[service]
+# Print document processing progress to stdout
+verbose = false
+batch_size = 6
+image_stiching_tolerance = 1  # in pixels
+mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
+
+# These variables control filters that are applied to either images, image metadata or service_estimator predictions.
+# The filter result values are reported in the service responses. For convenience the response to a request contains a
+# "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
+# specified required value.
+[filters.confidence]
+# Minimum permissible prediction confidence
+min = 0.5
+
+# Image size to page size ratio (ratio of geometric means of areas)
+[filters.image_to_page_quotient]
+min = 0.05
+max = 0.75
+
+[filters.is_scanned_page]
+# Minimum permissible image to page ratio tolerance for a page to be considered scanned.
+# This is only used for filtering small images on scanned pages and is applied before processing the image, therefore
+# superseding the image_to_page_quotient filter that only applies a tag to the image after processing.
+tolerance = 0
+
+# Image width to height ratio
+[filters.image_width_to_height_quotient]
+min = 0.1
+max = 10
+
+# put class specific filters here ['signature', 'formula', 'logo']
+[filters.overrides.signature.image_to_page_quotient]
+max = 0.4
+
+[filters.overrides.logo.image_to_page_quotient]
+min = 0.06
+
+
--- a/image_prediction/config.py
+++ b/image_prediction/config.py
@ -1,46 +0,0 @@
-"""Implements a config object with dot-indexing syntax."""
-
-
-from envyaml import EnvYAML
-
-from image_prediction.locations import CONFIG_FILE
-
-
-def _get_item_and_maybe_make_dotindexable(container, item):
-    ret = container[item]
-    return DotIndexable(ret) if isinstance(ret, dict) else ret
-
-
-class DotIndexable:
-    def __init__(self, x):
-        self.x = x
-
-    def get(self, item, default=None):
-        try:
-            return _get_item_and_maybe_make_dotindexable(self.x, item)
-        except KeyError:
-            return default
-
-    def __getattr__(self, item):
-        return _get_item_and_maybe_make_dotindexable(self.x, item)
-
-    def __repr__(self):
-        return self.x.__repr__()
-
-    def __getitem__(self, item):
-        return self.__getattr__(item)
-
-
-class Config:
-    def __init__(self, config_path):
-        self.__config = EnvYAML(config_path)
-
-    def __getattr__(self, item):
-        if item in self.__config:
-            return _get_item_and_maybe_make_dotindexable(self.__config, item)
-
-    def __getitem__(self, item):
-        return self.__getattr__(item)
-
-
-CONFIG = Config(CONFIG_FILE)
--- a/image_prediction/locations.py
+++ b/image_prediction/locations.py
@ -1,16 +0,0 @@
-"""Defines constant paths relative to the module root path."""
-
-from pathlib import Path
-
-MODULE_DIR = Path(__file__).resolve().parents[0]
-PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
-
-CONFIG_FILE = PACKAGE_ROOT_DIR / "config.yaml"
-BANNER_FILE = PACKAGE_ROOT_DIR / "banner.txt"
-
-DATA_DIR = PACKAGE_ROOT_DIR / "data"
-MLRUNS_DIR = str(DATA_DIR / "mlruns")
-
-TEST_DIR = PACKAGE_ROOT_DIR / "test"
-TEST_DATA_DIR = TEST_DIR / "data"
-TEST_DATA_DIR_DVC = TEST_DIR / "data.dvc"
--- a/image_prediction/utils/logger.py
+++ b/image_prediction/utils/logger.py
@ -1,27 +0,0 @@
-import logging
-
-from image_prediction.config import CONFIG
-
-
-def make_logger_getter():
-    logger = logging.getLogger("imclf")
-    logger.propagate = False
-
-    handler = logging.StreamHandler()
-    handler.setLevel(CONFIG.service.logging_level)
-
-    log_format = "%(asctime)s %(levelname)-8s %(message)s"
-    formatter = logging.Formatter(log_format, datefmt="%Y-%m-%d %H:%M:%S")
-
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
-
-    logger.setLevel(CONFIG.service.logging_level)
-
-    def get_logger():
-        return logger
-
-    return get_logger
-
-
-get_logger = make_logger_getter()
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,17 +1,20 @@
 [tool.poetry]
 name = "image-classification-service"
-version = "1.30.2"
+version = "2.17.0"
 description = ""
 authors = ["Team Research <research@knecon.com>"]
 readme = "README.md"
-packages = [{ include = "image_prediction" }]
+packages = [{ include = "image_prediction", from = "src" }]

 [tool.poetry.dependencies]
-python = "~3.8"
+python = ">=3.10,<3.11"
+# FIXME: This should be recent pyinfra, but the recent protobuf packages are not compatible with tensorflow 2.9.0, also
+#  see RED-9948.
+pyinfra = { version = "3.4.2", source = "gitlab-research" }
+kn-utils = { version = ">=0.4.0", source = "gitlab-research" }
 dvc = "^2.34.0"
 dvc-ssh = "^2.20.0"
 dvc-azure = "^2.21.2"
-pyinfra = { version = "1.5.9", source = "gitlab-research" }
 Flask = "^2.1.1"
 requests = "^2.27.1"
 iteration-utilities = "^0.11.0"
@ -22,22 +25,25 @@ mlflow = "^1.24.0"
 numpy = "^1.22.3"
 tqdm = "^4.64.0"
 pandas = "^1.4.2"
-tensorflow = "^2.8.0"
+# FIXME: Our current model significantly changes the prediction behaviour when using newer tensorflow (/ protobuf)
+#  versions which is introduuced by pyinfra updates using newer protobuf versions, see RED-9948.
+tensorflow = "2.9.0"
+protobuf = "^3.20"
 pytest = "^7.1.0"
-funcy = "^1.17"
+funcy = "^2"
 PyMuPDF = "^1.19.6"
 fpdf = "^1.7.2"
 coverage = "^6.3.2"
 Pillow = "^9.1.0"
 pdf2image = "^1.16.0"
 frozendict = "^2.3.0"
-protobuf = "^3.20.0"
 fsspec = "^2022.11.0"
 PyMonad = "^2.4.0"
 pdfnetpython3 = "9.4.2"
 loguru = "^0.7.0"
+cyclonedx-bom = "^4.5.0"

-[tool.poetry.group.test.dependencies]
+[tool.poetry.group.dev.dependencies]
 pytest = "^7.0.1"
 pymonad = "^2.4.0"
 pylint = "^2.17.4"
--- a/scripts/debug/debug.py
+++ b/scripts/debug/debug.py
@ -0,0 +1,46 @@
+"""Script to debug RED-9948. The predictions unexpectedly changed for some images, and we need to understand why."""
+
+import json
+import random
+from pathlib import Path
+
+import numpy as np
+import tensorflow as tf
+from kn_utils.logging import logger
+
+from image_prediction.config import CONFIG
+from image_prediction.pipeline import load_pipeline
+
+
+def process_pdf(pipeline, pdf_path, page_range=None):
+    with open(pdf_path, "rb") as f:
+        logger.info(f"Processing {pdf_path}")
+        predictions = list(pipeline(f.read(), page_range=page_range))
+
+    return predictions
+
+
+def ensure_seeds():
+    seed = 42
+    np.random.seed(seed)
+    random.seed(seed)
+    tf.random.set_seed(seed)
+
+
+def debug_info():
+    devices = tf.config.list_physical_devices()
+    print("Available devices:", devices)
+
+
+if __name__ == "__main__":
+    # For in container debugging, copy the file and adjust the path.
+    debug_file_path = Path(__file__).parents[2] / "test" / "data" / "RED-9948" / "SYNGENTA_EFSA_sanitisation_GFL_v2"
+    ensure_seeds()
+    debug_info()
+
+    pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size)
+    predictions = process_pdf(pipeline, debug_file_path)
+    # This is the image that has the wrong prediction mentioned in RED-9948. The predictions should inconclusive, and
+    # the flag all passed should be false.
+    predictions = [x for x in predictions if x["representation"] == "FA30F080F0C031CE17E8CF237"]
+    print(json.dumps(predictions, indent=2))
--- a/scripts/docker_build_run.sh
+++ b/scripts/docker_build_run.sh
@ -1,6 +1,6 @@
-docker build -t image-clsasification-service:$(poetry version -s)-dev \                                                               
+docker build -t --platform linux/amd64 image-clsasification-service:$(poetry version -s)-dev \                                                               
    -f Dockerfile \
-    --build-arg USERNAME=$GITLAB_USER \
-    --build-arg TOKEN=$GITLAB_ACCESS_TOKEN \
+    --build-arg GITLAB_USER=$GITLAB_USER \
+    --build-arg GITLAB_ACCESS_TOKEN=$GITLAB_ACCESS_TOKEN \
    . && \
 docker run -it --rm image-clsasification-service:$(poetry version -s)-dev
--- a/scripts/run_pipeline.py
+++ b/scripts/run_pipeline.py
@ -3,12 +3,15 @@ import json
 import os
 from glob import glob

+from image_prediction.config import CONFIG
 from image_prediction.pipeline import load_pipeline
 from image_prediction.utils import get_logger
 from image_prediction.utils.pdf_annotation import annotate_pdf

 logger = get_logger()

+logger.setLevel("DEBUG")
+

 def parse_args():
    parser = argparse.ArgumentParser()
@ -35,7 +38,7 @@ def process_pdf(pipeline, pdf_path, page_range=None):


 def main(args):
-    pipeline = load_pipeline(verbose=True, tolerance=3)
+    pipeline = load_pipeline(verbose=CONFIG.service.verbose, batch_size=CONFIG.service.batch_size, tolerance=CONFIG.service.image_stiching_tolerance)

    if os.path.isfile(args.input):
        pdf_paths = [args.input]
--- a/src/image_prediction/init.py
+++ b/src/image_prediction/init.py
--- a/src/image_prediction/classifier/init.py
+++ b/src/image_prediction/classifier/init.py
--- a/src/image_prediction/classifier/classifier.py
+++ b/src/image_prediction/classifier/classifier.py
--- a/src/image_prediction/classifier/image_classifier.py
+++ b/src/image_prediction/classifier/image_classifier.py
--- a/src/image_prediction/compositor/init.py
+++ b/src/image_prediction/compositor/init.py
--- a/src/image_prediction/compositor/compositor.py
+++ b/src/image_prediction/compositor/compositor.py
--- a/src/image_prediction/config.py
+++ b/src/image_prediction/config.py
@ -0,0 +1,7 @@
+from pathlib import Path
+
+from pyinfra.config.loader import load_settings
+
+from image_prediction.locations import PROJECT_ROOT_DIR
+
+CONFIG = load_settings(root_path=PROJECT_ROOT_DIR, settings_path="config")
--- a/src/image_prediction/default_objects.py
+++ b/src/image_prediction/default_objects.py
--- a/src/image_prediction/encoder/init.py
+++ b/src/image_prediction/encoder/init.py
--- a/src/image_prediction/encoder/encoder.py
+++ b/src/image_prediction/encoder/encoder.py
--- a/src/image_prediction/encoder/encoders/init.py
+++ b/src/image_prediction/encoder/encoders/init.py
--- a/src/image_prediction/encoder/encoders/hash_encoder.py
+++ b/src/image_prediction/encoder/encoders/hash_encoder.py
@ -13,7 +13,7 @@ class HashEncoder(Encoder):
        yield from self.encode(images)


-def hash_image(image: Image.Image):
+def hash_image(image: Image.Image) -> str:
    """See: https://stackoverflow.com/a/49692185/3578468"""
    image = image.resize((10, 10), Image.ANTIALIAS)
    image = image.convert("L")
@ -21,4 +21,6 @@ def hash_image(image: Image.Image):
    avg_pixel = sum(pixel_data) / len(pixel_data)
    bits = "".join(["1" if (px >= avg_pixel) else "0" for px in pixel_data])
    hex_representation = str(hex(int(bits, 2)))[2:][::-1].upper()
-    return hex_representation
+    # Note: For each 4 leading zeros, the hex representation will be shorter by one character.
+    # To ensure that all hashes have the same length, we pad the hex representation with zeros (also see RED-3813).
+    return hex_representation.zfill(25)
--- a/src/image_prediction/estimator/init.py
+++ b/src/image_prediction/estimator/init.py
--- a/src/image_prediction/estimator/adapter/init.py
+++ b/src/image_prediction/estimator/adapter/init.py
--- a/src/image_prediction/estimator/adapter/adapter.py
+++ b/src/image_prediction/estimator/adapter/adapter.py
--- a/src/image_prediction/estimator/adapter/adapters/init.py
+++ b/src/image_prediction/estimator/adapter/adapters/init.py
--- a/src/image_prediction/estimator/preprocessor/init.py
+++ b/src/image_prediction/estimator/preprocessor/init.py
--- a/src/image_prediction/estimator/preprocessor/preprocessor.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessor.py
--- a/src/image_prediction/estimator/preprocessor/preprocessors/init.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessors/init.py
--- a/src/image_prediction/estimator/preprocessor/preprocessors/basic.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessors/basic.py
--- a/src/image_prediction/estimator/preprocessor/preprocessors/identity.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessors/identity.py
--- a/src/image_prediction/estimator/preprocessor/utils.py
+++ b/src/image_prediction/estimator/preprocessor/utils.py
--- a/src/image_prediction/exceptions.py
+++ b/src/image_prediction/exceptions.py
--- a/src/image_prediction/extraction.py
+++ b/src/image_prediction/extraction.py
--- a/src/image_prediction/flask.py
+++ b/src/image_prediction/flask.py
--- a/src/image_prediction/formatter/init.py
+++ b/src/image_prediction/formatter/init.py
--- a/src/image_prediction/formatter/formatter.py
+++ b/src/image_prediction/formatter/formatter.py
--- a/src/image_prediction/formatter/formatters/init.py
+++ b/src/image_prediction/formatter/formatters/init.py
--- a/src/image_prediction/formatter/formatters/camel_case.py
+++ b/src/image_prediction/formatter/formatters/camel_case.py
--- a/src/image_prediction/formatter/formatters/enum.py
+++ b/src/image_prediction/formatter/formatters/enum.py
--- a/src/image_prediction/formatter/formatters/identity.py
+++ b/src/image_prediction/formatter/formatters/identity.py
--- a/src/image_prediction/formatter/formatters/key_formatter.py
+++ b/src/image_prediction/formatter/formatters/key_formatter.py
--- a/src/image_prediction/image_extractor/init.py
+++ b/src/image_prediction/image_extractor/init.py
--- a/src/image_prediction/image_extractor/extractor.py
+++ b/src/image_prediction/image_extractor/extractor.py
--- a/src/image_prediction/image_extractor/extractors/init.py
+++ b/src/image_prediction/image_extractor/extractors/init.py
--- a/src/image_prediction/image_extractor/extractors/mock.py
+++ b/src/image_prediction/image_extractor/extractors/mock.py
--- a/src/image_prediction/image_extractor/extractors/parsable.py
+++ b/src/image_prediction/image_extractor/extractors/parsable.py
@ -3,7 +3,7 @@ import json
 import traceback
 from _operator import itemgetter
 from functools import partial, lru_cache
-from itertools import chain, starmap, filterfalse
+from itertools import chain, starmap, filterfalse, tee
 from operator import itemgetter, truth
 from typing import Iterable, Iterator, List, Union

@ -11,9 +11,10 @@ import fitz
 import numpy as np
 from PIL import Image
 from funcy import merge, pluck, compose, rcompose, remove, keep
+from scipy.stats import gmean

 from image_prediction.config import CONFIG
-from image_prediction.exceptions import InvalidBox, BadXref
+from image_prediction.exceptions import InvalidBox
 from image_prediction.formatter.formatters.enum import EnumFormatter
 from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
 from image_prediction.info import Info
@ -34,7 +35,7 @@ class ParsablePDFImageExtractor(ImageExtractor):
            tolerance: The tolerance in pixels for the distance between images, beyond which they will not be stitched
                together
        """
-        self.doc: fitz.fitz.Document = None
+        self.doc: fitz.Document = None
        self.verbose = verbose
        self.tolerance = tolerance

@ -47,7 +48,7 @@ class ParsablePDFImageExtractor(ImageExtractor):

        yield from image_metadata_pairs

-    def __process_images_on_page(self, page: fitz.fitz.Page):
+    def __process_images_on_page(self, page: fitz.Page):
        metadata = extract_valid_metadata(self.doc, page)
        images = get_images_on_page(self.doc, metadata)

@ -64,9 +65,13 @@ class ParsablePDFImageExtractor(ImageExtractor):

    @staticmethod
    def __filter_valid_images(image_metadata_pairs: Iterable[ImageMetadataPair]) -> Iterator[ImageMetadataPair]:
-        def validate(image: Image.Image, metadata: dict):
+        def validate_image_is_not_corrupt(image: Image.Image, metadata: dict):
+            """See RED-5148: Some images are corrupt and cannot be processed by the image classifier. This function
+            filters out such images by trying to resize and convert them to RGB. If this fails, the image is considered
+            corrupt and is dropped.
+            TODO: find cleaner solution
+            """
            try:
-                # TODO: stand-in heuristic for testing if image is valid => find cleaner solution (RED-5148)
                image.resize((100, 100)).convert("RGB")
                return ImageMetadataPair(image, metadata)
            except (OSError, Exception) as err:
@ -74,7 +79,41 @@ class ParsablePDFImageExtractor(ImageExtractor):
                logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
                return None

-        return filter(truth, starmap(validate, image_metadata_pairs))
+        def filter_small_images_on_scanned_pages(image_metadata_pairs) -> Iterable[ImageMetadataPair]:
+            """See RED-9746: Small images on scanned pages should be dropped, so they are not classified. This is a
+            heuristic to filter out images that are too small in relation to the page size if they are on a scanned page.
+
+            The ratio is computed as the geometric mean of the width and height of the image divided by the geometric mean
+            of the width and height of the page. If the ratio is below the threshold, the image is dropped.
+            """
+
+            def image_is_a_scanned_page(image_metadata_pair: ImageMetadataPair) -> bool:
+                tolerance = CONFIG.filters.is_scanned_page.tolerance
+                width_ratio = image_metadata_pair.metadata[Info.WIDTH] / image_metadata_pair.metadata[Info.PAGE_WIDTH]
+                height_ratio = (
+                    image_metadata_pair.metadata[Info.HEIGHT] / image_metadata_pair.metadata[Info.PAGE_HEIGHT]
+                )
+                return width_ratio >= 1 - tolerance and height_ratio >= 1 - tolerance
+
+            def image_fits_geometric_mean_ratio(image_metadata_pair: ImageMetadataPair) -> bool:
+                min_ratio = CONFIG.filters.image_to_page_quotient.min
+                metadatum = image_metadata_pair.metadata
+                image_gmean = gmean([metadatum[Info.WIDTH], metadatum[Info.HEIGHT]])
+                page_gmean = gmean([metadatum[Info.PAGE_WIDTH], metadatum[Info.PAGE_HEIGHT]])
+                ratio = image_gmean / page_gmean
+                return ratio >= min_ratio
+
+            pairs, pairs_copy = tee(image_metadata_pairs)
+
+            if any(map(image_is_a_scanned_page, pairs_copy)):
+                logger.debug("Scanned page detected, filtering out small images ...")
+                return filter(image_fits_geometric_mean_ratio, pairs)
+            else:
+                return pairs
+
+        image_metadata_pairs = filter_small_images_on_scanned_pages(image_metadata_pairs)
+
+        return filter(truth, starmap(validate_image_is_not_corrupt, image_metadata_pairs))


 def extract_pages(doc, page_range):
@ -91,13 +130,12 @@ def get_images_on_page(doc, metadata):
    yield from images


-def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
-    return compose(
-        list,
-        partial(add_alpha_channel_info, doc),
-        filter_valid_metadata,
-        get_metadata_for_images_on_page,
-    )(page)
+def extract_valid_metadata(doc: fitz.Document, page: fitz.Page):
+    metadata = get_metadata_for_images_on_page(page)
+    metadata = filter_valid_metadata(metadata)
+    metadata = add_alpha_channel_info(doc, metadata)
+
+    return list(metadata)


 def get_metadata_for_images_on_page(page: fitz.Page):
@ -153,7 +191,7 @@ def xref_to_image(doc, xref) -> Union[Image.Image, None]:
        return


-def convert_pixmap_to_array(pixmap: fitz.fitz.Pixmap):
+def convert_pixmap_to_array(pixmap: fitz.Pixmap):
    array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
    array = _normalize_channels(array)
    return array
@ -172,7 +210,6 @@ def _normalize_channels(array: np.ndarray):


 def get_image_metadata(image_info):
-
    xref, coords = itemgetter("xref", "bbox")(image_info)
    x1, y1, x2, y2 = map(rounder, coords)

@ -207,7 +244,11 @@ def add_alpha_channel_info(doc, metadata):

@lru_cache(maxsize=None)
 def load_image_handle_from_xref(doc, xref):
-    return doc.extract_image(xref)
+    try:
+        return doc.extract_image(xref)
+    except ValueError:
+        logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
+        return


 rounder = rcompose(round, int)
@ -224,7 +265,6 @@ def get_page_metadata(page):


 def has_alpha_channel(doc, xref):
-
    maybe_image = load_image_handle_from_xref(doc, xref)
    maybe_smask = maybe_image["smask"] if maybe_image else None

--- a/src/image_prediction/info.py
+++ b/src/image_prediction/info.py
--- a/src/image_prediction/label_mapper/init.py
+++ b/src/image_prediction/label_mapper/init.py
--- a/src/image_prediction/label_mapper/mapper.py
+++ b/src/image_prediction/label_mapper/mapper.py
--- a/src/image_prediction/label_mapper/mappers/init.py
+++ b/src/image_prediction/label_mapper/mappers/init.py
--- a/src/image_prediction/label_mapper/mappers/numeric.py
+++ b/src/image_prediction/label_mapper/mappers/numeric.py
--- a/src/image_prediction/label_mapper/mappers/probability.py
+++ b/src/image_prediction/label_mapper/mappers/probability.py
--- a/src/image_prediction/locations.py
+++ b/src/image_prediction/locations.py
@ -0,0 +1,18 @@
+"""Defines constant paths relative to the module root path."""
+
+from pathlib import Path
+
+# FIXME: move these paths to config, only depending on 'ROOT_PATH' environment variable.
+MODULE_DIR = Path(__file__).resolve().parents[0]
+PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
+PROJECT_ROOT_DIR = PACKAGE_ROOT_DIR.parents[0]
+
+CONFIG_FILE = PROJECT_ROOT_DIR / "config" / "settings.toml"
+BANNER_FILE = PROJECT_ROOT_DIR / "banner.txt"
+
+DATA_DIR = PROJECT_ROOT_DIR / "data"
+MLRUNS_DIR = str(DATA_DIR / "mlruns")
+
+TEST_DIR = PROJECT_ROOT_DIR / "test"
+TEST_DATA_DIR = TEST_DIR / "data"
+TEST_DATA_DIR_DVC = TEST_DIR / "data.dvc"
--- a/src/image_prediction/model_loader/init.py
+++ b/src/image_prediction/model_loader/init.py
--- a/src/image_prediction/model_loader/database/init.py
+++ b/src/image_prediction/model_loader/database/init.py
--- a/src/image_prediction/model_loader/database/connector.py
+++ b/src/image_prediction/model_loader/database/connector.py
--- a/src/image_prediction/model_loader/database/connectors/init.py
+++ b/src/image_prediction/model_loader/database/connectors/init.py
--- a/src/image_prediction/model_loader/database/connectors/mock.py
+++ b/src/image_prediction/model_loader/database/connectors/mock.py
--- a/src/image_prediction/model_loader/loader.py
+++ b/src/image_prediction/model_loader/loader.py
--- a/src/image_prediction/model_loader/loaders/init.py
+++ b/src/image_prediction/model_loader/loaders/init.py
--- a/src/image_prediction/model_loader/loaders/mlflow.py
+++ b/src/image_prediction/model_loader/loaders/mlflow.py
--- a/src/image_prediction/pipeline.py
+++ b/src/image_prediction/pipeline.py
@ -1,8 +1,10 @@
 import os
 from functools import lru_cache, partial
 from itertools import chain, tee
+from typing import Iterable, Any

 from funcy import rcompose, first, compose, second, chunks, identity, rpartial
+from kn_utils.logging import logger
 from tqdm import tqdm

 from image_prediction.config import CONFIG
@ -21,6 +23,7 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

@lru_cache(maxsize=None)
 def load_pipeline(**kwargs):
+    logger.info(f"Loading pipeline with kwargs: {kwargs}")
    model_loader = get_mlflow_model_loader(MLRUNS_DIR)
    model_identifier = CONFIG.service.mlflow_run_id

@ -52,7 +55,7 @@ class Pipeline:
        join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))

        #                       />--classify--\
-        # --extract-->--split--+->--encode---->+--join-->reformat
+        # --extract-->--split--+->--encode---->+--join-->reformat-->filter_duplicates
        #                       \>--identity--/

        self.pipe = rcompose(
@ -61,6 +64,7 @@ class Pipeline:
            pairwise_apply(classify, represent, identity),  # ... apply functions to the streams pairwise
            join,  # ... the streams by zipping
            reformat,  # ... the items
+            filter_duplicates,  # ... filter out duplicate images
        )

    def __call__(self, pdf: bytes, page_range: range = None):
@ -70,3 +74,32 @@ class Pipeline:
            unit=" images",
            disable=not self.verbose,
        )
+
+
+def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
+    """Filter out duplicate images from the `position` (image coordinates) and `page`, preferring the one with
+    `allPassed` set to True.
+    See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
+    """
+    keep = dict()
+    for image_meta in metadata:
+        key: tuple[int, int, int, int, int] = (
+            image_meta["position"]["x1"],
+            image_meta["position"]["x2"],
+            image_meta["position"]["y1"],
+            image_meta["position"]["y2"],
+            image_meta["position"]["pageNumber"],
+        )
+        if key in keep:
+            logger.warning(
+                f"Duplicate image found: x1={key[0]}, x2={key[1]}, y1={key[2]}, y2={key[3]}, pageNumber={key[4]}"
+            )
+            if image_meta["filters"]["allPassed"]:
+                logger.warning("Setting the image with allPassed flag set to True")
+                keep[key] = image_meta
+            else:
+                logger.warning("Keeping the previous image since the current image has allPassed flag set to False")
+        else:
+            keep[key] = image_meta
+
+    yield from keep.values()
--- a/src/image_prediction/redai_adapter/init.py
+++ b/src/image_prediction/redai_adapter/init.py
--- a/src/image_prediction/redai_adapter/efficient_net_wrapper.py
+++ b/src/image_prediction/redai_adapter/efficient_net_wrapper.py
--- a/src/image_prediction/redai_adapter/mlflow.py
+++ b/src/image_prediction/redai_adapter/mlflow.py
--- a/src/image_prediction/redai_adapter/model.py
+++ b/src/image_prediction/redai_adapter/model.py
--- a/src/image_prediction/redai_adapter/model_wrapper.py
+++ b/src/image_prediction/redai_adapter/model_wrapper.py
--- a/src/image_prediction/stitching/init.py
+++ b/src/image_prediction/stitching/init.py
--- a/src/image_prediction/stitching/grouping.py
+++ b/src/image_prediction/stitching/grouping.py
--- a/src/image_prediction/stitching/merging.py
+++ b/src/image_prediction/stitching/merging.py
--- a/src/image_prediction/stitching/split_mapper.py
+++ b/src/image_prediction/stitching/split_mapper.py
--- a/src/image_prediction/stitching/stitching.py
+++ b/src/image_prediction/stitching/stitching.py
--- a/src/image_prediction/stitching/utils.py
+++ b/src/image_prediction/stitching/utils.py
--- a/src/image_prediction/transformer/init.py
+++ b/src/image_prediction/transformer/init.py
--- a/src/image_prediction/transformer/transformer.py
+++ b/src/image_prediction/transformer/transformer.py
--- a/src/image_prediction/transformer/transformers/init.py
+++ b/src/image_prediction/transformer/transformers/init.py
--- a/src/image_prediction/transformer/transformers/coordinate/init.py
+++ b/src/image_prediction/transformer/transformers/coordinate/init.py
--- a/src/image_prediction/transformer/transformers/coordinate/coordinate_transformer.py
+++ b/src/image_prediction/transformer/transformers/coordinate/coordinate_transformer.py
--- a/src/image_prediction/transformer/transformers/coordinate/fitz.py
+++ b/src/image_prediction/transformer/transformers/coordinate/fitz.py
--- a/src/image_prediction/transformer/transformers/coordinate/fpdf.py
+++ b/src/image_prediction/transformer/transformers/coordinate/fpdf.py
--- a/src/image_prediction/transformer/transformers/coordinate/pdfnet.py
+++ b/src/image_prediction/transformer/transformers/coordinate/pdfnet.py
--- a/src/image_prediction/transformer/transformers/response.py
+++ b/src/image_prediction/transformer/transformers/response.py
@ -1,13 +1,8 @@
-import json
 import math
-import os
-from functools import lru_cache
+from dynaconf import Dynaconf
 from operator import itemgetter

-from funcy import first
-
 from image_prediction.config import CONFIG
-from image_prediction.exceptions import ParsingError
 from image_prediction.transformer.transformer import Transformer
 from image_prediction.utils import get_logger

@ -32,21 +27,22 @@ def build_image_info(data: dict) -> dict:
    geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4)

    min_image_to_page_quotient_breached = bool(
-        geometric_quotient < get_class_specific_min_image_to_page_quotient(label)
+        geometric_quotient < get_class_specific_filter_value(label, CONFIG, "image_to_page_quotient", "min")
    )
    max_image_to_page_quotient_breached = bool(
-        geometric_quotient > get_class_specific_max_image_to_page_quotient(label)
+        geometric_quotient > get_class_specific_filter_value(label, CONFIG, "image_to_page_quotient", "max")
    )

    min_image_width_to_height_quotient_breached = bool(
-        width / height < get_class_specific_min_image_width_to_height_quotient(label)
+        width / height < get_class_specific_filter_value(label, CONFIG, "image_width_to_height_quotient", "min")
    )
    max_image_width_to_height_quotient_breached = bool(
-        width / height > get_class_specific_max_image_width_to_height_quotient(label)
+        width / height > get_class_specific_filter_value(label, CONFIG, "image_width_to_height_quotient", "max")
    )

    min_confidence_breached = bool(
-        max(classification["probabilities"].values()) < get_class_specific_min_classification_confidence(label)
+        max(classification["probabilities"].values())
+        < get_class_specific_filter_value(label, CONFIG, "confidence", "min")
    )

    image_info = {
@ -90,65 +86,15 @@ def compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1):
    return image_area_sqrt / page_area_sqrt


-def get_class_specific_min_image_to_page_quotient(label, table=None):
-    return get_class_specific_value(
-        "REL_IMAGE_SIZE", label, "min", CONFIG.filters.image_to_page_quotient.min, table=table
-    )
-
-
-def get_class_specific_max_image_to_page_quotient(label, table=None):
-    return get_class_specific_value(
-        "REL_IMAGE_SIZE", label, "max", CONFIG.filters.image_to_page_quotient.max, table=table
-    )
-
-
-def get_class_specific_min_image_width_to_height_quotient(label, table=None):
-    return get_class_specific_value(
-        "IMAGE_FORMAT", label, "min", CONFIG.filters.image_width_to_height_quotient.min, table=table
-    )
-
-
-def get_class_specific_max_image_width_to_height_quotient(label, table=None):
-    return get_class_specific_value(
-        "IMAGE_FORMAT", label, "max", CONFIG.filters.image_width_to_height_quotient.max, table=table
-    )
-
-
-def get_class_specific_min_classification_confidence(label, table=None):
-    return get_class_specific_value("CONFIDENCE", label, "min", CONFIG.filters.min_confidence, table=table)
-
-
-def get_class_specific_value(prefix, label, bound, fallback_value, table=None):
-    def fallback():
-        return fallback_value
-
-    def success():
-        threshold_map = parse_env_var(prefix, table=table) or {}
-        value = threshold_map.get(label, {}).get(bound)
-        if value:
-            logger.debug(f"Using class '{label}' specific {bound} {prefix.lower().replace('_', '-')} value.")
-        return value
-
-    assert bound in ["min", "max"]
-
-    return success() or fallback()
-
-
-@lru_cache(maxsize=None)
-def parse_env_var(prefix, table=None):
-    table = table or os.environ
-    head = first(filter(lambda s: s == prefix, table))
-    if head:
-        try:
-            return parse_env_var_value(table[head])
-        except ParsingError as err:
-            logger.warning(err)
-    else:
-        return None
-
-
-def parse_env_var_value(env_var_value):
+def get_class_specific_filter_value(label: str, settings: Dynaconf, filter_type: str, bound: str = None):
    try:
-        return json.loads(env_var_value)
-    except Exception as err:
-        raise ParsingError(f"Failed to parse {env_var_value}") from err
+        value = (
+            settings.filters.overrides[label][filter_type][bound]
+            if bound
+            else settings.filters.overrides[label][filter_type]
+        )
+        logger.warning(f"Using {label=} specific {bound=} {filter_type=} {value=}.")
+    except KeyError:
+        value = settings.filters[filter_type][bound]
+
+    return value
--- a/src/image_prediction/utils.py
+++ b/src/image_prediction/utils.py
--- a/src/image_prediction/utils/init.py
+++ b/src/image_prediction/utils/init.py
--- a/src/image_prediction/utils/banner.py
+++ b/src/image_prediction/utils/banner.py
--- a/src/image_prediction/utils/generic.py
+++ b/src/image_prediction/utils/generic.py
--- a/src/image_prediction/utils/logger.py
+++ b/src/image_prediction/utils/logger.py
@ -0,0 +1,4 @@
+import kn_utils
+
+# TODO: remove this module and use the `get_logger` function from the `kn_utils` package.
+get_logger = kn_utils.get_logger
--- a/src/image_prediction/utils/pdf_annotation.py
+++ b/src/image_prediction/utils/pdf_annotation.py
@ -56,7 +56,8 @@ def annotate_image(doc, image_info):

 def init():
    PDFNet.Initialize(
-        "Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
+        # "Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
+        "Knecon AG:OEM:DDA-R::WL+:AMS(20270129):EA5FDFB23C7F36B9C2AE606F4F0D9197DE1FB649119F9730B622FABEF5C7"
    )


--- a/src/image_prediction/utils/process_wrapping.py
+++ b/src/image_prediction/utils/process_wrapping.py
@ -17,9 +17,6 @@ def wrap_in_process(fn):
        process = multiprocessing.Process(target=process_fn, args=args, kwargs=kwargs)
        process.start()
        process.join()
-        try:
-            return return_queue.pop(0)
-        except IndexError:
-            logger.warning("No results returned by subprocess.")
+        return return_queue.pop(0)

    return wrapped_fn
--- a/src/serve.py
+++ b/src/serve.py
@ -1,17 +1,15 @@
-from image_prediction import logger
-from image_prediction.config import Config
-from image_prediction.locations import CONFIG_FILE
+from sys import stdout
+
+from kn_utils.logging import logger
+from pyinfra.examples import start_standard_queue_consumer
+from pyinfra.queue.callback import make_download_process_upload_callback
+
+from image_prediction.config import CONFIG
 from image_prediction.pipeline import load_pipeline
 from image_prediction.utils.banner import load_banner
 from image_prediction.utils.process_wrapping import wrap_in_process
-from pyinfra import config
-from pyinfra.payload_processing import make_payload_processor
-from pyinfra.queue.queue_manager import QueueManager

-PYINFRA_CONFIG = config.get_config()
-IMAGE_CONFIG = Config(CONFIG_FILE)
-
-logger.setLevel(PYINFRA_CONFIG.logging_level_root)
+logger.reconfigure(sink=stdout, level=CONFIG.logging.level)


 # A component of the processing pipeline (probably tensorflow) does not release allocated memory (see RED-4206).
@ -19,18 +17,16 @@ logger.setLevel(PYINFRA_CONFIG.logging_level_root)
 # Workaround: Manage Memory with the operating system, by wrapping the processing in a sub-process.
 # FIXME: Find more fine-grained solution or if the problem occurs persistently for python services,
@wrap_in_process
-def process_data(data: bytes) -> list:
-    pipeline = load_pipeline(verbose=IMAGE_CONFIG.service.verbose, batch_size=IMAGE_CONFIG.service.batch_size)
+def process_data(data: bytes, _message: dict) -> list:
+    pipeline = load_pipeline(verbose=CONFIG.service.verbose, batch_size=CONFIG.service.batch_size, tolerance=CONFIG.service.image_stiching_tolerance)
    return list(pipeline(data))


 def main():
    logger.info(load_banner())

-    process_payload = make_payload_processor(process_data, config=PYINFRA_CONFIG)
-
-    queue_manager = QueueManager(PYINFRA_CONFIG)
-    queue_manager.start_consuming(process_payload)
+    callback = make_download_process_upload_callback(process_data, CONFIG)
+    start_standard_queue_consumer(callback, CONFIG)


 if __name__ == "__main__":
--- a/test/conftest.py
+++ b/test/conftest.py
@ -1,10 +1,3 @@
-import logging
-
-import pytest
-
-from image_prediction.utils import get_logger
-
-
 pytest_plugins = [
    "test.fixtures.extractor",
    "test.fixtures.image",
@ -17,14 +10,5 @@ pytest_plugins = [
    "test.fixtures.parameters",
    "test.fixtures.pdf",
    "test.fixtures.target",
-    "test.unit_tests.image_stitching_test"
+    "test.unit_tests.image_stitching_test",
 ]
-
-
-@pytest.fixture(autouse=True)
-def mute_logger():
-    logger = get_logger()
-    level = logger.level
-    logger.setLevel(logging.CRITICAL + 1)
-    yield
-    logger.setLevel(level)
--- a/test/data.dvc
+++ b/test/data.dvc
@ -1,5 +1,5 @@
 outs:
- md5: 4b0fec291ce0661b3efbbd8b80f4f514.dir
-  size: 107332
-  nfiles: 4
+- md5: 08bf8a63f04b3f19f859008556699708.dir
+  size: 7979836
+  nfiles: 7
  path: data
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Julius Unverfehrt	0027421628	feat: RED-10765: ignore perceptual hash for image deduplication and prefer to keep the ones with `allPassed` set to `True`	2025-01-31 12:59:59 +01:00
Julius Unverfehrt	00740c91b8	Merge branch 'feat/RED-10765/filter-duplicate-images' into 'master' feat: RED-10765: filter out classifications for 'duplicate' images present in the document Closes RED-10765 See merge request redactmanager/image-classification-service!23	2025-01-30 13:20:19 +01:00
Julius Unverfehrt	a3d79eb9af	feat: RED-10765: filter out classifications for 'duplicate' images present in the document	2025-01-30 12:42:41 +01:00
Jonathan Kössler	373f9f2d01	Merge branch 'bugfix/RED-10722' into 'master' RED-10722: fix dead letter queue Closes RED-10722 See merge request redactmanager/image-classification-service!22	2025-01-16 09:29:11 +01:00
Jonathan Kössler	2429d90dd5	chore: update pyinfra to v3.4.2	2025-01-15 13:39:16 +01:00
Julius Unverfehrt	2b85999258	Merge branch 'fix/RM-227' into 'master' fix: RM-227: set minimum permissable value for logos Closes RM-227 and RED-10686 See merge request redactmanager/image-classification-service!21	2024-12-18 12:39:44 +01:00
Julius Unverfehrt	4b15d2c2ca	fix: RED-10686: set minimum permissable value for logos Reference the jira ticket for more information. This change can introduce unwanted behavior.	2024-12-18 11:47:54 +01:00
Jonathan Kössler	bf1ca8d6f9	Merge branch 'feature/RED-10441' into 'master' RED-10441: fix abandoned queues Closes RED-10441 See merge request redactmanager/image-classification-service!20	2024-11-13 17:32:27 +01:00
Jonathan Kössler	9a4b8cad2b	chore: update pyinfra to v3.3.5	2024-11-13 17:21:58 +01:00
Jonathan Kössler	28adb50330	chore: update pyinfra to v3.3.4	2024-11-13 16:39:49 +01:00
Jonathan Kössler	7a3fdf8fa4	chore: update pyinfra to v3.3.3	2024-11-13 14:54:29 +01:00
Jonathan Kössler	3fbcd65e9b	chore: update pyinfra to v3.3.2	2024-11-13 09:56:55 +01:00
Jonathan Kössler	90a60b4b7c	Merge branch 'chore/update_pyinfra' into 'master' RES-858: fix graceful shutdown See merge request redactmanager/image-classification-service!19	2024-09-30 11:01:24 +02:00
Jonathan Kössler	526de8984c	chore: update pyinfra to v3.2.11	2024-09-30 10:12:40 +02:00
Jonathan Kössler	99cbf3c9bf	Merge branch 'feature/RED-10017-fix-config' into 'master' RED-10017: fix pyinfra config Closes RED-10017 See merge request redactmanager/image-classification-service!18	2024-09-27 08:22:00 +02:00
Jonathan Kössler	986137e729	chore: update pyinfra to v3.2.10	2024-09-26 13:40:49 +02:00
Jonathan Kössler	f950b96cfb	fix: pyinfra config	2024-09-24 14:31:10 +02:00
Francisco Schulz	2385d19bc2	Merge branch 'RED-10017-investigate-crashing-py-services-when-upload-large-number-of-files' into 'master' RED-10017 "Investigate crashing py services when upload large number of files" See merge request redactmanager/image-classification-service!17	2024-09-23 18:55:01 +02:00
Francisco Schulz	16f2f0d557	RED-10017 "Investigate crashing py services when upload large number of files"	2024-09-23 18:55:01 +02:00
Julius Unverfehrt	afa6fc34cb	Merge branch 'improvement/RED-10018' into 'master' feat: parameterize image stiching tolerance Closes RED-10018 See merge request redactmanager/image-classification-service!16	2024-09-06 16:27:36 +02:00
Julius Unverfehrt	a192e05be2	feat: parameterize image stiching tolerance Also sets image stitching tolerance default to one (pixel) and adds informative log of which settings are loaded when initializing the image classification pipeline.	2024-09-06 15:51:17 +02:00
Francisco Schulz	d23034e38a	Merge branch 'fix/RED-9948' into 'master' fix: regression of predictions Closes RED-9948 See merge request redactmanager/image-classification-service!15	2024-08-30 16:06:50 +02:00
Julius Unverfehrt	4bc53cf88b	chore: update pyinfra (for current features)	2024-08-30 15:54:52 +02:00
Julius Unverfehrt	e737f64ed2	fix: pin dependencies to working versions BREAKING CHANGE Recent pyinfra changes update tensorflow implicitely (see RED-9948). This can be fixed by pinning tensorflow and protobuf. However this makes the service incompatible with the current pyinfra versions.	2024-08-30 15:54:52 +02:00
Julius Unverfehrt	4b099f0106	chore: bump poetry version	2024-08-30 15:53:35 +02:00
Julius Unverfehrt	b3a58d6777	chore: add tests to ensure no regression happens ever again	2024-08-30 15:53:07 +02:00
Julius Unverfehrt	c888453cc6	fix: pin dependencies to working versions BREAKING CHANGE Recent pyinfra changes update tensorflow implicitely (see RED-9948). This can be fixed by pinning tensorflow and protobuf. However this makes the service incompatible with the current pyinfra versions.	2024-08-30 15:52:55 +02:00
Julius Unverfehrt	bf9ab4b1a2	chore: update run pipline script to use all parameters that are used in production	2024-08-30 15:51:10 +02:00
Julius Unverfehrt	9ff88a1e5d	chore: update test data	2024-08-30 15:51:10 +02:00
Julius Unverfehrt	c852434b75	chore: add script for local and container debug	2024-08-30 15:51:10 +02:00
Jonathan Kössler	8655e25ec0	Merge branch 'feature/RES-840-add-client-connector-error' into 'master' fix: add exception handling for ClientConnectorError Closes RES-840 See merge request redactmanager/image-classification-service!13	2024-08-28 15:46:55 +02:00
Jonathan Kössler	103c19d4cd	chore: update pyinfra version	2024-08-28 14:50:39 +02:00
Jonathan Kössler	530001a0af	Merge branch 'feature/RES-826-pyinfra-update' into 'master' chore: bump pyinfra version Closes RES-826 See merge request redactmanager/image-classification-service!12	2024-08-26 16:15:25 +02:00
Jonathan Kössler	a6c11a9db5	chore: bump pyinfra version	2024-08-26 15:14:34 +02:00
Julius Unverfehrt	1796c1bcbb	fix: RED-3813: ensure image hashes are always 25 chars long The hashing algorithm omits leading bits without information. Since this proves problematic for later processing, we restore this information and ensure the hashes are always 25 characters long.	2024-08-22 11:15:41 +02:00
Jonathan Kössler	f4b9ff54aa	chore: bump pyinfra version	2024-08-22 09:34:40 +02:00
Jonathan Kössler	278b42e368	Merge branch 'bugfix/set-image-tags' into 'master' fix: version reference See merge request redactmanager/image-classification-service!11	2024-08-20 09:46:55 +02:00
Jonathan Kössler	9600e4ca23	chore: bump version	2024-08-20 09:34:54 +02:00
Jonathan Kössler	8485345dd1	fix: version reference	2024-08-19 16:32:44 +02:00
Jonathan Kössler	d1a523c7d6	Merge branch 'feature/RES-731-add-queues-per-tenant' into 'master' RES-731: add queues per tenant Closes RES-731 See merge request redactmanager/image-classification-service!9	2024-08-19 15:12:03 +02:00
Jonathan Kössler	278f54eaa7	RES-731: add queues per tenant	2024-08-19 15:12:03 +02:00
Julius Unverfehrt	443c2614f9	Merge branch 'RED-9746' into 'master' fix: add small image filter logic Closes RED-9746 See merge request redactmanager/image-classification-service!10	2024-08-07 13:50:28 +02:00
Julius Unverfehrt	4102a564a3	fix: add small image filter logic Introduces a preprocessing that scans each page for page sized images. If one is encountered, all images that are below a configured ratio in respect to the page size are dropped. This step has to occur before the image stiching logic, but MIGHT introduce the problem of dropping image parts that might constitue an image. This hoever is not solveable since we want to drop the small images before further processing since the faulty character images are also stiched to a valid image, that in reality isn't an image.	2024-08-06 16:52:05 +02:00
Julius Unverfehrt	7f49642ba0	fix: RED-8978: update pyinfra	2024-04-16 16:42:10 +02:00
Julius Unverfehrt	ba8d1dfdfe	chore(logger): support spring log levels	2024-02-28 16:34:23 +01:00
Julius Unverfehrt	150d0d64e5	chore(prediction filters): adapt class specific filter logic	2024-02-09 11:36:51 +01:00
Julius Unverfehrt	a024ddfcf7	Merge branch 'RES-534-update-pyinfra' into 'master' feat(opentel,dynaconf): adapt new pyinfra Closes RES-534 See merge request redactmanager/image-classification-service!8	2024-02-09 09:59:11 +01:00
Julius Unverfehrt	13cbfa4ddf	chore(tests): disable integration test	2024-02-09 09:50:59 +01:00
Julius Unverfehrt	75af55dbda	chore(project structure): use src/ structure	2024-02-09 09:47:42 +01:00
Julius Unverfehrt	499c501acf	feat(opentel,dynaconf): adapt new pyinfra Also changes logging to knutils logging.	2024-02-09 09:47:31 +01:00
Julius Unverfehrt	6163e29d6b	fix(pdf conversion): repair broken bad x-ref handling	2024-02-08 17:16:41 +01:00
Francisco Schulz	dadc0a4163	Merge branch 'RED-7958-logging-issues-of-python-services' into 'master' RED-7958 "Logging issues of python services" See merge request redactmanager/image-classification-service!6	2023-12-12 11:29:46 +01:00
Francisco Schulz	729ce17de0	use .pdf as integration test file	2023-12-11 11:32:14 +01:00
francisco.schulz	88fbe077e6	fix: poetry install --without=dev	2023-12-11 10:40:06 +01:00
francisco.schulz	f8ecef1054	update dependencies	2023-12-11 10:39:27 +01:00
Francisco Schulz	5f44cc6560	use integration test default branch	2023-12-07 13:23:53 +01:00
francisco.schulz	b60f4d0383	use python 3.10	2023-11-28 15:57:53 +01:00
francisco.schulz	87873cc3a3	update dependencies	2023-11-28 15:57:45 +01:00
francisco.schulz	523ca1db7d	use latest CI template	2023-11-28 15:57:36 +01:00
Julius Unverfehrt	c25f6902e0	Merge branch 'feature/RED-6685-support-absolute-paths' into 'master' Upgrade pyinfra (absolute FP support) Closes RED-6685 See merge request redactmanager/image-classification-service!5	2023-08-23 15:04:59 +02:00
Julius Unverfehrt	9e336ecc01	Upgrade pyinfra (absolute FP support) - Update pyinfra with absolute file path support (still supports dossierID fileID format) - Update CI, use new template	2023-08-23 14:53:40 +02:00
Julius Unverfehrt	0efa2127d7	Merge branch 'fix/RED-7388-nack-message-if-processing-failure' into 'master' Adjust error handling of processing sub-process Closes RED-7388 See merge request redactmanager/image-classification-service!4	2023-08-17 13:40:11 +02:00
Julius Unverfehrt	501fd48d69	Adjust error handling of processing sub-process Removes exception catching when collecting subprocess result which led to the service silently go over failing file processing. Now, the sub-process doesn't return any results if it failed. It is made sure that an empty result is still returned if no images were present on the file to process.	2023-08-17 13:26:27 +02:00