feat: RED-10765: ignore perceptual hash for image deduplication and prefer to keep the ones with allPassed set to True

Merge branch 'feat/RED-10765/filter-duplicate-images' into 'master'
feat: RED-10765: filter out classifications for 'duplicate' images present in the document Closes RED-10765 See merge request redactmanager/image-classification-service!23
2025-01-31 12:59:59 +01:00 · 2025-01-30 13:20:19 +01:00 · 2025-01-30 12:42:41 +01:00 · 2025-01-16 09:29:11 +01:00 · 2025-01-15 13:39:16 +01:00 · 2024-12-18 12:39:44 +01:00
143 changed files with 42151 additions and 1079 deletions
--- a/.dvc/config
+++ b/.dvc/config
@ -1,6 +1,8 @@
 [core]
-    remote = vector
+    remote = azure_remote
    autostage = true
 ['remote "vector"']
    url = ssh://vector.iqser.com/research/image-prediction/
    port = 22
+['remote "azure_remote"']
+    url = azure://image-classification-dvc/
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,8 @@
 .vscode/
 *.h5
-/venv/
+*venv
 .idea/
+src/data

 !.gitignore
 *.project
@ -172,4 +173,4 @@ fabric.properties
 # https://plugins.jetbrains.com/plugin/12206-codestream
 .idea/codestream.xml

-# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
+# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -0,0 +1,51 @@
+include:
+  - project: "Gitlab/gitlab"
+    ref: main
+    file: "/ci-templates/research/dvc.gitlab-ci.yml"
+  - project: "Gitlab/gitlab"
+    ref: main
+    file: "/ci-templates/research/versioning-build-test-release.gitlab-ci.yml"
+
+variables:
+  NEXUS_PROJECT_DIR: red
+  IMAGENAME: "${CI_PROJECT_NAME}"
+  INTEGRATION_TEST_FILE: "${CI_PROJECT_ID}.pdf"
+  FF_USE_FASTZIP: "true" # enable fastzip - a faster zip implementation that also supports level configuration.
+  ARTIFACT_COMPRESSION_LEVEL: default # can also be set to fastest, fast, slow and slowest. If just enabling fastzip is not enough try setting this to fastest or fast.
+  CACHE_COMPRESSION_LEVEL: default # same as above, but for caches
+  # TRANSFER_METER_FREQUENCY: 5s # will display transfer progress every 5 seconds for artifacts and remote caches. For debugging purposes.
+
+stages:
+  - data
+  - setup
+  - tests
+  - sonarqube
+  - versioning
+  - build
+  - integration-tests
+  - release
+
+docker-build:
+  extends: .docker-build
+  needs:
+    - job: dvc-pull
+      artifacts: true
+    - !reference [.needs-versioning, needs] # leave this line as is
+  
+###################
+# INTEGRATION TESTS
+trigger-integration-tests:
+  extends: .integration-tests
+  # ADD THE MODEL BUILD WHICH SHOULD TRIGGER THE INTEGRATION TESTS
+  # needs:
+  #   - job: docker-build::model_name
+  #     artifacts: true
+  rules:
+    - when: never
+
+#########
+# RELEASE
+release:
+  extends: .release
+  needs:
+    - !reference [.needs-versioning, needs] # leave this line as is
--- a/.gitmodules
+++ b/.gitmodules
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
+3.10
--- a/78
+++ b/78
@ -1,21 +1,73 @@
-FROM image-prediction-base
+FROM python:3.10-slim AS builder

-WORKDIR /app/service
+ARG GITLAB_USER
+ARG GITLAB_ACCESS_TOKEN

-COPY src src
-COPY data data
-COPY image_prediction image_prediction
-COPY setup.py setup.py
-COPY requirements.txt requirements.txt
-COPY config.yaml config.yaml
-COPY banner.txt banner.txt
+ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
+ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research

-# Install dependencies differing from base image.
-RUN python3 -m pip install -r requirements.txt
+ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
+ARG POETRY_SOURCE_REF_RED=gitlab-red

-RUN python3 -m pip install -e .
+ARG PYPI_REGISTRY_FFORESIGHT=https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
+ARG POETRY_SOURCE_REF_FFORESIGHT=gitlab-fforesight
+
+ARG VERSION=dev
+
+LABEL maintainer="Research <research@knecon.com>"
+LABEL version="${VERSION}"
+
+WORKDIR /app
+
+###########
+# ENV SETUP
+ENV PYTHONDONTWRITEBYTECODE=true
+ENV PYTHONUNBUFFERED=true
+ENV POETRY_HOME=/opt/poetry
+ENV PATH="$POETRY_HOME/bin:$PATH"
+
+RUN apt-get update && \
+    apt-get install -y curl git bash build-essential libffi-dev libssl-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN curl -sSL https://install.python-poetry.org | python3 -
+RUN poetry --version
+
+COPY pyproject.toml poetry.lock ./
+
+RUN poetry config virtualenvs.create true && \
+    poetry config virtualenvs.in-project true && \
+    poetry config installer.max-workers 10 && \
+    poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
+    poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
+    poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
+    poetry config http-basic.${POETRY_SOURCE_REF_RED} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
+    poetry config repositories.${POETRY_SOURCE_REF_FFORESIGHT} ${PYPI_REGISTRY_FFORESIGHT} && \
+    poetry config http-basic.${POETRY_SOURCE_REF_FFORESIGHT} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
+    poetry install --without=dev -vv --no-interaction --no-root
+
+###############
+# WORKING IMAGE
+FROM python:3.10-slim
+
+WORKDIR /app
+
+# COPY SOURCE CODE FROM BUILDER IMAGE
+COPY --from=builder /app /app
+# COPY BILL OF MATERIALS (BOM)
+COPY bom.json /bom.json
+
+ENV PATH="/app/.venv/bin:$PATH"
+
+###################
+# COPY SOURCE CODE
+COPY ./src ./src
+COPY ./config ./config
+COPY ./data ./data
+COPY banner.txt ./

 EXPOSE 5000
 EXPOSE 8080

-CMD ["python3", "src/serve.py"]
+CMD [ "python", "src/serve.py"]
--- a/25
+++ b/25
@ -1,25 +0,0 @@
-FROM python:3.8 as builder1
-
-# Use a virtual environment.
-RUN python -m venv /app/venv
-ENV PATH="/app/venv/bin:$PATH"
-
-# Upgrade pip.
-RUN python -m pip install --upgrade pip
-
-# Make a directory for the service files and copy the service repo into the container.
-WORKDIR /app/service
-COPY ./requirements.txt ./requirements.txt
-
-# Install dependencies.
-RUN python3 -m pip install -r requirements.txt
-
-# Make a new container and copy all relevant files over to filter out temporary files
-# produced during setup to reduce the final container's size.
-FROM python:3.8
-
-WORKDIR /app/
-COPY --from=builder1  /app .
-ENV PATH="/app/venv/bin:$PATH"
-
-WORKDIR /app/service
--- a/46
+++ b/46
@ -1,20 +1,40 @@
-ARG BASE_ROOT="nexus.iqser.com:5001/red/"
-ARG VERSION_TAG="dev"
+FROM python:3.10

-FROM ${BASE_ROOT}image-prediction:${VERSION_TAG}
+ARG USERNAME
+ARG TOKEN
+ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
+ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
+ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
+ARG POETRY_SOURCE_REF_RED=gitlab-red
+ARG VERSION=dev

-WORKDIR /app/service
+LABEL maintainer="Research <research@knecon.com>"
+LABEL version="${VERSION}"

-COPY src src
-COPY data data
-COPY image_prediction image_prediction
-COPY setup.py setup.py
-COPY requirements.txt requirements.txt
-COPY config.yaml config.yaml
+WORKDIR /app

-# Install module & dependencies
-RUN python3 -m pip install -e .
-RUN python3 -m pip install -r requirements.txt
+ENV PYTHONUNBUFFERED=true
+ENV POETRY_HOME=/opt/poetry
+ENV PATH="$POETRY_HOME/bin:$PATH"
+
+RUN curl -sSL https://install.python-poetry.org | python3 -
+
+COPY ./data ./data
+COPY ./test ./test
+COPY ./config ./config
+COPY ./src ./src
+COPY pyproject.toml poetry.lock banner.txt config.yaml./
+
+RUN poetry config virtualenvs.create false && \
+    poetry config installer.max-workers 10 && \
+    poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
+    poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${USERNAME} ${TOKEN} && \
+    poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
+    poetry config http-basic.${POETRY_SOURCE_REF_RED} ${USERNAME} ${TOKEN} && \
+    poetry install --without=dev -vv --no-interaction --no-root
+
+EXPOSE 5000
+EXPOSE 8080

 RUN apt update --yes
 RUN apt install vim --yes
--- a/README.md
+++ b/README.md
@ -2,8 +2,11 @@

 Build base image
 ```bash
-docker build -f Dockerfile_base -t image-prediction-base .
-docker build -f Dockerfile -t image-prediction .
+docker build -t image-classification-image --progress=plain --no-cache \
+    -f Dockerfile \
+    --build-arg USERNAME=$GITLAB_USER \
+    --build-arg TOKEN=$GITLAB_ACCESS_TOKEN \
+    .
 ```

 ### Usage
--- a/bamboo-specs/pom.xml
+++ b/bamboo-specs/pom.xml
@ -1,40 +0,0 @@
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-
-  <parent>
-    <groupId>com.atlassian.bamboo</groupId>
-    <artifactId>bamboo-specs-parent</artifactId>
-    <version>7.1.2</version>
-    <relativePath/>
-  </parent>
-
-  <artifactId>bamboo-specs</artifactId>
-  <version>1.0.0-SNAPSHOT</version>
-  <packaging>jar</packaging>
-
-  <properties>
-    <sonar.skip>true</sonar.skip>
-  </properties>
-
-  <dependencies>
-    <dependency>
-      <groupId>com.atlassian.bamboo</groupId>
-      <artifactId>bamboo-specs-api</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>com.atlassian.bamboo</groupId>
-      <artifactId>bamboo-specs</artifactId>
-    </dependency>
-
-    <!-- Test dependencies -->
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <scope>test</scope>
-    </dependency>
-  </dependencies>
-
-  <!-- run 'mvn test' to perform offline validation of the plan -->
-  <!-- run 'mvn -Ppublish-specs' to upload the plan to your Bamboo server -->
-</project>
--- a/bamboo-specs/src/main/java/buildjob/PlanSpec.java
+++ b/bamboo-specs/src/main/java/buildjob/PlanSpec.java
@ -1,178 +0,0 @@
-package buildjob;
-
-import com.atlassian.bamboo.specs.api.BambooSpec;
-import com.atlassian.bamboo.specs.api.builders.BambooKey;
-import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
-import com.atlassian.bamboo.specs.api.builders.permission.PermissionType;
-import com.atlassian.bamboo.specs.api.builders.permission.Permissions;
-import com.atlassian.bamboo.specs.api.builders.permission.PlanPermissions;
-import com.atlassian.bamboo.specs.api.builders.plan.Job;
-import com.atlassian.bamboo.specs.api.builders.plan.Plan;
-import com.atlassian.bamboo.specs.api.builders.plan.PlanIdentifier;
-import com.atlassian.bamboo.specs.api.builders.plan.Stage;
-import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup;
-import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement;
-import com.atlassian.bamboo.specs.api.builders.project.Project;
-import com.atlassian.bamboo.specs.builders.task.CheckoutItem;
-import com.atlassian.bamboo.specs.builders.task.InjectVariablesTask;
-import com.atlassian.bamboo.specs.builders.task.ScriptTask;
-import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask;
-import com.atlassian.bamboo.specs.builders.task.CleanWorkingDirectoryTask;
-import com.atlassian.bamboo.specs.builders.task.VcsTagTask;
-import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
-import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
-import com.atlassian.bamboo.specs.api.builders.Variable;
-import com.atlassian.bamboo.specs.util.BambooServer;
-import com.atlassian.bamboo.specs.builders.task.ScriptTask;
-import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
-
-/**
- * Plan configuration for Bamboo.
- * Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
- */
-@BambooSpec
-public class PlanSpec {
-
-    private static final String SERVICE_NAME = "image-prediction";
-    private static final String SERVICE_NAME_BASE = "image-prediction-base";
-
-    private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_","");
-
-    /**
-     * Run main to publish plan on Bamboo
-     */
-    public static void main(final String[] args) throws Exception {
-        //By default credentials are read from the '.credentials' file.
-        BambooServer bambooServer = new BambooServer("http://localhost:8085");
-
-        Plan plan = new PlanSpec().createDockerBuildPlan();
-        bambooServer.publish(plan);
-        PlanPermissions planPermission = new PlanSpec().createPlanPermission(plan.getIdentifier());
-        bambooServer.publish(planPermission);
-    }
-
-    private PlanPermissions createPlanPermission(PlanIdentifier planIdentifier) {
-        Permissions permission = new Permissions()
-                .userPermissions("atlbamboo", PermissionType.EDIT, PermissionType.VIEW, PermissionType.ADMIN, PermissionType.CLONE, PermissionType.BUILD)
-                .groupPermissions("research", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
-                .groupPermissions("Development", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
-                .groupPermissions("QA", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
-                .loggedInUserPermissions(PermissionType.VIEW)
-                .anonymousUserPermissionView();
-        return new PlanPermissions(planIdentifier.getProjectKey(), planIdentifier.getPlanKey()).permissions(permission);
-    }
-
-    private Project project() {
-        return new Project()
-                .name("RED")
-                .key(new BambooKey("RED"));
-    }
-
-    public Plan createDockerBuildPlan() {
-    return new Plan(
-            project(),
-            SERVICE_NAME, new BambooKey(SERVICE_KEY))
-            .description("Docker build for image-prediction.")
-            .stages(
-            new Stage("Build Stage")
-              .jobs(
-                new Job("Build Job", new BambooKey("BUILD"))
-                  .tasks(
-                    new CleanWorkingDirectoryTask()
-                        .description("Clean working directory.")
-                        .enabled(true),
-                    new VcsCheckoutTask()
-                        .description("Checkout default repository.")
-                        .checkoutItems(new CheckoutItem().defaultRepository()),
-                    new ScriptTask()
-                        .description("Set config and keys.")
-                        .inlineBody("mkdir -p ~/.ssh\n" +
-                                    "echo \"${bamboo.bamboo_agent_ssh}\" | base64 -d >> ~/.ssh/id_rsa\n" +
-                                    "echo \"host vector.iqser.com\" > ~/.ssh/config\n" +
-                                    "echo \"    user bamboo-agent\" >> ~/.ssh/config\n" +
-                                    "chmod 600 ~/.ssh/config ~/.ssh/id_rsa"),
-                    new ScriptTask()
-                        .description("Build Docker container.")
-                        .location(Location.FILE)
-                        .fileFromPath("bamboo-specs/src/main/resources/scripts/docker-build.sh")
-                        .argument(SERVICE_NAME + " " + SERVICE_NAME_BASE))
-                  .dockerConfiguration(
-                      new DockerConfiguration()
-                        .image("nexus.iqser.com:5001/infra/release_build:4.2.0")
-                        .volume("/var/run/docker.sock", "/var/run/docker.sock"))),
-            new Stage("Sonar Stage")
-              .jobs(
-                new Job("Sonar Job", new BambooKey("SONAR"))
-                  .tasks(
-                    new CleanWorkingDirectoryTask()
-                        .description("Clean working directory.")
-                        .enabled(true),
-                    new VcsCheckoutTask()
-                        .description("Checkout default repository.")
-                        .checkoutItems(new CheckoutItem().defaultRepository()),
-                    new ScriptTask()
-                        .description("Set config and keys.")
-                        .inlineBody("mkdir -p ~/.ssh\n" +
-                                    "echo \"${bamboo.bamboo_agent_ssh}\" | base64 -d >> ~/.ssh/id_rsa\n" +
-                                    "echo \"host vector.iqser.com\" > ~/.ssh/config\n" +
-                                    "echo \"    user bamboo-agent\" >> ~/.ssh/config\n" +
-                                    "chmod 600 ~/.ssh/config ~/.ssh/id_rsa"),
-                    new ScriptTask()
-                        .description("Run Sonarqube scan.")
-                        .location(Location.FILE)
-                        .fileFromPath("bamboo-specs/src/main/resources/scripts/sonar-scan.sh")
-                        .argument(SERVICE_NAME))
-                  .dockerConfiguration(
-                      new DockerConfiguration()
-                        .image("nexus.iqser.com:5001/infra/release_build:4.2.0")
-                        .volume("/var/run/docker.sock", "/var/run/docker.sock"))),
-            new Stage("Licence Stage")
-              .jobs(
-                new Job("Git Tag Job", new BambooKey("GITTAG"))
-                  .tasks(
-                    new VcsCheckoutTask()
-                        .description("Checkout default repository.")
-                        .checkoutItems(new CheckoutItem().defaultRepository()),
-                    new ScriptTask()
-                        .description("Build git tag.")
-                        .location(Location.FILE)
-                        .fileFromPath("bamboo-specs/src/main/resources/scripts/git-tag.sh"),
-                    new InjectVariablesTask()
-                        .description("Inject git tag.")
-                        .path("git.tag")
-                        .namespace("g")
-                        .scope(InjectVariablesScope.LOCAL),
-                    new VcsTagTask()
-                        .description("${bamboo.g.gitTag}")
-                        .tagName("${bamboo.g.gitTag}")
-                        .defaultRepository())
-                .dockerConfiguration(
-                    new DockerConfiguration()
-                        .image("nexus.iqser.com:5001/infra/release_build:4.4.1")),
-                new Job("Licence Job", new BambooKey("LICENCE"))
-                  .enabled(false)
-                  .tasks(
-                    new VcsCheckoutTask()
-                        .description("Checkout default repository.")
-                        .checkoutItems(new CheckoutItem().defaultRepository()),
-                    new ScriptTask()
-                        .description("Build licence.")
-                        .location(Location.FILE)
-                        .fileFromPath("bamboo-specs/src/main/resources/scripts/create-licence.sh"))
-                  .dockerConfiguration(
-                    new DockerConfiguration()
-                        .image("nexus.iqser.com:5001/infra/maven:3.6.2-jdk-13-3.0.0")
-                        .volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
-                        .volume("/var/run/docker.sock", "/var/run/docker.sock"))))
-            .linkedRepositories("RR / " + SERVICE_NAME)
-            .linkedRepositories("RR / redai_image")
-            .triggers(new BitbucketServerTrigger())
-            .planBranchManagement(new PlanBranchManagement()
-              .createForVcsBranch()
-              .delete(new BranchCleanup()
-                  .whenInactiveInRepositoryAfterDays(14))
-              .notificationForCommitters());
-    }
-
-
-}
--- a/bamboo-specs/src/main/resources/scripts/create-licence.sh
+++ b/bamboo-specs/src/main/resources/scripts/create-licence.sh
@ -1,19 +0,0 @@
-#!/bin/bash
-set -e
-
-if [[ \"${bamboo_version_tag}\" != \"dev\" ]]
-then
-    ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
-                    -f ${bamboo_build_working_directory}/pom.xml \
-                    versions:set  \
-                    -DnewVersion=${bamboo_version_tag}
-
-    ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
-                    -f ${bamboo_build_working_directory}/pom.xml \
-                    -B clean deploy \
-                    -e -DdeployAtEnd=true \
-                    -Dmaven.wagon.http.ssl.insecure=true \
-                    -Dmaven.wagon.http.ssl.allowall=true \
-                    -Dmaven.wagon.http.ssl.ignore.validity.dates=true \
-                    -DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/gin4-platform-releases
-fi
--- a/bamboo-specs/src/main/resources/scripts/docker-build.sh
+++ b/bamboo-specs/src/main/resources/scripts/docker-build.sh
@ -1,20 +0,0 @@
-#!/bin/bash
-set -e
-
-SERVICE_NAME=$1
-SERVICE_NAME_BASE=$2
-
-python3 -m venv build_venv
-source build_venv/bin/activate
-python3 -m pip install --upgrade pip
-
-pip install dvc
-pip install 'dvc[ssh]'
-echo "Pulling dvc data"
-dvc pull
-
-echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
-docker build -f Dockerfile_base -t $SERVICE_NAME_BASE .
-docker build -f Dockerfile  -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} .
-echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
-docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag}
--- a/bamboo-specs/src/main/resources/scripts/git-tag.sh
+++ b/bamboo-specs/src/main/resources/scripts/git-tag.sh
@ -1,9 +0,0 @@
-#!/bin/bash
-set -e
-
-if [[ "${bamboo_version_tag}" = "dev" ]]
-then
-    echo "gitTag=${bamboo_planRepository_1_branch}_${bamboo_buildNumber}" > git.tag
-else
-    echo "gitTag=${bamboo_version_tag}" > git.tag
-fi
--- a/bamboo-specs/src/main/resources/scripts/sonar-scan.sh
+++ b/bamboo-specs/src/main/resources/scripts/sonar-scan.sh
@ -1,57 +0,0 @@
-#!/bin/bash
-set -e
-
-export JAVA_HOME=/usr/bin/sonar-scanner/jre
-
-python3 -m venv build_venv
-source build_venv/bin/activate
-python3 -m pip install --upgrade pip
-python3 -m pip install dependency-check
-python3 -m pip install coverage
-
-echo "coverage report generation"
-
-bash run_tests.sh
-
-if [ ! -f reports/coverage.xml ]
-then
-  exit 1
-fi
-
-SERVICE_NAME=$1
-
-echo "dependency-check:aggregate"
-mkdir -p reports
-dependency-check --enableExperimental -f JSON -f HTML -f XML \
-  --disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
-  --exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**"
-
-if [[ -z "${bamboo_repository_pr_key}" ]]
-then
-    echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
-    /usr/bin/sonar-scanner/bin/sonar-scanner \
-      -Dsonar.projectKey=RED_$SERVICE_NAME \
-      -Dsonar.sources=image_prediction \
-      -Dsonar.host.url=https://sonarqube.iqser.com \
-      -Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-      -Dsonar.branch.name=${bamboo_planRepository_1_branch} \
-      -Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
-      -Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
-      -Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
-      -Dsonar.python.coverage.reportPaths=reports/coverage.xml
- 
-else
-    echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
-    /usr/bin/sonar-scanner/bin/sonar-scanner \
-      -Dsonar.projectKey=RED_$SERVICE_NAME \
-      -Dsonar.sources=image_prediction \
-      -Dsonar.host.url=https://sonarqube.iqser.com \
-      -Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-      -Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
-      -Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \
-      -Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \
-      -Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
-      -Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
-      -Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
-      -Dsonar.python.coverage.reportPaths=reports/coverage.xml
-fi
--- a/bamboo-specs/src/test/java/buildjob/PlanSpecTest.java
+++ b/bamboo-specs/src/test/java/buildjob/PlanSpecTest.java
@ -1,16 +0,0 @@
-package buildjob;
-
-
-import com.atlassian.bamboo.specs.api.builders.plan.Plan;
-import com.atlassian.bamboo.specs.api.exceptions.PropertiesValidationException;
-import com.atlassian.bamboo.specs.api.util.EntityPropertiesBuilders;
-import org.junit.Test;
-
-public class PlanSpecTest {
-    @Test
-    public void checkYourPlanOffline() throws PropertiesValidationException {
-        Plan plan = new PlanSpec().createDockerBuildPlan();
-
-        EntityPropertiesBuilders.build(plan);
-    }
-}
--- a/bom.json
+++ b/bom.json
--- a/config.yaml
+++ b/config.yaml
@ -1,26 +0,0 @@
-webserver:
-  host: $SERVER_HOST|"127.0.0.1"  # webserver address
-  port: $SERVER_PORT|5000  # webserver port
-
-service:
-  logging_level: $LOGGING_LEVEL_ROOT|INFO  # Logging level for service logger
-  verbose: $VERBOSE|True  # Service prints document processing progress to stdout
-  batch_size: $BATCH_SIZE|16  # Number of images in memory simultaneously
-  mlflow_run_id: $MLFLOW_RUN_ID|fabfb1f192c745369b88cab34471aba7  # The ID of the mlflow run to load the service_estimator from
-
-
-# These variables control filters that are applied to either images, image metadata or service_estimator predictions.
-# The filter result values are reported in the service responses. For convenience the response to a request contains a
-# "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
-# specified required value.
-filters:
-
-  image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)
-    min: $MIN_REL_IMAGE_SIZE|0.05  # Minimum permissible
-    max: $MAX_REL_IMAGE_SIZE|0.75  # Maximum permissible
-
-  image_width_to_height_quotient:  # Image width to height ratio
-    min: $MIN_IMAGE_FORMAT|0.1  # Minimum permissible
-    max: $MAX_IMAGE_FORMAT|10  # Maximum permissible
-
-  min_confidence: $MIN_CONFIDENCE|0.5  # Minimum permissible prediction confidence
--- a/config/pyinfra.toml
+++ b/config/pyinfra.toml
@ -0,0 +1,68 @@
+
+[asyncio]
+max_concurrent_tasks = 10
+
+[dynamic_tenant_queues]
+enabled = true
+
+[metrics.prometheus]
+enabled = true
+prefix = "redactmanager_image_service"
+
+[tracing]
+enabled = true
+# possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
+type = "azure_monitor" 
+
+[tracing.opentelemetry]
+endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
+service_name = "redactmanager_image_service"
+exporter = "otlp"
+
+[webserver]
+host = "0.0.0.0"
+port = 8080
+
+[rabbitmq]
+host = "localhost"
+port = 5672
+username = ""
+password = ""
+heartbeat = 60
+# Has to be a divider of heartbeat, and shouldn't be too big, since only in these intervals queue interactions happen (like receiving new messages)
+# This is also the minimum time the service needs to process a message
+connection_sleep = 5
+input_queue = "request_queue"
+output_queue = "response_queue"
+dead_letter_queue = "dead_letter_queue"
+
+tenant_event_queue_suffix = "_tenant_event_queue"
+tenant_event_dlq_suffix = "_tenant_events_dlq"
+tenant_exchange_name = "tenants-exchange"
+queue_expiration_time = 300000  # 5 minutes in milliseconds
+
+service_request_queue_prefix = "image_request_queue"
+service_request_exchange_name = "image_request_exchange"
+service_response_exchange_name = "image_response_exchange"
+service_dlq_name = "image_dlq"
+
+[storage]
+backend = "s3"
+
+[storage.s3]
+bucket = "redaction"
+endpoint = "http://127.0.0.1:9000"
+key = ""
+secret = ""
+region = "eu-central-1"
+
+[storage.azure]
+container = "redaction"
+connection_string = ""
+
+[storage.tenant_server]
+public_key = ""
+endpoint =  "http://tenant-user-management:8081/internal-api/tenants"
+
+[kubernetes]
+pod_name = "test_pod"
--- a/config/settings.toml
+++ b/config/settings.toml
@ -0,0 +1,42 @@
+[logging]
+level = "INFO"
+
+[service]
+# Print document processing progress to stdout
+verbose = false
+batch_size = 6
+image_stiching_tolerance = 1  # in pixels
+mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
+
+# These variables control filters that are applied to either images, image metadata or service_estimator predictions.
+# The filter result values are reported in the service responses. For convenience the response to a request contains a
+# "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
+# specified required value.
+[filters.confidence]
+# Minimum permissible prediction confidence
+min = 0.5
+
+# Image size to page size ratio (ratio of geometric means of areas)
+[filters.image_to_page_quotient]
+min = 0.05
+max = 0.75
+
+[filters.is_scanned_page]
+# Minimum permissible image to page ratio tolerance for a page to be considered scanned.
+# This is only used for filtering small images on scanned pages and is applied before processing the image, therefore
+# superseding the image_to_page_quotient filter that only applies a tag to the image after processing.
+tolerance = 0
+
+# Image width to height ratio
+[filters.image_width_to_height_quotient]
+min = 0.1
+max = 10
+
+# put class specific filters here ['signature', 'formula', 'logo']
+[filters.overrides.signature.image_to_page_quotient]
+max = 0.4
+
+[filters.overrides.logo.image_to_page_quotient]
+min = 0.06
+
+
--- a/image_prediction/config.py
+++ b/image_prediction/config.py
@ -1,40 +0,0 @@
-"""Implements a config object with dot-indexing syntax."""
-
-
-from envyaml import EnvYAML
-
-from image_prediction.locations import CONFIG_FILE
-
-
-def _get_item_and_maybe_make_dotindexable(container, item):
-    ret = container[item]
-    return DotIndexable(ret) if isinstance(ret, dict) else ret
-
-
-class DotIndexable:
-    def __init__(self, x):
-        self.x = x
-
-    def __getattr__(self, item):
-        return _get_item_and_maybe_make_dotindexable(self.x, item)
-
-    def __repr__(self):
-        return self.x.__repr__()
-
-    def __getitem__(self, item):
-        return self.__getattr__(item)
-
-
-class Config:
-    def __init__(self, config_path):
-        self.__config = EnvYAML(config_path)
-
-    def __getattr__(self, item):
-        if item in self.__config:
-            return _get_item_and_maybe_make_dotindexable(self.__config, item)
-
-    def __getitem__(self, item):
-        return self.__getattr__(item)
-
-
-CONFIG = Config(CONFIG_FILE)
--- a/image_prediction/image_extractor/extractors/parsable.py
+++ b/image_prediction/image_extractor/extractors/parsable.py
@ -1,186 +0,0 @@
-import atexit
-import io
-from functools import partial, lru_cache
-from itertools import chain, starmap, filterfalse
-from operator import itemgetter
-from typing import List
-
-import fitz
-from PIL import Image
-from funcy import rcompose, merge, pluck, curry, compose
-
-from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
-from image_prediction.info import Info
-from image_prediction.stitching.stitching import stitch_pairs
-from image_prediction.stitching.utils import validate_box_coords, validate_box_size
-from image_prediction.utils import get_logger
-from image_prediction.utils.generic import lift
-
-logger = get_logger()
-
-
-class ParsablePDFImageExtractor(ImageExtractor):
-    def __init__(self, verbose=False, tolerance=0):
-        """
-
-        Args:
-            verbose: Whether to show progressbar
-            tolerance: The tolerance in pixels for the distance images beyond which they will not be stitched together
-        """
-        self.doc: fitz.fitz.Document = None
-        self.verbose = verbose
-        self.tolerance = tolerance
-
-    def extract(self, pdf: bytes, page_range: range = None):
-        self.doc = fitz.Document(stream=pdf)
-
-        pages = extract_pages(self.doc, page_range) if page_range else self.doc
-
-        image_metadata_pairs = chain.from_iterable(map(self.__process_images_on_page, pages))
-
-        yield from image_metadata_pairs
-
-    def __process_images_on_page(self, page: fitz.fitz.Page):
-        images = get_images_on_page(self.doc, page)
-        metadata = get_metadata_for_images_on_page(self.doc, page)
-        clear_caches()
-
-        image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
-        image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
-
-        yield from image_metadata_pairs
-
-
-def extract_pages(doc, page_range):
-    page_range = range(page_range.start + 1, page_range.stop + 1)
-    pages = map(doc.load_page, page_range)
-
-    yield from pages
-
-
-@lru_cache(maxsize=None)
-def get_images_on_page(doc, page: fitz.Page):
-    image_infos = get_image_infos(page)
-    xrefs = map(itemgetter("xref"), image_infos)
-    images = map(partial(xref_to_image, doc), xrefs)
-
-    yield from images
-
-
-def get_metadata_for_images_on_page(doc, page: fitz.Page):
-
-    metadata = map(get_image_metadata, get_image_infos(page))
-    metadata = validate_coords_and_passthrough(metadata)
-
-    metadata = filter_out_tiny_images(metadata)
-    metadata = validate_size_and_passthrough(metadata)
-
-    metadata = add_page_metadata(page, metadata)
-
-    metadata = add_alpha_channel_info(doc, page, metadata)
-
-    yield from metadata
-
-
-@lru_cache(maxsize=None)
-def get_image_infos(page: fitz.Page) -> List[dict]:
-    return page.get_image_info(xrefs=True)
-
-
-@lru_cache(maxsize=None)
-def xref_to_image(doc, xref) -> Image:
-    maybe_image = load_image_handle_from_xref(doc, xref)
-    return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None
-
-
-def get_image_metadata(image_info):
-
-    x1, y1, x2, y2 = map(rounder, image_info["bbox"])
-
-    width = abs(x2 - x1)
-    height = abs(y2 - y1)
-
-    return {
-        Info.WIDTH: width,
-        Info.HEIGHT: height,
-        Info.X1: x1,
-        Info.X2: x2,
-        Info.Y1: y1,
-        Info.Y2: y2,
-    }
-
-
-def validate_coords_and_passthrough(metadata):
-    yield from map(validate_box_coords, metadata)
-
-
-def filter_out_tiny_images(metadata):
-    yield from filterfalse(tiny, metadata)
-
-
-def validate_size_and_passthrough(metadata):
-    yield from map(validate_box_size, metadata)
-
-
-def add_page_metadata(page, metadata):
-    yield from map(partial(merge, get_page_metadata(page)), metadata)
-
-
-def add_alpha_channel_info(doc, page, metadata):
-
-    page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos)
-    xref_to_alpha = partial(has_alpha_channel, doc)
-    page_to_alpha_value_per_image = compose(lift(xref_to_alpha), page_to_xrefs)
-    alpha_to_dict = compose(dict, lambda a: [(Info.ALPHA, a)])
-    page_to_alpha_mapping_per_image = compose(lift(alpha_to_dict), page_to_alpha_value_per_image)
-
-    metadata = starmap(merge, zip(page_to_alpha_mapping_per_image(page), metadata))
-
-    yield from metadata
-
-
-@lru_cache(maxsize=None)
-def load_image_handle_from_xref(doc, xref):
-    return doc.extract_image(xref)
-
-
-rounder = rcompose(round, int)
-
-
-def get_page_metadata(page):
-    page_width, page_height = map(rounder, page.mediabox_size)
-
-    return {
-        Info.PAGE_WIDTH: page_width,
-        Info.PAGE_HEIGHT: page_height,
-        Info.PAGE_IDX: page.number,
-    }
-
-
-def has_alpha_channel(doc, xref):
-
-    maybe_image = load_image_handle_from_xref(doc, xref)
-    maybe_smask = maybe_image["smask"] if maybe_image else None
-
-    if maybe_smask:
-        return any([doc.extract_image(maybe_smask) is not None, bool(fitz.Pixmap(doc, maybe_smask).alpha)])
-    else:
-        try:
-            return bool(fitz.Pixmap(doc, xref).alpha)
-        except ValueError:
-            logger.debug(f"Encountered invalid xref `{xref}` in {doc.metadata.get('title', '<no title>')}.")
-            return False
-
-
-def tiny(metadata):
-    return metadata[Info.WIDTH] * metadata[Info.HEIGHT] <= 4
-
-
-def clear_caches():
-    get_image_infos.cache_clear()
-    load_image_handle_from_xref.cache_clear()
-    get_images_on_page.cache_clear()
-    xref_to_image.cache_clear()
-
-
-atexit.register(clear_caches)
--- a/image_prediction/locations.py
+++ b/image_prediction/locations.py
@ -1,17 +0,0 @@
-"""Defines constant paths relative to the module root path."""
-
-from pathlib import Path
-
-MODULE_DIR = Path(__file__).resolve().parents[0]
-
-PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
-
-CONFIG_FILE = PACKAGE_ROOT_DIR / "config.yaml"
-
-BANNER_FILE = PACKAGE_ROOT_DIR / "banner.txt"
-
-DATA_DIR = PACKAGE_ROOT_DIR / "data"
-
-MLRUNS_DIR = str(DATA_DIR / "mlruns")
-
-TEST_DATA_DIR = PACKAGE_ROOT_DIR / "test" / "data"
--- a/image_prediction/transformer/transformers/init.py
+++ b/image_prediction/transformer/transformers/init.py
--- a/image_prediction/transformer/transformers/coordinate/init.py
+++ b/image_prediction/transformer/transformers/coordinate/init.py
--- a/image_prediction/utils/generic.py
+++ b/image_prediction/utils/generic.py
@ -1,15 +0,0 @@
-from itertools import starmap
-
-from funcy import iterate, first, curry, map
-
-
-def until(cond, func, *args, **kwargs):
-    return first(filter(cond, iterate(func, *args, **kwargs)))
-
-
-def lift(fn):
-    return curry(map)(fn)
-
-
-def starlift(fn):
-    return curry(starmap)(fn)
--- a/image_prediction/utils/logger.py
+++ b/image_prediction/utils/logger.py
@ -1,27 +0,0 @@
-import logging
-
-from image_prediction.config import CONFIG
-
-
-def make_logger_getter():
-    logger = logging.getLogger("imclf")
-    logger.propagate = False
-
-    handler = logging.StreamHandler()
-    handler.setLevel(CONFIG.service.logging_level)
-
-    log_format = "%(asctime)s %(levelname)-8s %(message)s"
-    formatter = logging.Formatter(log_format, datefmt="%Y-%m-%d %H:%M:%S")
-
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
-
-    logger.setLevel(CONFIG.service.logging_level)
-
-    def get_logger():
-        return logger
-
-    return get_logger
-
-
-get_logger = make_logger_getter()
--- a/incl/init.py
+++ b/incl/init.py
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,73 @@
+[tool.poetry]
+name = "image-classification-service"
+version = "2.17.0"
+description = ""
+authors = ["Team Research <research@knecon.com>"]
+readme = "README.md"
+packages = [{ include = "image_prediction", from = "src" }]
+
+[tool.poetry.dependencies]
+python = ">=3.10,<3.11"
+# FIXME: This should be recent pyinfra, but the recent protobuf packages are not compatible with tensorflow 2.9.0, also
+#  see RED-9948.
+pyinfra = { version = "3.4.2", source = "gitlab-research" }
+kn-utils = { version = ">=0.4.0", source = "gitlab-research" }
+dvc = "^2.34.0"
+dvc-ssh = "^2.20.0"
+dvc-azure = "^2.21.2"
+Flask = "^2.1.1"
+requests = "^2.27.1"
+iteration-utilities = "^0.11.0"
+waitress = "^2.1.1"
+envyaml = "^1.10.211231"
+dependency-check = "^0.6.0"
+mlflow = "^1.24.0"
+numpy = "^1.22.3"
+tqdm = "^4.64.0"
+pandas = "^1.4.2"
+# FIXME: Our current model significantly changes the prediction behaviour when using newer tensorflow (/ protobuf)
+#  versions which is introduuced by pyinfra updates using newer protobuf versions, see RED-9948.
+tensorflow = "2.9.0"
+protobuf = "^3.20"
+pytest = "^7.1.0"
+funcy = "^2"
+PyMuPDF = "^1.19.6"
+fpdf = "^1.7.2"
+coverage = "^6.3.2"
+Pillow = "^9.1.0"
+pdf2image = "^1.16.0"
+frozendict = "^2.3.0"
+fsspec = "^2022.11.0"
+PyMonad = "^2.4.0"
+pdfnetpython3 = "9.4.2"
+loguru = "^0.7.0"
+cyclonedx-bom = "^4.5.0"
+
+[tool.poetry.group.dev.dependencies]
+pytest = "^7.0.1"
+pymonad = "^2.4.0"
+pylint = "^2.17.4"
+ipykernel = "^6.23.2"
+
+[tool.pytest.ini_options]
+testpaths = ["test"]
+addopts = "--ignore=data"
+filterwarnings = ["ignore:.*:DeprecationWarning"]
+
+[[tool.poetry.source]]
+name = "PyPI"
+priority = "primary"
+
+[[tool.poetry.source]]
+name = "gitlab-research"
+url = "https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi/simple"
+priority = "explicit"
+
+[[tool.poetry.source]]
+name = "gitlab-red"
+url = "https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi/simple"
+priority = "explicit"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
--- a/pytest.ini
+++ b/pytest.ini
@ -1,5 +0,0 @@
-[pytest]
-norecursedirs = incl
-filterwarnings =
-    ignore:.*:DeprecationWarning
-    ignore:.*:DeprecationWarning
--- a/requirements.txt
+++ b/requirements.txt
@ -1,25 +0,0 @@
-Flask==2.1.1
-requests==2.27.1
-iteration-utilities==0.11.0
-dvc==2.10.0
-dvc[ssh]
-waitress==2.1.1
-envyaml==1.10.211231
-dependency-check==0.6.*
-mlflow==1.24.0
-numpy==1.22.3
-tqdm==4.64.0
-pandas==1.4.2
-tensorflow==2.8.0
-PyYAML==6.0
-pytest~=7.1.0
-funcy==1.17
-PyMuPDF==1.19.6
-fpdf==1.7.2
-coverage==6.3.2
-Pillow==9.1.0
-PDFNetPython3==9.1.0
-pdf2image==1.16.0
-frozendict==2.3.0
-protobuf<=3.20.*
-prometheus-client==0.13.1
--- a/scripts/debug/debug.py
+++ b/scripts/debug/debug.py
@ -0,0 +1,46 @@
+"""Script to debug RED-9948. The predictions unexpectedly changed for some images, and we need to understand why."""
+
+import json
+import random
+from pathlib import Path
+
+import numpy as np
+import tensorflow as tf
+from kn_utils.logging import logger
+
+from image_prediction.config import CONFIG
+from image_prediction.pipeline import load_pipeline
+
+
+def process_pdf(pipeline, pdf_path, page_range=None):
+    with open(pdf_path, "rb") as f:
+        logger.info(f"Processing {pdf_path}")
+        predictions = list(pipeline(f.read(), page_range=page_range))
+
+    return predictions
+
+
+def ensure_seeds():
+    seed = 42
+    np.random.seed(seed)
+    random.seed(seed)
+    tf.random.set_seed(seed)
+
+
+def debug_info():
+    devices = tf.config.list_physical_devices()
+    print("Available devices:", devices)
+
+
+if __name__ == "__main__":
+    # For in container debugging, copy the file and adjust the path.
+    debug_file_path = Path(__file__).parents[2] / "test" / "data" / "RED-9948" / "SYNGENTA_EFSA_sanitisation_GFL_v2"
+    ensure_seeds()
+    debug_info()
+
+    pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size)
+    predictions = process_pdf(pipeline, debug_file_path)
+    # This is the image that has the wrong prediction mentioned in RED-9948. The predictions should inconclusive, and
+    # the flag all passed should be false.
+    predictions = [x for x in predictions if x["representation"] == "FA30F080F0C031CE17E8CF237"]
+    print(json.dumps(predictions, indent=2))
--- a/scripts/devenvsetup.sh
+++ b/scripts/devenvsetup.sh
@ -0,0 +1,30 @@
+#!/bin/bash
+python_version=$1
+gitlab_user=$2
+gitlab_personal_access_token=$3
+
+# cookiecutter https://gitlab.knecon.com/knecon/research/template-python-project.git --checkout master
+# latest_dir=$(ls -td -- */ | head -n 1)  # should be the dir cookiecutter just created
+
+# cd $latest_dir
+
+pyenv install $python_version
+pyenv local $python_version
+pyenv shell $python_version
+
+pip install --upgrade pip
+pip install poetry
+
+poetry config installer.max-workers 10
+# research package registry
+poetry config repositories.gitlab-research https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
+poetry config http-basic.gitlab-research ${gitlab_user} ${gitlab_personal_access_token}
+# redactmanager package registry
+poetry config repositories.gitlab-red https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
+poetry config http-basic.gitlab-red ${gitlab_user} ${gitlab_personal_access_token}
+
+poetry env use $(pyenv which python)
+poetry install --with=dev
+poetry update
+
+source .venv/bin/activate
--- a/scripts/docker_build_run.sh
+++ b/scripts/docker_build_run.sh
@ -0,0 +1,6 @@
+docker build -t --platform linux/amd64 image-clsasification-service:$(poetry version -s)-dev \                                                               
+    -f Dockerfile \
+    --build-arg GITLAB_USER=$GITLAB_USER \
+    --build-arg GITLAB_ACCESS_TOKEN=$GITLAB_ACCESS_TOKEN \
+    . && \
+docker run -it --rm image-clsasification-service:$(poetry version -s)-dev
--- a/scripts/docker_tag_push.sh
+++ b/scripts/docker_tag_push.sh
@ -0,0 +1,3 @@
+docker tag image-clsasification-service:$(poetry version -s)-dev $NEXUS_REGISTRY/red/image-clsasification-service:$(poetry version -s)-dev
+
+docker push $NEXUS_REGISTRY/red/image-clsasification-service:$(poetry version -s)-dev
--- a/scripts/k8s_startup_probe.py
+++ b/scripts/k8s_startup_probe.py
@ -0,0 +1,6 @@
+from pyinfra.k8s_probes import startup
+from loguru import logger
+
+if __name__ == "__main__":
+    logger.debug("running health check")
+    startup.run_checks()
--- a/scripts/run_pipeline.py
+++ b/scripts/run_pipeline.py
@ -3,12 +3,15 @@ import json
 import os
 from glob import glob

+from image_prediction.config import CONFIG
 from image_prediction.pipeline import load_pipeline
 from image_prediction.utils import get_logger
 from image_prediction.utils.pdf_annotation import annotate_pdf

 logger = get_logger()

+logger.setLevel("DEBUG")
+

 def parse_args():
    parser = argparse.ArgumentParser()
@ -35,7 +38,7 @@ def process_pdf(pipeline, pdf_path, page_range=None):


 def main(args):
-    pipeline = load_pipeline(verbose=True, tolerance=3)
+    pipeline = load_pipeline(verbose=CONFIG.service.verbose, batch_size=CONFIG.service.batch_size, tolerance=CONFIG.service.image_stiching_tolerance)

    if os.path.isfile(args.input):
        pdf_paths = [args.input]
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
--- a/setup.py
+++ b/setup.py
@ -1,13 +0,0 @@
-#!/usr/bin/env python
-
-from distutils.core import setup
-
-setup(
-    name="image_prediction",
-    version="0.1.0",
-    description="",
-    author="",
-    author_email="",
-    url="",
-    packages=["image_prediction"],
-)
--- a/sonar-project.properties
+++ b/sonar-project.properties
@ -1,4 +0,0 @@
-sonar.exclusions=bamboo-specs/**, **/test_data/**
-sonar.c.file.suffixes=-
-sonar.cpp.file.suffixes=-
-sonar.objc.file.suffixes=-
--- a/src/image_prediction/init.py
+++ b/src/image_prediction/init.py
@ -0,0 +1,13 @@
+import logging
+import sys
+
+# log config
+LOG_FORMAT = "%(asctime)s [%(levelname)s] - [%(filename)s -> %(funcName)s() -> %(lineno)s] : %(message)s"
+DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
+stream_handler = logging.StreamHandler(sys.stdout)
+stream_handler_format = logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT)
+stream_handler.setFormatter(stream_handler_format)
+
+logger = logging.getLogger(__name__)
+logger.propagate = False
+logger.addHandler(stream_handler)
--- a/src/image_prediction/classifier/init.py
+++ b/src/image_prediction/classifier/init.py
--- a/src/image_prediction/classifier/classifier.py
+++ b/src/image_prediction/classifier/classifier.py
--- a/src/image_prediction/classifier/image_classifier.py
+++ b/src/image_prediction/classifier/image_classifier.py
--- a/src/image_prediction/compositor/init.py
+++ b/src/image_prediction/compositor/init.py
--- a/src/image_prediction/compositor/compositor.py
+++ b/src/image_prediction/compositor/compositor.py
--- a/src/image_prediction/config.py
+++ b/src/image_prediction/config.py
@ -0,0 +1,7 @@
+from pathlib import Path
+
+from pyinfra.config.loader import load_settings
+
+from image_prediction.locations import PROJECT_ROOT_DIR
+
+CONFIG = load_settings(root_path=PROJECT_ROOT_DIR, settings_path="config")
--- a/src/image_prediction/default_objects.py
+++ b/src/image_prediction/default_objects.py
--- a/src/image_prediction/encoder/init.py
+++ b/src/image_prediction/encoder/init.py
--- a/src/image_prediction/encoder/encoder.py
+++ b/src/image_prediction/encoder/encoder.py
--- a/src/image_prediction/encoder/encoders/init.py
+++ b/src/image_prediction/encoder/encoders/init.py
--- a/src/image_prediction/encoder/encoders/hash_encoder.py
+++ b/src/image_prediction/encoder/encoders/hash_encoder.py
@ -13,7 +13,7 @@ class HashEncoder(Encoder):
        yield from self.encode(images)


-def hash_image(image: Image.Image):
+def hash_image(image: Image.Image) -> str:
    """See: https://stackoverflow.com/a/49692185/3578468"""
    image = image.resize((10, 10), Image.ANTIALIAS)
    image = image.convert("L")
@ -21,4 +21,6 @@ def hash_image(image: Image.Image):
    avg_pixel = sum(pixel_data) / len(pixel_data)
    bits = "".join(["1" if (px >= avg_pixel) else "0" for px in pixel_data])
    hex_representation = str(hex(int(bits, 2)))[2:][::-1].upper()
-    return hex_representation
+    # Note: For each 4 leading zeros, the hex representation will be shorter by one character.
+    # To ensure that all hashes have the same length, we pad the hex representation with zeros (also see RED-3813).
+    return hex_representation.zfill(25)
--- a/src/image_prediction/estimator/init.py
+++ b/src/image_prediction/estimator/init.py
--- a/src/image_prediction/estimator/adapter/init.py
+++ b/src/image_prediction/estimator/adapter/init.py
--- a/src/image_prediction/estimator/adapter/adapter.py
+++ b/src/image_prediction/estimator/adapter/adapter.py
--- a/src/image_prediction/estimator/adapter/adapters/init.py
+++ b/src/image_prediction/estimator/adapter/adapters/init.py
--- a/src/image_prediction/estimator/preprocessor/init.py
+++ b/src/image_prediction/estimator/preprocessor/init.py
--- a/src/image_prediction/estimator/preprocessor/preprocessor.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessor.py
--- a/src/image_prediction/estimator/preprocessor/preprocessors/init.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessors/init.py
--- a/src/image_prediction/estimator/preprocessor/preprocessors/basic.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessors/basic.py
--- a/src/image_prediction/estimator/preprocessor/preprocessors/identity.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessors/identity.py
--- a/src/image_prediction/estimator/preprocessor/utils.py
+++ b/src/image_prediction/estimator/preprocessor/utils.py
--- a/src/image_prediction/exceptions.py
+++ b/src/image_prediction/exceptions.py
@ -32,3 +32,11 @@ class IntentionalTestException(RuntimeError):

 class InvalidBox(Exception):
    pass
+
+
+class ParsingError(Exception):
+    pass
+
+
+class BadXref(ValueError):
+    pass
--- a/src/image_prediction/extraction.py
+++ b/src/image_prediction/extraction.py
--- a/src/image_prediction/flask.py
+++ b/src/image_prediction/flask.py
@ -1,38 +1,14 @@
-import multiprocessing
-import traceback
 from typing import Callable

 from flask import Flask, request, jsonify
 from prometheus_client import generate_latest, CollectorRegistry, Summary

 from image_prediction.utils import get_logger
+from image_prediction.utils.process_wrapping import wrap_in_process

 logger = get_logger()


-def run_in_process(func):
-    p = multiprocessing.Process(target=func)
-    p.start()
-    p.join()
-
-
-def wrap_in_process(func_to_wrap):
-    def build_function_and_run_in_process(*args, **kwargs):
-        def func():
-            try:
-                result = func_to_wrap(*args, **kwargs)
-                return_dict["result"] = result
-            except:
-                logger.error(traceback.format_exc())
-
-        manager = multiprocessing.Manager()
-        return_dict = manager.dict()
-        run_in_process(func)
-        return return_dict.get("result", None)
-
-    return build_function_and_run_in_process
-
-
 def make_prediction_server(predict_fn: Callable):
    app = Flask(__name__)
    registry = CollectorRegistry(auto_describe=True)
--- a/image_prediction/estimator/preprocessor/init.py
+++ b/image_prediction/estimator/preprocessor/init.py
--- a/src/image_prediction/formatter/formatter.py
+++ b/src/image_prediction/formatter/formatter.py
--- a/image_prediction/estimator/preprocessor/preprocessors/init.py
+++ b/image_prediction/estimator/preprocessor/preprocessors/init.py
--- a/src/image_prediction/formatter/formatters/camel_case.py
+++ b/src/image_prediction/formatter/formatters/camel_case.py
--- a/src/image_prediction/formatter/formatters/enum.py
+++ b/src/image_prediction/formatter/formatters/enum.py
--- a/src/image_prediction/formatter/formatters/identity.py
+++ b/src/image_prediction/formatter/formatters/identity.py
--- a/src/image_prediction/formatter/formatters/key_formatter.py
+++ b/src/image_prediction/formatter/formatters/key_formatter.py
--- a/src/image_prediction/image_extractor/init.py
+++ b/src/image_prediction/image_extractor/init.py
--- a/src/image_prediction/image_extractor/extractor.py
+++ b/src/image_prediction/image_extractor/extractor.py
--- a/src/image_prediction/image_extractor/extractors/init.py
+++ b/src/image_prediction/image_extractor/extractors/init.py
--- a/src/image_prediction/image_extractor/extractors/mock.py
+++ b/src/image_prediction/image_extractor/extractors/mock.py
--- a/src/image_prediction/image_extractor/extractors/parsable.py
+++ b/src/image_prediction/image_extractor/extractors/parsable.py
@ -0,0 +1,300 @@
+import atexit
+import json
+import traceback
+from _operator import itemgetter
+from functools import partial, lru_cache
+from itertools import chain, starmap, filterfalse, tee
+from operator import itemgetter, truth
+from typing import Iterable, Iterator, List, Union
+
+import fitz
+import numpy as np
+from PIL import Image
+from funcy import merge, pluck, compose, rcompose, remove, keep
+from scipy.stats import gmean
+
+from image_prediction.config import CONFIG
+from image_prediction.exceptions import InvalidBox
+from image_prediction.formatter.formatters.enum import EnumFormatter
+from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
+from image_prediction.info import Info
+from image_prediction.stitching.stitching import stitch_pairs
+from image_prediction.stitching.utils import validate_box
+from image_prediction.transformer.transformers.response import compute_geometric_quotient
+from image_prediction.utils import get_logger
+
+logger = get_logger()
+
+
+class ParsablePDFImageExtractor(ImageExtractor):
+    def __init__(self, verbose=False, tolerance=0):
+        """
+
+        Args:
+            verbose: Whether to show progressbar
+            tolerance: The tolerance in pixels for the distance between images, beyond which they will not be stitched
+                together
+        """
+        self.doc: fitz.Document = None
+        self.verbose = verbose
+        self.tolerance = tolerance
+
+    def extract(self, pdf: bytes, page_range: range = None):
+        self.doc = fitz.Document(stream=pdf)
+
+        pages = extract_pages(self.doc, page_range) if page_range else self.doc
+
+        image_metadata_pairs = chain.from_iterable(map(self.__process_images_on_page, pages))
+
+        yield from image_metadata_pairs
+
+    def __process_images_on_page(self, page: fitz.Page):
+        metadata = extract_valid_metadata(self.doc, page)
+        images = get_images_on_page(self.doc, metadata)
+
+        clear_caches()
+
+        image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
+        #  TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the
+        #   validation here. Invalid images can then be split into a different stream and joined with the intact images
+        #   again for the formatting step.
+        image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs)
+        image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
+
+        yield from image_metadata_pairs
+
+    @staticmethod
+    def __filter_valid_images(image_metadata_pairs: Iterable[ImageMetadataPair]) -> Iterator[ImageMetadataPair]:
+        def validate_image_is_not_corrupt(image: Image.Image, metadata: dict):
+            """See RED-5148: Some images are corrupt and cannot be processed by the image classifier. This function
+            filters out such images by trying to resize and convert them to RGB. If this fails, the image is considered
+            corrupt and is dropped.
+            TODO: find cleaner solution
+            """
+            try:
+                image.resize((100, 100)).convert("RGB")
+                return ImageMetadataPair(image, metadata)
+            except (OSError, Exception) as err:
+                metadata = json.dumps(EnumFormatter()(metadata), indent=2)
+                logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
+                return None
+
+        def filter_small_images_on_scanned_pages(image_metadata_pairs) -> Iterable[ImageMetadataPair]:
+            """See RED-9746: Small images on scanned pages should be dropped, so they are not classified. This is a
+            heuristic to filter out images that are too small in relation to the page size if they are on a scanned page.
+
+            The ratio is computed as the geometric mean of the width and height of the image divided by the geometric mean
+            of the width and height of the page. If the ratio is below the threshold, the image is dropped.
+            """
+
+            def image_is_a_scanned_page(image_metadata_pair: ImageMetadataPair) -> bool:
+                tolerance = CONFIG.filters.is_scanned_page.tolerance
+                width_ratio = image_metadata_pair.metadata[Info.WIDTH] / image_metadata_pair.metadata[Info.PAGE_WIDTH]
+                height_ratio = (
+                    image_metadata_pair.metadata[Info.HEIGHT] / image_metadata_pair.metadata[Info.PAGE_HEIGHT]
+                )
+                return width_ratio >= 1 - tolerance and height_ratio >= 1 - tolerance
+
+            def image_fits_geometric_mean_ratio(image_metadata_pair: ImageMetadataPair) -> bool:
+                min_ratio = CONFIG.filters.image_to_page_quotient.min
+                metadatum = image_metadata_pair.metadata
+                image_gmean = gmean([metadatum[Info.WIDTH], metadatum[Info.HEIGHT]])
+                page_gmean = gmean([metadatum[Info.PAGE_WIDTH], metadatum[Info.PAGE_HEIGHT]])
+                ratio = image_gmean / page_gmean
+                return ratio >= min_ratio
+
+            pairs, pairs_copy = tee(image_metadata_pairs)
+
+            if any(map(image_is_a_scanned_page, pairs_copy)):
+                logger.debug("Scanned page detected, filtering out small images ...")
+                return filter(image_fits_geometric_mean_ratio, pairs)
+            else:
+                return pairs
+
+        image_metadata_pairs = filter_small_images_on_scanned_pages(image_metadata_pairs)
+
+        return filter(truth, starmap(validate_image_is_not_corrupt, image_metadata_pairs))
+
+
+def extract_pages(doc, page_range):
+    page_range = range(page_range.start + 1, page_range.stop + 1)
+    pages = map(doc.load_page, page_range)
+
+    yield from pages
+
+
+def get_images_on_page(doc, metadata):
+    xrefs = pluck(Info.XREF, metadata)
+    images = map(partial(xref_to_image, doc), xrefs)
+
+    yield from images
+
+
+def extract_valid_metadata(doc: fitz.Document, page: fitz.Page):
+    metadata = get_metadata_for_images_on_page(page)
+    metadata = filter_valid_metadata(metadata)
+    metadata = add_alpha_channel_info(doc, metadata)
+
+    return list(metadata)
+
+
+def get_metadata_for_images_on_page(page: fitz.Page):
+    metadata = map(get_image_metadata, get_image_infos(page))
+    metadata = add_page_metadata(page, metadata)
+
+    yield from metadata
+
+
+def filter_valid_metadata(metadata):
+    yield from compose(
+        # TODO: Disabled for now, since atm since the backend needs atm the metadata and the hash of every image, even
+        #  scanned pages. In the future, this should be resolved differently, e.g. by filtering all page-sized images
+        #  and giving the user the ability to reclassify false positives with a separate call.
+        # filter_out_page_sized_images,
+        filter_out_tiny_images,
+        filter_out_invalid_metadata,
+    )(metadata)
+
+
+def filter_out_invalid_metadata(metadata):
+    def __validate_box(box):
+        try:
+            return validate_box(box)
+        except InvalidBox as err:
+            logger.debug(f"Dropping invalid metadatum, reason: {err}")
+
+    yield from keep(__validate_box, metadata)
+
+
+def filter_out_page_sized_images(metadata):
+    yield from remove(breaches_image_to_page_quotient, metadata)
+
+
+def filter_out_tiny_images(metadata):
+    yield from filterfalse(tiny, metadata)
+
+
+@lru_cache(maxsize=None)
+def get_image_infos(page: fitz.Page) -> List[dict]:
+    return page.get_image_info(xrefs=True)
+
+
+@lru_cache(maxsize=None)
+def xref_to_image(doc, xref) -> Union[Image.Image, None]:
+    # NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream
+    try:
+        pixmap = fitz.Pixmap(doc, xref)
+        array = convert_pixmap_to_array(pixmap)
+        return Image.fromarray(array)
+    except ValueError:
+        logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
+        return
+
+
+def convert_pixmap_to_array(pixmap: fitz.Pixmap):
+    array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
+    array = _normalize_channels(array)
+    return array
+
+
+def _normalize_channels(array: np.ndarray):
+    if array.shape[-1] == 1:
+        array = array[:, :, 0]
+    elif array.shape[-1] == 4:
+        array = array[..., :3]
+    elif array.shape[-1] != 3:
+        logger.warning(f"Unexpected image format: {array.shape}.")
+        raise ValueError(f"Unexpected image format: {array.shape}.")
+
+    return array
+
+
+def get_image_metadata(image_info):
+    xref, coords = itemgetter("xref", "bbox")(image_info)
+    x1, y1, x2, y2 = map(rounder, coords)
+
+    width = abs(x2 - x1)
+    height = abs(y2 - y1)
+
+    return {
+        Info.WIDTH: width,
+        Info.HEIGHT: height,
+        Info.X1: x1,
+        Info.X2: x2,
+        Info.Y1: y1,
+        Info.Y2: y2,
+        Info.XREF: xref,
+    }
+
+
+def add_page_metadata(page, metadata):
+    yield from map(partial(merge, get_page_metadata(page)), metadata)
+
+
+def add_alpha_channel_info(doc, metadata):
+    def add_alpha_value_to_metadatum(metadatum):
+        alpha = metadatum_to_alpha_value(metadatum)
+        return {**metadatum, Info.ALPHA: alpha}
+
+    xref_to_alpha = partial(has_alpha_channel, doc)
+    metadatum_to_alpha_value = compose(xref_to_alpha, itemgetter(Info.XREF))
+
+    yield from map(add_alpha_value_to_metadatum, metadata)
+
+
+@lru_cache(maxsize=None)
+def load_image_handle_from_xref(doc, xref):
+    try:
+        return doc.extract_image(xref)
+    except ValueError:
+        logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
+        return
+
+
+rounder = rcompose(round, int)
+
+
+def get_page_metadata(page):
+    page_width, page_height = map(rounder, page.mediabox_size)
+
+    return {
+        Info.PAGE_WIDTH: page_width,
+        Info.PAGE_HEIGHT: page_height,
+        Info.PAGE_IDX: page.number,
+    }
+
+
+def has_alpha_channel(doc, xref):
+    maybe_image = load_image_handle_from_xref(doc, xref)
+    maybe_smask = maybe_image["smask"] if maybe_image else None
+
+    if maybe_smask:
+        return any([doc.extract_image(maybe_smask) is not None, bool(fitz.Pixmap(doc, maybe_smask).alpha)])
+    else:
+        try:
+            return bool(fitz.Pixmap(doc, xref).alpha)
+        except ValueError:
+            logger.debug(f"Encountered invalid xref `{xref}` in {doc.metadata.get('title', '<no title>')}.")
+            return False
+
+
+def tiny(metadata):
+    return metadata[Info.WIDTH] * metadata[Info.HEIGHT] <= 4
+
+
+def clear_caches():
+    get_image_infos.cache_clear()
+    load_image_handle_from_xref.cache_clear()
+    xref_to_image.cache_clear()
+
+
+atexit.register(clear_caches)
+
+
+def breaches_image_to_page_quotient(metadatum):
+    page_width, page_height, x1, x2, y1, y2, width, height = itemgetter(
+        Info.PAGE_WIDTH, Info.PAGE_HEIGHT, Info.X1, Info.X2, Info.Y1, Info.Y2, Info.WIDTH, Info.HEIGHT
+    )(metadatum)
+    geometric_quotient = compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1)
+    quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max)
+    return quotient_breached
--- a/src/image_prediction/info.py
+++ b/src/image_prediction/info.py
@ -12,3 +12,4 @@ class Info(Enum):
    Y1 = "y1"
    Y2 = "y2"
    ALPHA = "alpha"
+    XREF = "xref"
--- a/src/image_prediction/label_mapper/init.py
+++ b/src/image_prediction/label_mapper/init.py
--- a/src/image_prediction/label_mapper/mapper.py
+++ b/src/image_prediction/label_mapper/mapper.py
--- a/image_prediction/image_extractor/extractors/init.py
+++ b/image_prediction/image_extractor/extractors/init.py
--- a/src/image_prediction/label_mapper/mappers/numeric.py
+++ b/src/image_prediction/label_mapper/mappers/numeric.py
--- a/src/image_prediction/label_mapper/mappers/probability.py
+++ b/src/image_prediction/label_mapper/mappers/probability.py
--- a/src/image_prediction/locations.py
+++ b/src/image_prediction/locations.py
@ -0,0 +1,18 @@
+"""Defines constant paths relative to the module root path."""
+
+from pathlib import Path
+
+# FIXME: move these paths to config, only depending on 'ROOT_PATH' environment variable.
+MODULE_DIR = Path(__file__).resolve().parents[0]
+PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
+PROJECT_ROOT_DIR = PACKAGE_ROOT_DIR.parents[0]
+
+CONFIG_FILE = PROJECT_ROOT_DIR / "config" / "settings.toml"
+BANNER_FILE = PROJECT_ROOT_DIR / "banner.txt"
+
+DATA_DIR = PROJECT_ROOT_DIR / "data"
+MLRUNS_DIR = str(DATA_DIR / "mlruns")
+
+TEST_DIR = PROJECT_ROOT_DIR / "test"
+TEST_DATA_DIR = TEST_DIR / "data"
+TEST_DATA_DIR_DVC = TEST_DIR / "data.dvc"
--- a/src/image_prediction/model_loader/init.py
+++ b/src/image_prediction/model_loader/init.py
--- a/src/image_prediction/model_loader/database/init.py
+++ b/src/image_prediction/model_loader/database/init.py
--- a/src/image_prediction/model_loader/database/connector.py
+++ b/src/image_prediction/model_loader/database/connector.py
--- a/src/image_prediction/model_loader/database/connectors/init.py
+++ b/src/image_prediction/model_loader/database/connectors/init.py
--- a/src/image_prediction/model_loader/database/connectors/mock.py
+++ b/src/image_prediction/model_loader/database/connectors/mock.py
--- a/src/image_prediction/model_loader/loader.py
+++ b/src/image_prediction/model_loader/loader.py
--- a/src/image_prediction/model_loader/loaders/init.py
+++ b/src/image_prediction/model_loader/loaders/init.py
--- a/src/image_prediction/model_loader/loaders/mlflow.py
+++ b/src/image_prediction/model_loader/loaders/mlflow.py
--- a/src/image_prediction/pipeline.py
+++ b/src/image_prediction/pipeline.py
@ -1,8 +1,10 @@
 import os
-from functools import partial
+from functools import lru_cache, partial
 from itertools import chain, tee
+from typing import Iterable, Any

 from funcy import rcompose, first, compose, second, chunks, identity, rpartial
+from kn_utils.logging import logger
 from tqdm import tqdm

 from image_prediction.config import CONFIG
@ -19,7 +21,9 @@ from image_prediction.utils.generic import lift, starlift
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"


+@lru_cache(maxsize=None)
 def load_pipeline(**kwargs):
+    logger.info(f"Loading pipeline with kwargs: {kwargs}")
    model_loader = get_mlflow_model_loader(MLRUNS_DIR)
    model_identifier = CONFIG.service.mlflow_run_id

@ -37,7 +41,7 @@ def star(f):


 class Pipeline:
-    def __init__(self, model_loader, model_identifier, batch_size=16, verbose=True, **kwargs):
+    def __init__(self, model_loader, model_identifier, batch_size=16, verbose=False, **kwargs):
        self.verbose = verbose

        extract = get_extractor(**kwargs)
@ -51,7 +55,7 @@ class Pipeline:
        join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))

        #                       />--classify--\
-        # --extract-->--split--+->--encode---->+--join-->reformat
+        # --extract-->--split--+->--encode---->+--join-->reformat-->filter_duplicates
        #                       \>--identity--/

        self.pipe = rcompose(
@ -60,6 +64,7 @@ class Pipeline:
            pairwise_apply(classify, represent, identity),  # ... apply functions to the streams pairwise
            join,  # ... the streams by zipping
            reformat,  # ... the items
+            filter_duplicates,  # ... filter out duplicate images
        )

    def __call__(self, pdf: bytes, page_range: range = None):
@ -69,3 +74,32 @@ class Pipeline:
            unit=" images",
            disable=not self.verbose,
        )
+
+
+def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
+    """Filter out duplicate images from the `position` (image coordinates) and `page`, preferring the one with
+    `allPassed` set to True.
+    See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
+    """
+    keep = dict()
+    for image_meta in metadata:
+        key: tuple[int, int, int, int, int] = (
+            image_meta["position"]["x1"],
+            image_meta["position"]["x2"],
+            image_meta["position"]["y1"],
+            image_meta["position"]["y2"],
+            image_meta["position"]["pageNumber"],
+        )
+        if key in keep:
+            logger.warning(
+                f"Duplicate image found: x1={key[0]}, x2={key[1]}, y1={key[2]}, y2={key[3]}, pageNumber={key[4]}"
+            )
+            if image_meta["filters"]["allPassed"]:
+                logger.warning("Setting the image with allPassed flag set to True")
+                keep[key] = image_meta
+            else:
+                logger.warning("Keeping the previous image since the current image has allPassed flag set to False")
+        else:
+            keep[key] = image_meta
+
+    yield from keep.values()
--- a/image_prediction/model_loader/database/connectors/init.py
+++ b/image_prediction/model_loader/database/connectors/init.py
--- a/src/image_prediction/redai_adapter/efficient_net_wrapper.py
+++ b/src/image_prediction/redai_adapter/efficient_net_wrapper.py
--- a/src/image_prediction/redai_adapter/mlflow.py
+++ b/src/image_prediction/redai_adapter/mlflow.py
--- a/src/image_prediction/redai_adapter/model.py
+++ b/src/image_prediction/redai_adapter/model.py
--- a/src/image_prediction/redai_adapter/model_wrapper.py
+++ b/src/image_prediction/redai_adapter/model_wrapper.py
--- a/image_prediction/model_loader/loaders/init.py
+++ b/image_prediction/model_loader/loaders/init.py
--- a/Show More
+++ b/Show More