feat: RED-10765: ignore perceptual hash for image deduplication and prefer to keep the ones with allPassed set to True

Merge branch 'feat/RED-10765/filter-duplicate-images' into 'master'
feat: RED-10765: filter out classifications for 'duplicate' images present in the document Closes RED-10765 See merge request redactmanager/image-classification-service!23
2025-01-31 12:59:59 +01:00 · 2025-01-30 13:20:19 +01:00 · 2025-01-30 12:42:41 +01:00 · 2025-01-16 09:29:11 +01:00 · 2025-01-15 13:39:16 +01:00 · 2024-12-18 12:39:44 +01:00
143 changed files with 42151 additions and 1079 deletions
--- a/.dvc/config
+++ b/.dvc/config
@ -1,6 +1,8 @@
 [core]
-    remote = vector
+    remote = azure_remote
    autostage = true
 ['remote "vector"']
    url = ssh://vector.iqser.com/research/image-prediction/
    port = 22
 ['remote "azure_remote"']
    url = azure://image-classification-dvc/
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,8 @@
 .vscode/
 *.h5
-/venv/
+*venv
 .idea/
 src/data
 !.gitignore
 *.project
@ -172,4 +173,4 @@ fabric.properties
 # https://plugins.jetbrains.com/plugin/12206-codestream
 .idea/codestream.xml
-# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
+# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -0,0 +1,51 @@
 include:
  - project: "Gitlab/gitlab"
    ref: main
    file: "/ci-templates/research/dvc.gitlab-ci.yml"
  - project: "Gitlab/gitlab"
    ref: main
    file: "/ci-templates/research/versioning-build-test-release.gitlab-ci.yml"
 variables:
  NEXUS_PROJECT_DIR: red
  IMAGENAME: "${CI_PROJECT_NAME}"
  INTEGRATION_TEST_FILE: "${CI_PROJECT_ID}.pdf"
  FF_USE_FASTZIP: "true" # enable fastzip - a faster zip implementation that also supports level configuration.
  ARTIFACT_COMPRESSION_LEVEL: default # can also be set to fastest, fast, slow and slowest. If just enabling fastzip is not enough try setting this to fastest or fast.
  CACHE_COMPRESSION_LEVEL: default # same as above, but for caches
  # TRANSFER_METER_FREQUENCY: 5s # will display transfer progress every 5 seconds for artifacts and remote caches. For debugging purposes.
 stages:
  - data
  - setup
  - tests
  - sonarqube
  - versioning
  - build
  - integration-tests
  - release
 docker-build:
  extends: .docker-build
  needs:
    - job: dvc-pull
      artifacts: true
    - !reference [.needs-versioning, needs] # leave this line as is
 ###################
 # INTEGRATION TESTS
 trigger-integration-tests:
  extends: .integration-tests
  # ADD THE MODEL BUILD WHICH SHOULD TRIGGER THE INTEGRATION TESTS
  # needs:
  #   - job: docker-build::model_name
  #     artifacts: true
  rules:
    - when: never
 #########
 # RELEASE
 release:
  extends: .release
  needs:
    - !reference [.needs-versioning, needs] # leave this line as is
--- a/.gitmodules
+++ b/.gitmodules
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
 3.10
--- a/78
+++ b/78
@ -1,21 +1,73 @@
-FROM image-prediction-base
+FROM python:3.10-slim AS builder
-WORKDIR /app/service
+ARG GITLAB_USER
 ARG GITLAB_ACCESS_TOKEN
-COPY src src
+ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
-COPY data data
+ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
 COPY image_prediction image_prediction
 COPY setup.py setup.py
 COPY requirements.txt requirements.txt
 COPY config.yaml config.yaml
 COPY banner.txt banner.txt
-# Install dependencies differing from base image.
+ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
-RUN python3 -m pip install -r requirements.txt
+ARG POETRY_SOURCE_REF_RED=gitlab-red
-RUN python3 -m pip install -e .
+ARG PYPI_REGISTRY_FFORESIGHT=https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
 ARG POETRY_SOURCE_REF_FFORESIGHT=gitlab-fforesight
 ARG VERSION=dev
 LABEL maintainer="Research <research@knecon.com>"
 LABEL version="${VERSION}"
 WORKDIR /app
 ###########
 # ENV SETUP
 ENV PYTHONDONTWRITEBYTECODE=true
 ENV PYTHONUNBUFFERED=true
 ENV POETRY_HOME=/opt/poetry
 ENV PATH="$POETRY_HOME/bin:$PATH"
 RUN apt-get update && \
    apt-get install -y curl git bash build-essential libffi-dev libssl-dev && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 RUN curl -sSL https://install.python-poetry.org | python3 -
 RUN poetry --version
 COPY pyproject.toml poetry.lock ./
 RUN poetry config virtualenvs.create true && \
    poetry config virtualenvs.in-project true && \
    poetry config installer.max-workers 10 && \
    poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
    poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
    poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
    poetry config http-basic.${POETRY_SOURCE_REF_RED} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
    poetry config repositories.${POETRY_SOURCE_REF_FFORESIGHT} ${PYPI_REGISTRY_FFORESIGHT} && \
    poetry config http-basic.${POETRY_SOURCE_REF_FFORESIGHT} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
    poetry install --without=dev -vv --no-interaction --no-root
 ###############
 # WORKING IMAGE
 FROM python:3.10-slim
 WORKDIR /app
 # COPY SOURCE CODE FROM BUILDER IMAGE
 COPY --from=builder /app /app
 # COPY BILL OF MATERIALS (BOM)
 COPY bom.json /bom.json
 ENV PATH="/app/.venv/bin:$PATH"
 ###################
 # COPY SOURCE CODE
 COPY ./src ./src
 COPY ./config ./config
 COPY ./data ./data
 COPY banner.txt ./
 EXPOSE 5000
 EXPOSE 8080
-CMD ["python3", "src/serve.py"]
+CMD [ "python", "src/serve.py"]
--- a/25
+++ b/25
@ -1,25 +0,0 @@
 FROM python:3.8 as builder1
 # Use a virtual environment.
 RUN python -m venv /app/venv
 ENV PATH="/app/venv/bin:$PATH"
 # Upgrade pip.
 RUN python -m pip install --upgrade pip
 # Make a directory for the service files and copy the service repo into the container.
 WORKDIR /app/service
 COPY ./requirements.txt ./requirements.txt
 # Install dependencies.
 RUN python3 -m pip install -r requirements.txt
 # Make a new container and copy all relevant files over to filter out temporary files
 # produced during setup to reduce the final container's size.
 FROM python:3.8
 WORKDIR /app/
 COPY --from=builder1  /app .
 ENV PATH="/app/venv/bin:$PATH"
 WORKDIR /app/service
--- a/46
+++ b/46
@ -1,20 +1,40 @@
-ARG BASE_ROOT="nexus.iqser.com:5001/red/"
+FROM python:3.10
 ARG VERSION_TAG="dev"
-FROM ${BASE_ROOT}image-prediction:${VERSION_TAG}
+ARG USERNAME
 ARG TOKEN
 ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
 ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
 ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
 ARG POETRY_SOURCE_REF_RED=gitlab-red
 ARG VERSION=dev
-WORKDIR /app/service
+LABEL maintainer="Research <research@knecon.com>"
 LABEL version="${VERSION}"
-COPY src src
+WORKDIR /app
 COPY data data
 COPY image_prediction image_prediction
 COPY setup.py setup.py
 COPY requirements.txt requirements.txt
 COPY config.yaml config.yaml
-# Install module & dependencies
+ENV PYTHONUNBUFFERED=true
-RUN python3 -m pip install -e .
+ENV POETRY_HOME=/opt/poetry
-RUN python3 -m pip install -r requirements.txt
+ENV PATH="$POETRY_HOME/bin:$PATH"
 RUN curl -sSL https://install.python-poetry.org | python3 -
 COPY ./data ./data
 COPY ./test ./test
 COPY ./config ./config
 COPY ./src ./src
 COPY pyproject.toml poetry.lock banner.txt config.yaml./
 RUN poetry config virtualenvs.create false && \
    poetry config installer.max-workers 10 && \
    poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
    poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${USERNAME} ${TOKEN} && \
    poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
    poetry config http-basic.${POETRY_SOURCE_REF_RED} ${USERNAME} ${TOKEN} && \
    poetry install --without=dev -vv --no-interaction --no-root
 EXPOSE 5000
 EXPOSE 8080
 RUN apt update --yes
 RUN apt install vim --yes
--- a/README.md
+++ b/README.md
@ -2,8 +2,11 @@
 Build base image
 ```bash
-docker build -f Dockerfile_base -t image-prediction-base .
+docker build -t image-classification-image --progress=plain --no-cache \
-docker build -f Dockerfile -t image-prediction .
+    -f Dockerfile \
    --build-arg USERNAME=$GITLAB_USER \
    --build-arg TOKEN=$GITLAB_ACCESS_TOKEN \
    .
 ```
 ### Usage
--- a/bamboo-specs/pom.xml
+++ b/bamboo-specs/pom.xml
@ -1,40 +0,0 @@
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <parent>
    <groupId>com.atlassian.bamboo</groupId>
    <artifactId>bamboo-specs-parent</artifactId>
    <version>7.1.2</version>
    <relativePath/>
  </parent>
  <artifactId>bamboo-specs</artifactId>
  <version>1.0.0-SNAPSHOT</version>
  <packaging>jar</packaging>
  <properties>
    <sonar.skip>true</sonar.skip>
  </properties>
  <dependencies>
    <dependency>
      <groupId>com.atlassian.bamboo</groupId>
      <artifactId>bamboo-specs-api</artifactId>
    </dependency>
    <dependency>
      <groupId>com.atlassian.bamboo</groupId>
      <artifactId>bamboo-specs</artifactId>
    </dependency>
    <!-- Test dependencies -->
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <scope>test</scope>
    </dependency>
  </dependencies>
  <!-- run 'mvn test' to perform offline validation of the plan -->
  <!-- run 'mvn -Ppublish-specs' to upload the plan to your Bamboo server -->
 </project>
--- a/bamboo-specs/src/main/java/buildjob/PlanSpec.java
+++ b/bamboo-specs/src/main/java/buildjob/PlanSpec.java
@ -1,178 +0,0 @@
 package buildjob;
 import com.atlassian.bamboo.specs.api.BambooSpec;
 import com.atlassian.bamboo.specs.api.builders.BambooKey;
 import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
 import com.atlassian.bamboo.specs.api.builders.permission.PermissionType;
 import com.atlassian.bamboo.specs.api.builders.permission.Permissions;
 import com.atlassian.bamboo.specs.api.builders.permission.PlanPermissions;
 import com.atlassian.bamboo.specs.api.builders.plan.Job;
 import com.atlassian.bamboo.specs.api.builders.plan.Plan;
 import com.atlassian.bamboo.specs.api.builders.plan.PlanIdentifier;
 import com.atlassian.bamboo.specs.api.builders.plan.Stage;
 import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup;
 import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement;
 import com.atlassian.bamboo.specs.api.builders.project.Project;
 import com.atlassian.bamboo.specs.builders.task.CheckoutItem;
 import com.atlassian.bamboo.specs.builders.task.InjectVariablesTask;
 import com.atlassian.bamboo.specs.builders.task.ScriptTask;
 import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask;
 import com.atlassian.bamboo.specs.builders.task.CleanWorkingDirectoryTask;
 import com.atlassian.bamboo.specs.builders.task.VcsTagTask;
 import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
 import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
 import com.atlassian.bamboo.specs.api.builders.Variable;
 import com.atlassian.bamboo.specs.util.BambooServer;
 import com.atlassian.bamboo.specs.builders.task.ScriptTask;
 import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
 /**
 * Plan configuration for Bamboo.
 * Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
 */
@BambooSpec
 public class PlanSpec {
    private static final String SERVICE_NAME = "image-prediction";
    private static final String SERVICE_NAME_BASE = "image-prediction-base";
    private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_","");
    /**
     * Run main to publish plan on Bamboo
     */
    public static void main(final String[] args) throws Exception {
        //By default credentials are read from the '.credentials' file.
        BambooServer bambooServer = new BambooServer("http://localhost:8085");
        Plan plan = new PlanSpec().createDockerBuildPlan();
        bambooServer.publish(plan);
        PlanPermissions planPermission = new PlanSpec().createPlanPermission(plan.getIdentifier());
        bambooServer.publish(planPermission);
    }
    private PlanPermissions createPlanPermission(PlanIdentifier planIdentifier) {
        Permissions permission = new Permissions()
                .userPermissions("atlbamboo", PermissionType.EDIT, PermissionType.VIEW, PermissionType.ADMIN, PermissionType.CLONE, PermissionType.BUILD)
                .groupPermissions("research", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
                .groupPermissions("Development", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
                .groupPermissions("QA", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
                .loggedInUserPermissions(PermissionType.VIEW)
                .anonymousUserPermissionView();
        return new PlanPermissions(planIdentifier.getProjectKey(), planIdentifier.getPlanKey()).permissions(permission);
    }
    private Project project() {
        return new Project()
                .name("RED")
                .key(new BambooKey("RED"));
    }
    public Plan createDockerBuildPlan() {
    return new Plan(
            project(),
            SERVICE_NAME, new BambooKey(SERVICE_KEY))
            .description("Docker build for image-prediction.")
            .stages(
            new Stage("Build Stage")
              .jobs(
                new Job("Build Job", new BambooKey("BUILD"))
                  .tasks(
                    new CleanWorkingDirectoryTask()
                        .description("Clean working directory.")
                        .enabled(true),
                    new VcsCheckoutTask()
                        .description("Checkout default repository.")
                        .checkoutItems(new CheckoutItem().defaultRepository()),
                    new ScriptTask()
                        .description("Set config and keys.")
                        .inlineBody("mkdir -p ~/.ssh\n" +
                                    "echo \"${bamboo.bamboo_agent_ssh}\" | base64 -d >> ~/.ssh/id_rsa\n" +
                                    "echo \"host vector.iqser.com\" > ~/.ssh/config\n" +
                                    "echo \"    user bamboo-agent\" >> ~/.ssh/config\n" +
                                    "chmod 600 ~/.ssh/config ~/.ssh/id_rsa"),
                    new ScriptTask()
                        .description("Build Docker container.")
                        .location(Location.FILE)
                        .fileFromPath("bamboo-specs/src/main/resources/scripts/docker-build.sh")
                        .argument(SERVICE_NAME + " " + SERVICE_NAME_BASE))
                  .dockerConfiguration(
                      new DockerConfiguration()
                        .image("nexus.iqser.com:5001/infra/release_build:4.2.0")
                        .volume("/var/run/docker.sock", "/var/run/docker.sock"))),
            new Stage("Sonar Stage")
              .jobs(
                new Job("Sonar Job", new BambooKey("SONAR"))
                  .tasks(
                    new CleanWorkingDirectoryTask()
                        .description("Clean working directory.")
                        .enabled(true),
                    new VcsCheckoutTask()
                        .description("Checkout default repository.")
                        .checkoutItems(new CheckoutItem().defaultRepository()),
                    new ScriptTask()
                        .description("Set config and keys.")
                        .inlineBody("mkdir -p ~/.ssh\n" +
                                    "echo \"${bamboo.bamboo_agent_ssh}\" | base64 -d >> ~/.ssh/id_rsa\n" +
                                    "echo \"host vector.iqser.com\" > ~/.ssh/config\n" +
                                    "echo \"    user bamboo-agent\" >> ~/.ssh/config\n" +
                                    "chmod 600 ~/.ssh/config ~/.ssh/id_rsa"),
                    new ScriptTask()
                        .description("Run Sonarqube scan.")
                        .location(Location.FILE)
                        .fileFromPath("bamboo-specs/src/main/resources/scripts/sonar-scan.sh")
                        .argument(SERVICE_NAME))
                  .dockerConfiguration(
                      new DockerConfiguration()
                        .image("nexus.iqser.com:5001/infra/release_build:4.2.0")
                        .volume("/var/run/docker.sock", "/var/run/docker.sock"))),
            new Stage("Licence Stage")
              .jobs(
                new Job("Git Tag Job", new BambooKey("GITTAG"))
                  .tasks(
                    new VcsCheckoutTask()
                        .description("Checkout default repository.")
                        .checkoutItems(new CheckoutItem().defaultRepository()),
                    new ScriptTask()
                        .description("Build git tag.")
                        .location(Location.FILE)
                        .fileFromPath("bamboo-specs/src/main/resources/scripts/git-tag.sh"),
                    new InjectVariablesTask()
                        .description("Inject git tag.")
                        .path("git.tag")
                        .namespace("g")
                        .scope(InjectVariablesScope.LOCAL),
                    new VcsTagTask()
                        .description("${bamboo.g.gitTag}")
                        .tagName("${bamboo.g.gitTag}")
                        .defaultRepository())
                .dockerConfiguration(
                    new DockerConfiguration()
                        .image("nexus.iqser.com:5001/infra/release_build:4.4.1")),
                new Job("Licence Job", new BambooKey("LICENCE"))
                  .enabled(false)
                  .tasks(
                    new VcsCheckoutTask()
                        .description("Checkout default repository.")
                        .checkoutItems(new CheckoutItem().defaultRepository()),
                    new ScriptTask()
                        .description("Build licence.")
                        .location(Location.FILE)
                        .fileFromPath("bamboo-specs/src/main/resources/scripts/create-licence.sh"))
                  .dockerConfiguration(
                    new DockerConfiguration()
                        .image("nexus.iqser.com:5001/infra/maven:3.6.2-jdk-13-3.0.0")
                        .volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
                        .volume("/var/run/docker.sock", "/var/run/docker.sock"))))
            .linkedRepositories("RR / " + SERVICE_NAME)
            .linkedRepositories("RR / redai_image")
            .triggers(new BitbucketServerTrigger())
            .planBranchManagement(new PlanBranchManagement()
              .createForVcsBranch()
              .delete(new BranchCleanup()
                  .whenInactiveInRepositoryAfterDays(14))
              .notificationForCommitters());
    }
 }
--- a/bamboo-specs/src/main/resources/scripts/create-licence.sh
+++ b/bamboo-specs/src/main/resources/scripts/create-licence.sh
@ -1,19 +0,0 @@
 #!/bin/bash
 set -e
 if [[ \"${bamboo_version_tag}\" != \"dev\" ]]
 then
    ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
                    -f ${bamboo_build_working_directory}/pom.xml \
                    versions:set  \
                    -DnewVersion=${bamboo_version_tag}
    ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
                    -f ${bamboo_build_working_directory}/pom.xml \
                    -B clean deploy \
                    -e -DdeployAtEnd=true \
                    -Dmaven.wagon.http.ssl.insecure=true \
                    -Dmaven.wagon.http.ssl.allowall=true \
                    -Dmaven.wagon.http.ssl.ignore.validity.dates=true \
                    -DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/gin4-platform-releases
 fi
--- a/bamboo-specs/src/main/resources/scripts/docker-build.sh
+++ b/bamboo-specs/src/main/resources/scripts/docker-build.sh
@ -1,20 +0,0 @@
 #!/bin/bash
 set -e
 SERVICE_NAME=$1
 SERVICE_NAME_BASE=$2
 python3 -m venv build_venv
 source build_venv/bin/activate
 python3 -m pip install --upgrade pip
 pip install dvc
 pip install 'dvc[ssh]'
 echo "Pulling dvc data"
 dvc pull
 echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
 docker build -f Dockerfile_base -t $SERVICE_NAME_BASE .
 docker build -f Dockerfile  -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} .
 echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
 docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag}
--- a/bamboo-specs/src/main/resources/scripts/git-tag.sh
+++ b/bamboo-specs/src/main/resources/scripts/git-tag.sh
@ -1,9 +0,0 @@
 #!/bin/bash
 set -e
 if [[ "${bamboo_version_tag}" = "dev" ]]
 then
    echo "gitTag=${bamboo_planRepository_1_branch}_${bamboo_buildNumber}" > git.tag
 else
    echo "gitTag=${bamboo_version_tag}" > git.tag
 fi
--- a/bamboo-specs/src/main/resources/scripts/sonar-scan.sh
+++ b/bamboo-specs/src/main/resources/scripts/sonar-scan.sh
@ -1,57 +0,0 @@
 #!/bin/bash
 set -e
 export JAVA_HOME=/usr/bin/sonar-scanner/jre
 python3 -m venv build_venv
 source build_venv/bin/activate
 python3 -m pip install --upgrade pip
 python3 -m pip install dependency-check
 python3 -m pip install coverage
 echo "coverage report generation"
 bash run_tests.sh
 if [ ! -f reports/coverage.xml ]
 then
  exit 1
 fi
 SERVICE_NAME=$1
 echo "dependency-check:aggregate"
 mkdir -p reports
 dependency-check --enableExperimental -f JSON -f HTML -f XML \
  --disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
  --exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**"
 if [[ -z "${bamboo_repository_pr_key}" ]]
 then
    echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
    /usr/bin/sonar-scanner/bin/sonar-scanner \
      -Dsonar.projectKey=RED_$SERVICE_NAME \
      -Dsonar.sources=image_prediction \
      -Dsonar.host.url=https://sonarqube.iqser.com \
      -Dsonar.login=${bamboo_sonarqube_api_token_secret} \
      -Dsonar.branch.name=${bamboo_planRepository_1_branch} \
      -Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
      -Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
      -Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
      -Dsonar.python.coverage.reportPaths=reports/coverage.xml
 else
    echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
    /usr/bin/sonar-scanner/bin/sonar-scanner \
      -Dsonar.projectKey=RED_$SERVICE_NAME \
      -Dsonar.sources=image_prediction \
      -Dsonar.host.url=https://sonarqube.iqser.com \
      -Dsonar.login=${bamboo_sonarqube_api_token_secret} \
      -Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
      -Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \
      -Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \
      -Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
      -Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
      -Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
      -Dsonar.python.coverage.reportPaths=reports/coverage.xml
 fi
--- a/bamboo-specs/src/test/java/buildjob/PlanSpecTest.java
+++ b/bamboo-specs/src/test/java/buildjob/PlanSpecTest.java
@ -1,16 +0,0 @@
 package buildjob;
 import com.atlassian.bamboo.specs.api.builders.plan.Plan;
 import com.atlassian.bamboo.specs.api.exceptions.PropertiesValidationException;
 import com.atlassian.bamboo.specs.api.util.EntityPropertiesBuilders;
 import org.junit.Test;
 public class PlanSpecTest {
    @Test
    public void checkYourPlanOffline() throws PropertiesValidationException {
        Plan plan = new PlanSpec().createDockerBuildPlan();
        EntityPropertiesBuilders.build(plan);
    }
 }
--- a/bom.json
+++ b/bom.json
--- a/config.yaml
+++ b/config.yaml
@ -1,26 +0,0 @@
 webserver:
  host: $SERVER_HOST|"127.0.0.1"  # webserver address
  port: $SERVER_PORT|5000  # webserver port
 service:
  logging_level: $LOGGING_LEVEL_ROOT|INFO  # Logging level for service logger
  verbose: $VERBOSE|True  # Service prints document processing progress to stdout
  batch_size: $BATCH_SIZE|16  # Number of images in memory simultaneously
  mlflow_run_id: $MLFLOW_RUN_ID|fabfb1f192c745369b88cab34471aba7  # The ID of the mlflow run to load the service_estimator from
 # These variables control filters that are applied to either images, image metadata or service_estimator predictions.
 # The filter result values are reported in the service responses. For convenience the response to a request contains a
 # "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
 # specified required value.
 filters:
  image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)
    min: $MIN_REL_IMAGE_SIZE|0.05  # Minimum permissible
    max: $MAX_REL_IMAGE_SIZE|0.75  # Maximum permissible
  image_width_to_height_quotient:  # Image width to height ratio
    min: $MIN_IMAGE_FORMAT|0.1  # Minimum permissible
    max: $MAX_IMAGE_FORMAT|10  # Maximum permissible
  min_confidence: $MIN_CONFIDENCE|0.5  # Minimum permissible prediction confidence
--- a/config/pyinfra.toml
+++ b/config/pyinfra.toml
@ -0,0 +1,68 @@
 [asyncio]
 max_concurrent_tasks = 10
 [dynamic_tenant_queues]
 enabled = true
 [metrics.prometheus]
 enabled = true
 prefix = "redactmanager_image_service"
 [tracing]
 enabled = true
 # possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
 type = "azure_monitor" 
 [tracing.opentelemetry]
 endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
 service_name = "redactmanager_image_service"
 exporter = "otlp"
 [webserver]
 host = "0.0.0.0"
 port = 8080
 [rabbitmq]
 host = "localhost"
 port = 5672
 username = ""
 password = ""
 heartbeat = 60
 # Has to be a divider of heartbeat, and shouldn't be too big, since only in these intervals queue interactions happen (like receiving new messages)
 # This is also the minimum time the service needs to process a message
 connection_sleep = 5
 input_queue = "request_queue"
 output_queue = "response_queue"
 dead_letter_queue = "dead_letter_queue"
 tenant_event_queue_suffix = "_tenant_event_queue"
 tenant_event_dlq_suffix = "_tenant_events_dlq"
 tenant_exchange_name = "tenants-exchange"
 queue_expiration_time = 300000  # 5 minutes in milliseconds
 service_request_queue_prefix = "image_request_queue"
 service_request_exchange_name = "image_request_exchange"
 service_response_exchange_name = "image_response_exchange"
 service_dlq_name = "image_dlq"
 [storage]
 backend = "s3"
 [storage.s3]
 bucket = "redaction"
 endpoint = "http://127.0.0.1:9000"
 key = ""
 secret = ""
 region = "eu-central-1"
 [storage.azure]
 container = "redaction"
 connection_string = ""
 [storage.tenant_server]
 public_key = ""
 endpoint =  "http://tenant-user-management:8081/internal-api/tenants"
 [kubernetes]
 pod_name = "test_pod"
--- a/config/settings.toml
+++ b/config/settings.toml
@ -0,0 +1,42 @@
 [logging]
 level = "INFO"
 [service]
 # Print document processing progress to stdout
 verbose = false
 batch_size = 6
 image_stiching_tolerance = 1  # in pixels
 mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
 # These variables control filters that are applied to either images, image metadata or service_estimator predictions.
 # The filter result values are reported in the service responses. For convenience the response to a request contains a
 # "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
 # specified required value.
 [filters.confidence]
 # Minimum permissible prediction confidence
 min = 0.5
 # Image size to page size ratio (ratio of geometric means of areas)
 [filters.image_to_page_quotient]
 min = 0.05
 max = 0.75
 [filters.is_scanned_page]
 # Minimum permissible image to page ratio tolerance for a page to be considered scanned.
 # This is only used for filtering small images on scanned pages and is applied before processing the image, therefore
 # superseding the image_to_page_quotient filter that only applies a tag to the image after processing.
 tolerance = 0
 # Image width to height ratio
 [filters.image_width_to_height_quotient]
 min = 0.1
 max = 10
 # put class specific filters here ['signature', 'formula', 'logo']
 [filters.overrides.signature.image_to_page_quotient]
 max = 0.4
 [filters.overrides.logo.image_to_page_quotient]
 min = 0.06
--- a/image_prediction/config.py
+++ b/image_prediction/config.py
@ -1,40 +0,0 @@
 """Implements a config object with dot-indexing syntax."""
 from envyaml import EnvYAML
 from image_prediction.locations import CONFIG_FILE
 def _get_item_and_maybe_make_dotindexable(container, item):
    ret = container[item]
    return DotIndexable(ret) if isinstance(ret, dict) else ret
 class DotIndexable:
    def __init__(self, x):
        self.x = x
    def __getattr__(self, item):
        return _get_item_and_maybe_make_dotindexable(self.x, item)
    def __repr__(self):
        return self.x.__repr__()
    def __getitem__(self, item):
        return self.__getattr__(item)
 class Config:
    def __init__(self, config_path):
        self.__config = EnvYAML(config_path)
    def __getattr__(self, item):
        if item in self.__config:
            return _get_item_and_maybe_make_dotindexable(self.__config, item)
    def __getitem__(self, item):
        return self.__getattr__(item)
 CONFIG = Config(CONFIG_FILE)
--- a/image_prediction/image_extractor/extractors/parsable.py
+++ b/image_prediction/image_extractor/extractors/parsable.py
@ -1,186 +0,0 @@
 import atexit
 import io
 from functools import partial, lru_cache
 from itertools import chain, starmap, filterfalse
 from operator import itemgetter
 from typing import List
 import fitz
 from PIL import Image
 from funcy import rcompose, merge, pluck, curry, compose
 from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
 from image_prediction.info import Info
 from image_prediction.stitching.stitching import stitch_pairs
 from image_prediction.stitching.utils import validate_box_coords, validate_box_size
 from image_prediction.utils import get_logger
 from image_prediction.utils.generic import lift
 logger = get_logger()
 class ParsablePDFImageExtractor(ImageExtractor):
    def __init__(self, verbose=False, tolerance=0):
        """
        Args:
            verbose: Whether to show progressbar
            tolerance: The tolerance in pixels for the distance images beyond which they will not be stitched together
        """
        self.doc: fitz.fitz.Document = None
        self.verbose = verbose
        self.tolerance = tolerance
    def extract(self, pdf: bytes, page_range: range = None):
        self.doc = fitz.Document(stream=pdf)
        pages = extract_pages(self.doc, page_range) if page_range else self.doc
        image_metadata_pairs = chain.from_iterable(map(self.__process_images_on_page, pages))
        yield from image_metadata_pairs
    def __process_images_on_page(self, page: fitz.fitz.Page):
        images = get_images_on_page(self.doc, page)
        metadata = get_metadata_for_images_on_page(self.doc, page)
        clear_caches()
        image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
        image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
        yield from image_metadata_pairs
 def extract_pages(doc, page_range):
    page_range = range(page_range.start + 1, page_range.stop + 1)
    pages = map(doc.load_page, page_range)
    yield from pages
@lru_cache(maxsize=None)
 def get_images_on_page(doc, page: fitz.Page):
    image_infos = get_image_infos(page)
    xrefs = map(itemgetter("xref"), image_infos)
    images = map(partial(xref_to_image, doc), xrefs)
    yield from images
 def get_metadata_for_images_on_page(doc, page: fitz.Page):
    metadata = map(get_image_metadata, get_image_infos(page))
    metadata = validate_coords_and_passthrough(metadata)
    metadata = filter_out_tiny_images(metadata)
    metadata = validate_size_and_passthrough(metadata)
    metadata = add_page_metadata(page, metadata)
    metadata = add_alpha_channel_info(doc, page, metadata)
    yield from metadata
@lru_cache(maxsize=None)
 def get_image_infos(page: fitz.Page) -> List[dict]:
    return page.get_image_info(xrefs=True)
@lru_cache(maxsize=None)
 def xref_to_image(doc, xref) -> Image:
    maybe_image = load_image_handle_from_xref(doc, xref)
    return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None
 def get_image_metadata(image_info):
    x1, y1, x2, y2 = map(rounder, image_info["bbox"])
    width = abs(x2 - x1)
    height = abs(y2 - y1)
    return {
        Info.WIDTH: width,
        Info.HEIGHT: height,
        Info.X1: x1,
        Info.X2: x2,
        Info.Y1: y1,
        Info.Y2: y2,
    }
 def validate_coords_and_passthrough(metadata):
    yield from map(validate_box_coords, metadata)
 def filter_out_tiny_images(metadata):
    yield from filterfalse(tiny, metadata)
 def validate_size_and_passthrough(metadata):
    yield from map(validate_box_size, metadata)
 def add_page_metadata(page, metadata):
    yield from map(partial(merge, get_page_metadata(page)), metadata)
 def add_alpha_channel_info(doc, page, metadata):
    page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos)
    xref_to_alpha = partial(has_alpha_channel, doc)
    page_to_alpha_value_per_image = compose(lift(xref_to_alpha), page_to_xrefs)
    alpha_to_dict = compose(dict, lambda a: [(Info.ALPHA, a)])
    page_to_alpha_mapping_per_image = compose(lift(alpha_to_dict), page_to_alpha_value_per_image)
    metadata = starmap(merge, zip(page_to_alpha_mapping_per_image(page), metadata))
    yield from metadata
@lru_cache(maxsize=None)
 def load_image_handle_from_xref(doc, xref):
    return doc.extract_image(xref)
 rounder = rcompose(round, int)
 def get_page_metadata(page):
    page_width, page_height = map(rounder, page.mediabox_size)
    return {
        Info.PAGE_WIDTH: page_width,
        Info.PAGE_HEIGHT: page_height,
        Info.PAGE_IDX: page.number,
    }
 def has_alpha_channel(doc, xref):
    maybe_image = load_image_handle_from_xref(doc, xref)
    maybe_smask = maybe_image["smask"] if maybe_image else None
    if maybe_smask:
        return any([doc.extract_image(maybe_smask) is not None, bool(fitz.Pixmap(doc, maybe_smask).alpha)])
    else:
        try:
            return bool(fitz.Pixmap(doc, xref).alpha)
        except ValueError:
            logger.debug(f"Encountered invalid xref `{xref}` in {doc.metadata.get('title', '<no title>')}.")
            return False
 def tiny(metadata):
    return metadata[Info.WIDTH] * metadata[Info.HEIGHT] <= 4
 def clear_caches():
    get_image_infos.cache_clear()
    load_image_handle_from_xref.cache_clear()
    get_images_on_page.cache_clear()
    xref_to_image.cache_clear()
 atexit.register(clear_caches)
--- a/image_prediction/locations.py
+++ b/image_prediction/locations.py
@ -1,17 +0,0 @@
 """Defines constant paths relative to the module root path."""
 from pathlib import Path
 MODULE_DIR = Path(__file__).resolve().parents[0]
 PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
 CONFIG_FILE = PACKAGE_ROOT_DIR / "config.yaml"
 BANNER_FILE = PACKAGE_ROOT_DIR / "banner.txt"
 DATA_DIR = PACKAGE_ROOT_DIR / "data"
 MLRUNS_DIR = str(DATA_DIR / "mlruns")
 TEST_DATA_DIR = PACKAGE_ROOT_DIR / "test" / "data"
--- a/image_prediction/transformer/transformers/init.py
+++ b/image_prediction/transformer/transformers/init.py
--- a/image_prediction/transformer/transformers/coordinate/init.py
+++ b/image_prediction/transformer/transformers/coordinate/init.py
--- a/image_prediction/utils/generic.py
+++ b/image_prediction/utils/generic.py
@ -1,15 +0,0 @@
 from itertools import starmap
 from funcy import iterate, first, curry, map
 def until(cond, func, *args, **kwargs):
    return first(filter(cond, iterate(func, *args, **kwargs)))
 def lift(fn):
    return curry(map)(fn)
 def starlift(fn):
    return curry(starmap)(fn)
--- a/image_prediction/utils/logger.py
+++ b/image_prediction/utils/logger.py
@ -1,27 +0,0 @@
 import logging
 from image_prediction.config import CONFIG
 def make_logger_getter():
    logger = logging.getLogger("imclf")
    logger.propagate = False
    handler = logging.StreamHandler()
    handler.setLevel(CONFIG.service.logging_level)
    log_format = "%(asctime)s %(levelname)-8s %(message)s"
    formatter = logging.Formatter(log_format, datefmt="%Y-%m-%d %H:%M:%S")
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(CONFIG.service.logging_level)
    def get_logger():
        return logger
    return get_logger
 get_logger = make_logger_getter()
--- a/incl/init.py
+++ b/incl/init.py
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,73 @@
 [tool.poetry]
 name = "image-classification-service"
 version = "2.17.0"
 description = ""
 authors = ["Team Research <research@knecon.com>"]
 readme = "README.md"
 packages = [{ include = "image_prediction", from = "src" }]
 [tool.poetry.dependencies]
 python = ">=3.10,<3.11"
 # FIXME: This should be recent pyinfra, but the recent protobuf packages are not compatible with tensorflow 2.9.0, also
 #  see RED-9948.
 pyinfra = { version = "3.4.2", source = "gitlab-research" }
 kn-utils = { version = ">=0.4.0", source = "gitlab-research" }
 dvc = "^2.34.0"
 dvc-ssh = "^2.20.0"
 dvc-azure = "^2.21.2"
 Flask = "^2.1.1"
 requests = "^2.27.1"
 iteration-utilities = "^0.11.0"
 waitress = "^2.1.1"
 envyaml = "^1.10.211231"
 dependency-check = "^0.6.0"
 mlflow = "^1.24.0"
 numpy = "^1.22.3"
 tqdm = "^4.64.0"
 pandas = "^1.4.2"
 # FIXME: Our current model significantly changes the prediction behaviour when using newer tensorflow (/ protobuf)
 #  versions which is introduuced by pyinfra updates using newer protobuf versions, see RED-9948.
 tensorflow = "2.9.0"
 protobuf = "^3.20"
 pytest = "^7.1.0"
 funcy = "^2"
 PyMuPDF = "^1.19.6"
 fpdf = "^1.7.2"
 coverage = "^6.3.2"
 Pillow = "^9.1.0"
 pdf2image = "^1.16.0"
 frozendict = "^2.3.0"
 fsspec = "^2022.11.0"
 PyMonad = "^2.4.0"
 pdfnetpython3 = "9.4.2"
 loguru = "^0.7.0"
 cyclonedx-bom = "^4.5.0"
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.0.1"
 pymonad = "^2.4.0"
 pylint = "^2.17.4"
 ipykernel = "^6.23.2"
 [tool.pytest.ini_options]
 testpaths = ["test"]
 addopts = "--ignore=data"
 filterwarnings = ["ignore:.*:DeprecationWarning"]
 [[tool.poetry.source]]
 name = "PyPI"
 priority = "primary"
 [[tool.poetry.source]]
 name = "gitlab-research"
 url = "https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi/simple"
 priority = "explicit"
 [[tool.poetry.source]]
 name = "gitlab-red"
 url = "https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi/simple"
 priority = "explicit"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
--- a/pytest.ini
+++ b/pytest.ini
@ -1,5 +0,0 @@
 [pytest]
 norecursedirs = incl
 filterwarnings =
    ignore:.*:DeprecationWarning
    ignore:.*:DeprecationWarning
--- a/requirements.txt
+++ b/requirements.txt
@ -1,25 +0,0 @@
 Flask==2.1.1
 requests==2.27.1
 iteration-utilities==0.11.0
 dvc==2.10.0
 dvc[ssh]
 waitress==2.1.1
 envyaml==1.10.211231
 dependency-check==0.6.*
 mlflow==1.24.0
 numpy==1.22.3
 tqdm==4.64.0
 pandas==1.4.2
 tensorflow==2.8.0
 PyYAML==6.0
 pytest~=7.1.0
 funcy==1.17
 PyMuPDF==1.19.6
 fpdf==1.7.2
 coverage==6.3.2
 Pillow==9.1.0
 PDFNetPython3==9.1.0
 pdf2image==1.16.0
 frozendict==2.3.0
 protobuf<=3.20.*
 prometheus-client==0.13.1
--- a/scripts/debug/debug.py
+++ b/scripts/debug/debug.py
@ -0,0 +1,46 @@
 """Script to debug RED-9948. The predictions unexpectedly changed for some images, and we need to understand why."""
 import json
 import random
 from pathlib import Path
 import numpy as np
 import tensorflow as tf
 from kn_utils.logging import logger
 from image_prediction.config import CONFIG
 from image_prediction.pipeline import load_pipeline
 def process_pdf(pipeline, pdf_path, page_range=None):
    with open(pdf_path, "rb") as f:
        logger.info(f"Processing {pdf_path}")
        predictions = list(pipeline(f.read(), page_range=page_range))
    return predictions
 def ensure_seeds():
    seed = 42
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
 def debug_info():
    devices = tf.config.list_physical_devices()
    print("Available devices:", devices)
 if __name__ == "__main__":
    # For in container debugging, copy the file and adjust the path.
    debug_file_path = Path(__file__).parents[2] / "test" / "data" / "RED-9948" / "SYNGENTA_EFSA_sanitisation_GFL_v2"
    ensure_seeds()
    debug_info()
    pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size)
    predictions = process_pdf(pipeline, debug_file_path)
    # This is the image that has the wrong prediction mentioned in RED-9948. The predictions should inconclusive, and
    # the flag all passed should be false.
    predictions = [x for x in predictions if x["representation"] == "FA30F080F0C031CE17E8CF237"]
    print(json.dumps(predictions, indent=2))
--- a/scripts/devenvsetup.sh
+++ b/scripts/devenvsetup.sh
@ -0,0 +1,30 @@
 #!/bin/bash
 python_version=$1
 gitlab_user=$2
 gitlab_personal_access_token=$3
 # cookiecutter https://gitlab.knecon.com/knecon/research/template-python-project.git --checkout master
 # latest_dir=$(ls -td -- */ | head -n 1)  # should be the dir cookiecutter just created
 # cd $latest_dir
 pyenv install $python_version
 pyenv local $python_version
 pyenv shell $python_version
 pip install --upgrade pip
 pip install poetry
 poetry config installer.max-workers 10
 # research package registry
 poetry config repositories.gitlab-research https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
 poetry config http-basic.gitlab-research ${gitlab_user} ${gitlab_personal_access_token}
 # redactmanager package registry
 poetry config repositories.gitlab-red https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
 poetry config http-basic.gitlab-red ${gitlab_user} ${gitlab_personal_access_token}
 poetry env use $(pyenv which python)
 poetry install --with=dev
 poetry update
 source .venv/bin/activate
--- a/scripts/docker_build_run.sh
+++ b/scripts/docker_build_run.sh
@ -0,0 +1,6 @@
 docker build -t --platform linux/amd64 image-clsasification-service:$(poetry version -s)-dev \                                                               
    -f Dockerfile \
    --build-arg GITLAB_USER=$GITLAB_USER \
    --build-arg GITLAB_ACCESS_TOKEN=$GITLAB_ACCESS_TOKEN \
    . && \
 docker run -it --rm image-clsasification-service:$(poetry version -s)-dev
--- a/scripts/docker_tag_push.sh
+++ b/scripts/docker_tag_push.sh
@ -0,0 +1,3 @@
 docker tag image-clsasification-service:$(poetry version -s)-dev $NEXUS_REGISTRY/red/image-clsasification-service:$(poetry version -s)-dev
 docker push $NEXUS_REGISTRY/red/image-clsasification-service:$(poetry version -s)-dev
--- a/scripts/k8s_startup_probe.py
+++ b/scripts/k8s_startup_probe.py
@ -0,0 +1,6 @@
 from pyinfra.k8s_probes import startup
 from loguru import logger
 if __name__ == "__main__":
    logger.debug("running health check")
    startup.run_checks()
--- a/scripts/run_pipeline.py
+++ b/scripts/run_pipeline.py
@ -3,12 +3,15 @@ import json
 import os
 from glob import glob
 from image_prediction.config import CONFIG
 from image_prediction.pipeline import load_pipeline
 from image_prediction.utils import get_logger
 from image_prediction.utils.pdf_annotation import annotate_pdf
 logger = get_logger()
 logger.setLevel("DEBUG")
 def parse_args():
    parser = argparse.ArgumentParser()
@ -35,7 +38,7 @@ def process_pdf(pipeline, pdf_path, page_range=None):
 def main(args):
-    pipeline = load_pipeline(verbose=True, tolerance=3)
+    pipeline = load_pipeline(verbose=CONFIG.service.verbose, batch_size=CONFIG.service.batch_size, tolerance=CONFIG.service.image_stiching_tolerance)
    if os.path.isfile(args.input):
        pdf_paths = [args.input]
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
--- a/setup.py
+++ b/setup.py
@ -1,13 +0,0 @@
 #!/usr/bin/env python
 from distutils.core import setup
 setup(
    name="image_prediction",
    version="0.1.0",
    description="",
    author="",
    author_email="",
    url="",
    packages=["image_prediction"],
 )
--- a/sonar-project.properties
+++ b/sonar-project.properties
@ -1,4 +0,0 @@
 sonar.exclusions=bamboo-specs/**, **/test_data/**
 sonar.c.file.suffixes=-
 sonar.cpp.file.suffixes=-
 sonar.objc.file.suffixes=-
--- a/src/image_prediction/init.py
+++ b/src/image_prediction/init.py
@ -0,0 +1,13 @@
 import logging
 import sys
 # log config
 LOG_FORMAT = "%(asctime)s [%(levelname)s] - [%(filename)s -> %(funcName)s() -> %(lineno)s] : %(message)s"
 DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
 stream_handler = logging.StreamHandler(sys.stdout)
 stream_handler_format = logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT)
 stream_handler.setFormatter(stream_handler_format)
 logger = logging.getLogger(__name__)
 logger.propagate = False
 logger.addHandler(stream_handler)
--- a/src/image_prediction/classifier/init.py
+++ b/src/image_prediction/classifier/init.py
--- a/src/image_prediction/classifier/classifier.py
+++ b/src/image_prediction/classifier/classifier.py
--- a/src/image_prediction/classifier/image_classifier.py
+++ b/src/image_prediction/classifier/image_classifier.py
--- a/src/image_prediction/compositor/init.py
+++ b/src/image_prediction/compositor/init.py
--- a/src/image_prediction/compositor/compositor.py
+++ b/src/image_prediction/compositor/compositor.py
--- a/src/image_prediction/config.py
+++ b/src/image_prediction/config.py
@ -0,0 +1,7 @@
 from pathlib import Path
 from pyinfra.config.loader import load_settings
 from image_prediction.locations import PROJECT_ROOT_DIR
 CONFIG = load_settings(root_path=PROJECT_ROOT_DIR, settings_path="config")
--- a/src/image_prediction/default_objects.py
+++ b/src/image_prediction/default_objects.py
--- a/src/image_prediction/encoder/init.py
+++ b/src/image_prediction/encoder/init.py
--- a/src/image_prediction/encoder/encoder.py
+++ b/src/image_prediction/encoder/encoder.py
--- a/src/image_prediction/encoder/encoders/init.py
+++ b/src/image_prediction/encoder/encoders/init.py
--- a/src/image_prediction/encoder/encoders/hash_encoder.py
+++ b/src/image_prediction/encoder/encoders/hash_encoder.py
@ -13,7 +13,7 @@ class HashEncoder(Encoder):
        yield from self.encode(images)
-def hash_image(image: Image.Image):
+def hash_image(image: Image.Image) -> str:
    """See: https://stackoverflow.com/a/49692185/3578468"""
    image = image.resize((10, 10), Image.ANTIALIAS)
    image = image.convert("L")
@ -21,4 +21,6 @@ def hash_image(image: Image.Image):
    avg_pixel = sum(pixel_data) / len(pixel_data)
    bits = "".join(["1" if (px >= avg_pixel) else "0" for px in pixel_data])
    hex_representation = str(hex(int(bits, 2)))[2:][::-1].upper()
-    return hex_representation
+    # Note: For each 4 leading zeros, the hex representation will be shorter by one character.
    # To ensure that all hashes have the same length, we pad the hex representation with zeros (also see RED-3813).
    return hex_representation.zfill(25)
--- a/src/image_prediction/estimator/init.py
+++ b/src/image_prediction/estimator/init.py
--- a/src/image_prediction/estimator/adapter/init.py
+++ b/src/image_prediction/estimator/adapter/init.py
--- a/src/image_prediction/estimator/adapter/adapter.py
+++ b/src/image_prediction/estimator/adapter/adapter.py
--- a/src/image_prediction/estimator/adapter/adapters/init.py
+++ b/src/image_prediction/estimator/adapter/adapters/init.py
--- a/src/image_prediction/estimator/preprocessor/init.py
+++ b/src/image_prediction/estimator/preprocessor/init.py
--- a/src/image_prediction/estimator/preprocessor/preprocessor.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessor.py
--- a/src/image_prediction/estimator/preprocessor/preprocessors/init.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessors/init.py
--- a/src/image_prediction/estimator/preprocessor/preprocessors/basic.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessors/basic.py
--- a/src/image_prediction/estimator/preprocessor/preprocessors/identity.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessors/identity.py
--- a/src/image_prediction/estimator/preprocessor/utils.py
+++ b/src/image_prediction/estimator/preprocessor/utils.py
--- a/src/image_prediction/exceptions.py
+++ b/src/image_prediction/exceptions.py
@ -32,3 +32,11 @@ class IntentionalTestException(RuntimeError):
 class InvalidBox(Exception):
    pass
 class ParsingError(Exception):
    pass
 class BadXref(ValueError):
    pass
--- a/src/image_prediction/extraction.py
+++ b/src/image_prediction/extraction.py
--- a/src/image_prediction/flask.py
+++ b/src/image_prediction/flask.py
@ -1,38 +1,14 @@
 import multiprocessing
 import traceback
 from typing import Callable
 from flask import Flask, request, jsonify
 from prometheus_client import generate_latest, CollectorRegistry, Summary
 from image_prediction.utils import get_logger
 from image_prediction.utils.process_wrapping import wrap_in_process
 logger = get_logger()
 def run_in_process(func):
    p = multiprocessing.Process(target=func)
    p.start()
    p.join()
 def wrap_in_process(func_to_wrap):
    def build_function_and_run_in_process(*args, **kwargs):
        def func():
            try:
                result = func_to_wrap(*args, **kwargs)
                return_dict["result"] = result
            except:
                logger.error(traceback.format_exc())
        manager = multiprocessing.Manager()
        return_dict = manager.dict()
        run_in_process(func)
        return return_dict.get("result", None)
    return build_function_and_run_in_process
 def make_prediction_server(predict_fn: Callable):
    app = Flask(__name__)
    registry = CollectorRegistry(auto_describe=True)
--- a/image_prediction/estimator/preprocessor/init.py
+++ b/image_prediction/estimator/preprocessor/init.py
--- a/src/image_prediction/formatter/formatter.py
+++ b/src/image_prediction/formatter/formatter.py
--- a/image_prediction/estimator/preprocessor/preprocessors/init.py
+++ b/image_prediction/estimator/preprocessor/preprocessors/init.py
--- a/src/image_prediction/formatter/formatters/camel_case.py
+++ b/src/image_prediction/formatter/formatters/camel_case.py
--- a/src/image_prediction/formatter/formatters/enum.py
+++ b/src/image_prediction/formatter/formatters/enum.py
--- a/src/image_prediction/formatter/formatters/identity.py
+++ b/src/image_prediction/formatter/formatters/identity.py
--- a/src/image_prediction/formatter/formatters/key_formatter.py
+++ b/src/image_prediction/formatter/formatters/key_formatter.py
--- a/src/image_prediction/image_extractor/init.py
+++ b/src/image_prediction/image_extractor/init.py
--- a/src/image_prediction/image_extractor/extractor.py
+++ b/src/image_prediction/image_extractor/extractor.py
--- a/src/image_prediction/image_extractor/extractors/init.py
+++ b/src/image_prediction/image_extractor/extractors/init.py
--- a/src/image_prediction/image_extractor/extractors/mock.py
+++ b/src/image_prediction/image_extractor/extractors/mock.py
--- a/src/image_prediction/image_extractor/extractors/parsable.py
+++ b/src/image_prediction/image_extractor/extractors/parsable.py
@ -0,0 +1,300 @@
 import atexit
 import json
 import traceback
 from _operator import itemgetter
 from functools import partial, lru_cache
 from itertools import chain, starmap, filterfalse, tee
 from operator import itemgetter, truth
 from typing import Iterable, Iterator, List, Union
 import fitz
 import numpy as np
 from PIL import Image
 from funcy import merge, pluck, compose, rcompose, remove, keep
 from scipy.stats import gmean
 from image_prediction.config import CONFIG
 from image_prediction.exceptions import InvalidBox
 from image_prediction.formatter.formatters.enum import EnumFormatter
 from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
 from image_prediction.info import Info
 from image_prediction.stitching.stitching import stitch_pairs
 from image_prediction.stitching.utils import validate_box
 from image_prediction.transformer.transformers.response import compute_geometric_quotient
 from image_prediction.utils import get_logger
 logger = get_logger()
 class ParsablePDFImageExtractor(ImageExtractor):
    def __init__(self, verbose=False, tolerance=0):
        """
        Args:
            verbose: Whether to show progressbar
            tolerance: The tolerance in pixels for the distance between images, beyond which they will not be stitched
                together
        """
        self.doc: fitz.Document = None
        self.verbose = verbose
        self.tolerance = tolerance
    def extract(self, pdf: bytes, page_range: range = None):
        self.doc = fitz.Document(stream=pdf)
        pages = extract_pages(self.doc, page_range) if page_range else self.doc
        image_metadata_pairs = chain.from_iterable(map(self.__process_images_on_page, pages))
        yield from image_metadata_pairs
    def __process_images_on_page(self, page: fitz.Page):
        metadata = extract_valid_metadata(self.doc, page)
        images = get_images_on_page(self.doc, metadata)
        clear_caches()
        image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
        #  TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the
        #   validation here. Invalid images can then be split into a different stream and joined with the intact images
        #   again for the formatting step.
        image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs)
        image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
        yield from image_metadata_pairs
    @staticmethod
    def __filter_valid_images(image_metadata_pairs: Iterable[ImageMetadataPair]) -> Iterator[ImageMetadataPair]:
        def validate_image_is_not_corrupt(image: Image.Image, metadata: dict):
            """See RED-5148: Some images are corrupt and cannot be processed by the image classifier. This function
            filters out such images by trying to resize and convert them to RGB. If this fails, the image is considered
            corrupt and is dropped.
            TODO: find cleaner solution
            """
            try:
                image.resize((100, 100)).convert("RGB")
                return ImageMetadataPair(image, metadata)
            except (OSError, Exception) as err:
                metadata = json.dumps(EnumFormatter()(metadata), indent=2)
                logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
                return None
        def filter_small_images_on_scanned_pages(image_metadata_pairs) -> Iterable[ImageMetadataPair]:
            """See RED-9746: Small images on scanned pages should be dropped, so they are not classified. This is a
            heuristic to filter out images that are too small in relation to the page size if they are on a scanned page.
            The ratio is computed as the geometric mean of the width and height of the image divided by the geometric mean
            of the width and height of the page. If the ratio is below the threshold, the image is dropped.
            """
            def image_is_a_scanned_page(image_metadata_pair: ImageMetadataPair) -> bool:
                tolerance = CONFIG.filters.is_scanned_page.tolerance
                width_ratio = image_metadata_pair.metadata[Info.WIDTH] / image_metadata_pair.metadata[Info.PAGE_WIDTH]
                height_ratio = (
                    image_metadata_pair.metadata[Info.HEIGHT] / image_metadata_pair.metadata[Info.PAGE_HEIGHT]
                )
                return width_ratio >= 1 - tolerance and height_ratio >= 1 - tolerance
            def image_fits_geometric_mean_ratio(image_metadata_pair: ImageMetadataPair) -> bool:
                min_ratio = CONFIG.filters.image_to_page_quotient.min
                metadatum = image_metadata_pair.metadata
                image_gmean = gmean([metadatum[Info.WIDTH], metadatum[Info.HEIGHT]])
                page_gmean = gmean([metadatum[Info.PAGE_WIDTH], metadatum[Info.PAGE_HEIGHT]])
                ratio = image_gmean / page_gmean
                return ratio >= min_ratio
            pairs, pairs_copy = tee(image_metadata_pairs)
            if any(map(image_is_a_scanned_page, pairs_copy)):
                logger.debug("Scanned page detected, filtering out small images ...")
                return filter(image_fits_geometric_mean_ratio, pairs)
            else:
                return pairs
        image_metadata_pairs = filter_small_images_on_scanned_pages(image_metadata_pairs)
        return filter(truth, starmap(validate_image_is_not_corrupt, image_metadata_pairs))
 def extract_pages(doc, page_range):
    page_range = range(page_range.start + 1, page_range.stop + 1)
    pages = map(doc.load_page, page_range)
    yield from pages
 def get_images_on_page(doc, metadata):
    xrefs = pluck(Info.XREF, metadata)
    images = map(partial(xref_to_image, doc), xrefs)
    yield from images
 def extract_valid_metadata(doc: fitz.Document, page: fitz.Page):
    metadata = get_metadata_for_images_on_page(page)
    metadata = filter_valid_metadata(metadata)
    metadata = add_alpha_channel_info(doc, metadata)
    return list(metadata)
 def get_metadata_for_images_on_page(page: fitz.Page):
    metadata = map(get_image_metadata, get_image_infos(page))
    metadata = add_page_metadata(page, metadata)
    yield from metadata
 def filter_valid_metadata(metadata):
    yield from compose(
        # TODO: Disabled for now, since atm since the backend needs atm the metadata and the hash of every image, even
        #  scanned pages. In the future, this should be resolved differently, e.g. by filtering all page-sized images
        #  and giving the user the ability to reclassify false positives with a separate call.
        # filter_out_page_sized_images,
        filter_out_tiny_images,
        filter_out_invalid_metadata,
    )(metadata)
 def filter_out_invalid_metadata(metadata):
    def __validate_box(box):
        try:
            return validate_box(box)
        except InvalidBox as err:
            logger.debug(f"Dropping invalid metadatum, reason: {err}")
    yield from keep(__validate_box, metadata)
 def filter_out_page_sized_images(metadata):
    yield from remove(breaches_image_to_page_quotient, metadata)
 def filter_out_tiny_images(metadata):
    yield from filterfalse(tiny, metadata)
@lru_cache(maxsize=None)
 def get_image_infos(page: fitz.Page) -> List[dict]:
    return page.get_image_info(xrefs=True)
@lru_cache(maxsize=None)
 def xref_to_image(doc, xref) -> Union[Image.Image, None]:
    # NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream
    try:
        pixmap = fitz.Pixmap(doc, xref)
        array = convert_pixmap_to_array(pixmap)
        return Image.fromarray(array)
    except ValueError:
        logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
        return
 def convert_pixmap_to_array(pixmap: fitz.Pixmap):
    array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
    array = _normalize_channels(array)
    return array
 def _normalize_channels(array: np.ndarray):
    if array.shape[-1] == 1:
        array = array[:, :, 0]
    elif array.shape[-1] == 4:
        array = array[..., :3]
    elif array.shape[-1] != 3:
        logger.warning(f"Unexpected image format: {array.shape}.")
        raise ValueError(f"Unexpected image format: {array.shape}.")
    return array
 def get_image_metadata(image_info):
    xref, coords = itemgetter("xref", "bbox")(image_info)
    x1, y1, x2, y2 = map(rounder, coords)
    width = abs(x2 - x1)
    height = abs(y2 - y1)
    return {
        Info.WIDTH: width,
        Info.HEIGHT: height,
        Info.X1: x1,
        Info.X2: x2,
        Info.Y1: y1,
        Info.Y2: y2,
        Info.XREF: xref,
    }
 def add_page_metadata(page, metadata):
    yield from map(partial(merge, get_page_metadata(page)), metadata)
 def add_alpha_channel_info(doc, metadata):
    def add_alpha_value_to_metadatum(metadatum):
        alpha = metadatum_to_alpha_value(metadatum)
        return {**metadatum, Info.ALPHA: alpha}
    xref_to_alpha = partial(has_alpha_channel, doc)
    metadatum_to_alpha_value = compose(xref_to_alpha, itemgetter(Info.XREF))
    yield from map(add_alpha_value_to_metadatum, metadata)
@lru_cache(maxsize=None)
 def load_image_handle_from_xref(doc, xref):
    try:
        return doc.extract_image(xref)
    except ValueError:
        logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
        return
 rounder = rcompose(round, int)
 def get_page_metadata(page):
    page_width, page_height = map(rounder, page.mediabox_size)
    return {
        Info.PAGE_WIDTH: page_width,
        Info.PAGE_HEIGHT: page_height,
        Info.PAGE_IDX: page.number,
    }
 def has_alpha_channel(doc, xref):
    maybe_image = load_image_handle_from_xref(doc, xref)
    maybe_smask = maybe_image["smask"] if maybe_image else None
    if maybe_smask:
        return any([doc.extract_image(maybe_smask) is not None, bool(fitz.Pixmap(doc, maybe_smask).alpha)])
    else:
        try:
            return bool(fitz.Pixmap(doc, xref).alpha)
        except ValueError:
            logger.debug(f"Encountered invalid xref `{xref}` in {doc.metadata.get('title', '<no title>')}.")
            return False
 def tiny(metadata):
    return metadata[Info.WIDTH] * metadata[Info.HEIGHT] <= 4
 def clear_caches():
    get_image_infos.cache_clear()
    load_image_handle_from_xref.cache_clear()
    xref_to_image.cache_clear()
 atexit.register(clear_caches)
 def breaches_image_to_page_quotient(metadatum):
    page_width, page_height, x1, x2, y1, y2, width, height = itemgetter(
        Info.PAGE_WIDTH, Info.PAGE_HEIGHT, Info.X1, Info.X2, Info.Y1, Info.Y2, Info.WIDTH, Info.HEIGHT
    )(metadatum)
    geometric_quotient = compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1)
    quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max)
    return quotient_breached
--- a/src/image_prediction/info.py
+++ b/src/image_prediction/info.py
@ -12,3 +12,4 @@ class Info(Enum):
    Y1 = "y1"
    Y2 = "y2"
    ALPHA = "alpha"
    XREF = "xref"
--- a/src/image_prediction/label_mapper/init.py
+++ b/src/image_prediction/label_mapper/init.py
--- a/src/image_prediction/label_mapper/mapper.py
+++ b/src/image_prediction/label_mapper/mapper.py
--- a/image_prediction/image_extractor/extractors/init.py
+++ b/image_prediction/image_extractor/extractors/init.py
--- a/src/image_prediction/label_mapper/mappers/numeric.py
+++ b/src/image_prediction/label_mapper/mappers/numeric.py
--- a/src/image_prediction/label_mapper/mappers/probability.py
+++ b/src/image_prediction/label_mapper/mappers/probability.py
--- a/src/image_prediction/locations.py
+++ b/src/image_prediction/locations.py
@ -0,0 +1,18 @@
 """Defines constant paths relative to the module root path."""
 from pathlib import Path
 # FIXME: move these paths to config, only depending on 'ROOT_PATH' environment variable.
 MODULE_DIR = Path(__file__).resolve().parents[0]
 PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
 PROJECT_ROOT_DIR = PACKAGE_ROOT_DIR.parents[0]
 CONFIG_FILE = PROJECT_ROOT_DIR / "config" / "settings.toml"
 BANNER_FILE = PROJECT_ROOT_DIR / "banner.txt"
 DATA_DIR = PROJECT_ROOT_DIR / "data"
 MLRUNS_DIR = str(DATA_DIR / "mlruns")
 TEST_DIR = PROJECT_ROOT_DIR / "test"
 TEST_DATA_DIR = TEST_DIR / "data"
 TEST_DATA_DIR_DVC = TEST_DIR / "data.dvc"
--- a/src/image_prediction/model_loader/init.py
+++ b/src/image_prediction/model_loader/init.py
--- a/src/image_prediction/model_loader/database/init.py
+++ b/src/image_prediction/model_loader/database/init.py
--- a/src/image_prediction/model_loader/database/connector.py
+++ b/src/image_prediction/model_loader/database/connector.py
--- a/src/image_prediction/model_loader/database/connectors/init.py
+++ b/src/image_prediction/model_loader/database/connectors/init.py
--- a/src/image_prediction/model_loader/database/connectors/mock.py
+++ b/src/image_prediction/model_loader/database/connectors/mock.py
--- a/src/image_prediction/model_loader/loader.py
+++ b/src/image_prediction/model_loader/loader.py
--- a/src/image_prediction/model_loader/loaders/init.py
+++ b/src/image_prediction/model_loader/loaders/init.py
--- a/src/image_prediction/model_loader/loaders/mlflow.py
+++ b/src/image_prediction/model_loader/loaders/mlflow.py
--- a/src/image_prediction/pipeline.py
+++ b/src/image_prediction/pipeline.py
@ -1,8 +1,10 @@
 import os
-from functools import partial
+from functools import lru_cache, partial
 from itertools import chain, tee
 from typing import Iterable, Any
 from funcy import rcompose, first, compose, second, chunks, identity, rpartial
 from kn_utils.logging import logger
 from tqdm import tqdm
 from image_prediction.config import CONFIG
@ -19,7 +21,9 @@ from image_prediction.utils.generic import lift, starlift
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
@lru_cache(maxsize=None)
 def load_pipeline(**kwargs):
    logger.info(f"Loading pipeline with kwargs: {kwargs}")
    model_loader = get_mlflow_model_loader(MLRUNS_DIR)
    model_identifier = CONFIG.service.mlflow_run_id
@ -37,7 +41,7 @@ def star(f):
 class Pipeline:
-    def __init__(self, model_loader, model_identifier, batch_size=16, verbose=True, **kwargs):
+    def __init__(self, model_loader, model_identifier, batch_size=16, verbose=False, **kwargs):
        self.verbose = verbose
        extract = get_extractor(**kwargs)
@ -51,7 +55,7 @@ class Pipeline:
        join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))
        #                       />--classify--\
-        # --extract-->--split--+->--encode---->+--join-->reformat
+        # --extract-->--split--+->--encode---->+--join-->reformat-->filter_duplicates
        #                       \>--identity--/
        self.pipe = rcompose(
@ -60,6 +64,7 @@ class Pipeline:
            pairwise_apply(classify, represent, identity),  # ... apply functions to the streams pairwise
            join,  # ... the streams by zipping
            reformat,  # ... the items
            filter_duplicates,  # ... filter out duplicate images
        )
    def __call__(self, pdf: bytes, page_range: range = None):
@ -69,3 +74,32 @@ class Pipeline:
            unit=" images",
            disable=not self.verbose,
        )
 def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
    """Filter out duplicate images from the `position` (image coordinates) and `page`, preferring the one with
    `allPassed` set to True.
    See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
    """
    keep = dict()
    for image_meta in metadata:
        key: tuple[int, int, int, int, int] = (
            image_meta["position"]["x1"],
            image_meta["position"]["x2"],
            image_meta["position"]["y1"],
            image_meta["position"]["y2"],
            image_meta["position"]["pageNumber"],
        )
        if key in keep:
            logger.warning(
                f"Duplicate image found: x1={key[0]}, x2={key[1]}, y1={key[2]}, y2={key[3]}, pageNumber={key[4]}"
            )
            if image_meta["filters"]["allPassed"]:
                logger.warning("Setting the image with allPassed flag set to True")
                keep[key] = image_meta
            else:
                logger.warning("Keeping the previous image since the current image has allPassed flag set to False")
        else:
            keep[key] = image_meta
    yield from keep.values()
--- a/image_prediction/model_loader/database/connectors/init.py
+++ b/image_prediction/model_loader/database/connectors/init.py
--- a/src/image_prediction/redai_adapter/efficient_net_wrapper.py
+++ b/src/image_prediction/redai_adapter/efficient_net_wrapper.py
--- a/src/image_prediction/redai_adapter/mlflow.py
+++ b/src/image_prediction/redai_adapter/mlflow.py
--- a/src/image_prediction/redai_adapter/model.py
+++ b/src/image_prediction/redai_adapter/model.py
--- a/src/image_prediction/redai_adapter/model_wrapper.py
+++ b/src/image_prediction/redai_adapter/model_wrapper.py
--- a/image_prediction/model_loader/loaders/init.py
+++ b/image_prediction/model_loader/loaders/init.py
--- a/Show More
+++ b/Show More
		`@ -0,0 +1,3 @@`
							`docker tag image-clsasification-service:$(poetry version -s)-dev $NEXUS_REGISTRY/red/image-clsasification-service:$(poetry version -s)-dev`

							`docker push $NEXUS_REGISTRY/red/image-clsasification-service:$(poetry version -s)-dev`