feat: RED-10765: ignore perceptual hash for image deduplication and prefer to keep the ones with allPassed set to True

Merge branch 'feat/RED-10765/filter-duplicate-images' into 'master'
feat: RED-10765: filter out classifications for 'duplicate' images present in the document Closes RED-10765 See merge request redactmanager/image-classification-service!23
2025-01-31 12:59:59 +01:00 · 2025-01-30 13:20:19 +01:00 · 2025-01-30 12:42:41 +01:00 · 2025-01-16 09:29:11 +01:00 · 2025-01-15 13:39:16 +01:00 · 2024-12-18 12:39:44 +01:00
174 changed files with 45575 additions and 788 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -0,0 +1,63 @@
 # .coveragerc to control coverage.py
 [run]
 branch = True
 parallel = True
 command_line = -m pytest
 concurrency = multiprocessing
 omit =
    */site-packages/*
    */distutils/*
    */test/*
    */__init__.py
    */setup.py
 	*/venv/*
 	*/env/*
 	*/build_venv/*
 	*/build_env/*
 	*/utils/banner.py
 	*/utils/logger.py
 	*/src/*
 source =
    image_prediction
 relative_files = True
 data_file = .coverage
 [report]
 # Regexes for lines to exclude from consideration
 exclude_lines =
    # Have to re-enable the standard pragma
    pragma: no cover
    # Don't complain about missing debug-only code:
    def __repr__
    if self\.debug
    # Don't complain if tests don't hit defensive assertion code:
    raise AssertionError
    raise NotImplementedError
    # Don't complain if non-runnable code isn't run:
    if 0:
    if __name__ == .__main__.:
 omit =
    */site-packages/*
    */distutils/*
    */test/*
    */__init__.py
    */setup.py
 	*/venv/*
 	*/env/*
 	*/build_venv/*
 	*/build_env/*
 	*/utils/banner.py
 	*/utils/logger.py
 	*/src/*
 	*/pdf_annotation.py
 ignore_errors = True
 [html]
 directory = reports
 [xml]
 output = reports/coverage.xml
--- a/.dvc/config
+++ b/.dvc/config
@ -1,5 +1,8 @@
 [core]
-    remote = vector
+    remote = azure_remote
    autostage = true
 ['remote "vector"']
-    url = ssh://vector.iqser.com/research/image_service/
+    url = ssh://vector.iqser.com/research/image-prediction/
    port = 22
 ['remote "azure_remote"']
    url = azure://image-classification-dvc/
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,8 @@
 .vscode/
 *.h5
-/venv/
+*venv
 .idea/
 src/data
 !.gitignore
 *.project
@ -32,6 +33,9 @@
 **/classpath-data.json
 **/dependencies-and-licenses-overview.txt
 .coverage
 .coverage\.*\.*
 *__pycache__
 *.egg-info*
@ -44,7 +48,6 @@
 *misc
 /coverage_html_report/
 .coverage
 # Created by https://www.toptal.com/developers/gitignore/api/linux,pycharm
 # Edit at https://www.toptal.com/developers/gitignore?templates=linux,pycharm
@ -170,6 +173,4 @@ fabric.properties
 # https://plugins.jetbrains.com/plugin/12206-codestream
 .idea/codestream.xml
-# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
+# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
 /image_prediction/data/mlruns/
 /data/mlruns/
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -0,0 +1,51 @@
 include:
  - project: "Gitlab/gitlab"
    ref: main
    file: "/ci-templates/research/dvc.gitlab-ci.yml"
  - project: "Gitlab/gitlab"
    ref: main
    file: "/ci-templates/research/versioning-build-test-release.gitlab-ci.yml"
 variables:
  NEXUS_PROJECT_DIR: red
  IMAGENAME: "${CI_PROJECT_NAME}"
  INTEGRATION_TEST_FILE: "${CI_PROJECT_ID}.pdf"
  FF_USE_FASTZIP: "true" # enable fastzip - a faster zip implementation that also supports level configuration.
  ARTIFACT_COMPRESSION_LEVEL: default # can also be set to fastest, fast, slow and slowest. If just enabling fastzip is not enough try setting this to fastest or fast.
  CACHE_COMPRESSION_LEVEL: default # same as above, but for caches
  # TRANSFER_METER_FREQUENCY: 5s # will display transfer progress every 5 seconds for artifacts and remote caches. For debugging purposes.
 stages:
  - data
  - setup
  - tests
  - sonarqube
  - versioning
  - build
  - integration-tests
  - release
 docker-build:
  extends: .docker-build
  needs:
    - job: dvc-pull
      artifacts: true
    - !reference [.needs-versioning, needs] # leave this line as is
 ###################
 # INTEGRATION TESTS
 trigger-integration-tests:
  extends: .integration-tests
  # ADD THE MODEL BUILD WHICH SHOULD TRIGGER THE INTEGRATION TESTS
  # needs:
  #   - job: docker-build::model_name
  #     artifacts: true
  rules:
    - when: never
 #########
 # RELEASE
 release:
  extends: .release
  needs:
    - !reference [.needs-versioning, needs] # leave this line as is
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +0,0 @@
 [submodule "incl/redai_image"]
 	path = incl/redai_image
 	url = ssh://git@git.iqser.com:2222/rr/redai_image.git
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
 3.10
--- a/80
+++ b/80
@ -1,25 +1,73 @@
-ARG BASE_ROOT="nexus.iqser.com:5001/red/"
+FROM python:3.10-slim AS builder
 ARG VERSION_TAG="latest"
-FROM ${BASE_ROOT}image-prediction-base:${VERSION_TAG}
+ARG GITLAB_USER
 ARG GITLAB_ACCESS_TOKEN
-WORKDIR /app/service
+ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
 ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
-COPY src src
+ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
-COPY data data
+ARG POETRY_SOURCE_REF_RED=gitlab-red
 COPY image_prediction image_prediction
 COPY incl/redai_image/redai incl/redai_image/redai
 COPY setup.py setup.py
 COPY requirements.txt requirements.txt
 COPY config.yaml config.yaml
-# Install dependencies differing from base image.
+ARG PYPI_REGISTRY_FFORESIGHT=https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
-RUN python3 -m pip install -r requirements.txt
+ARG POETRY_SOURCE_REF_FFORESIGHT=gitlab-fforesight
-RUN python3 -m pip install -e .
+ARG VERSION=dev
-RUN python3 -m pip install -e incl/redai_image/redai
+
 LABEL maintainer="Research <research@knecon.com>"
 LABEL version="${VERSION}"
 WORKDIR /app
 ###########
 # ENV SETUP
 ENV PYTHONDONTWRITEBYTECODE=true
 ENV PYTHONUNBUFFERED=true
 ENV POETRY_HOME=/opt/poetry
 ENV PATH="$POETRY_HOME/bin:$PATH"
 RUN apt-get update && \
    apt-get install -y curl git bash build-essential libffi-dev libssl-dev && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 RUN curl -sSL https://install.python-poetry.org | python3 -
 RUN poetry --version
 COPY pyproject.toml poetry.lock ./
 RUN poetry config virtualenvs.create true && \
    poetry config virtualenvs.in-project true && \
    poetry config installer.max-workers 10 && \
    poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
    poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
    poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
    poetry config http-basic.${POETRY_SOURCE_REF_RED} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
    poetry config repositories.${POETRY_SOURCE_REF_FFORESIGHT} ${PYPI_REGISTRY_FFORESIGHT} && \
    poetry config http-basic.${POETRY_SOURCE_REF_FFORESIGHT} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
    poetry install --without=dev -vv --no-interaction --no-root
 ###############
 # WORKING IMAGE
 FROM python:3.10-slim
 WORKDIR /app
 # COPY SOURCE CODE FROM BUILDER IMAGE
 COPY --from=builder /app /app
 # COPY BILL OF MATERIALS (BOM)
 COPY bom.json /bom.json
 ENV PATH="/app/.venv/bin:$PATH"
 ###################
 # COPY SOURCE CODE
 COPY ./src ./src
 COPY ./config ./config
 COPY ./data ./data
 COPY banner.txt ./
 EXPOSE 5000
 EXPOSE 8080
-CMD ["python3", "src/serve.py"]
+CMD [ "python", "src/serve.py"]
--- a/25
+++ b/25
@ -1,25 +0,0 @@
 FROM python:3.8 as builder1
 # Use a virtual environment.
 RUN python -m venv /app/venv
 ENV PATH="/app/venv/bin:$PATH"
 # Upgrade pip.
 RUN python -m pip install --upgrade pip
 # Make a directory for the service files and copy the service repo into the container.
 WORKDIR /app/service
 COPY ./requirements.txt ./requirements.txt
 # Install dependencies.
 RUN python3 -m pip install -r requirements.txt
 # Make a new container and copy all relevant files over to filter out temporary files
 # produced during setup to reduce the final container's size.
 FROM python:3.8
 WORKDIR /app/
 COPY --from=builder1  /app .
 ENV PATH="/app/venv/bin:$PATH"
 WORKDIR /app/service
--- a/43
+++ b/43
@ -0,0 +1,43 @@
 FROM python:3.10
 ARG USERNAME
 ARG TOKEN
 ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
 ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
 ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
 ARG POETRY_SOURCE_REF_RED=gitlab-red
 ARG VERSION=dev
 LABEL maintainer="Research <research@knecon.com>"
 LABEL version="${VERSION}"
 WORKDIR /app
 ENV PYTHONUNBUFFERED=true
 ENV POETRY_HOME=/opt/poetry
 ENV PATH="$POETRY_HOME/bin:$PATH"
 RUN curl -sSL https://install.python-poetry.org | python3 -
 COPY ./data ./data
 COPY ./test ./test
 COPY ./config ./config
 COPY ./src ./src
 COPY pyproject.toml poetry.lock banner.txt config.yaml./
 RUN poetry config virtualenvs.create false && \
    poetry config installer.max-workers 10 && \
    poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
    poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${USERNAME} ${TOKEN} && \
    poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
    poetry config http-basic.${POETRY_SOURCE_REF_RED} ${USERNAME} ${TOKEN} && \
    poetry install --without=dev -vv --no-interaction --no-root
 EXPOSE 5000
 EXPOSE 8080
 RUN apt update --yes
 RUN apt install vim --yes
 RUN apt install poppler-utils --yes
 CMD coverage run -m pytest test/ --tb=native -q -s -vvv -x && coverage combine && coverage report -m && coverage xml
--- a/README.md
+++ b/README.md
@ -1,25 +1,143 @@
-### Building
+### Setup
 Build base image
 ```bash
-setup/docker.sh
+docker build -t image-classification-image --progress=plain --no-cache \
-```
+    -f Dockerfile \
-
+    --build-arg USERNAME=$GITLAB_USER \
-Build head image
+    --build-arg TOKEN=$GITLAB_ACCESS_TOKEN \
-```bash
+    .
 docker build -f Dockerfile -t image-prediction . --build-arg BASE_ROOT=""
 ```
 ### Usage
 #### Without Docker
 ```bash
 py scripts/run_pipeline.py /path/to/a/pdf
 ```
 #### With Docker
 Shell 1
 ```bash
-docker run --rm --net=host --rm image-prediction
+docker run --rm --net=host image-prediction
 ```
 Shell 2
 ```bash
-python scripts/pyinfra_mock.py --pdf_path /path/to/a/pdf
+python scripts/pyinfra_mock.py /path/to/a/pdf
 ```
 ### Tests
 Run for example this command to execute all tests and get a coverage report:
 ```bash
 coverage run -m pytest test --tb=native -q -s -vvv -x && coverage combine && coverage report -m
 ```
 After having built the service container as specified above, you can also run tests in a container as follows:
 ```bash
 ./run_tests.sh
 ```
 ### Message Body Formats
 #### Request Format
 The request messages need to provide the fields `"dossierId"` and `"fileId"`. A request should look like this:
 ```json
 {
    "dossierId": "<string identifier>",
    "fileId": "<string identifier>"
 }
 ```
 Any additional keys are ignored.
 #### Response Format
 Response bodies contain information about the identified class of the image, the confidence of the classification, the
 position and size of the image as well as the results of additional convenience filters which can be configured through
 environment variables. A response body looks like this:
 ```json
 {
  "dossierId": "debug",
  "fileId": "13ffa9851740c8d20c4c7d1706d72f2a",
  "data": [...]
 }
 ```
 An image metadata record (entry in `"data"` field of a response body) looks like this:
 ```json
 {
  "classification": {
    "label": "logo",
    "probabilities": {
      "logo": 1.0,
      "signature": 1.1599173226749333e-17,
      "other": 2.994595513398207e-23,
      "formula": 4.352109377281029e-31
    }
  },
  "position": {
    "x1": 475.95,
    "x2": 533.4,
    "y1": 796.47,
    "y2": 827.62,
    "pageNumber": 6
  },
  "geometry": {
    "width": 57.44999999999999,
    "height": 31.149999999999977
  },
  "alpha": false,
  "filters": {
    "geometry": {
      "imageSize": {
        "quotient": 0.05975350599135938,
        "tooLarge": false,
        "tooSmall": false
      },
      "imageFormat": {
        "quotient": 1.8443017656500813,
        "tooTall": false,
        "tooWide": false
      }
    },
    "probability": {
      "unconfident": false
    },
    "allPassed": true
  }
 }
 ```
 ## Configuration
 A configuration file is located under `config.yaml`. All relevant variables can be configured via
 exporting environment variables.
 | __Environment Variable__           | Default                            | Description                                                                            |
 |------------------------------------|------------------------------------|----------------------------------------------------------------------------------------|
 | __LOGGING_LEVEL_ROOT__             | "INFO"                             | Logging level for log file messages                                                    |
 | __VERBOSE__                        | *true*                             | Service prints document processing progress to stdout                                  |
 | __BATCH_SIZE__                     | 16                                 | Number of images in memory simultaneously per service instance                         |
 | __RUN_ID__                         | "fabfb1f192c745369b88cab34471aba7" | The ID of the mlflow run to load the image classifier from                             |
 | __MIN_REL_IMAGE_SIZE__             | 0.05                               | Minimally permissible image size to page size ratio                                    |
 | __MAX_REL_IMAGE_SIZE__             | 0.75                               | Maximally permissible image size to page size ratio                                    |
 | __MIN_IMAGE_FORMAT__               | 0.1                                | Minimally permissible image width to height ratio                                      |
 | __MAX_IMAGE_FORMAT__               | 10                                 | Maximally permissible image width to height ratio                                      |
 See also: https://git.iqser.com/projects/RED/repos/helm/browse/redaction/templates/image-service-v2
--- a/bamboo-specs/pom.xml
+++ b/bamboo-specs/pom.xml
@ -1,40 +0,0 @@
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <parent>
    <groupId>com.atlassian.bamboo</groupId>
    <artifactId>bamboo-specs-parent</artifactId>
    <version>7.1.2</version>
    <relativePath/>
  </parent>
  <artifactId>bamboo-specs</artifactId>
  <version>1.0.0-SNAPSHOT</version>
  <packaging>jar</packaging>
  <properties>
    <sonar.skip>true</sonar.skip>
  </properties>
  <dependencies>
    <dependency>
      <groupId>com.atlassian.bamboo</groupId>
      <artifactId>bamboo-specs-api</artifactId>
    </dependency>
    <dependency>
      <groupId>com.atlassian.bamboo</groupId>
      <artifactId>bamboo-specs</artifactId>
    </dependency>
    <!-- Test dependencies -->
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <scope>test</scope>
    </dependency>
  </dependencies>
  <!-- run 'mvn test' to perform offline validation of the plan -->
  <!-- run 'mvn -Ppublish-specs' to upload the plan to your Bamboo server -->
 </project>
--- a/bamboo-specs/src/main/java/buildjob/PlanSpec.java
+++ b/bamboo-specs/src/main/java/buildjob/PlanSpec.java
@ -1,182 +0,0 @@
 package buildjob;
 import com.atlassian.bamboo.specs.api.BambooSpec;
 import com.atlassian.bamboo.specs.api.builders.BambooKey;
 import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
 import com.atlassian.bamboo.specs.api.builders.permission.PermissionType;
 import com.atlassian.bamboo.specs.api.builders.permission.Permissions;
 import com.atlassian.bamboo.specs.api.builders.permission.PlanPermissions;
 import com.atlassian.bamboo.specs.api.builders.plan.Job;
 import com.atlassian.bamboo.specs.api.builders.plan.Plan;
 import com.atlassian.bamboo.specs.api.builders.plan.PlanIdentifier;
 import com.atlassian.bamboo.specs.api.builders.plan.Stage;
 import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup;
 import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement;
 import com.atlassian.bamboo.specs.api.builders.project.Project;
 import com.atlassian.bamboo.specs.builders.task.CheckoutItem;
 import com.atlassian.bamboo.specs.builders.task.InjectVariablesTask;
 import com.atlassian.bamboo.specs.builders.task.ScriptTask;
 import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask;
 import com.atlassian.bamboo.specs.builders.task.CleanWorkingDirectoryTask;
 import com.atlassian.bamboo.specs.builders.task.VcsTagTask;
 import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
 import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
 import com.atlassian.bamboo.specs.api.builders.Variable;
 import com.atlassian.bamboo.specs.util.BambooServer;
 import com.atlassian.bamboo.specs.builders.task.ScriptTask;
 import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
 /**
 * Plan configuration for Bamboo.
 * Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
 */
@BambooSpec
 public class PlanSpec {
    private static final String SERVICE_NAME = "image-prediction";
    private static final String SERVICE_NAME_BASE = "image-prediction-base";
    private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_","");
    /**
     * Run main to publish plan on Bamboo
     */
    public static void main(final String[] args) throws Exception {
        //By default credentials are read from the '.credentials' file.
        BambooServer bambooServer = new BambooServer("http://localhost:8085");
        Plan plan = new PlanSpec().createDockerBuildPlan();
        bambooServer.publish(plan);
        PlanPermissions planPermission = new PlanSpec().createPlanPermission(plan.getIdentifier());
        bambooServer.publish(planPermission);
    }
    private PlanPermissions createPlanPermission(PlanIdentifier planIdentifier) {
        Permissions permission = new Permissions()
                .userPermissions("atlbamboo", PermissionType.EDIT, PermissionType.VIEW, PermissionType.ADMIN, PermissionType.CLONE, PermissionType.BUILD)
                .groupPermissions("research", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
                .groupPermissions("Development", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
                .groupPermissions("QA", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
                .loggedInUserPermissions(PermissionType.VIEW)
                .anonymousUserPermissionView();
        return new PlanPermissions(planIdentifier.getProjectKey(), planIdentifier.getPlanKey()).permissions(permission);
    }
    private Project project() {
        return new Project()
                .name("RED")
                .key(new BambooKey("RED"));
    }
    public Plan createDockerBuildPlan() {
    return new Plan(
            project(),
            SERVICE_NAME, new BambooKey(SERVICE_KEY))
            .description("Docker build for image-prediction.")
            // .variables()
            .stages(new Stage("Build Stage")
              .jobs(
                new Job("Build Job", new BambooKey("BUILD"))
                  .tasks(
                    new CleanWorkingDirectoryTask()
                        .description("Clean working directory.")
                        .enabled(true),
                    new VcsCheckoutTask()
                        .description("Checkout default repository.")
                        .checkoutItems(new CheckoutItem().defaultRepository()),
                    new VcsCheckoutTask()
                        .description("Checkout redai_image research repository.")
                        .checkoutItems(new CheckoutItem().repository("RR / redai_image").path("redai_image")),
                    new ScriptTask()
                        .description("Set config and keys.")
                        .inlineBody("mkdir -p ~/.ssh\n" +
                                    "echo \"${bamboo.bamboo_agent_ssh}\" | base64 -d >> ~/.ssh/id_rsa\n" +
                                    "echo \"host vector.iqser.com\" > ~/.ssh/config\n" +
                                    "echo \"    user bamboo-agent\" >> ~/.ssh/config\n" +
                                    "chmod 600 ~/.ssh/config ~/.ssh/id_rsa"),
                    new ScriptTask()
                        .description("Build Docker container.")
                        .location(Location.FILE)
                        .fileFromPath("bamboo-specs/src/main/resources/scripts/docker-build.sh")
                        .argument(SERVICE_NAME + " " + SERVICE_NAME_BASE))
                  .dockerConfiguration(
                      new DockerConfiguration()
                        .image("nexus.iqser.com:5001/infra/release_build:4.2.0")
                        .volume("/var/run/docker.sock", "/var/run/docker.sock")),
                new Job("Sonar Job", new BambooKey("SONAR"))
                  .tasks(
                    new CleanWorkingDirectoryTask()
                        .description("Clean working directory.")
                        .enabled(true),
                    new VcsCheckoutTask()
                        .description("Checkout default repository.")
                        .checkoutItems(new CheckoutItem().defaultRepository()),
                    new VcsCheckoutTask()
                        .description("Checkout redai_image repository.")
                        .checkoutItems(new CheckoutItem().repository("RR / redai_image").path("redai_image")),
                    new ScriptTask()
                        .description("Set config and keys.")
                        .inlineBody("mkdir -p ~/.ssh\n" +
                                    "echo \"${bamboo.bamboo_agent_ssh}\" | base64 -d >> ~/.ssh/id_rsa\n" +
                                    "echo \"host vector.iqser.com\" > ~/.ssh/config\n" +
                                    "echo \"    user bamboo-agent\" >> ~/.ssh/config\n" +
                                    "chmod 600 ~/.ssh/config ~/.ssh/id_rsa"),
                    new ScriptTask()
                        .description("Run Sonarqube scan.")
                        .location(Location.FILE)
                        .fileFromPath("bamboo-specs/src/main/resources/scripts/sonar-scan.sh")
                        .argument(SERVICE_NAME))
                  .dockerConfiguration(
                      new DockerConfiguration()
                        .image("nexus.iqser.com:5001/infra/release_build:4.2.0")
                        .volume("/var/run/docker.sock", "/var/run/docker.sock"))),
            new Stage("Licence Stage")
              .jobs(
                new Job("Git Tag Job", new BambooKey("GITTAG"))
                  .tasks(
                    new VcsCheckoutTask()
                        .description("Checkout default repository.")
                        .checkoutItems(new CheckoutItem().defaultRepository()),
                    new ScriptTask()
                        .description("Build git tag.")
                        .location(Location.FILE)
                        .fileFromPath("bamboo-specs/src/main/resources/scripts/git-tag.sh"),
                    new InjectVariablesTask()
                        .description("Inject git tag.")
                        .path("git.tag")
                        .namespace("g")
                        .scope(InjectVariablesScope.LOCAL),
                    new VcsTagTask()
                        .description("${bamboo.g.gitTag}")
                        .tagName("${bamboo.g.gitTag}")
                        .defaultRepository())
                .dockerConfiguration(
                    new DockerConfiguration()
                        .image("nexus.iqser.com:5001/infra/release_build:4.4.1")),
                new Job("Licence Job", new BambooKey("LICENCE"))
                  .enabled(false)
                  .tasks(
                    new VcsCheckoutTask()
                        .description("Checkout default repository.")
                        .checkoutItems(new CheckoutItem().defaultRepository()),
                    new ScriptTask()
                        .description("Build licence.")
                        .location(Location.FILE)
                        .fileFromPath("bamboo-specs/src/main/resources/scripts/create-licence.sh"))
                  .dockerConfiguration(
                    new DockerConfiguration()
                        .image("nexus.iqser.com:5001/infra/maven:3.6.2-jdk-13-3.0.0")
                        .volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
                        .volume("/var/run/docker.sock", "/var/run/docker.sock"))))
            .linkedRepositories("RR / " + SERVICE_NAME)
            .linkedRepositories("RR / redai_image")
            .triggers(new BitbucketServerTrigger())
            .planBranchManagement(new PlanBranchManagement()
              .createForVcsBranch()
              .delete(new BranchCleanup()
                  .whenInactiveInRepositoryAfterDays(14))
              .notificationForCommitters());
    }
 }
--- a/bamboo-specs/src/main/resources/scripts/create-licence.sh
+++ b/bamboo-specs/src/main/resources/scripts/create-licence.sh
@ -1,19 +0,0 @@
 #!/bin/bash
 set -e
 if [[ \"${bamboo_version_tag}\" != \"dev\" ]]
 then
    ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
                    -f ${bamboo_build_working_directory}/pom.xml \
                    versions:set  \
                    -DnewVersion=${bamboo_version_tag}
    ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
                    -f ${bamboo_build_working_directory}/pom.xml \
                    -B clean deploy \
                    -e -DdeployAtEnd=true \
                    -Dmaven.wagon.http.ssl.insecure=true \
                    -Dmaven.wagon.http.ssl.allowall=true \
                    -Dmaven.wagon.http.ssl.ignore.validity.dates=true \
                    -DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/gin4-platform-releases
 fi
--- a/bamboo-specs/src/main/resources/scripts/docker-build.sh
+++ b/bamboo-specs/src/main/resources/scripts/docker-build.sh
@ -1,19 +0,0 @@
 #!/bin/bash
 set -e
 SERVICE_NAME=$1
 SERVICE_NAME_BASE=$2
 python3 -m venv build_venv
 source build_venv/bin/activate
 python3 -m pip install --upgrade pip
 pip install dvc
 pip install 'dvc[ssh]'
 dvc pull
 echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
 docker build -f Dockerfile_base  -t nexus.iqser.com:5001/red/$SERVICE_NAME_BASE:${bamboo_version_tag} .
 docker build -f Dockerfile  -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} --build-arg VERSION_TAG=${bamboo_version_tag} .
 echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
 docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag}
--- a/bamboo-specs/src/main/resources/scripts/git-tag.sh
+++ b/bamboo-specs/src/main/resources/scripts/git-tag.sh
@ -1,9 +0,0 @@
 #!/bin/bash
 set -e
 if [[ "${bamboo_version_tag}" = "dev" ]]
 then
    echo "gitTag=${bamboo_planRepository_1_branch}_${bamboo_buildNumber}" > git.tag
 else
    echo "gitTag=${bamboo_version_tag}" > git.tag
 fi
--- a/bamboo-specs/src/main/resources/scripts/sonar-scan.sh
+++ b/bamboo-specs/src/main/resources/scripts/sonar-scan.sh
@ -1,51 +0,0 @@
 #!/bin/bash
 set -e
 export JAVA_HOME=/usr/bin/sonar-scanner/jre
 python3 -m venv build_venv
 source build_venv/bin/activate
 python3 -m pip install --upgrade pip
 echo "dev setup for unit test and coverage 💖"
 pip install -e .
 pip install -r requirements.txt
 SERVICE_NAME=$1
 echo "dependency-check:aggregate"
 mkdir -p reports
 dependency-check --enableExperimental -f JSON -f HTML -f XML \
  --disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
  --exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**"
 if [[ -z "${bamboo_repository_pr_key}" ]]
 then
    echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
    /usr/bin/sonar-scanner/bin/sonar-scanner \
      -Dsonar.projectKey=RED_$SERVICE_NAME \
      -Dsonar.sources=image_prediction \
      -Dsonar.host.url=https://sonarqube.iqser.com \
      -Dsonar.login=${bamboo_sonarqube_api_token_secret} \
      -Dsonar.branch.name=${bamboo_planRepository_1_branch} \
      -Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
      -Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
      -Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
      -Dsonar.python.coverage.reportPaths=reports/coverage.xml
 else
    echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
    /usr/bin/sonar-scanner/bin/sonar-scanner \
      -Dsonar.projectKey=RED_$SERVICE_NAME \
      -Dsonar.sources=image_prediction \
      -Dsonar.host.url=https://sonarqube.iqser.com \
      -Dsonar.login=${bamboo_sonarqube_api_token_secret} \
      -Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
      -Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \
      -Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \
      -Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
      -Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
      -Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
      -Dsonar.python.coverage.reportPaths=reports/coverage.xml
 fi
--- a/bamboo-specs/src/test/java/buildjob/PlanSpecTest.java
+++ b/bamboo-specs/src/test/java/buildjob/PlanSpecTest.java
@ -1,16 +0,0 @@
 package buildjob;
 import com.atlassian.bamboo.specs.api.builders.plan.Plan;
 import com.atlassian.bamboo.specs.api.exceptions.PropertiesValidationException;
 import com.atlassian.bamboo.specs.api.util.EntityPropertiesBuilders;
 import org.junit.Test;
 public class PlanSpecTest {
    @Test
    public void checkYourPlanOffline() throws PropertiesValidationException {
        Plan plan = new PlanSpec().createDockerBuildPlan();
        EntityPropertiesBuilders.build(plan);
    }
 }
--- a/banner.txt
+++ b/banner.txt
@ -0,0 +1,11 @@
 +----------------------------------------------------+
 |                                  ___               |
 |                               __/_  `.  .-"""-.    |
 |_._     _,-'""`-._             \_,` | \-'  /   )`-')|
 |(,-.`._,'(       |\`-/|         "") `"`    \  ((`"` |
 |    `-.-' \ )-`( , o o)        ___Y  ,    .'7 /|    |
 |          `-    \`_`"'-       (_,___/...-` (_/_/    |
 |                                                    |
 +----------------------------------------------------+
 |            Image Classification Service            |
 +----------------------------------------------------+
--- a/bom.json
+++ b/bom.json
--- a/config.yaml
+++ b/config.yaml
@ -1,28 +0,0 @@
 webserver:
  host: $SERVER_HOST|"127.0.0.1"  # webserver address
  port: $SERVER_PORT|5000  # webserver port
  mode: $SERVER_MODE|production  # webserver mode: {development, production}
 service:
  logging_level: $LOGGING_LEVEL_ROOT|DEBUG  # Logging level for service logger
  batch_size: $BATCH_SIZE|32  # Number of images in memory simultaneously
  verbose: $VERBOSE|True  # Service prints document processing progress to stdout
  run_id: $RUN_ID|fabfb1f192c745369b88cab34471aba7  # The ID of the mlflow run to load the model from
 # These variables control filters that are applied to either images, image metadata or model predictions. The filter
 # result values are reported in the service responses. For convenience the response to a request contains a
 # "filters.allPassed" field, which is set to false if any of the filters returned values did not meet its specified
 # required value.
 filters:
  image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)
    min: $MIN_REL_IMAGE_SIZE|0.05  # Minimum permissible
    max: $MAX_REL_IMAGE_SIZE|0.75  # Maximum permissible
  image_width_to_height_quotient:  # Image width to height ratio
    min: $MIN_IMAGE_FORMAT|0.1  # Minimum permissible
    max: $MAX_IMAGE_FORMAT|10  # Maximum permissible
  min_confidence: $MIN_CONFIDENCE|0.5  # Minimum permissible prediction confidence
--- a/config/pyinfra.toml
+++ b/config/pyinfra.toml
@ -0,0 +1,68 @@
 [asyncio]
 max_concurrent_tasks = 10
 [dynamic_tenant_queues]
 enabled = true
 [metrics.prometheus]
 enabled = true
 prefix = "redactmanager_image_service"
 [tracing]
 enabled = true
 # possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
 type = "azure_monitor" 
 [tracing.opentelemetry]
 endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
 service_name = "redactmanager_image_service"
 exporter = "otlp"
 [webserver]
 host = "0.0.0.0"
 port = 8080
 [rabbitmq]
 host = "localhost"
 port = 5672
 username = ""
 password = ""
 heartbeat = 60
 # Has to be a divider of heartbeat, and shouldn't be too big, since only in these intervals queue interactions happen (like receiving new messages)
 # This is also the minimum time the service needs to process a message
 connection_sleep = 5
 input_queue = "request_queue"
 output_queue = "response_queue"
 dead_letter_queue = "dead_letter_queue"
 tenant_event_queue_suffix = "_tenant_event_queue"
 tenant_event_dlq_suffix = "_tenant_events_dlq"
 tenant_exchange_name = "tenants-exchange"
 queue_expiration_time = 300000  # 5 minutes in milliseconds
 service_request_queue_prefix = "image_request_queue"
 service_request_exchange_name = "image_request_exchange"
 service_response_exchange_name = "image_response_exchange"
 service_dlq_name = "image_dlq"
 [storage]
 backend = "s3"
 [storage.s3]
 bucket = "redaction"
 endpoint = "http://127.0.0.1:9000"
 key = ""
 secret = ""
 region = "eu-central-1"
 [storage.azure]
 container = "redaction"
 connection_string = ""
 [storage.tenant_server]
 public_key = ""
 endpoint =  "http://tenant-user-management:8081/internal-api/tenants"
 [kubernetes]
 pod_name = "test_pod"
--- a/config/settings.toml
+++ b/config/settings.toml
@ -0,0 +1,42 @@
 [logging]
 level = "INFO"
 [service]
 # Print document processing progress to stdout
 verbose = false
 batch_size = 6
 image_stiching_tolerance = 1  # in pixels
 mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
 # These variables control filters that are applied to either images, image metadata or service_estimator predictions.
 # The filter result values are reported in the service responses. For convenience the response to a request contains a
 # "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
 # specified required value.
 [filters.confidence]
 # Minimum permissible prediction confidence
 min = 0.5
 # Image size to page size ratio (ratio of geometric means of areas)
 [filters.image_to_page_quotient]
 min = 0.05
 max = 0.75
 [filters.is_scanned_page]
 # Minimum permissible image to page ratio tolerance for a page to be considered scanned.
 # This is only used for filtering small images on scanned pages and is applied before processing the image, therefore
 # superseding the image_to_page_quotient filter that only applies a tag to the image after processing.
 tolerance = 0
 # Image width to height ratio
 [filters.image_width_to_height_quotient]
 min = 0.1
 max = 10
 # put class specific filters here ['signature', 'formula', 'logo']
 [filters.overrides.signature.image_to_page_quotient]
 max = 0.4
 [filters.overrides.logo.image_to_page_quotient]
 min = 0.06
--- a/data/.gitignore
+++ b/data/.gitignore
@ -0,0 +1 @@
 /mlruns
--- a/data/base_weights.h5.dvc
+++ b/data/base_weights.h5.dvc
@ -1,4 +0,0 @@
 outs:
 - md5: 6d0186c1f25e889d531788f168fa6cf0
  size: 16727296
  path: base_weights.h5
--- a/data/mlruns.dvc
+++ b/data/mlruns.dvc
@ -1,5 +1,5 @@
 outs:
- md5: d1c708270bab6fcd344d4a8b05d1103d.dir
+- md5: ad061d607f615afc149643f62dbf37cc.dir
-  size: 150225383
+  size: 166952700
-  nfiles: 178
+  nfiles: 179
  path: mlruns
--- a/doc/tests.drawio
+++ b/doc/tests.drawio
@ -0,0 +1 @@
 <mxfile host="app.diagrams.net" modified="2022-03-17T15:35:10.371Z" agent="5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36" etag="b-CbBXg6FXQ9T3Px-oLc" version="17.1.1" type="device"><diagram id="tS3WR_Pr6QhNVK3FqSUP" name="Page-1">1ZZRT6QwEMc/DY8mQHdRX93z9JLbmNzGmNxbQ0daLQzpDrL46a/IsCzinneJcd0XaP+dtsN/fkADscg3V06WeokKbBCHahOIb0Ecnydzf22FphPmyXknZM6oTooGYWWegcWQ1cooWI8CCdGSKcdiikUBKY006RzW47B7tONdS5nBRFil0k7VO6NId+rZPBz0azCZ7neOQh7JZR/MwlpLhfWOJC4DsXCI1LXyzQJs613vSzfv+57RbWIOCvqXCZqW9PBref27aZ7xsQ5vTn/cnvAqT9JW/MCwJuNzR8dZU9Nb4bAqFLSrhYG4qLUhWJUybUdrX3uvacqt70W+yeuCI9jsTTja2uDxAcyBXONDeILonWN04hn366EQUR+jd4qQsCa59tl26cEe32CH/sOt+TueoCONGRbS/kQs2YkHIGoYbFkRvuUTqAmFr1zyu2LlUvhLdjG/HtJlQO/VfOq6AyvJPI3z+HAL4wlwpbp/2V0qODxzUTJmLjo4c8nEkxaWFXcLLPzt4ithKI4BQzHBMOc/l8UvAeLrj9/hQTw9NhBnxwDibB+IB+ZvdvZ5/PnucAx6Gds5S4rLPw==</diagram></mxfile>
--- a/image_prediction/config.py
+++ b/image_prediction/config.py
@ -1,40 +0,0 @@
 """Implements a config object with dot-indexing syntax."""
 from envyaml import EnvYAML
 from image_prediction.locations import CONFIG_FILE
 def _get_item_and_maybe_make_dotindexable(container, item):
    ret = container[item]
    return DotIndexable(ret) if isinstance(ret, dict) else ret
 class DotIndexable:
    def __init__(self, x):
        self.x = x
    def __getattr__(self, item):
        return _get_item_and_maybe_make_dotindexable(self.x, item)
    def __setitem__(self, key, value):
        self.x[key] = value
    def __repr__(self):
        return self.x.__repr__()
 class Config:
    def __init__(self, config_path):
        self.__config = EnvYAML(config_path)
    def __getattr__(self, item):
        if item in self.__config:
            return _get_item_and_maybe_make_dotindexable(self.__config, item)
    def __getitem__(self, item):
        return self.__getattr__(item)
 CONFIG = Config(CONFIG_FILE)
--- a/image_prediction/locations.py
+++ b/image_prediction/locations.py
@ -1,14 +0,0 @@
 from os import path
 MODULE_DIR = path.dirname(path.abspath(__file__))
 PACKAGE_ROOT_DIR = path.dirname(MODULE_DIR)
 REPO_ROOT_DIR = path.dirname(path.dirname(PACKAGE_ROOT_DIR))
 DOCKER_COMPOSE_FILE = path.join(REPO_ROOT_DIR, "docker-compose.yaml")
 CONFIG_FILE = path.join(PACKAGE_ROOT_DIR, "config.yaml")
 LOG_FILE = "/tmp/log.log"
 DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data")
 MLRUNS_DIR = path.join(DATA_DIR, "mlruns")
 BASE_WEIGHTS = path.join(DATA_DIR, "base_weights.h5")
--- a/image_prediction/predictor.py
+++ b/image_prediction/predictor.py
@ -1,116 +0,0 @@
 import logging
 from itertools import chain
 from operator import itemgetter
 from typing import List, Dict, Iterable
 import numpy as np
 from image_prediction.config import CONFIG
 from image_prediction.locations import MLRUNS_DIR, BASE_WEIGHTS
 from incl.redai_image.redai.redai.backend.model.model_handle import ModelHandle
 from incl.redai_image.redai.redai.backend.pdf.image_extraction import extract_and_stitch
 from incl.redai_image.redai.redai.utils.mlflow_reader import MlflowModelReader
 from incl.redai_image.redai.redai.utils.shared import chunk_iterable
 class Predictor:
    """`ModelHandle` wrapper. Forwards to wrapped model handle for prediction and produces structured output that is
    interpretable independently of the wrapped model (e.g. with regard to a .classes_ attribute).
    """
    def __init__(self, model_handle: ModelHandle = None):
        """Initializes a ServiceEstimator.
        Args:
            model_handle: ModelHandle object to forward to for prediction. By default, a model handle is loaded from the
                mlflow database via CONFIG.service.run_id.
        """
        try:
            if model_handle is None:
                reader = MlflowModelReader(run_id=CONFIG.service.run_id, mlruns_dir=MLRUNS_DIR)
                self.model_handle = reader.get_model_handle(BASE_WEIGHTS)
            else:
                self.model_handle = model_handle
            self.classes = self.model_handle.model.classes_
            self.classes_readable = np.array(self.model_handle.classes)
            self.classes_readable_aligned = self.classes_readable[self.classes[list(range(len(self.classes)))]]
        except Exception as e:
            logging.info(f"Service estimator initialization failed: {e}")
    def __make_predictions_human_readable(self, probs: np.ndarray) -> List[Dict[str, float]]:
        """Translates an n x m matrix of probabilities over classes into an n-element list of mappings from classes to
        probabilities.
        Args:
            probs: probability matrix (items x classes)
        Returns:
            list of mappings from classes to probabilities.
        """
        classes = np.argmax(probs, axis=1)
        classes = self.classes[classes]
        classes_readable = [self.model_handle.classes[c] for c in classes]
        return classes_readable
    def predict(self, images: List, probabilities: bool = False, **kwargs):
        """Gathers predictions for list of images. Assigns each image a class and optionally a probability distribution
        over all classes.
        Args:
            images (List[PIL.Image]) : Images to gather predictions for.
            probabilities: Whether to return dictionaries of the following form instead of strings:
                {
                    "class": predicted class,
                    "probabilities": {
                        "class 1" : class 1 probability,
                        "class 2" : class 2 probability,
                         ...
                    }
                }
        Returns:
            By default the return value is a list of classes (meaningful class name strings). Alternatively a list of
            dictionaries with an additional probability field for estimated class probabilities per image can be
            returned.
        """
        X = self.model_handle.prep_images(list(images))
        probs_per_item = self.model_handle.model.predict_proba(X, **kwargs).astype(float)
        classes = self.__make_predictions_human_readable(probs_per_item)
        class2prob_per_item = [dict(zip(self.classes_readable_aligned, probs)) for probs in probs_per_item]
        class2prob_per_item = [
            dict(sorted(c2p.items(), key=itemgetter(1), reverse=True)) for c2p in class2prob_per_item
        ]
        predictions = [{"class": c, "probabilities": c2p} for c, c2p in zip(classes, class2prob_per_item)]
        return predictions if probabilities else classes
 def extract_image_metadata_pairs(pdf_path: str, **kwargs):
    def image_is_large_enough(metadata: dict):
        x1, x2, y1, y2 = itemgetter("x1", "x2", "y1", "y2")(metadata)
        return abs(x1 - x2) > 2 and abs(y1 - y2) > 2
    yield from extract_and_stitch(pdf_path, convert_to_rgb=True, filter_fn=image_is_large_enough, **kwargs)
 def classify_images(predictor, image_metadata_pairs: Iterable, batch_size: int = CONFIG.service.batch_size):
    def process_chunk(chunk):
        images, metadata = zip(*chunk)
        predictions = predictor.predict(images, probabilities=True)
        return predictions, metadata
    def predict(image_metadata_pair_generator):
        chunks = chunk_iterable(image_metadata_pair_generator, n=batch_size)
        return map(chain.from_iterable, zip(*map(process_chunk, chunks)))
    try:
        predictions, metadata = predict(image_metadata_pairs)
        return predictions, metadata
    except ValueError:
        return [], []
--- a/image_prediction/response.py
+++ b/image_prediction/response.py
@ -1,71 +0,0 @@
 """Defines functions for constructing service responses."""
 from itertools import starmap
 from operator import itemgetter
 import numpy as np
 from image_prediction.config import CONFIG
 def build_response(predictions: list, metadata: list) -> list:
    return list(starmap(build_image_info, zip(predictions, metadata)))
 def build_image_info(prediction: dict, metadata: dict) -> dict:
    def compute_geometric_quotient():
        page_area_sqrt = np.sqrt(abs(page_width * page_height))
        image_area_sqrt = np.sqrt(abs(x2 - x1) * abs(y2 - y1))
        return image_area_sqrt / page_area_sqrt
    page_width, page_height, x1, x2, y1, y2, width, height = itemgetter(
        "page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height"
    )(metadata)
    quotient = compute_geometric_quotient()
    min_image_to_page_quotient_breached = bool(quotient < CONFIG.filters.image_to_page_quotient.min)
    max_image_to_page_quotient_breached = bool(quotient > CONFIG.filters.image_to_page_quotient.max)
    min_image_width_to_height_quotient_breached = bool(
        width / height < CONFIG.filters.image_width_to_height_quotient.min
    )
    max_image_width_to_height_quotient_breached = bool(
        width / height > CONFIG.filters.image_width_to_height_quotient.max
    )
    min_confidence_breached = bool(max(prediction["probabilities"].values()) < CONFIG.filters.min_confidence)
    prediction["label"] = prediction.pop("class")  # "class" as field name causes problem for Java objectmapper
    prediction["probabilities"] = {klass: np.round(prob, 6) for klass, prob in prediction["probabilities"].items()}
    image_info = {
        "classification": prediction,
        "position": {"x1": x1, "x2": x2, "y1": y1, "y2": y2, "pageNumber": metadata["page_idx"] + 1},
        "geometry": {"width": width, "height": height},
        "filters": {
            "geometry": {
                "imageSize": {
                    "quotient": quotient,
                    "tooLarge": max_image_to_page_quotient_breached,
                    "tooSmall": min_image_to_page_quotient_breached,
                },
                "imageFormat": {
                    "quotient": width / height,
                    "tooTall": min_image_width_to_height_quotient_breached,
                    "tooWide": max_image_width_to_height_quotient_breached,
                },
            },
            "probability": {"unconfident": min_confidence_breached},
            "allPassed": not any(
                [
                    max_image_to_page_quotient_breached,
                    min_image_to_page_quotient_breached,
                    min_image_width_to_height_quotient_breached,
                    max_image_width_to_height_quotient_breached,
                    min_confidence_breached,
                ]
            ),
        },
    }
    return image_info
--- a/incl/redai_image
+++ b/incl/redai_image
@ -1 +0,0 @@
 Subproject commit 4c3b26d7673457aaa99e0663dad6950cd36da967
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,73 @@
 [tool.poetry]
 name = "image-classification-service"
 version = "2.17.0"
 description = ""
 authors = ["Team Research <research@knecon.com>"]
 readme = "README.md"
 packages = [{ include = "image_prediction", from = "src" }]
 [tool.poetry.dependencies]
 python = ">=3.10,<3.11"
 # FIXME: This should be recent pyinfra, but the recent protobuf packages are not compatible with tensorflow 2.9.0, also
 #  see RED-9948.
 pyinfra = { version = "3.4.2", source = "gitlab-research" }
 kn-utils = { version = ">=0.4.0", source = "gitlab-research" }
 dvc = "^2.34.0"
 dvc-ssh = "^2.20.0"
 dvc-azure = "^2.21.2"
 Flask = "^2.1.1"
 requests = "^2.27.1"
 iteration-utilities = "^0.11.0"
 waitress = "^2.1.1"
 envyaml = "^1.10.211231"
 dependency-check = "^0.6.0"
 mlflow = "^1.24.0"
 numpy = "^1.22.3"
 tqdm = "^4.64.0"
 pandas = "^1.4.2"
 # FIXME: Our current model significantly changes the prediction behaviour when using newer tensorflow (/ protobuf)
 #  versions which is introduuced by pyinfra updates using newer protobuf versions, see RED-9948.
 tensorflow = "2.9.0"
 protobuf = "^3.20"
 pytest = "^7.1.0"
 funcy = "^2"
 PyMuPDF = "^1.19.6"
 fpdf = "^1.7.2"
 coverage = "^6.3.2"
 Pillow = "^9.1.0"
 pdf2image = "^1.16.0"
 frozendict = "^2.3.0"
 fsspec = "^2022.11.0"
 PyMonad = "^2.4.0"
 pdfnetpython3 = "9.4.2"
 loguru = "^0.7.0"
 cyclonedx-bom = "^4.5.0"
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.0.1"
 pymonad = "^2.4.0"
 pylint = "^2.17.4"
 ipykernel = "^6.23.2"
 [tool.pytest.ini_options]
 testpaths = ["test"]
 addopts = "--ignore=data"
 filterwarnings = ["ignore:.*:DeprecationWarning"]
 [[tool.poetry.source]]
 name = "PyPI"
 priority = "primary"
 [[tool.poetry.source]]
 name = "gitlab-research"
 url = "https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi/simple"
 priority = "explicit"
 [[tool.poetry.source]]
 name = "gitlab-red"
 url = "https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi/simple"
 priority = "explicit"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,21 +0,0 @@
 Flask==2.0.2
 requests==2.27.1
 iteration-utilities==0.11.0
 dvc==2.9.3
 dvc[ssh]
 frozendict==2.3.0
 waitress==2.0.0
 envyaml~=1.8.210417
 dependency-check==0.6.*
 envyaml~=1.8.210417
 mlflow~=1.20.2
 numpy~=1.19.3
 PDFNetPython3~=9.1.0
 tqdm~=4.62.2
 pandas~=1.3.1
 mlflow~=1.20.2
 tensorflow~=2.5.0
 PDFNetPython3~=9.1.0
 Pillow~=8.3.2
 PyYAML~=5.4.1
 scikit_learn~=0.24.2
--- a/scripts/debug/debug.py
+++ b/scripts/debug/debug.py
@ -0,0 +1,46 @@
 """Script to debug RED-9948. The predictions unexpectedly changed for some images, and we need to understand why."""
 import json
 import random
 from pathlib import Path
 import numpy as np
 import tensorflow as tf
 from kn_utils.logging import logger
 from image_prediction.config import CONFIG
 from image_prediction.pipeline import load_pipeline
 def process_pdf(pipeline, pdf_path, page_range=None):
    with open(pdf_path, "rb") as f:
        logger.info(f"Processing {pdf_path}")
        predictions = list(pipeline(f.read(), page_range=page_range))
    return predictions
 def ensure_seeds():
    seed = 42
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
 def debug_info():
    devices = tf.config.list_physical_devices()
    print("Available devices:", devices)
 if __name__ == "__main__":
    # For in container debugging, copy the file and adjust the path.
    debug_file_path = Path(__file__).parents[2] / "test" / "data" / "RED-9948" / "SYNGENTA_EFSA_sanitisation_GFL_v2"
    ensure_seeds()
    debug_info()
    pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size)
    predictions = process_pdf(pipeline, debug_file_path)
    # This is the image that has the wrong prediction mentioned in RED-9948. The predictions should inconclusive, and
    # the flag all passed should be false.
    predictions = [x for x in predictions if x["representation"] == "FA30F080F0C031CE17E8CF237"]
    print(json.dumps(predictions, indent=2))
--- a/scripts/devenvsetup.sh
+++ b/scripts/devenvsetup.sh
@ -0,0 +1,30 @@
 #!/bin/bash
 python_version=$1
 gitlab_user=$2
 gitlab_personal_access_token=$3
 # cookiecutter https://gitlab.knecon.com/knecon/research/template-python-project.git --checkout master
 # latest_dir=$(ls -td -- */ | head -n 1)  # should be the dir cookiecutter just created
 # cd $latest_dir
 pyenv install $python_version
 pyenv local $python_version
 pyenv shell $python_version
 pip install --upgrade pip
 pip install poetry
 poetry config installer.max-workers 10
 # research package registry
 poetry config repositories.gitlab-research https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
 poetry config http-basic.gitlab-research ${gitlab_user} ${gitlab_personal_access_token}
 # redactmanager package registry
 poetry config repositories.gitlab-red https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
 poetry config http-basic.gitlab-red ${gitlab_user} ${gitlab_personal_access_token}
 poetry env use $(pyenv which python)
 poetry install --with=dev
 poetry update
 source .venv/bin/activate
--- a/scripts/docker_build_run.sh
+++ b/scripts/docker_build_run.sh
@ -0,0 +1,6 @@
 docker build -t --platform linux/amd64 image-clsasification-service:$(poetry version -s)-dev \                                                               
    -f Dockerfile \
    --build-arg GITLAB_USER=$GITLAB_USER \
    --build-arg GITLAB_ACCESS_TOKEN=$GITLAB_ACCESS_TOKEN \
    . && \
 docker run -it --rm image-clsasification-service:$(poetry version -s)-dev
--- a/scripts/docker_tag_push.sh
+++ b/scripts/docker_tag_push.sh
@ -0,0 +1,3 @@
 docker tag image-clsasification-service:$(poetry version -s)-dev $NEXUS_REGISTRY/red/image-clsasification-service:$(poetry version -s)-dev
 docker push $NEXUS_REGISTRY/red/image-clsasification-service:$(poetry version -s)-dev
--- a/scripts/k8s_startup_probe.py
+++ b/scripts/k8s_startup_probe.py
@ -0,0 +1,6 @@
 from pyinfra.k8s_probes import startup
 from loguru import logger
 if __name__ == "__main__":
    logger.debug("running health check")
    startup.run_checks()
--- a/scripts/keras_MnWE.py
+++ b/scripts/keras_MnWE.py
@ -0,0 +1,58 @@
 import multiprocessing
 import numpy as np
 from tensorflow import keras
 from tensorflow.keras import layers
 def process(predict_fn_wrapper):
    # We observed memory doesn't get properly deallocated unless we do this:
    manager = multiprocessing.Manager()
    return_dict = manager.dict()
    p = multiprocessing.Process(
        target=predict_fn_wrapper,
        args=(return_dict,),
    )
    p.start()
    p.join()
    try:
        return dict(return_dict)["result"]
    except KeyError:
        pass
 def make_model():
    inputs = keras.Input(shape=(784,))
    dense = layers.Dense(64, activation="relu")
    x = dense(inputs)
    outputs = layers.Dense(10)(x)
    model = keras.Model(inputs=inputs, outputs=outputs, name="mnist_model")
    model.compile(
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=keras.optimizers.RMSprop(),
        metrics=["accuracy"],
    )
    return model
 def make_predict_fn():
    # Keras bug: doesn't work in outer scope
    model = make_model()
    def predict(*args):
        # service_estimator = make_model()
        return model.predict(np.random.random(size=(1, 784)))
    return predict
 def make_predict_fn_wrapper(predict_fn):
    def predict_fn_wrapper(return_dict):
        return_dict["result"] = predict_fn()
    return predict_fn_wrapper
 if __name__ == "__main__":
    predict_fn = make_predict_fn()
    print(process(make_predict_fn_wrapper(predict_fn)))
--- a/scripts/pyinfra_mock.py
+++ b/scripts/pyinfra_mock.py
@ -6,7 +6,7 @@ import requests
 def parse_args():
    parser = argparse.ArgumentParser()
-    parser.add_argument("--pdf_path", required=True)
+    parser.add_argument("pdf_path")
    args = parser.parse_args()
    return args
--- a/scripts/run_pipeline.py
+++ b/scripts/run_pipeline.py
@ -0,0 +1,58 @@
 import argparse
 import json
 import os
 from glob import glob
 from image_prediction.config import CONFIG
 from image_prediction.pipeline import load_pipeline
 from image_prediction.utils import get_logger
 from image_prediction.utils.pdf_annotation import annotate_pdf
 logger = get_logger()
 logger.setLevel("DEBUG")
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", help="pdf file or directory")
    parser.add_argument("--print", "-p", help="print output to terminal", action="store_true", default=False)
    parser.add_argument("--page_interval", "-i", help="page interval [i, j), min index = 0", nargs=2, type=int)
    args = parser.parse_args()
    return args
 def process_pdf(pipeline, pdf_path, page_range=None):
    with open(pdf_path, "rb") as f:
        logger.info(f"Processing {pdf_path}")
        predictions = list(pipeline(f.read(), page_range=page_range))
    annotate_pdf(
        pdf_path, predictions, os.path.join("/tmp", os.path.basename(pdf_path.replace(".pdf", "_annotated.pdf")))
    )
    return predictions
 def main(args):
    pipeline = load_pipeline(verbose=CONFIG.service.verbose, batch_size=CONFIG.service.batch_size, tolerance=CONFIG.service.image_stiching_tolerance)
    if os.path.isfile(args.input):
        pdf_paths = [args.input]
    else:
        pdf_paths = glob(os.path.join(args.input, "*.pdf"))
    page_range = range(*args.page_interval) if args.page_interval else None
    for pdf_path in pdf_paths:
        predictions = process_pdf(pipeline, pdf_path, page_range=page_range)
        if args.print:
            print(pdf_path)
            print(json.dumps(predictions, indent=2))
 if __name__ == "__main__":
    args = parse_args()
    main(args)
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@ -0,0 +1,15 @@
 echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
 pip install dvc
 pip install 'dvc[ssh]'
 echo "Pulling dvc data"
 dvc pull
 docker build -f Dockerfile_tests -t image-prediction-tests .
 rnd=$(date +"%s")
 name=image-prediction-tests-${rnd}
 echo "running tests container"
 docker run --rm --name $name -v $PWD:$PWD -w $PWD -v /var/run/docker.sock:/var/run/docker.sock image-prediction-tests
--- a/setup.py
+++ b/setup.py
@ -1,13 +0,0 @@
 #!/usr/bin/env python
 from distutils.core import setup
 setup(
    name="image_prediction",
    version="0.1.0",
    description="",
    author="",
    author_email="",
    url="",
    packages=["image_prediction"],
 )
--- a/setup/docker.sh
+++ b/setup/docker.sh
@ -1,15 +0,0 @@
 #!/bin/bash
 set -e
 python3 -m venv build_venv
 source build_venv/bin/activate
 python3 -m pip install --upgrade pip
 pip install dvc
 pip install 'dvc[ssh]'
 dvc pull
 git submodule update --init --recursive
 docker build -f Dockerfile_base -t image-prediction-base .
 docker build -f Dockerfile -t image-prediction .
--- a/sonar-project.properties
+++ b/sonar-project.properties
@ -1,4 +0,0 @@
 sonar.exclusions=bamboo-specs/**, **/test_data/**
 sonar.c.file.suffixes=-
 sonar.cpp.file.suffixes=-
 sonar.objc.file.suffixes=-
--- a/src/image_prediction/init.py
+++ b/src/image_prediction/init.py
@ -0,0 +1,13 @@
 import logging
 import sys
 # log config
 LOG_FORMAT = "%(asctime)s [%(levelname)s] - [%(filename)s -> %(funcName)s() -> %(lineno)s] : %(message)s"
 DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
 stream_handler = logging.StreamHandler(sys.stdout)
 stream_handler_format = logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT)
 stream_handler.setFormatter(stream_handler_format)
 logger = logging.getLogger(__name__)
 logger.propagate = False
 logger.addHandler(stream_handler)
--- a/src/image_prediction/classifier/init.py
+++ b/src/image_prediction/classifier/init.py
--- a/src/image_prediction/classifier/classifier.py
+++ b/src/image_prediction/classifier/classifier.py
@ -0,0 +1,35 @@
 from typing import List, Union, Tuple
 import numpy as np
 from PIL.Image import Image
 from funcy import rcompose
 from image_prediction.estimator.adapter.adapter import EstimatorAdapter
 from image_prediction.label_mapper.mapper import LabelMapper
 from image_prediction.utils import get_logger
 logger = get_logger()
 class Classifier:
    def __init__(self, estimator_adapter: EstimatorAdapter, label_mapper: LabelMapper):
        """Abstraction layer over different estimator backends (e.g. keras or scikit-learn). For each backend to be used
        an EstimatorAdapter must be implemented.
        Args:
            estimator_adapter: adapter for a given estimator backend
        """
        self.__estimator_adapter = estimator_adapter
        self.__label_mapper = label_mapper
        self.__pipe = rcompose(self.__estimator_adapter, self.__label_mapper)
    def predict(self, batch: Union[np.array, Tuple[Image]]) -> List[str]:
        if isinstance(batch, np.ndarray) and batch.shape[0] == 0:
            return []
        return self.__pipe(batch)
    def __call__(self, batch: np.array) -> List[str]:
        logger.debug("Classifier.predict")
        return self.predict(batch)
--- a/src/image_prediction/classifier/image_classifier.py
+++ b/src/image_prediction/classifier/image_classifier.py
@ -0,0 +1,32 @@
 from itertools import chain
 from typing import Iterable
 from PIL.Image import Image
 from funcy import rcompose, chunks
 from image_prediction.classifier.classifier import Classifier
 from image_prediction.estimator.preprocessor.preprocessor import Preprocessor
 from image_prediction.estimator.preprocessor.preprocessors.identity import IdentityPreprocessor
 from image_prediction.utils import get_logger
 logger = get_logger()
 class ImageClassifier:
    """Combines a classifier with a preprocessing pipeline: Receives images, chunks into batches, converts to tensors,
    applies transformations and finally sends to internal classifier.
    """
    def __init__(self, classifier: Classifier, preprocessor: Preprocessor = None):
        self.estimator = classifier
        self.preprocessor = preprocessor if preprocessor else IdentityPreprocessor()
        self.pipe = rcompose(self.preprocessor, self.estimator)
    def predict(self, images: Iterable[Image], batch_size=16):
        batches = chunks(batch_size, images)
        predictions = chain.from_iterable(map(self.pipe, batches))
        return predictions
    def __call__(self, images: Iterable[Image], batch_size=16):
        logger.debug("ImageClassifier.predict")
        yield from self.predict(images, batch_size=batch_size)
--- a/src/image_prediction/compositor/init.py
+++ b/src/image_prediction/compositor/init.py
--- a/src/image_prediction/compositor/compositor.py
+++ b/src/image_prediction/compositor/compositor.py
@ -0,0 +1,16 @@
 from funcy import rcompose
 from image_prediction.transformer.transformer import Transformer
 from image_prediction.utils import get_logger
 logger = get_logger()
 class TransformerCompositor(Transformer):
    def __init__(self, formatter: Transformer, *formatters: Transformer):
        formatters = (formatter, *formatters)
        self.pipe = rcompose(*formatters)
    def transform(self, obj):
        logger.debug("TransformerCompositor.transform")
        return self.pipe(obj)
--- a/src/image_prediction/config.py
+++ b/src/image_prediction/config.py
@ -0,0 +1,7 @@
 from pathlib import Path
 from pyinfra.config.loader import load_settings
 from image_prediction.locations import PROJECT_ROOT_DIR
 CONFIG = load_settings(root_path=PROJECT_ROOT_DIR, settings_path="config")
--- a/src/image_prediction/default_objects.py
+++ b/src/image_prediction/default_objects.py
@ -0,0 +1,43 @@
 from funcy import juxt
 from image_prediction.classifier.classifier import Classifier
 from image_prediction.classifier.image_classifier import ImageClassifier
 from image_prediction.compositor.compositor import TransformerCompositor
 from image_prediction.encoder.encoders.hash_encoder import HashEncoder
 from image_prediction.estimator.adapter.adapter import EstimatorAdapter
 from image_prediction.formatter.formatters.camel_case import Snake2CamelCaseKeyFormatter
 from image_prediction.formatter.formatters.enum import EnumFormatter
 from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
 from image_prediction.label_mapper.mappers.probability import ProbabilityMapper
 from image_prediction.model_loader.loader import ModelLoader
 from image_prediction.model_loader.loaders.mlflow import MlflowConnector
 from image_prediction.redai_adapter.mlflow import MlflowModelReader
 from image_prediction.transformer.transformers.coordinate.pdfnet import PDFNetCoordinateTransformer
 from image_prediction.transformer.transformers.response import ResponseTransformer
 def get_mlflow_model_loader(mlruns_dir):
    model_loader = ModelLoader(MlflowConnector(MlflowModelReader(mlruns_dir)))
    return model_loader
 def get_image_classifier(model_loader, model_identifier):
    model, classes = juxt(model_loader.load_model, model_loader.load_classes)(model_identifier)
    return ImageClassifier(Classifier(EstimatorAdapter(model), ProbabilityMapper(classes)))
 def get_extractor(**kwargs):
    image_extractor = ParsablePDFImageExtractor(**kwargs)
    return image_extractor
 def get_formatter():
    formatter = TransformerCompositor(
        PDFNetCoordinateTransformer(), EnumFormatter(), ResponseTransformer(), Snake2CamelCaseKeyFormatter()
    )
    return formatter
 def get_encoder():
    return HashEncoder()
--- a/src/image_prediction/encoder/init.py
+++ b/src/image_prediction/encoder/init.py
--- a/src/image_prediction/encoder/encoder.py
+++ b/src/image_prediction/encoder/encoder.py
@ -0,0 +1,13 @@
 import abc
 from typing import Iterable
 from PIL.Image import Image
 class Encoder(abc.ABC):
    @abc.abstractmethod
    def encode(self, images: Iterable[Image]):
        raise NotImplementedError
    def __call__(self, images: Iterable[Image], batch_size=16):
        yield from self.encode(images)
--- a/src/image_prediction/encoder/encoders/init.py
+++ b/src/image_prediction/encoder/encoders/init.py
--- a/src/image_prediction/encoder/encoders/hash_encoder.py
+++ b/src/image_prediction/encoder/encoders/hash_encoder.py
@ -0,0 +1,26 @@
 from typing import Iterable
 from PIL import Image
 from image_prediction.encoder.encoder import Encoder
 class HashEncoder(Encoder):
    def encode(self, images: Iterable[Image.Image]):
        yield from map(hash_image, images)
    def __call__(self, images: Iterable[Image.Image], batch_size=16):
        yield from self.encode(images)
 def hash_image(image: Image.Image) -> str:
    """See: https://stackoverflow.com/a/49692185/3578468"""
    image = image.resize((10, 10), Image.ANTIALIAS)
    image = image.convert("L")
    pixel_data = list(image.getdata())
    avg_pixel = sum(pixel_data) / len(pixel_data)
    bits = "".join(["1" if (px >= avg_pixel) else "0" for px in pixel_data])
    hex_representation = str(hex(int(bits, 2)))[2:][::-1].upper()
    # Note: For each 4 leading zeros, the hex representation will be shorter by one character.
    # To ensure that all hashes have the same length, we pad the hex representation with zeros (also see RED-3813).
    return hex_representation.zfill(25)
--- a/src/image_prediction/estimator/init.py
+++ b/src/image_prediction/estimator/init.py
--- a/src/image_prediction/estimator/adapter/init.py
+++ b/src/image_prediction/estimator/adapter/init.py
--- a/src/image_prediction/estimator/adapter/adapter.py
+++ b/src/image_prediction/estimator/adapter/adapter.py
@ -0,0 +1,15 @@
 from image_prediction.utils import get_logger
 logger = get_logger()
 class EstimatorAdapter:
    def __init__(self, estimator):
        self.estimator = estimator
    def predict(self, batch):
        return self.estimator(batch)
    def __call__(self, batch):
        logger.debug("EstimatorAdapter.predict")
        return self.predict(batch)
--- a/src/image_prediction/estimator/adapter/adapters/init.py
+++ b/src/image_prediction/estimator/adapter/adapters/init.py
--- a/src/image_prediction/estimator/preprocessor/init.py
+++ b/src/image_prediction/estimator/preprocessor/init.py
--- a/src/image_prediction/estimator/preprocessor/preprocessor.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessor.py
@ -0,0 +1,10 @@
 import abc
 class Preprocessor(abc.ABC):
    @abc.abstractmethod
    def preprocess(self, batch):
        raise NotImplementedError
    def __call__(self, batch):
        return self.preprocess(batch)
--- a/src/image_prediction/estimator/preprocessor/preprocessors/init.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessors/init.py
--- a/src/image_prediction/estimator/preprocessor/preprocessors/basic.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessors/basic.py
@ -0,0 +1,10 @@
 from image_prediction.estimator.preprocessor.preprocessor import Preprocessor
 from image_prediction.estimator.preprocessor.utils import images_to_batch_tensor
 class BasicPreprocessor(Preprocessor):
    """Converts images to tensors"""
    @staticmethod
    def preprocess(images):
        return images_to_batch_tensor(images)
--- a/src/image_prediction/estimator/preprocessor/preprocessors/identity.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessors/identity.py
@ -0,0 +1,10 @@
 from image_prediction.estimator.preprocessor.preprocessor import Preprocessor
 class IdentityPreprocessor(Preprocessor):
    @staticmethod
    def preprocess(images):
        return images
    def __call__(self, images):
        return self.preprocess(images)
--- a/src/image_prediction/estimator/preprocessor/utils.py
+++ b/src/image_prediction/estimator/preprocessor/utils.py
@ -0,0 +1,10 @@
 import numpy as np
 from PIL.Image import Image
 def image_to_normalized_tensor(image: Image) -> np.ndarray:
    return np.array(image) / 255
 def images_to_batch_tensor(images) -> np.ndarray:
    return np.array(list(map(image_to_normalized_tensor, images)))
--- a/src/image_prediction/exceptions.py
+++ b/src/image_prediction/exceptions.py
@ -0,0 +1,42 @@
 class UnknownEstimatorAdapter(ValueError):
    pass
 class UnknownImageExtractor(ValueError):
    pass
 class UnknownModelLoader(ValueError):
    pass
 class UnknownDatabaseType(ValueError):
    pass
 class UnknownLabelFormat(ValueError):
    pass
 class UnexpectedLabelFormat(ValueError):
    pass
 class IncorrectInstantiation(RuntimeError):
    pass
 class IntentionalTestException(RuntimeError):
    pass
 class InvalidBox(Exception):
    pass
 class ParsingError(Exception):
    pass
 class BadXref(ValueError):
    pass
--- a/src/image_prediction/extraction.py
+++ b/src/image_prediction/extraction.py
@ -0,0 +1,13 @@
 from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
 def extract_images_from_pdf(pdf, extractor=None):
    if not extractor:
        extractor = ParsablePDFImageExtractor()
    try:
        images_extracted, metadata_extracted = zip(*extractor(pdf))
        return images_extracted, metadata_extracted
    except ValueError:
        return [], []
--- a/src/image_prediction/flask.py
+++ b/src/image_prediction/flask.py
@ -0,0 +1,60 @@
 from typing import Callable
 from flask import Flask, request, jsonify
 from prometheus_client import generate_latest, CollectorRegistry, Summary
 from image_prediction.utils import get_logger
 from image_prediction.utils.process_wrapping import wrap_in_process
 logger = get_logger()
 def make_prediction_server(predict_fn: Callable):
    app = Flask(__name__)
    registry = CollectorRegistry(auto_describe=True)
    metric = Summary(
        f"redactmanager_imageClassification_seconds", f"Time spent on image-service classification.", registry=registry
    )
    @app.route("/ready", methods=["GET"])
    def ready():
        resp = jsonify("OK")
        resp.status_code = 200
        return resp
    @app.route("/health", methods=["GET"])
    def healthy():
        resp = jsonify("OK")
        resp.status_code = 200
        return resp
    def __failure():
        response = jsonify("Analysis failed")
        response.status_code = 500
        return response
    @app.route("/predict", methods=["POST"])
    @app.route("/", methods=["POST"])
    @metric.time()
    def predict():
        # Tensorflow does not free RAM. Workaround: Run prediction function (which instantiates a model) in sub-process.
        # See: https://stackoverflow.com/questions/39758094/clearing-tensorflow-gpu-memory-after-model-execution
        predict_fn_wrapped = wrap_in_process(predict_fn)
        logger.info("Analysing...")
        predictions = predict_fn_wrapped(request.data)
        if predictions is not None:
            response = jsonify(predictions)
            logger.info("Analysis completed.")
            return response
        else:
            logger.error("Analysis failed.")
            return __failure()
    @app.route("/prometheus", methods=["GET"])
    def prometheus():
        return generate_latest(registry=registry)
    return app
--- a/src/image_prediction/formatter/init.py
+++ b/src/image_prediction/formatter/init.py
--- a/src/image_prediction/formatter/formatter.py
+++ b/src/image_prediction/formatter/formatter.py
@ -0,0 +1,15 @@
 import abc
 from image_prediction.transformer.transformer import Transformer
 class Formatter(Transformer):
    @abc.abstractmethod
    def format(self, obj):
        raise NotImplementedError
    def transform(self, obj):
        raise NotImplementedError()
    def __call__(self, obj):
        return self.format(obj)
--- a/src/image_prediction/formatter/formatters/init.py
+++ b/src/image_prediction/formatter/formatters/init.py
--- a/src/image_prediction/formatter/formatters/camel_case.py
+++ b/src/image_prediction/formatter/formatters/camel_case.py
@ -0,0 +1,11 @@
 from image_prediction.formatter.formatters.key_formatter import KeyFormatter
 class Snake2CamelCaseKeyFormatter(KeyFormatter):
    def format_key(self, key):
        if isinstance(key, str):
            head, *tail = key.split("_")
            return head + "".join(map(str.title, tail))
        else:
            return key
--- a/src/image_prediction/formatter/formatters/enum.py
+++ b/src/image_prediction/formatter/formatters/enum.py
@ -0,0 +1,23 @@
 from enum import Enum
 from image_prediction.formatter.formatters.key_formatter import KeyFormatter
 class EnumFormatter(KeyFormatter):
    def format_key(self, key):
        return key.value if isinstance(key, Enum) else key
    def transform(self, obj):
        raise NotImplementedError
 class ReverseEnumFormatter(KeyFormatter):
    def __init__(self, enum):
        self.enum = enum
        self.reverse_enum = {e.value: e for e in enum}
    def format_key(self, key):
        return self.reverse_enum.get(key, key)
    def transform(self, obj):
        raise NotImplementedError
--- a/src/image_prediction/formatter/formatters/identity.py
+++ b/src/image_prediction/formatter/formatters/identity.py
@ -0,0 +1,6 @@
 from image_prediction.formatter.formatter import Formatter
 class IdentityFormatter(Formatter):
    def format(self, obj):
        return obj
--- a/src/image_prediction/formatter/formatters/key_formatter.py
+++ b/src/image_prediction/formatter/formatters/key_formatter.py
@ -0,0 +1,28 @@
 import abc
 from typing import Iterable
 from image_prediction.formatter.formatter import Formatter
 class KeyFormatter(Formatter):
    @abc.abstractmethod
    def format_key(self, key):
        raise NotImplementedError
    def __format(self, data):
        # If we wanted to do this properly, we would need handlers for all expected types and dispatch based
        # on a type comparison. This is too much engineering for the limited use-case of this class though.
        if isinstance(data, Iterable) and not isinstance(data, dict) and not isinstance(data, str):
            f = map(self.__format, data)
            return type(data)(f) if not isinstance(data, map) else f
        if not isinstance(data, dict):
            return data
        keys_formatted = list(map(self.format_key, data))
        return dict(zip(keys_formatted, map(self.__format, data.values())))
    def format(self, data):
        return self.__format(data)
--- a/src/image_prediction/image_extractor/init.py
+++ b/src/image_prediction/image_extractor/init.py
--- a/src/image_prediction/image_extractor/extractor.py
+++ b/src/image_prediction/image_extractor/extractor.py
@ -0,0 +1,19 @@
 import abc
 from collections import namedtuple
 from typing import Iterable
 from image_prediction.utils import get_logger
 ImageMetadataPair = namedtuple("ImageMetadataPair", ["image", "metadata"])
 logger = get_logger()
 class ImageExtractor(abc.ABC):
    @abc.abstractmethod
    def extract(self, obj) -> Iterable[ImageMetadataPair]:
        raise NotImplementedError
    def __call__(self, obj, **kwargs):
        logger.debug("ImageExtractor.extract")
        return self.extract(obj, **kwargs)
--- a/src/image_prediction/image_extractor/extractors/init.py
+++ b/src/image_prediction/image_extractor/extractors/init.py
--- a/src/image_prediction/image_extractor/extractors/mock.py
+++ b/src/image_prediction/image_extractor/extractors/mock.py
@ -0,0 +1,7 @@
 from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
 class ImageExtractorMock(ImageExtractor):
    def extract(self, image_container):
        for i, image in enumerate(image_container):
            yield ImageMetadataPair(image, {"image_id": i})
--- a/src/image_prediction/image_extractor/extractors/parsable.py
+++ b/src/image_prediction/image_extractor/extractors/parsable.py
@ -0,0 +1,300 @@
 import atexit
 import json
 import traceback
 from _operator import itemgetter
 from functools import partial, lru_cache
 from itertools import chain, starmap, filterfalse, tee
 from operator import itemgetter, truth
 from typing import Iterable, Iterator, List, Union
 import fitz
 import numpy as np
 from PIL import Image
 from funcy import merge, pluck, compose, rcompose, remove, keep
 from scipy.stats import gmean
 from image_prediction.config import CONFIG
 from image_prediction.exceptions import InvalidBox
 from image_prediction.formatter.formatters.enum import EnumFormatter
 from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
 from image_prediction.info import Info
 from image_prediction.stitching.stitching import stitch_pairs
 from image_prediction.stitching.utils import validate_box
 from image_prediction.transformer.transformers.response import compute_geometric_quotient
 from image_prediction.utils import get_logger
 logger = get_logger()
 class ParsablePDFImageExtractor(ImageExtractor):
    def __init__(self, verbose=False, tolerance=0):
        """
        Args:
            verbose: Whether to show progressbar
            tolerance: The tolerance in pixels for the distance between images, beyond which they will not be stitched
                together
        """
        self.doc: fitz.Document = None
        self.verbose = verbose
        self.tolerance = tolerance
    def extract(self, pdf: bytes, page_range: range = None):
        self.doc = fitz.Document(stream=pdf)
        pages = extract_pages(self.doc, page_range) if page_range else self.doc
        image_metadata_pairs = chain.from_iterable(map(self.__process_images_on_page, pages))
        yield from image_metadata_pairs
    def __process_images_on_page(self, page: fitz.Page):
        metadata = extract_valid_metadata(self.doc, page)
        images = get_images_on_page(self.doc, metadata)
        clear_caches()
        image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
        #  TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the
        #   validation here. Invalid images can then be split into a different stream and joined with the intact images
        #   again for the formatting step.
        image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs)
        image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
        yield from image_metadata_pairs
    @staticmethod
    def __filter_valid_images(image_metadata_pairs: Iterable[ImageMetadataPair]) -> Iterator[ImageMetadataPair]:
        def validate_image_is_not_corrupt(image: Image.Image, metadata: dict):
            """See RED-5148: Some images are corrupt and cannot be processed by the image classifier. This function
            filters out such images by trying to resize and convert them to RGB. If this fails, the image is considered
            corrupt and is dropped.
            TODO: find cleaner solution
            """
            try:
                image.resize((100, 100)).convert("RGB")
                return ImageMetadataPair(image, metadata)
            except (OSError, Exception) as err:
                metadata = json.dumps(EnumFormatter()(metadata), indent=2)
                logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
                return None
        def filter_small_images_on_scanned_pages(image_metadata_pairs) -> Iterable[ImageMetadataPair]:
            """See RED-9746: Small images on scanned pages should be dropped, so they are not classified. This is a
            heuristic to filter out images that are too small in relation to the page size if they are on a scanned page.
            The ratio is computed as the geometric mean of the width and height of the image divided by the geometric mean
            of the width and height of the page. If the ratio is below the threshold, the image is dropped.
            """
            def image_is_a_scanned_page(image_metadata_pair: ImageMetadataPair) -> bool:
                tolerance = CONFIG.filters.is_scanned_page.tolerance
                width_ratio = image_metadata_pair.metadata[Info.WIDTH] / image_metadata_pair.metadata[Info.PAGE_WIDTH]
                height_ratio = (
                    image_metadata_pair.metadata[Info.HEIGHT] / image_metadata_pair.metadata[Info.PAGE_HEIGHT]
                )
                return width_ratio >= 1 - tolerance and height_ratio >= 1 - tolerance
            def image_fits_geometric_mean_ratio(image_metadata_pair: ImageMetadataPair) -> bool:
                min_ratio = CONFIG.filters.image_to_page_quotient.min
                metadatum = image_metadata_pair.metadata
                image_gmean = gmean([metadatum[Info.WIDTH], metadatum[Info.HEIGHT]])
                page_gmean = gmean([metadatum[Info.PAGE_WIDTH], metadatum[Info.PAGE_HEIGHT]])
                ratio = image_gmean / page_gmean
                return ratio >= min_ratio
            pairs, pairs_copy = tee(image_metadata_pairs)
            if any(map(image_is_a_scanned_page, pairs_copy)):
                logger.debug("Scanned page detected, filtering out small images ...")
                return filter(image_fits_geometric_mean_ratio, pairs)
            else:
                return pairs
        image_metadata_pairs = filter_small_images_on_scanned_pages(image_metadata_pairs)
        return filter(truth, starmap(validate_image_is_not_corrupt, image_metadata_pairs))
 def extract_pages(doc, page_range):
    page_range = range(page_range.start + 1, page_range.stop + 1)
    pages = map(doc.load_page, page_range)
    yield from pages
 def get_images_on_page(doc, metadata):
    xrefs = pluck(Info.XREF, metadata)
    images = map(partial(xref_to_image, doc), xrefs)
    yield from images
 def extract_valid_metadata(doc: fitz.Document, page: fitz.Page):
    metadata = get_metadata_for_images_on_page(page)
    metadata = filter_valid_metadata(metadata)
    metadata = add_alpha_channel_info(doc, metadata)
    return list(metadata)
 def get_metadata_for_images_on_page(page: fitz.Page):
    metadata = map(get_image_metadata, get_image_infos(page))
    metadata = add_page_metadata(page, metadata)
    yield from metadata
 def filter_valid_metadata(metadata):
    yield from compose(
        # TODO: Disabled for now, since atm since the backend needs atm the metadata and the hash of every image, even
        #  scanned pages. In the future, this should be resolved differently, e.g. by filtering all page-sized images
        #  and giving the user the ability to reclassify false positives with a separate call.
        # filter_out_page_sized_images,
        filter_out_tiny_images,
        filter_out_invalid_metadata,
    )(metadata)
 def filter_out_invalid_metadata(metadata):
    def __validate_box(box):
        try:
            return validate_box(box)
        except InvalidBox as err:
            logger.debug(f"Dropping invalid metadatum, reason: {err}")
    yield from keep(__validate_box, metadata)
 def filter_out_page_sized_images(metadata):
    yield from remove(breaches_image_to_page_quotient, metadata)
 def filter_out_tiny_images(metadata):
    yield from filterfalse(tiny, metadata)
@lru_cache(maxsize=None)
 def get_image_infos(page: fitz.Page) -> List[dict]:
    return page.get_image_info(xrefs=True)
@lru_cache(maxsize=None)
 def xref_to_image(doc, xref) -> Union[Image.Image, None]:
    # NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream
    try:
        pixmap = fitz.Pixmap(doc, xref)
        array = convert_pixmap_to_array(pixmap)
        return Image.fromarray(array)
    except ValueError:
        logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
        return
 def convert_pixmap_to_array(pixmap: fitz.Pixmap):
    array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
    array = _normalize_channels(array)
    return array
 def _normalize_channels(array: np.ndarray):
    if array.shape[-1] == 1:
        array = array[:, :, 0]
    elif array.shape[-1] == 4:
        array = array[..., :3]
    elif array.shape[-1] != 3:
        logger.warning(f"Unexpected image format: {array.shape}.")
        raise ValueError(f"Unexpected image format: {array.shape}.")
    return array
 def get_image_metadata(image_info):
    xref, coords = itemgetter("xref", "bbox")(image_info)
    x1, y1, x2, y2 = map(rounder, coords)
    width = abs(x2 - x1)
    height = abs(y2 - y1)
    return {
        Info.WIDTH: width,
        Info.HEIGHT: height,
        Info.X1: x1,
        Info.X2: x2,
        Info.Y1: y1,
        Info.Y2: y2,
        Info.XREF: xref,
    }
 def add_page_metadata(page, metadata):
    yield from map(partial(merge, get_page_metadata(page)), metadata)
 def add_alpha_channel_info(doc, metadata):
    def add_alpha_value_to_metadatum(metadatum):
        alpha = metadatum_to_alpha_value(metadatum)
        return {**metadatum, Info.ALPHA: alpha}
    xref_to_alpha = partial(has_alpha_channel, doc)
    metadatum_to_alpha_value = compose(xref_to_alpha, itemgetter(Info.XREF))
    yield from map(add_alpha_value_to_metadatum, metadata)
@lru_cache(maxsize=None)
 def load_image_handle_from_xref(doc, xref):
    try:
        return doc.extract_image(xref)
    except ValueError:
        logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
        return
 rounder = rcompose(round, int)
 def get_page_metadata(page):
    page_width, page_height = map(rounder, page.mediabox_size)
    return {
        Info.PAGE_WIDTH: page_width,
        Info.PAGE_HEIGHT: page_height,
        Info.PAGE_IDX: page.number,
    }
 def has_alpha_channel(doc, xref):
    maybe_image = load_image_handle_from_xref(doc, xref)
    maybe_smask = maybe_image["smask"] if maybe_image else None
    if maybe_smask:
        return any([doc.extract_image(maybe_smask) is not None, bool(fitz.Pixmap(doc, maybe_smask).alpha)])
    else:
        try:
            return bool(fitz.Pixmap(doc, xref).alpha)
        except ValueError:
            logger.debug(f"Encountered invalid xref `{xref}` in {doc.metadata.get('title', '<no title>')}.")
            return False
 def tiny(metadata):
    return metadata[Info.WIDTH] * metadata[Info.HEIGHT] <= 4
 def clear_caches():
    get_image_infos.cache_clear()
    load_image_handle_from_xref.cache_clear()
    xref_to_image.cache_clear()
 atexit.register(clear_caches)
 def breaches_image_to_page_quotient(metadatum):
    page_width, page_height, x1, x2, y1, y2, width, height = itemgetter(
        Info.PAGE_WIDTH, Info.PAGE_HEIGHT, Info.X1, Info.X2, Info.Y1, Info.Y2, Info.WIDTH, Info.HEIGHT
    )(metadatum)
    geometric_quotient = compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1)
    quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max)
    return quotient_breached
--- a/src/image_prediction/info.py
+++ b/src/image_prediction/info.py
@ -0,0 +1,15 @@
 from enum import Enum
 class Info(Enum):
    PAGE_WIDTH = "page_width"
    PAGE_HEIGHT = "page_height"
    PAGE_IDX = "page_idx"
    WIDTH = "width"
    HEIGHT = "height"
    X1 = "x1"
    X2 = "x2"
    Y1 = "y1"
    Y2 = "y2"
    ALPHA = "alpha"
    XREF = "xref"
--- a/src/image_prediction/label_mapper/init.py
+++ b/src/image_prediction/label_mapper/init.py
--- a/src/image_prediction/label_mapper/mapper.py
+++ b/src/image_prediction/label_mapper/mapper.py
@ -0,0 +1,10 @@
 import abc
 class LabelMapper(abc.ABC):
    @abc.abstractmethod
    def map_labels(self, items):
        raise NotImplementedError
    def __call__(self, items):
        return self.map_labels(items)
--- a/src/image_prediction/label_mapper/mappers/init.py
+++ b/src/image_prediction/label_mapper/mappers/init.py
--- a/src/image_prediction/label_mapper/mappers/numeric.py
+++ b/src/image_prediction/label_mapper/mappers/numeric.py
@ -0,0 +1,20 @@
 from typing import Mapping, Iterable
 from image_prediction.exceptions import UnexpectedLabelFormat
 from image_prediction.label_mapper.mapper import LabelMapper
 class IndexMapper(LabelMapper):
    def __init__(self, labels: Mapping[int, str]):
        self.__labels = labels
    def __validate_index_label_format(self, index_label: int) -> None:
        if not 0 <= index_label < len(self.__labels):
            raise UnexpectedLabelFormat(f"Received index label  '{index_label}' that has no associated string label.")
    def __map_label(self, index_label: int) -> str:
        self.__validate_index_label_format(index_label)
        return self.__labels[index_label]
    def map_labels(self, index_labels: Iterable[int]) -> Iterable[str]:
        return map(self.__map_label, index_labels)
--- a/src/image_prediction/label_mapper/mappers/probability.py
+++ b/src/image_prediction/label_mapper/mappers/probability.py
@ -0,0 +1,39 @@
 from enum import Enum
 from operator import itemgetter
 from typing import Mapping, Iterable
 import numpy as np
 from funcy import rcompose, rpartial
 from image_prediction.exceptions import UnexpectedLabelFormat
 from image_prediction.label_mapper.mapper import LabelMapper
 class ProbabilityMapperKeys(Enum):
    LABEL = "label"
    PROBABILITIES = "probabilities"
 class ProbabilityMapper(LabelMapper):
    def __init__(self, labels: Mapping[int, str]):
        self.__labels = labels
        # String conversion in the middle due to floating point precision issues.
        # See: https://stackoverflow.com/questions/56820/round-doesnt-seem-to-be-rounding-properly
        self.__rounder = rcompose(rpartial(round, 4), str, float)
    def __validate_array_label_format(self, probabilities: np.ndarray) -> None:
        if not len(probabilities) == len(self.__labels):
            raise UnexpectedLabelFormat(
                f"Received fewer probabilities ({len(probabilities)}) than labels were passed ({len(self.__labels)})."
            )
    def __map_array(self, probabilities: np.ndarray) -> dict:
        self.__validate_array_label_format(probabilities)
        cls2prob = dict(
            sorted(zip(self.__labels, list(map(self.__rounder, probabilities))), key=itemgetter(1), reverse=True)
        )
        most_likely = [*cls2prob][0]
        return {ProbabilityMapperKeys.LABEL: most_likely, ProbabilityMapperKeys.PROBABILITIES: cls2prob}
    def map_labels(self, probabilities: Iterable[np.ndarray]) -> Iterable[dict]:
        return map(self.__map_array, probabilities)
--- a/src/image_prediction/locations.py
+++ b/src/image_prediction/locations.py
@ -0,0 +1,18 @@
 """Defines constant paths relative to the module root path."""
 from pathlib import Path
 # FIXME: move these paths to config, only depending on 'ROOT_PATH' environment variable.
 MODULE_DIR = Path(__file__).resolve().parents[0]
 PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
 PROJECT_ROOT_DIR = PACKAGE_ROOT_DIR.parents[0]
 CONFIG_FILE = PROJECT_ROOT_DIR / "config" / "settings.toml"
 BANNER_FILE = PROJECT_ROOT_DIR / "banner.txt"
 DATA_DIR = PROJECT_ROOT_DIR / "data"
 MLRUNS_DIR = str(DATA_DIR / "mlruns")
 TEST_DIR = PROJECT_ROOT_DIR / "test"
 TEST_DATA_DIR = TEST_DIR / "data"
 TEST_DATA_DIR_DVC = TEST_DIR / "data.dvc"
--- a/src/image_prediction/model_loader/init.py
+++ b/src/image_prediction/model_loader/init.py
--- a/src/image_prediction/model_loader/database/init.py
+++ b/src/image_prediction/model_loader/database/init.py
--- a/src/image_prediction/model_loader/database/connector.py
+++ b/src/image_prediction/model_loader/database/connector.py
@ -0,0 +1,7 @@
 import abc
 class DatabaseConnector(abc.ABC):
    @abc.abstractmethod
    def get_object(self, identifier):
        raise NotImplementedError
--- a/src/image_prediction/model_loader/database/connectors/init.py
+++ b/src/image_prediction/model_loader/database/connectors/init.py
--- a/src/image_prediction/model_loader/database/connectors/mock.py
+++ b/src/image_prediction/model_loader/database/connectors/mock.py
@ -0,0 +1,9 @@
 from image_prediction.model_loader.database.connector import DatabaseConnector
 class DatabaseConnectorMock(DatabaseConnector):
    def __init__(self, store: dict):
        self.store = store
    def get_object(self, identifier):
        return self.store[identifier]
--- a/src/image_prediction/model_loader/loader.py
+++ b/src/image_prediction/model_loader/loader.py
@ -0,0 +1,18 @@
 from functools import lru_cache
 from image_prediction.model_loader.database.connector import DatabaseConnector
 class ModelLoader:
    def __init__(self, database_connector: DatabaseConnector):
        self.database_connector = database_connector
    @lru_cache(maxsize=None)
    def __get_object(self, identifier):
        return self.database_connector.get_object(identifier)
    def load_model(self, identifier):
        return self.__get_object(identifier)["model"]
    def load_classes(self, identifier):
        return self.__get_object(identifier)["classes"]
--- a/src/image_prediction/model_loader/loaders/init.py
+++ b/src/image_prediction/model_loader/loaders/init.py
--- a/src/image_prediction/model_loader/loaders/mlflow.py
+++ b/src/image_prediction/model_loader/loaders/mlflow.py
@ -0,0 +1,10 @@
 from image_prediction.model_loader.database.connector import DatabaseConnector
 from image_prediction.redai_adapter.mlflow import MlflowModelReader
 class MlflowConnector(DatabaseConnector):
    def __init__(self, mlflow_reader: MlflowModelReader):
        self.mlflow_reader = mlflow_reader
    def get_object(self, run_id):
        return self.mlflow_reader[run_id]
--- a/src/image_prediction/pipeline.py
+++ b/src/image_prediction/pipeline.py
@ -0,0 +1,105 @@
 import os
 from functools import lru_cache, partial
 from itertools import chain, tee
 from typing import Iterable, Any
 from funcy import rcompose, first, compose, second, chunks, identity, rpartial
 from kn_utils.logging import logger
 from tqdm import tqdm
 from image_prediction.config import CONFIG
 from image_prediction.default_objects import (
    get_formatter,
    get_mlflow_model_loader,
    get_image_classifier,
    get_extractor,
    get_encoder,
 )
 from image_prediction.locations import MLRUNS_DIR
 from image_prediction.utils.generic import lift, starlift
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
@lru_cache(maxsize=None)
 def load_pipeline(**kwargs):
    logger.info(f"Loading pipeline with kwargs: {kwargs}")
    model_loader = get_mlflow_model_loader(MLRUNS_DIR)
    model_identifier = CONFIG.service.mlflow_run_id
    pipeline = Pipeline(model_loader, model_identifier, **kwargs)
    return pipeline
 def parallel(*fs):
    return lambda *args: (f(a) for f, a in zip(fs, args))
 def star(f):
    return lambda x: f(*x)
 class Pipeline:
    def __init__(self, model_loader, model_identifier, batch_size=16, verbose=False, **kwargs):
        self.verbose = verbose
        extract = get_extractor(**kwargs)
        classifier = get_image_classifier(model_loader, model_identifier)
        reformat = get_formatter()
        represent = get_encoder()
        split = compose(star(parallel(*map(lift, (first, first, second)))), rpartial(tee, 3))
        classify = compose(chain.from_iterable, lift(classifier), partial(chunks, batch_size))
        pairwise_apply = compose(star, parallel)
        join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))
        #                       />--classify--\
        # --extract-->--split--+->--encode---->+--join-->reformat-->filter_duplicates
        #                       \>--identity--/
        self.pipe = rcompose(
            extract,  # ... image-metadata-pairs as a stream
            split,  # ... into an image stream and a metadata stream
            pairwise_apply(classify, represent, identity),  # ... apply functions to the streams pairwise
            join,  # ... the streams by zipping
            reformat,  # ... the items
            filter_duplicates,  # ... filter out duplicate images
        )
    def __call__(self, pdf: bytes, page_range: range = None):
        yield from tqdm(
            self.pipe(pdf, page_range=page_range),
            desc="Processing images from document",
            unit=" images",
            disable=not self.verbose,
        )
 def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
    """Filter out duplicate images from the `position` (image coordinates) and `page`, preferring the one with
    `allPassed` set to True.
    See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
    """
    keep = dict()
    for image_meta in metadata:
        key: tuple[int, int, int, int, int] = (
            image_meta["position"]["x1"],
            image_meta["position"]["x2"],
            image_meta["position"]["y1"],
            image_meta["position"]["y2"],
            image_meta["position"]["pageNumber"],
        )
        if key in keep:
            logger.warning(
                f"Duplicate image found: x1={key[0]}, x2={key[1]}, y1={key[2]}, y2={key[3]}, pageNumber={key[4]}"
            )
            if image_meta["filters"]["allPassed"]:
                logger.warning("Setting the image with allPassed flag set to True")
                keep[key] = image_meta
            else:
                logger.warning("Keeping the previous image since the current image has allPassed flag set to False")
        else:
            keep[key] = image_meta
    yield from keep.values()
--- a/src/image_prediction/redai_adapter/init.py
+++ b/src/image_prediction/redai_adapter/init.py
--- a/Show More
+++ b/Show More
		`@ -0,0 +1 @@`
							<mxfile host="app.diagrams.net" modified="2022-03-17T15:35:10.371Z" agent="5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36" etag="b-CbBXg6FXQ9T3Px-oLc" version="17.1.1" type="device"><diagram id="tS3WR_Pr6QhNVK3FqSUP" name="Page-1">1ZZRT6QwEMc/DY8mQHdRX93z9JLbmNzGmNxbQ0daLQzpDrL46a/IsCzinneJcd0XaP+dtsN/fkADscg3V06WeokKbBCHahOIb0Ecnydzf22FphPmyXknZM6oTooGYWWegcWQ1cooWI8CCdGSKcdiikUBKY006RzW47B7tONdS5nBRFil0k7VO6NId+rZPBz0azCZ7neOQh7JZR/MwlpLhfWOJC4DsXCI1LXyzQJs613vSzfv+57RbWIOCvqXCZqW9PBref27aZ7xsQ5vTn/cnvAqT9JW/MCwJuNzR8dZU9Nb4bAqFLSrhYG4qLUhWJUybUdrX3uvacqt70W+yeuCI9jsTTja2uDxAcyBXONDeILonWN04hn366EQUR+jd4qQsCa59tl26cEe32CH/sOt+TueoCONGRbS/kQs2YkHIGoYbFkRvuUTqAmFr1zyu2LlUvhLdjG/HtJlQO/VfOq6AyvJPI3z+HAL4wlwpbp/2V0qODxzUTJmLjo4c8nEkxaWFXcLLPzt4ithKI4BQzHBMOc/l8UvAeLrj9/hQTw9NhBnxwDibB+IB+ZvdvZ5/PnucAx6Gds5S4rLPw==</diagram></mxfile>
		`@ -1 +0,0 @@`
			`Subproject commit 4c3b26d7673457aaa99e0663dad6950cd36da967`
		`@ -0,0 +1,3 @@`
							`docker tag image-clsasification-service:$(poetry version -s)-dev $NEXUS_REGISTRY/red/image-clsasification-service:$(poetry version -s)-dev`

							`docker push $NEXUS_REGISTRY/red/image-clsasification-service:$(poetry version -s)-dev`