RED-5277 update build-script EXPERIMENTAL

RED-5277 update build-script
RED-5277: backport multi-threading
2023-02-22 12:23:26 +01:00 · 2023-02-22 12:14:28 +01:00 · 2023-02-22 11:54:01 +01:00
147 changed files with 1419 additions and 42160 deletions
--- a/.dvc/config
+++ b/.dvc/config
@ -1,8 +1,6 @@
 [core]
-    remote = azure_remote
+    remote = vector
    autostage = true
 ['remote "vector"']
    url = ssh://vector.iqser.com/research/image-prediction/
    port = 22
 ['remote "azure_remote"']
    url = azure://image-classification-dvc/
--- a/.gitignore
+++ b/.gitignore
@ -1,8 +1,7 @@
 .vscode/
 *.h5
-*venv
+/venv/
 .idea/
 src/data
 !.gitignore
 *.project
@ -173,4 +172,4 @@ fabric.properties
 # https://plugins.jetbrains.com/plugin/12206-codestream
 .idea/codestream.xml
-# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
+# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -1,51 +0,0 @@
 include:
  - project: "Gitlab/gitlab"
    ref: main
    file: "/ci-templates/research/dvc.gitlab-ci.yml"
  - project: "Gitlab/gitlab"
    ref: main
    file: "/ci-templates/research/versioning-build-test-release.gitlab-ci.yml"
 variables:
  NEXUS_PROJECT_DIR: red
  IMAGENAME: "${CI_PROJECT_NAME}"
  INTEGRATION_TEST_FILE: "${CI_PROJECT_ID}.pdf"
  FF_USE_FASTZIP: "true" # enable fastzip - a faster zip implementation that also supports level configuration.
  ARTIFACT_COMPRESSION_LEVEL: default # can also be set to fastest, fast, slow and slowest. If just enabling fastzip is not enough try setting this to fastest or fast.
  CACHE_COMPRESSION_LEVEL: default # same as above, but for caches
  # TRANSFER_METER_FREQUENCY: 5s # will display transfer progress every 5 seconds for artifacts and remote caches. For debugging purposes.
 stages:
  - data
  - setup
  - tests
  - sonarqube
  - versioning
  - build
  - integration-tests
  - release
 docker-build:
  extends: .docker-build
  needs:
    - job: dvc-pull
      artifacts: true
    - !reference [.needs-versioning, needs] # leave this line as is
 ###################
 # INTEGRATION TESTS
 trigger-integration-tests:
  extends: .integration-tests
  # ADD THE MODEL BUILD WHICH SHOULD TRIGGER THE INTEGRATION TESTS
  # needs:
  #   - job: docker-build::model_name
  #     artifacts: true
  rules:
    - when: never
 #########
 # RELEASE
 release:
  extends: .release
  needs:
    - !reference [.needs-versioning, needs] # leave this line as is
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,6 @@
 [submodule "incl/pyinfra"]
 	path = incl/pyinfra
 	url = ssh://git@git.iqser.com:2222/rr/pyinfra.git
 [submodule "incl/pdf2image"]
 	path = incl/pdf2image
 	url = ssh://git@git.iqser.com:2222/rr/pdf2image.git
--- a/.python-version
+++ b/.python-version
@ -1 +0,0 @@
 3.10
--- a/84
+++ b/84
@ -1,73 +1,27 @@
-FROM python:3.10-slim AS builder
+FROM image-prediction-base
-ARG GITLAB_USER
+WORKDIR /app/service
 ARG GITLAB_ACCESS_TOKEN
-ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
+COPY src src
-ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
+COPY incl/pyinfra incl/pyinfra
 COPY incl/pdf2image incl/pdf2image
 COPY data data
 COPY image_prediction image_prediction
 COPY setup.py setup.py
 COPY requirements.txt requirements.txt
 COPY config.yaml config.yaml
 COPY banner.txt banner.txt
-ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
+# Install dependencies differing from base image.
-ARG POETRY_SOURCE_REF_RED=gitlab-red
+RUN python3 -m pip install -r requirements.txt
 RUN python3 -m pip install -r incl/pyinfra/requirements.txt
 RUN python3 -m pip install -r incl/pdf2image/requirements.txt
-ARG PYPI_REGISTRY_FFORESIGHT=https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
+RUN python3 -m pip install -e .
-ARG POETRY_SOURCE_REF_FFORESIGHT=gitlab-fforesight
+RUN python3 -m pip install -e incl/pyinfra
-
+RUN python3 -m pip install -e incl/pdf2image
 ARG VERSION=dev
 LABEL maintainer="Research <research@knecon.com>"
 LABEL version="${VERSION}"
 WORKDIR /app
 ###########
 # ENV SETUP
 ENV PYTHONDONTWRITEBYTECODE=true
 ENV PYTHONUNBUFFERED=true
 ENV POETRY_HOME=/opt/poetry
 ENV PATH="$POETRY_HOME/bin:$PATH"
 RUN apt-get update && \
    apt-get install -y curl git bash build-essential libffi-dev libssl-dev && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 RUN curl -sSL https://install.python-poetry.org | python3 -
 RUN poetry --version
 COPY pyproject.toml poetry.lock ./
 RUN poetry config virtualenvs.create true && \
    poetry config virtualenvs.in-project true && \
    poetry config installer.max-workers 10 && \
    poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
    poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
    poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
    poetry config http-basic.${POETRY_SOURCE_REF_RED} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
    poetry config repositories.${POETRY_SOURCE_REF_FFORESIGHT} ${PYPI_REGISTRY_FFORESIGHT} && \
    poetry config http-basic.${POETRY_SOURCE_REF_FFORESIGHT} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
    poetry install --without=dev -vv --no-interaction --no-root
 ###############
 # WORKING IMAGE
 FROM python:3.10-slim
 WORKDIR /app
 # COPY SOURCE CODE FROM BUILDER IMAGE
 COPY --from=builder /app /app
 # COPY BILL OF MATERIALS (BOM)
 COPY bom.json /bom.json
 ENV PATH="/app/.venv/bin:$PATH"
 ###################
 # COPY SOURCE CODE
 COPY ./src ./src
 COPY ./config ./config
 COPY ./data ./data
 COPY banner.txt ./
 EXPOSE 5000
 EXPOSE 8080
-CMD [ "python", "src/serve.py"]
+CMD ["python3", "src/serve.py"]
--- a/25
+++ b/25
@ -0,0 +1,25 @@
 FROM python:3.8 as builder1
 # Use a virtual environment.
 RUN python -m venv /app/venv
 ENV PATH="/app/venv/bin:$PATH"
 # Upgrade pip.
 RUN python -m pip install --upgrade pip
 # Make a directory for the service files and copy the service repo into the container.
 WORKDIR /app/service
 COPY ./requirements.txt ./requirements.txt
 # Install dependencies.
 RUN python3 -m pip install -r requirements.txt
 # Make a new container and copy all relevant files over to filter out temporary files
 # produced during setup to reduce the final container's size.
 FROM python:3.8
 WORKDIR /app/
 COPY --from=builder1  /app .
 ENV PATH="/app/venv/bin:$PATH"
 WORKDIR /app/service
--- a/52
+++ b/52
@ -1,40 +1,28 @@
-FROM python:3.10
+ARG BASE_ROOT="nexus.iqser.com:5001/red/"
 ARG VERSION_TAG="dev"
-ARG USERNAME
+FROM ${BASE_ROOT}image-prediction:${VERSION_TAG}
 ARG TOKEN
 ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
 ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
 ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
 ARG POETRY_SOURCE_REF_RED=gitlab-red
 ARG VERSION=dev
-LABEL maintainer="Research <research@knecon.com>"
+WORKDIR /app/service
 LABEL version="${VERSION}"
-WORKDIR /app
+COPY src src
 COPY incl/pyinfra incl/pyinfra
 COPY incl/pdf2image incl/pdf2image
 COPY data data
 COPY image_prediction image_prediction
 COPY setup.py setup.py
 COPY requirements.txt requirements.txt
 COPY config.yaml config.yaml
 COPY banner.txt banner.txt
-ENV PYTHONUNBUFFERED=true
+# Install module & dependencies
-ENV POETRY_HOME=/opt/poetry
+RUN python3 -m pip install -r requirements.txt
-ENV PATH="$POETRY_HOME/bin:$PATH"
+RUN python3 -m pip install -r incl/pyinfra/requirements.txt
 RUN python3 -m pip install -r incl/pdf2image/requirements.txt
-RUN curl -sSL https://install.python-poetry.org | python3 -
+RUN python3 -m pip install -e .
-
+RUN python3 -m pip install -e incl/pyinfra
-COPY ./data ./data
+RUN python3 -m pip install -e incl/pdf2image
 COPY ./test ./test
 COPY ./config ./config
 COPY ./src ./src
 COPY pyproject.toml poetry.lock banner.txt config.yaml./
 RUN poetry config virtualenvs.create false && \
    poetry config installer.max-workers 10 && \
    poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
    poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${USERNAME} ${TOKEN} && \
    poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
    poetry config http-basic.${POETRY_SOURCE_REF_RED} ${USERNAME} ${TOKEN} && \
    poetry install --without=dev -vv --no-interaction --no-root
 EXPOSE 5000
 EXPOSE 8080
 RUN apt update --yes
 RUN apt install vim --yes
--- a/README.md
+++ b/README.md
@ -2,11 +2,8 @@
 Build base image
 ```bash
-docker build -t image-classification-image --progress=plain --no-cache \
+docker build -f Dockerfile_base -t image-prediction-base .
-    -f Dockerfile \
+docker build -f Dockerfile -t image-prediction .
    --build-arg USERNAME=$GITLAB_USER \
    --build-arg TOKEN=$GITLAB_ACCESS_TOKEN \
    .
 ```
 ### Usage
--- a/src/image_prediction/classifier/init.py
+++ b/src/image_prediction/classifier/init.py
--- a/bamboo-specs/pom.xml
+++ b/bamboo-specs/pom.xml
@ -0,0 +1,40 @@
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <parent>
    <groupId>com.atlassian.bamboo</groupId>
    <artifactId>bamboo-specs-parent</artifactId>
    <version>7.1.2</version>
    <relativePath/>
  </parent>
  <artifactId>bamboo-specs</artifactId>
  <version>1.0.0-SNAPSHOT</version>
  <packaging>jar</packaging>
  <properties>
    <sonar.skip>true</sonar.skip>
  </properties>
  <dependencies>
    <dependency>
      <groupId>com.atlassian.bamboo</groupId>
      <artifactId>bamboo-specs-api</artifactId>
    </dependency>
    <dependency>
      <groupId>com.atlassian.bamboo</groupId>
      <artifactId>bamboo-specs</artifactId>
    </dependency>
    <!-- Test dependencies -->
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <scope>test</scope>
    </dependency>
  </dependencies>
  <!-- run 'mvn test' to perform offline validation of the plan -->
  <!-- run 'mvn -Ppublish-specs' to upload the plan to your Bamboo server -->
 </project>
--- a/bamboo-specs/src/main/java/buildjob/PlanSpec.java
+++ b/bamboo-specs/src/main/java/buildjob/PlanSpec.java
@ -0,0 +1,180 @@
 package buildjob;
 import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
 import java.time.LocalTime;
 import com.atlassian.bamboo.specs.api.BambooSpec;
 import com.atlassian.bamboo.specs.api.builders.BambooKey;
 import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
 import com.atlassian.bamboo.specs.api.builders.permission.PermissionType;
 import com.atlassian.bamboo.specs.api.builders.permission.Permissions;
 import com.atlassian.bamboo.specs.api.builders.permission.PlanPermissions;
 import com.atlassian.bamboo.specs.api.builders.plan.Job;
 import com.atlassian.bamboo.specs.api.builders.plan.Plan;
 import com.atlassian.bamboo.specs.api.builders.plan.PlanIdentifier;
 import com.atlassian.bamboo.specs.api.builders.plan.Stage;
 import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup;
 import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement;
 import com.atlassian.bamboo.specs.api.builders.project.Project;
 import com.atlassian.bamboo.specs.builders.task.CheckoutItem;
 import com.atlassian.bamboo.specs.builders.task.InjectVariablesTask;
 import com.atlassian.bamboo.specs.builders.task.ScriptTask;
 import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask;
 import com.atlassian.bamboo.specs.builders.task.CleanWorkingDirectoryTask;
 import com.atlassian.bamboo.specs.builders.task.VcsTagTask;
 import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
 import com.atlassian.bamboo.specs.builders.trigger.ScheduledTrigger;
 import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
 import com.atlassian.bamboo.specs.api.builders.Variable;
 import com.atlassian.bamboo.specs.util.BambooServer;
 import com.atlassian.bamboo.specs.builders.task.ScriptTask;
 import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
 /**
 * Plan configuration for Bamboo.
 * Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
 */
@BambooSpec
 public class PlanSpec {
    private static final String SERVICE_NAME = "image-prediction";
    private static final String SERVICE_NAME_BASE = "image-prediction-base";
    private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_","");
    /**
     * Run main to publish plan on Bamboo
     */
    public static void main(final String[] args) throws Exception {
        //By default credentials are read from the '.credentials' file.
        BambooServer bambooServer = new BambooServer("http://localhost:8085");
        Plan plan = new PlanSpec().createDockerBuildPlan();
        bambooServer.publish(plan);
        PlanPermissions planPermission = new PlanSpec().createPlanPermission(plan.getIdentifier());
        bambooServer.publish(planPermission);
        Plan secPlan = new PlanSpec().createSecBuild();
        bambooServer.publish(secPlan);
        PlanPermissions secPlanPermission = new PlanSpec().createPlanPermission(secPlan.getIdentifier());
        bambooServer.publish(secPlanPermission);
    }
    private PlanPermissions createPlanPermission(PlanIdentifier planIdentifier) {
        Permissions permission = new Permissions()
                .userPermissions("atlbamboo", PermissionType.EDIT, PermissionType.VIEW, PermissionType.ADMIN, PermissionType.CLONE, PermissionType.BUILD)
                .groupPermissions("research", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
                .groupPermissions("Development", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
                .groupPermissions("QA", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
                .loggedInUserPermissions(PermissionType.VIEW)
                .anonymousUserPermissionView();
        return new PlanPermissions(planIdentifier.getProjectKey(), planIdentifier.getPlanKey()).permissions(permission);
    }
    private Project project() {
        return new Project()
                .name("RED")
                .key(new BambooKey("RED"));
    }
    public Plan createDockerBuildPlan() {
    return new Plan(
            project(),
            SERVICE_NAME, new BambooKey(SERVICE_KEY))
            .description("Docker build for image-prediction.")
            .stages(
            new Stage("Build Stage")
              .jobs(
                new Job("Build Job", new BambooKey("BUILD"))
                  .tasks(
                    new CleanWorkingDirectoryTask()
                        .description("Clean working directory.")
                        .enabled(true),
                    new VcsCheckoutTask()
                        .description("Checkout default repository.")
                        .checkoutItems(new CheckoutItem().defaultRepository()),
                    new ScriptTask()
                        .description("Set config and keys.")
                        .location(Location.FILE)
                        .fileFromPath("bamboo-specs/src/main/resources/scripts/key-prepare.sh"),
                    new ScriptTask()
                        .description("Build Docker container.")
                        .location(Location.FILE)
                        .fileFromPath("bamboo-specs/src/main/resources/scripts/docker-build.sh")
                        .argument(SERVICE_NAME + " " + SERVICE_NAME_BASE),
                    new InjectVariablesTask()
                        .description("Inject git tag.")
                        .path("git.tag")
                        .namespace("g")
                        .scope(InjectVariablesScope.LOCAL),
                    new VcsTagTask()
                        .description("${bamboo.g.gitTag}")
                        .tagName("${bamboo.g.gitTag}")
                        .defaultRepository())
                .dockerConfiguration(
                    new DockerConfiguration()
                      .image("nexus.iqser.com:5001/infra/release_build:4.5.0")
                      .volume("/var/run/docker.sock", "/var/run/docker.sock")),
                new Job("Licence Job", new BambooKey("LICENCE"))
                  .enabled(false)
                  .tasks(
                    new VcsCheckoutTask()
                        .description("Checkout default repository.")
                        .checkoutItems(new CheckoutItem().defaultRepository()),
                    new ScriptTask()
                        .description("Build licence.")
                        .location(Location.FILE)
                        .fileFromPath("bamboo-specs/src/main/resources/scripts/create-licence.sh"))
                  .dockerConfiguration(
                    new DockerConfiguration()
                        .image("nexus.iqser.com:5001/infra/maven:3.6.2-jdk-13-3.0.0")
                        .volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
                        .volume("/var/run/docker.sock", "/var/run/docker.sock"))))
            .linkedRepositories("RR / " + SERVICE_NAME)
            .linkedRepositories("RR / redai_image")
            .triggers(
                    new BitbucketServerTrigger())
            .planBranchManagement(
                new PlanBranchManagement()
                    .createForVcsBranch()
                    .delete(
                        new BranchCleanup()
                            .whenInactiveInRepositoryAfterDays(14))
                    .notificationForCommitters());
    }
    public Plan createSecBuild() {
        return new Plan(project(), SERVICE_NAME + "-Sec", new BambooKey(SERVICE_KEY + "SEC")).description("Security Analysis Plan")
                .stages(new Stage("Default Stage").jobs(
                    new Job("Sonar Job", new BambooKey("SONAR"))
                      .tasks(
                        new CleanWorkingDirectoryTask()
                            .description("Clean working directory.")
                            .enabled(true),
                        new VcsCheckoutTask()
                            .description("Checkout default repository.")
                            .checkoutItems(new CheckoutItem().defaultRepository()),
                        new ScriptTask()
                            .description("Set config and keys.")
                            .location(Location.FILE)
                            .fileFromPath("bamboo-specs/src/main/resources/scripts/key-prepare.sh"),
                        new ScriptTask()
                            .description("Run Sonarqube scan.")
                            .location(Location.FILE)
                            .fileFromPath("bamboo-specs/src/main/resources/scripts/sonar-scan.sh")
                            .argument(SERVICE_NAME))
                      .dockerConfiguration(
                          new DockerConfiguration()
                            .image("nexus.iqser.com:5001/infra/release_build:4.2.0")
                            .volume("/var/run/docker.sock", "/var/run/docker.sock"))))
                .linkedRepositories("RR / " + SERVICE_NAME)
                .triggers(
                    new ScheduledTrigger()
                        .scheduleOnceDaily(LocalTime.of(23, 00)))
                .planBranchManagement(
                    new PlanBranchManagement()
                        .createForVcsBranchMatching("release.*")
                        .notificationForCommitters());
    }
 }
--- a/bamboo-specs/src/main/resources/scripts/create-licence.sh
+++ b/bamboo-specs/src/main/resources/scripts/create-licence.sh
@ -0,0 +1,19 @@
 #!/bin/bash
 set -e
 if [[ \"${bamboo_version_tag}\" != \"dev\" ]]
 then
    ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
                    -f ${bamboo_build_working_directory}/pom.xml \
                    versions:set  \
                    -DnewVersion=${bamboo_version_tag}
    ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
                    -f ${bamboo_build_working_directory}/pom.xml \
                    -B clean deploy \
                    -e -DdeployAtEnd=true \
                    -Dmaven.wagon.http.ssl.insecure=true \
                    -Dmaven.wagon.http.ssl.allowall=true \
                    -Dmaven.wagon.http.ssl.ignore.validity.dates=true \
                    -DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/gin4-platform-releases
 fi
--- a/bamboo-specs/src/main/resources/scripts/docker-build.sh
+++ b/bamboo-specs/src/main/resources/scripts/docker-build.sh
@ -0,0 +1,60 @@
 #!/bin/bash
 set -e
 SERVICE_NAME=$1
 SERVICE_NAME_BASE=$2
 if [[ "$bamboo_planRepository_branchName" == "master" ]]
 then
    branchVersion=$(cat version.yaml | grep -Eo "version: .*" | sed -s 's|version: \(.*\)\..*\..*|\1|g')
    latestVersion=$( semver $(git tag -l "${branchVersion}.*" ) | tail -n1 )
    newVersion="$(semver $latestVersion -p -i minor)"
    echo "new release on master with version $newVersion"
 elif [[ "$bamboo_planRepository_branchName" == release* ]]
 then
 #    branchVersion=$(echo $bamboo_planRepository_branchName | sed -s 's|release\/\([0-9]\+\.[0-9]\+\)\.x|\1|')
 #    latestVersion=$( semver $(git tag -l "${branchVersion}.*" ) | tail -n1 )
 #    newVersion="$(semver $latestVersion -p -i patch)"
 # FIXME: obviously not the best solution
    newVersion="1.16.1"
    echo "new release on $bamboo_planRepository_branchName with version $newVersion"
 elif [[ "${bamboo_version_tag}" != "dev" ]]
 then
    newVersion="${bamboo_version_tag}"
    echo "new special version bild with $newVersion"
 else
    newVersion="${bamboo_planRepository_1_branch}_${bamboo_buildNumber}"
    echo "gitTag=${newVersion}" > git.tag
    dev_tag="dev"
    echo "dev build with tag $dev_tag"
    python3 -m venv build_venv
    source build_venv/bin/activate
    python3 -m pip install --upgrade pip
    pip install dvc
    pip install 'dvc[ssh]'
    dvc pull
    echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
    echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
    docker build -f Dockerfile_base -t $SERVICE_NAME_BASE .
    docker build -f Dockerfile  -t nexus.iqser.com:5001/red/$SERVICE_NAME:$dev_tag .
    docker push nexus.iqser.com:5001/red/$SERVICE_NAME:$dev_tag
    exit 0
 fi
 echo "gitTag=${newVersion}" > git.tag
 python3 -m venv build_venv
 source build_venv/bin/activate
 python3 -m pip install --upgrade pip
 pip install dvc
 pip install 'dvc[ssh]'
 echo "Pulling dvc data"
 dvc pull
 echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
 docker build -f Dockerfile_base -t $SERVICE_NAME_BASE .
 docker build -f Dockerfile  -t nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion} .
 echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
 docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion}
--- a/bamboo-specs/src/main/resources/scripts/key-prepare.sh
+++ b/bamboo-specs/src/main/resources/scripts/key-prepare.sh
@ -0,0 +1,8 @@
 #!/bin/bash
 set -e
 mkdir -p ~/.ssh
 echo "${bamboo_agent_ssh}" | base64 -d >> ~/.ssh/id_rsa
 echo "host vector.iqser.com" > ~/.ssh/config
 echo "    user bamboo-agent" >> ~/.ssh/config
 chmod 600 ~/.ssh/config ~/.ssh/id_rsa
--- a/bamboo-specs/src/main/resources/scripts/sonar-scan.sh
+++ b/bamboo-specs/src/main/resources/scripts/sonar-scan.sh
@ -0,0 +1,57 @@
 #!/bin/bash
 set -e
 export JAVA_HOME=/usr/bin/sonar-scanner/jre
 python3 -m venv build_venv
 source build_venv/bin/activate
 python3 -m pip install --upgrade pip
 python3 -m pip install dependency-check
 python3 -m pip install coverage
 echo "coverage report generation"
 bash run_tests.sh
 if [ ! -f reports/coverage.xml ]
 then
  exit 1
 fi
 SERVICE_NAME=$1
 echo "dependency-check:aggregate"
 mkdir -p reports
 dependency-check --enableExperimental -f JSON -f HTML -f XML \
  --disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
  --exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**"
 if [[ -z "${bamboo_repository_pr_key}" ]]
 then
    echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
    /usr/bin/sonar-scanner/bin/sonar-scanner \
      -Dsonar.projectKey=RED_$SERVICE_NAME \
      -Dsonar.sources=image_prediction \
      -Dsonar.host.url=https://sonarqube.iqser.com \
      -Dsonar.login=${bamboo_sonarqube_api_token_secret} \
      -Dsonar.branch.name=${bamboo_planRepository_1_branch} \
      -Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
      -Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
      -Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
      -Dsonar.python.coverage.reportPaths=reports/coverage.xml
 else
    echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
    /usr/bin/sonar-scanner/bin/sonar-scanner \
      -Dsonar.projectKey=RED_$SERVICE_NAME \
      -Dsonar.sources=image_prediction \
      -Dsonar.host.url=https://sonarqube.iqser.com \
      -Dsonar.login=${bamboo_sonarqube_api_token_secret} \
      -Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
      -Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \
      -Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \
      -Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
      -Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
      -Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
      -Dsonar.python.coverage.reportPaths=reports/coverage.xml
 fi
--- a/bamboo-specs/src/test/java/buildjob/PlanSpecTest.java
+++ b/bamboo-specs/src/test/java/buildjob/PlanSpecTest.java
@ -0,0 +1,21 @@
 package buildjob;
 import com.atlassian.bamboo.specs.api.builders.plan.Plan;
 import com.atlassian.bamboo.specs.api.exceptions.PropertiesValidationException;
 import com.atlassian.bamboo.specs.api.util.EntityPropertiesBuilders;
 import org.junit.Test;
 public class PlanSpecTest {
    @Test
    public void checkYourPlanOffline() throws PropertiesValidationException {
        Plan plan = new PlanSpec().createDockerBuildPlan();
        EntityPropertiesBuilders.build(plan);
    }
    @Test
    public void checkYourSecPlanOffline() throws PropertiesValidationException {
        Plan secPlan = new PlanSpec().createSecBuild();
        EntityPropertiesBuilders.build(secPlan);
    }
 }
--- a/bom.json
+++ b/bom.json
--- a/config.yaml
+++ b/config.yaml
@ -0,0 +1,26 @@
 webserver:
  host: $SERVER_HOST|"127.0.0.1"  # webserver address
  port: $SERVER_PORT|5000  # webserver port
 service:
  logging_level: $LOGGING_LEVEL_ROOT|INFO  # Logging level for service logger
  verbose: $VERBOSE|True  # Service prints document processing progress to stdout
  batch_size: $BATCH_SIZE|16  # Number of images in memory simultaneously
  mlflow_run_id: $MLFLOW_RUN_ID|fabfb1f192c745369b88cab34471aba7  # The ID of the mlflow run to load the service_estimator from
 # These variables control filters that are applied to either images, image metadata or service_estimator predictions.
 # The filter result values are reported in the service responses. For convenience the response to a request contains a
 # "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
 # specified required value.
 filters:
  image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)
    min: $MIN_REL_IMAGE_SIZE|0.05  # Minimum permissible
    max: $MAX_REL_IMAGE_SIZE|0.75  # Maximum permissible
  image_width_to_height_quotient:  # Image width to height ratio
    min: $MIN_IMAGE_FORMAT|0.1  # Minimum permissible
    max: $MAX_IMAGE_FORMAT|10  # Maximum permissible
  min_confidence: $MIN_CONFIDENCE|0.5  # Minimum permissible prediction confidence
--- a/config/pyinfra.toml
+++ b/config/pyinfra.toml
@ -1,68 +0,0 @@
 [asyncio]
 max_concurrent_tasks = 10
 [dynamic_tenant_queues]
 enabled = true
 [metrics.prometheus]
 enabled = true
 prefix = "redactmanager_image_service"
 [tracing]
 enabled = true
 # possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
 type = "azure_monitor" 
 [tracing.opentelemetry]
 endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
 service_name = "redactmanager_image_service"
 exporter = "otlp"
 [webserver]
 host = "0.0.0.0"
 port = 8080
 [rabbitmq]
 host = "localhost"
 port = 5672
 username = ""
 password = ""
 heartbeat = 60
 # Has to be a divider of heartbeat, and shouldn't be too big, since only in these intervals queue interactions happen (like receiving new messages)
 # This is also the minimum time the service needs to process a message
 connection_sleep = 5
 input_queue = "request_queue"
 output_queue = "response_queue"
 dead_letter_queue = "dead_letter_queue"
 tenant_event_queue_suffix = "_tenant_event_queue"
 tenant_event_dlq_suffix = "_tenant_events_dlq"
 tenant_exchange_name = "tenants-exchange"
 queue_expiration_time = 300000  # 5 minutes in milliseconds
 service_request_queue_prefix = "image_request_queue"
 service_request_exchange_name = "image_request_exchange"
 service_response_exchange_name = "image_response_exchange"
 service_dlq_name = "image_dlq"
 [storage]
 backend = "s3"
 [storage.s3]
 bucket = "redaction"
 endpoint = "http://127.0.0.1:9000"
 key = ""
 secret = ""
 region = "eu-central-1"
 [storage.azure]
 container = "redaction"
 connection_string = ""
 [storage.tenant_server]
 public_key = ""
 endpoint =  "http://tenant-user-management:8081/internal-api/tenants"
 [kubernetes]
 pod_name = "test_pod"
--- a/config/settings.toml
+++ b/config/settings.toml
@ -1,42 +0,0 @@
 [logging]
 level = "INFO"
 [service]
 # Print document processing progress to stdout
 verbose = false
 batch_size = 6
 image_stiching_tolerance = 1  # in pixels
 mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
 # These variables control filters that are applied to either images, image metadata or service_estimator predictions.
 # The filter result values are reported in the service responses. For convenience the response to a request contains a
 # "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
 # specified required value.
 [filters.confidence]
 # Minimum permissible prediction confidence
 min = 0.5
 # Image size to page size ratio (ratio of geometric means of areas)
 [filters.image_to_page_quotient]
 min = 0.05
 max = 0.75
 [filters.is_scanned_page]
 # Minimum permissible image to page ratio tolerance for a page to be considered scanned.
 # This is only used for filtering small images on scanned pages and is applied before processing the image, therefore
 # superseding the image_to_page_quotient filter that only applies a tag to the image after processing.
 tolerance = 0
 # Image width to height ratio
 [filters.image_width_to_height_quotient]
 min = 0.1
 max = 10
 # put class specific filters here ['signature', 'formula', 'logo']
 [filters.overrides.signature.image_to_page_quotient]
 max = 0.4
 [filters.overrides.logo.image_to_page_quotient]
 min = 0.06
--- a/src/image_prediction/compositor/init.py
+++ b/src/image_prediction/compositor/init.py
--- a/src/image_prediction/encoder/init.py
+++ b/src/image_prediction/encoder/init.py
--- a/src/image_prediction/classifier/classifier.py
+++ b/src/image_prediction/classifier/classifier.py
--- a/src/image_prediction/classifier/image_classifier.py
+++ b/src/image_prediction/classifier/image_classifier.py
--- a/src/image_prediction/encoder/encoders/init.py
+++ b/src/image_prediction/encoder/encoders/init.py
--- a/src/image_prediction/compositor/compositor.py
+++ b/src/image_prediction/compositor/compositor.py
--- a/image_prediction/config.py
+++ b/image_prediction/config.py
@ -0,0 +1,46 @@
 """Implements a config object with dot-indexing syntax."""
 from envyaml import EnvYAML
 from image_prediction.locations import CONFIG_FILE
 def _get_item_and_maybe_make_dotindexable(container, item):
    ret = container[item]
    return DotIndexable(ret) if isinstance(ret, dict) else ret
 class DotIndexable:
    def __init__(self, x):
        self.x = x
    def get(self, item, default=None):
        try:
            return _get_item_and_maybe_make_dotindexable(self.x, item)
        except KeyError:
            return default
    def __getattr__(self, item):
        return _get_item_and_maybe_make_dotindexable(self.x, item)
    def __repr__(self):
        return self.x.__repr__()
    def __getitem__(self, item):
        return self.__getattr__(item)
 class Config:
    def __init__(self, config_path):
        self.__config = EnvYAML(config_path)
    def __getattr__(self, item):
        if item in self.__config:
            return _get_item_and_maybe_make_dotindexable(self.__config, item)
    def __getitem__(self, item):
        return self.__getattr__(item)
 CONFIG = Config(CONFIG_FILE)
--- a/src/image_prediction/default_objects.py
+++ b/src/image_prediction/default_objects.py
@ -1,3 +1,5 @@
 from typing import Iterable
 from funcy import juxt
 from image_prediction.classifier.classifier import Classifier
@ -5,6 +7,7 @@ from image_prediction.classifier.image_classifier import ImageClassifier
 from image_prediction.compositor.compositor import TransformerCompositor
 from image_prediction.encoder.encoders.hash_encoder import HashEncoder
 from image_prediction.estimator.adapter.adapter import EstimatorAdapter
 from image_prediction.formatter.formatter import format_image_plus
 from image_prediction.formatter.formatters.camel_case import Snake2CamelCaseKeyFormatter
 from image_prediction.formatter.formatters.enum import EnumFormatter
 from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
@ -14,6 +17,7 @@ from image_prediction.model_loader.loaders.mlflow import MlflowConnector
 from image_prediction.redai_adapter.mlflow import MlflowModelReader
 from image_prediction.transformer.transformers.coordinate.pdfnet import PDFNetCoordinateTransformer
 from image_prediction.transformer.transformers.response import ResponseTransformer
 from pdf2img.extraction import extract_images_via_metadata
 def get_mlflow_model_loader(mlruns_dir):
@ -26,10 +30,17 @@ def get_image_classifier(model_loader, model_identifier):
    return ImageClassifier(Classifier(EstimatorAdapter(model), ProbabilityMapper(classes)))
-def get_extractor(**kwargs):
+def get_dispatched_extract(**kwargs):
    image_extractor = ParsablePDFImageExtractor(**kwargs)
-    return image_extractor
+    def extract(pdf: bytes, page_range: range = None, metadata_per_image: Iterable[dict] = None):
        if metadata_per_image:
            image_pluses = extract_images_via_metadata(pdf, metadata_per_image)
            yield from map(format_image_plus, image_pluses)
        else:
            yield from image_extractor.extract(pdf, page_range)
    return extract
 def get_formatter():
--- a/src/image_prediction/estimator/init.py
+++ b/src/image_prediction/estimator/init.py
--- a/src/image_prediction/encoder/encoder.py
+++ b/src/image_prediction/encoder/encoder.py
--- a/src/image_prediction/estimator/adapter/init.py
+++ b/src/image_prediction/estimator/adapter/init.py
--- a/src/image_prediction/encoder/encoders/hash_encoder.py
+++ b/src/image_prediction/encoder/encoders/hash_encoder.py
@ -13,7 +13,7 @@ class HashEncoder(Encoder):
        yield from self.encode(images)
-def hash_image(image: Image.Image) -> str:
+def hash_image(image: Image.Image):
    """See: https://stackoverflow.com/a/49692185/3578468"""
    image = image.resize((10, 10), Image.ANTIALIAS)
    image = image.convert("L")
@ -21,6 +21,4 @@ def hash_image(image: Image.Image) -> str:
    avg_pixel = sum(pixel_data) / len(pixel_data)
    bits = "".join(["1" if (px >= avg_pixel) else "0" for px in pixel_data])
    hex_representation = str(hex(int(bits, 2)))[2:][::-1].upper()
-    # Note: For each 4 leading zeros, the hex representation will be shorter by one character.
+    return hex_representation
    # To ensure that all hashes have the same length, we pad the hex representation with zeros (also see RED-3813).
    return hex_representation.zfill(25)
--- a/src/image_prediction/estimator/adapter/adapters/init.py
+++ b/src/image_prediction/estimator/adapter/adapters/init.py
--- a/src/image_prediction/estimator/preprocessor/init.py
+++ b/src/image_prediction/estimator/preprocessor/init.py
--- a/src/image_prediction/estimator/adapter/adapter.py
+++ b/src/image_prediction/estimator/adapter/adapter.py
--- a/src/image_prediction/estimator/preprocessor/preprocessors/init.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessors/init.py
--- a/image_prediction/estimator/preprocessor/init.py
+++ b/image_prediction/estimator/preprocessor/init.py
--- a/src/image_prediction/estimator/preprocessor/preprocessor.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessor.py
--- a/image_prediction/estimator/preprocessor/preprocessors/init.py
+++ b/image_prediction/estimator/preprocessor/preprocessors/init.py
--- a/src/image_prediction/estimator/preprocessor/preprocessors/basic.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessors/basic.py
--- a/src/image_prediction/estimator/preprocessor/preprocessors/identity.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessors/identity.py
--- a/src/image_prediction/estimator/preprocessor/utils.py
+++ b/src/image_prediction/estimator/preprocessor/utils.py
--- a/src/image_prediction/exceptions.py
+++ b/src/image_prediction/exceptions.py
@ -36,7 +36,3 @@ class InvalidBox(Exception):
 class ParsingError(Exception):
    pass
 class BadXref(ValueError):
    pass
--- a/src/image_prediction/extraction.py
+++ b/src/image_prediction/extraction.py
--- a/src/image_prediction/flask.py
+++ b/src/image_prediction/flask.py
--- a/src/image_prediction/image_extractor/init.py
+++ b/src/image_prediction/image_extractor/init.py
--- a/image_prediction/formatter/formatter.py
+++ b/image_prediction/formatter/formatter.py
@ -0,0 +1,35 @@
 import abc
 from image_prediction.image_extractor.extractor import ImageMetadataPair
 from image_prediction.info import Info
 from image_prediction.transformer.transformer import Transformer
 from pdf2img.default_objects.image import ImagePlus
 class Formatter(Transformer):
    @abc.abstractmethod
    def format(self, obj):
        raise NotImplementedError
    def transform(self, obj):
        raise NotImplementedError()
    def __call__(self, obj):
        return self.format(obj)
 def format_image_plus(image: ImagePlus) -> ImageMetadataPair:
    enum_metadata = {
        Info.PAGE_WIDTH: image.info.pageInfo.width,
        Info.PAGE_HEIGHT: image.info.pageInfo.height,
        Info.PAGE_IDX: image.info.pageInfo.number,
        Info.ALPHA: image.info.alpha,
        Info.WIDTH: image.info.boundingBox.width,
        Info.HEIGHT: image.info.boundingBox.height,
        Info.X1: image.info.boundingBox.x0,
        Info.X2: image.info.boundingBox.x1,
        Info.Y1: image.info.boundingBox.y0,
        Info.Y2: image.info.boundingBox.y1,
    }
    return ImageMetadataPair(image.aspil(), enum_metadata)
--- a/src/image_prediction/image_extractor/extractors/init.py
+++ b/src/image_prediction/image_extractor/extractors/init.py
--- a/src/image_prediction/formatter/formatters/camel_case.py
+++ b/src/image_prediction/formatter/formatters/camel_case.py
--- a/src/image_prediction/formatter/formatters/enum.py
+++ b/src/image_prediction/formatter/formatters/enum.py
--- a/src/image_prediction/formatter/formatters/identity.py
+++ b/src/image_prediction/formatter/formatters/identity.py
--- a/src/image_prediction/formatter/formatters/key_formatter.py
+++ b/src/image_prediction/formatter/formatters/key_formatter.py
--- a/src/image_prediction/label_mapper/init.py
+++ b/src/image_prediction/label_mapper/init.py
--- a/src/image_prediction/image_extractor/extractor.py
+++ b/src/image_prediction/image_extractor/extractor.py
--- a/image_prediction/image_extractor/extractors/init.py
+++ b/image_prediction/image_extractor/extractors/init.py
--- a/src/image_prediction/image_extractor/extractors/mock.py
+++ b/src/image_prediction/image_extractor/extractors/mock.py
--- a/image_prediction/image_extractor/extractors/parsable.py
+++ b/image_prediction/image_extractor/extractors/parsable.py
@ -0,0 +1,208 @@
 import atexit
 import io
 import json
 import traceback
 from functools import partial, lru_cache
 from itertools import chain, starmap, filterfalse
 from operator import itemgetter, truth
 from typing import List, Iterable, Iterator
 import fitz
 from PIL import Image
 from funcy import rcompose, merge, pluck, curry, compose
 from image_prediction.formatter.formatters.enum import EnumFormatter
 from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
 from image_prediction.info import Info
 from image_prediction.stitching.stitching import stitch_pairs
 from image_prediction.stitching.utils import validate_box_coords, validate_box_size
 from image_prediction.utils import get_logger
 from image_prediction.utils.generic import lift
 logger = get_logger()
 class ParsablePDFImageExtractor(ImageExtractor):
    def __init__(self, verbose=False, tolerance=0):
        """
        Args:
            verbose: Whether to show progressbar
            tolerance: The tolerance in pixels for the distance between images, beyond which they will not be stitched
                together
        """
        self.doc: fitz.fitz.Document = None
        self.verbose = verbose
        self.tolerance = tolerance
    def extract(self, pdf: bytes, page_range: range = None):
        self.doc = fitz.Document(stream=pdf)
        pages = extract_pages(self.doc, page_range) if page_range else self.doc
        image_metadata_pairs = chain.from_iterable(map(self.__process_images_on_page, pages))
        yield from image_metadata_pairs
    def __process_images_on_page(self, page: fitz.fitz.Page):
        images = get_images_on_page(self.doc, page)
        metadata = get_metadata_for_images_on_page(self.doc, page)
        clear_caches()
        image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
        #  TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the
        #   validation here. Invalid images can then be split into a different stream and joined with the intact images
        #   again for the formatting step.
        image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs)
        image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
        yield from image_metadata_pairs
    @staticmethod
    def __filter_valid_images(image_metadata_pairs: Iterable[ImageMetadataPair]) -> Iterator[ImageMetadataPair]:
        def validate(image: Image.Image, metadata: dict):
            try:
                # TODO: stand-in heuristic for testing if image is valid => find cleaner solution (RED-5148)
                image.resize((100, 100)).convert("RGB")
                return ImageMetadataPair(image, metadata)
            except (OSError, Exception) as err:
                metadata = json.dumps(EnumFormatter()(metadata), indent=2)
                logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
                return None
        return filter(truth, starmap(validate, image_metadata_pairs))
 def extract_pages(doc, page_range):
    page_range = range(page_range.start + 1, page_range.stop + 1)
    pages = map(doc.load_page, page_range)
    yield from pages
@lru_cache(maxsize=None)
 def get_images_on_page(doc, page: fitz.Page):
    image_infos = get_image_infos(page)
    xrefs = map(itemgetter("xref"), image_infos)
    images = map(partial(xref_to_image, doc), xrefs)
    yield from images
 def get_metadata_for_images_on_page(doc, page: fitz.Page):
    metadata = map(get_image_metadata, get_image_infos(page))
    metadata = validate_coords_and_passthrough(metadata)
    metadata = filter_out_tiny_images(metadata)
    metadata = validate_size_and_passthrough(metadata)
    metadata = add_page_metadata(page, metadata)
    metadata = add_alpha_channel_info(doc, page, metadata)
    yield from metadata
@lru_cache(maxsize=None)
 def get_image_infos(page: fitz.Page) -> List[dict]:
    return page.get_image_info(xrefs=True)
@lru_cache(maxsize=None)
 def xref_to_image(doc, xref) -> Image:
    maybe_image = load_image_handle_from_xref(doc, xref)
    return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None
 def get_image_metadata(image_info):
    x1, y1, x2, y2 = map(rounder, image_info["bbox"])
    width = abs(x2 - x1)
    height = abs(y2 - y1)
    return {
        Info.WIDTH: width,
        Info.HEIGHT: height,
        Info.X1: x1,
        Info.X2: x2,
        Info.Y1: y1,
        Info.Y2: y2,
    }
 def validate_coords_and_passthrough(metadata):
    yield from map(validate_box_coords, metadata)
 def filter_out_tiny_images(metadata):
    yield from filterfalse(tiny, metadata)
 def validate_size_and_passthrough(metadata):
    yield from map(validate_box_size, metadata)
 def add_page_metadata(page, metadata):
    yield from map(partial(merge, get_page_metadata(page)), metadata)
 def add_alpha_channel_info(doc, page, metadata):
    page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos)
    xref_to_alpha = partial(has_alpha_channel, doc)
    page_to_alpha_value_per_image = compose(lift(xref_to_alpha), page_to_xrefs)
    alpha_to_dict = compose(dict, lambda a: [(Info.ALPHA, a)])
    page_to_alpha_mapping_per_image = compose(lift(alpha_to_dict), page_to_alpha_value_per_image)
    metadata = starmap(merge, zip(page_to_alpha_mapping_per_image(page), metadata))
    yield from metadata
@lru_cache(maxsize=None)
 def load_image_handle_from_xref(doc, xref):
    return doc.extract_image(xref)
 rounder = rcompose(round, int)
 def get_page_metadata(page):
    page_width, page_height = map(rounder, page.mediabox_size)
    return {
        Info.PAGE_WIDTH: page_width,
        Info.PAGE_HEIGHT: page_height,
        Info.PAGE_IDX: page.number,
    }
 def has_alpha_channel(doc, xref):
    maybe_image = load_image_handle_from_xref(doc, xref)
    maybe_smask = maybe_image["smask"] if maybe_image else None
    if maybe_smask:
        return any([doc.extract_image(maybe_smask) is not None, bool(fitz.Pixmap(doc, maybe_smask).alpha)])
    else:
        try:
            return bool(fitz.Pixmap(doc, xref).alpha)
        except ValueError:
            logger.debug(f"Encountered invalid xref `{xref}` in {doc.metadata.get('title', '<no title>')}.")
            return False
 def tiny(metadata):
    return metadata[Info.WIDTH] * metadata[Info.HEIGHT] <= 4
 def clear_caches():
    get_image_infos.cache_clear()
    load_image_handle_from_xref.cache_clear()
    get_images_on_page.cache_clear()
    xref_to_image.cache_clear()
 atexit.register(clear_caches)
--- a/src/image_prediction/info.py
+++ b/src/image_prediction/info.py
@ -12,4 +12,3 @@ class Info(Enum):
    Y1 = "y1"
    Y2 = "y2"
    ALPHA = "alpha"
    XREF = "xref"
--- a/src/image_prediction/model_loader/init.py
+++ b/src/image_prediction/model_loader/init.py
--- a/src/image_prediction/label_mapper/mapper.py
+++ b/src/image_prediction/label_mapper/mapper.py
--- a/src/image_prediction/model_loader/database/init.py
+++ b/src/image_prediction/model_loader/database/init.py
--- a/src/image_prediction/label_mapper/mappers/numeric.py
+++ b/src/image_prediction/label_mapper/mappers/numeric.py
--- a/src/image_prediction/label_mapper/mappers/probability.py
+++ b/src/image_prediction/label_mapper/mappers/probability.py
--- a/image_prediction/locations.py
+++ b/image_prediction/locations.py
@ -0,0 +1,17 @@
 """Defines constant paths relative to the module root path."""
 from pathlib import Path
 MODULE_DIR = Path(__file__).resolve().parents[0]
 PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
 CONFIG_FILE = PACKAGE_ROOT_DIR / "config.yaml"
 BANNER_FILE = PACKAGE_ROOT_DIR / "banner.txt"
 DATA_DIR = PACKAGE_ROOT_DIR / "data"
 MLRUNS_DIR = str(DATA_DIR / "mlruns")
 TEST_DATA_DIR = PACKAGE_ROOT_DIR / "test" / "data"
--- a/src/image_prediction/model_loader/database/connectors/init.py
+++ b/src/image_prediction/model_loader/database/connectors/init.py
--- a/src/image_prediction/model_loader/loaders/init.py
+++ b/src/image_prediction/model_loader/loaders/init.py
--- a/src/image_prediction/model_loader/database/connector.py
+++ b/src/image_prediction/model_loader/database/connector.py
--- a/image_prediction/model_loader/database/connectors/init.py
+++ b/image_prediction/model_loader/database/connectors/init.py
--- a/src/image_prediction/model_loader/database/connectors/mock.py
+++ b/src/image_prediction/model_loader/database/connectors/mock.py
--- a/src/image_prediction/model_loader/loader.py
+++ b/src/image_prediction/model_loader/loader.py
--- a/image_prediction/model_loader/loaders/init.py
+++ b/image_prediction/model_loader/loaders/init.py
--- a/src/image_prediction/model_loader/loaders/mlflow.py
+++ b/src/image_prediction/model_loader/loaders/mlflow.py
--- a/src/image_prediction/pipeline.py
+++ b/src/image_prediction/pipeline.py
@ -1,10 +1,9 @@
 import os
-from functools import lru_cache, partial
+from functools import partial
 from itertools import chain, tee
-from typing import Iterable, Any
+from typing import Iterable
 from funcy import rcompose, first, compose, second, chunks, identity, rpartial
 from kn_utils.logging import logger
 from tqdm import tqdm
 from image_prediction.config import CONFIG
@ -12,8 +11,8 @@ from image_prediction.default_objects import (
    get_formatter,
    get_mlflow_model_loader,
    get_image_classifier,
    get_extractor,
    get_encoder,
    get_dispatched_extract,
 )
 from image_prediction.locations import MLRUNS_DIR
 from image_prediction.utils.generic import lift, starlift
@ -21,9 +20,7 @@ from image_prediction.utils.generic import lift, starlift
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
@lru_cache(maxsize=None)
 def load_pipeline(**kwargs):
    logger.info(f"Loading pipeline with kwargs: {kwargs}")
    model_loader = get_mlflow_model_loader(MLRUNS_DIR)
    model_identifier = CONFIG.service.mlflow_run_id
@ -41,10 +38,10 @@ def star(f):
 class Pipeline:
-    def __init__(self, model_loader, model_identifier, batch_size=16, verbose=False, **kwargs):
+    def __init__(self, model_loader, model_identifier, batch_size=16, verbose=True, **kwargs):
        self.verbose = verbose
-        extract = get_extractor(**kwargs)
+        extract = get_dispatched_extract(**kwargs)
        classifier = get_image_classifier(model_loader, model_identifier)
        reformat = get_formatter()
        represent = get_encoder()
@ -55,7 +52,7 @@ class Pipeline:
        join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))
        #                       />--classify--\
-        # --extract-->--split--+->--encode---->+--join-->reformat-->filter_duplicates
+        # --extract-->--split--+->--encode---->+--join-->reformat
        #                       \>--identity--/
        self.pipe = rcompose(
@ -64,42 +61,12 @@ class Pipeline:
            pairwise_apply(classify, represent, identity),  # ... apply functions to the streams pairwise
            join,  # ... the streams by zipping
            reformat,  # ... the items
            filter_duplicates,  # ... filter out duplicate images
        )
-    def __call__(self, pdf: bytes, page_range: range = None):
+    def __call__(self, pdf: bytes, page_range: range = None, metadata_per_image: Iterable[dict] = None):
        yield from tqdm(
-            self.pipe(pdf, page_range=page_range),
+            self.pipe(pdf, page_range=page_range, metadata_per_image=metadata_per_image),
            desc="Processing images from document",
            unit=" images",
            disable=not self.verbose,
        )
 def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
    """Filter out duplicate images from the `position` (image coordinates) and `page`, preferring the one with
    `allPassed` set to True.
    See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
    """
    keep = dict()
    for image_meta in metadata:
        key: tuple[int, int, int, int, int] = (
            image_meta["position"]["x1"],
            image_meta["position"]["x2"],
            image_meta["position"]["y1"],
            image_meta["position"]["y2"],
            image_meta["position"]["pageNumber"],
        )
        if key in keep:
            logger.warning(
                f"Duplicate image found: x1={key[0]}, x2={key[1]}, y1={key[2]}, y2={key[3]}, pageNumber={key[4]}"
            )
            if image_meta["filters"]["allPassed"]:
                logger.warning("Setting the image with allPassed flag set to True")
                keep[key] = image_meta
            else:
                logger.warning("Keeping the previous image since the current image has allPassed flag set to False")
        else:
            keep[key] = image_meta
    yield from keep.values()
--- a/src/image_prediction/transformer/init.py
+++ b/src/image_prediction/transformer/init.py
--- a/src/image_prediction/redai_adapter/efficient_net_wrapper.py
+++ b/src/image_prediction/redai_adapter/efficient_net_wrapper.py
--- a/src/image_prediction/redai_adapter/mlflow.py
+++ b/src/image_prediction/redai_adapter/mlflow.py
--- a/src/image_prediction/redai_adapter/model.py
+++ b/src/image_prediction/redai_adapter/model.py
--- a/src/image_prediction/redai_adapter/model_wrapper.py
+++ b/src/image_prediction/redai_adapter/model_wrapper.py
--- a/src/image_prediction/transformer/transformers/init.py
+++ b/src/image_prediction/transformer/transformers/init.py
--- a/src/image_prediction/stitching/grouping.py
+++ b/src/image_prediction/stitching/grouping.py
--- a/src/image_prediction/stitching/merging.py
+++ b/src/image_prediction/stitching/merging.py
--- a/src/image_prediction/stitching/split_mapper.py
+++ b/src/image_prediction/stitching/split_mapper.py
--- a/src/image_prediction/stitching/stitching.py
+++ b/src/image_prediction/stitching/stitching.py
--- a/src/image_prediction/stitching/utils.py
+++ b/src/image_prediction/stitching/utils.py
--- a/src/image_prediction/transformer/transformers/coordinate/init.py
+++ b/src/image_prediction/transformer/transformers/coordinate/init.py
--- a/src/image_prediction/transformer/transformer.py
+++ b/src/image_prediction/transformer/transformer.py
--- a/image_prediction/transformer/transformers/init.py
+++ b/image_prediction/transformer/transformers/init.py
--- a/image_prediction/transformer/transformers/coordinate/init.py
+++ b/image_prediction/transformer/transformers/coordinate/init.py
--- a/src/image_prediction/transformer/transformers/coordinate/coordinate_transformer.py
+++ b/src/image_prediction/transformer/transformers/coordinate/coordinate_transformer.py
--- a/src/image_prediction/transformer/transformers/coordinate/fitz.py
+++ b/src/image_prediction/transformer/transformers/coordinate/fitz.py
--- a/src/image_prediction/transformer/transformers/coordinate/fpdf.py
+++ b/src/image_prediction/transformer/transformers/coordinate/fpdf.py
--- a/src/image_prediction/transformer/transformers/coordinate/pdfnet.py
+++ b/src/image_prediction/transformer/transformers/coordinate/pdfnet.py
--- a/image_prediction/transformer/transformers/response.py
+++ b/image_prediction/transformer/transformers/response.py
@ -0,0 +1,153 @@
 import json
 import math
 import os
 from functools import lru_cache
 from operator import itemgetter
 from funcy import first
 from image_prediction.config import CONFIG
 from image_prediction.exceptions import ParsingError
 from image_prediction.transformer.transformer import Transformer
 from image_prediction.utils import get_logger
 logger = get_logger()
 class ResponseTransformer(Transformer):
    def transform(self, data):
        logger.debug("ResponseTransformer.transform")
        return build_image_info(data)
 def build_image_info(data: dict) -> dict:
    def compute_geometric_quotient():
        page_area_sqrt = math.sqrt(abs(page_width * page_height))
        image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1))
        return image_area_sqrt / page_area_sqrt
    page_width, page_height, x1, x2, y1, y2, width, height, alpha = itemgetter(
        "page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height", "alpha"
    )(data)
    classification = data["classification"]
    label = classification["label"]
    representation = data["representation"]
    geometric_quotient = round(compute_geometric_quotient(), 4)
    min_image_to_page_quotient_breached = bool(
        geometric_quotient < get_class_specific_min_image_to_page_quotient(label)
    )
    max_image_to_page_quotient_breached = bool(
        geometric_quotient > get_class_specific_max_image_to_page_quotient(label)
    )
    min_image_width_to_height_quotient_breached = bool(
        width / height < get_class_specific_min_image_width_to_height_quotient(label)
    )
    max_image_width_to_height_quotient_breached = bool(
        width / height > get_class_specific_max_image_width_to_height_quotient(label)
    )
    min_confidence_breached = bool(
        max(classification["probabilities"].values()) < get_class_specific_min_classification_confidence(label)
    )
    image_info = {
        "classification": classification,
        "representation": representation,
        "position": {"x1": x1, "x2": x2, "y1": y1, "y2": y2, "pageNumber": data["page_idx"] + 1},
        "geometry": {"width": width, "height": height},
        "alpha": alpha,
        "filters": {
            "geometry": {
                "imageSize": {
                    "quotient": geometric_quotient,
                    "tooLarge": max_image_to_page_quotient_breached,
                    "tooSmall": min_image_to_page_quotient_breached,
                },
                "imageFormat": {
                    "quotient": round(width / height, 4),
                    "tooTall": min_image_width_to_height_quotient_breached,
                    "tooWide": max_image_width_to_height_quotient_breached,
                },
            },
            "probability": {"unconfident": min_confidence_breached},
            "allPassed": not any(
                [
                    max_image_to_page_quotient_breached,
                    min_image_to_page_quotient_breached,
                    min_image_width_to_height_quotient_breached,
                    max_image_width_to_height_quotient_breached,
                    min_confidence_breached,
                ]
            ),
        },
    }
    return image_info
 def get_class_specific_min_image_to_page_quotient(label, table=None):
    return get_class_specific_value(
        "REL_IMAGE_SIZE", label, "min", CONFIG.filters.image_to_page_quotient.min, table=table
    )
 def get_class_specific_max_image_to_page_quotient(label, table=None):
    return get_class_specific_value(
        "REL_IMAGE_SIZE", label, "max", CONFIG.filters.image_to_page_quotient.max, table=table
    )
 def get_class_specific_min_image_width_to_height_quotient(label, table=None):
    return get_class_specific_value(
        "IMAGE_FORMAT", label, "min", CONFIG.filters.image_width_to_height_quotient.min, table=table
    )
 def get_class_specific_max_image_width_to_height_quotient(label, table=None):
    return get_class_specific_value(
        "IMAGE_FORMAT", label, "max", CONFIG.filters.image_width_to_height_quotient.max, table=table
    )
 def get_class_specific_min_classification_confidence(label, table=None):
    return get_class_specific_value("CONFIDENCE", label, "min", CONFIG.filters.min_confidence, table=table)
 def get_class_specific_value(prefix, label, bound, fallback_value, table=None):
    def fallback():
        return fallback_value
    def success():
        threshold_map = parse_env_var(prefix, table=table) or {}
        value = threshold_map.get(label, {}).get(bound)
        if value:
            logger.debug(f"Using class '{label}' specific {bound} {prefix.lower().replace('_', '-')} value.")
        return value
    assert bound in ["min", "max"]
    return success() or fallback()
@lru_cache(maxsize=None)
 def parse_env_var(prefix, table=None):
    table = table or os.environ
    head = first(filter(lambda s: s == prefix, table))
    if head:
        try:
            return parse_env_var_value(table[head])
        except ParsingError as err:
            logger.warning(err)
    else:
        return None
 def parse_env_var_value(env_var_value):
    try:
        return json.loads(env_var_value)
    except Exception as err:
        raise ParsingError(f"Failed to parse {env_var_value}") from err
--- a/src/image_prediction/utils.py
+++ b/src/image_prediction/utils.py
--- a/src/image_prediction/utils/init.py
+++ b/src/image_prediction/utils/init.py
--- a/src/image_prediction/utils/banner.py
+++ b/src/image_prediction/utils/banner.py
--- a/image_prediction/utils/generic.py
+++ b/image_prediction/utils/generic.py
@ -0,0 +1,15 @@
 from itertools import starmap
 from funcy import iterate, first, curry, map
 def until(cond, func, *args, **kwargs):
    return first(filter(cond, iterate(func, *args, **kwargs)))
 def lift(fn):
    return curry(map)(fn)
 def starlift(fn):
    return curry(starmap)(fn)
--- a/image_prediction/utils/logger.py
+++ b/image_prediction/utils/logger.py
@ -0,0 +1,27 @@
 import logging
 from image_prediction.config import CONFIG
 def make_logger_getter():
    logger = logging.getLogger("imclf")
    logger.propagate = False
    handler = logging.StreamHandler()
    handler.setLevel(CONFIG.service.logging_level)
    log_format = "%(asctime)s %(levelname)-8s %(message)s"
    formatter = logging.Formatter(log_format, datefmt="%Y-%m-%d %H:%M:%S")
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(CONFIG.service.logging_level)
    def get_logger():
        return logger
    return get_logger
 get_logger = make_logger_getter()
--- a/src/image_prediction/utils/pdf_annotation.py
+++ b/src/image_prediction/utils/pdf_annotation.py
@ -56,8 +56,7 @@ def annotate_image(doc, image_info):
 def init():
    PDFNet.Initialize(
-        # "Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
+        "Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
        "Knecon AG:OEM:DDA-R::WL+:AMS(20270129):EA5FDFB23C7F36B9C2AE606F4F0D9197DE1FB649119F9730B622FABEF5C7"
    )
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Julius Unverfehrt	9292c1f6c6	RED-5277 update build-script EXPERIMENTAL	2023-02-22 12:23:26 +01:00
Julius Unverfehrt	788af1df62	RED-5277 update build-script	2023-02-22 12:14:28 +01:00
Francisco Schulz	097479bc38	RED-5277: backport multi-threading	2023-02-22 11:54:01 +01:00