refactoring

alpha channel querying improved
renaming
2022-04-14 12:20:05 +02:00 · 2022-04-13 17:31:33 +02:00 · 2022-04-13 13:36:45 +02:00 · 2022-04-13 13:17:23 +02:00 · 2022-04-13 13:15:05 +02:00 · 2022-04-13 13:12:19 +02:00
167 changed files with 1682 additions and 43084 deletions
--- a/.dvc/config
+++ b/.dvc/config
@ -1,8 +1,6 @@
 [core]
-    remote = azure_remote
+    remote = vector
    autostage = true
 ['remote "vector"']
    url = ssh://vector.iqser.com/research/image-prediction/
    port = 22
-['remote "azure_remote"']
-    url = azure://image-classification-dvc/
--- a/.gitignore
+++ b/.gitignore
@ -1,8 +1,7 @@
 .vscode/
 *.h5
-*venv
+/venv/
 .idea/
-src/data

 !.gitignore
 *.project
@ -34,7 +33,6 @@ src/data
 **/dependencies-and-licenses-overview.txt

 .coverage
-.coverage\.*\.*


 *__pycache__
@ -48,6 +46,7 @@ src/data
 *misc

 /coverage_html_report/
+.coverage\.*

 # Created by https://www.toptal.com/developers/gitignore/api/linux,pycharm
 # Edit at https://www.toptal.com/developers/gitignore?templates=linux,pycharm
@ -174,3 +173,5 @@ fabric.properties
 .idea/codestream.xml

 # End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
+/image_prediction/data/mlruns/
+#/data/mlruns/
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -1,51 +0,0 @@
-include:
-  - project: "Gitlab/gitlab"
-    ref: main
-    file: "/ci-templates/research/dvc.gitlab-ci.yml"
-  - project: "Gitlab/gitlab"
-    ref: main
-    file: "/ci-templates/research/versioning-build-test-release.gitlab-ci.yml"
-
-variables:
-  NEXUS_PROJECT_DIR: red
-  IMAGENAME: "${CI_PROJECT_NAME}"
-  INTEGRATION_TEST_FILE: "${CI_PROJECT_ID}.pdf"
-  FF_USE_FASTZIP: "true" # enable fastzip - a faster zip implementation that also supports level configuration.
-  ARTIFACT_COMPRESSION_LEVEL: default # can also be set to fastest, fast, slow and slowest. If just enabling fastzip is not enough try setting this to fastest or fast.
-  CACHE_COMPRESSION_LEVEL: default # same as above, but for caches
-  # TRANSFER_METER_FREQUENCY: 5s # will display transfer progress every 5 seconds for artifacts and remote caches. For debugging purposes.
-
-stages:
-  - data
-  - setup
-  - tests
-  - sonarqube
-  - versioning
-  - build
-  - integration-tests
-  - release
-
-docker-build:
-  extends: .docker-build
-  needs:
-    - job: dvc-pull
-      artifacts: true
-    - !reference [.needs-versioning, needs] # leave this line as is
-  
-###################
-# INTEGRATION TESTS
-trigger-integration-tests:
-  extends: .integration-tests
-  # ADD THE MODEL BUILD WHICH SHOULD TRIGGER THE INTEGRATION TESTS
-  # needs:
-  #   - job: docker-build::model_name
-  #     artifacts: true
-  rules:
-    - when: never
-
-#########
-# RELEASE
-release:
-  extends: .release
-  needs:
-    - !reference [.needs-versioning, needs] # leave this line as is
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "incl/redai_image"]
+	path = incl/redai_image
+	url = ssh://git@git.iqser.com:2222/rr/redai_image.git
--- a/.python-version
+++ b/.python-version
@ -1 +0,0 @@
-3.10
--- a/80
+++ b/80
@ -1,73 +1,25 @@
-FROM python:3.10-slim AS builder
+ARG BASE_ROOT="nexus.iqser.com:5001/red/"
+ARG VERSION_TAG="latest"

-ARG GITLAB_USER
-ARG GITLAB_ACCESS_TOKEN
+FROM ${BASE_ROOT}image-prediction-base:${VERSION_TAG}

-ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
-ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
+WORKDIR /app/service

-ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
-ARG POETRY_SOURCE_REF_RED=gitlab-red
+COPY src src
+COPY data data
+COPY image_prediction image_prediction
+COPY incl/redai_image/redai incl/redai_image/redai
+COPY setup.py setup.py
+COPY requirements.txt requirements.txt
+COPY config.yaml config.yaml

-ARG PYPI_REGISTRY_FFORESIGHT=https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
-ARG POETRY_SOURCE_REF_FFORESIGHT=gitlab-fforesight
+# Install dependencies differing from base image.
+RUN python3 -m pip install -r requirements.txt

-ARG VERSION=dev
-
-LABEL maintainer="Research <research@knecon.com>"
-LABEL version="${VERSION}"
-
-WORKDIR /app
-
-###########
-# ENV SETUP
-ENV PYTHONDONTWRITEBYTECODE=true
-ENV PYTHONUNBUFFERED=true
-ENV POETRY_HOME=/opt/poetry
-ENV PATH="$POETRY_HOME/bin:$PATH"
-
-RUN apt-get update && \
-    apt-get install -y curl git bash build-essential libffi-dev libssl-dev && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN curl -sSL https://install.python-poetry.org | python3 -
-RUN poetry --version
-
-COPY pyproject.toml poetry.lock ./
-
-RUN poetry config virtualenvs.create true && \
-    poetry config virtualenvs.in-project true && \
-    poetry config installer.max-workers 10 && \
-    poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
-    poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
-    poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
-    poetry config http-basic.${POETRY_SOURCE_REF_RED} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
-    poetry config repositories.${POETRY_SOURCE_REF_FFORESIGHT} ${PYPI_REGISTRY_FFORESIGHT} && \
-    poetry config http-basic.${POETRY_SOURCE_REF_FFORESIGHT} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
-    poetry install --without=dev -vv --no-interaction --no-root
-
-###############
-# WORKING IMAGE
-FROM python:3.10-slim
-
-WORKDIR /app
-
-# COPY SOURCE CODE FROM BUILDER IMAGE
-COPY --from=builder /app /app
-# COPY BILL OF MATERIALS (BOM)
-COPY bom.json /bom.json
-
-ENV PATH="/app/.venv/bin:$PATH"
-
-###################
-# COPY SOURCE CODE
-COPY ./src ./src
-COPY ./config ./config
-COPY ./data ./data
-COPY banner.txt ./
+RUN python3 -m pip install -e .
+RUN python3 -m pip install -e incl/redai_image/redai

 EXPOSE 5000
 EXPOSE 8080

-CMD [ "python", "src/serve.py"]
+CMD ["python3", "src/serve.py"]
--- a/25
+++ b/25
@ -0,0 +1,25 @@
+FROM python:3.8 as builder1
+
+# Use a virtual environment.
+RUN python -m venv /app/venv
+ENV PATH="/app/venv/bin:$PATH"
+
+# Upgrade pip.
+RUN python -m pip install --upgrade pip
+
+# Make a directory for the service files and copy the service repo into the container.
+WORKDIR /app/service
+COPY ./requirements.txt ./requirements.txt
+
+# Install dependencies.
+RUN python3 -m pip install -r requirements.txt
+
+# Make a new container and copy all relevant files over to filter out temporary files
+# produced during setup to reduce the final container's size.
+FROM python:3.8
+
+WORKDIR /app/
+COPY --from=builder1  /app .
+ENV PATH="/app/venv/bin:$PATH"
+
+WORKDIR /app/service
--- a/43
+++ b/43
@ -1,43 +0,0 @@
-FROM python:3.10
-
-ARG USERNAME
-ARG TOKEN
-ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
-ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
-ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
-ARG POETRY_SOURCE_REF_RED=gitlab-red
-ARG VERSION=dev
-
-LABEL maintainer="Research <research@knecon.com>"
-LABEL version="${VERSION}"
-
-WORKDIR /app
-
-ENV PYTHONUNBUFFERED=true
-ENV POETRY_HOME=/opt/poetry
-ENV PATH="$POETRY_HOME/bin:$PATH"
-
-RUN curl -sSL https://install.python-poetry.org | python3 -
-
-COPY ./data ./data
-COPY ./test ./test
-COPY ./config ./config
-COPY ./src ./src
-COPY pyproject.toml poetry.lock banner.txt config.yaml./
-
-RUN poetry config virtualenvs.create false && \
-    poetry config installer.max-workers 10 && \
-    poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
-    poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${USERNAME} ${TOKEN} && \
-    poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
-    poetry config http-basic.${POETRY_SOURCE_REF_RED} ${USERNAME} ${TOKEN} && \
-    poetry install --without=dev -vv --no-interaction --no-root
-
-EXPOSE 5000
-EXPOSE 8080
-
-RUN apt update --yes
-RUN apt install vim --yes
-RUN apt install poppler-utils --yes
-
-CMD coverage run -m pytest test/ --tb=native -q -s -vvv -x && coverage combine && coverage report -m && coverage xml
--- a/README.md
+++ b/README.md
@ -1,143 +1,25 @@
-### Setup
+### Building

 Build base image
 ```bash
-docker build -t image-classification-image --progress=plain --no-cache \
-    -f Dockerfile \
-    --build-arg USERNAME=$GITLAB_USER \
-    --build-arg TOKEN=$GITLAB_ACCESS_TOKEN \
-    .
+setup/docker.sh
+```
+
+Build head image
+```bash
+docker build -f Dockerfile -t image-prediction . --build-arg BASE_ROOT=""
 ```

 ### Usage

-#### Without Docker
-
-
-```bash
-py scripts/run_pipeline.py /path/to/a/pdf
-```
-
-#### With Docker
-
 Shell 1

 ```bash
-docker run --rm --net=host image-prediction
+docker run --rm --net=host --rm image-prediction
 ```

 Shell 2

 ```bash
-python scripts/pyinfra_mock.py /path/to/a/pdf
+python scripts/pyinfra_mock.py --pdf_path /path/to/a/pdf
 ```
-
-### Tests
-
-Run for example this command to execute all tests and get a coverage report:
-
-```bash
-coverage run -m pytest test --tb=native -q -s -vvv -x && coverage combine && coverage report -m
-```
-
-After having built the service container as specified above, you can also run tests in a container as follows:
-
-```bash
-./run_tests.sh
-```
-
-### Message Body Formats
-
-
-#### Request Format
-
-The request messages need to provide the fields `"dossierId"` and `"fileId"`. A request should look like this:
-
-```json
-{
-    "dossierId": "<string identifier>",
-    "fileId": "<string identifier>"
-}
-```
-
-Any additional keys are ignored.
-
-
-#### Response Format
-
-Response bodies contain information about the identified class of the image, the confidence of the classification, the
-position and size of the image as well as the results of additional convenience filters which can be configured through
-environment variables. A response body looks like this:
-
-```json
-{
-  "dossierId": "debug",
-  "fileId": "13ffa9851740c8d20c4c7d1706d72f2a",
-  "data": [...]
-}
-```
-
-An image metadata record (entry in `"data"` field of a response body) looks like this:
-
-```json
-{
-  "classification": {
-    "label": "logo",
-    "probabilities": {
-      "logo": 1.0,
-      "signature": 1.1599173226749333e-17,
-      "other": 2.994595513398207e-23,
-      "formula": 4.352109377281029e-31
-    }
-  },
-  "position": {
-    "x1": 475.95,
-    "x2": 533.4,
-    "y1": 796.47,
-    "y2": 827.62,
-    "pageNumber": 6
-  },
-  "geometry": {
-    "width": 57.44999999999999,
-    "height": 31.149999999999977
-  },
-  "alpha": false,
-  "filters": {
-    "geometry": {
-      "imageSize": {
-        "quotient": 0.05975350599135938,
-        "tooLarge": false,
-        "tooSmall": false
-      },
-      "imageFormat": {
-        "quotient": 1.8443017656500813,
-        "tooTall": false,
-        "tooWide": false
-      }
-    },
-    "probability": {
-      "unconfident": false
-    },
-    "allPassed": true
-  }
-}
-```
-
-
-## Configuration
-
-A configuration file is located under `config.yaml`. All relevant variables can be configured via
-exporting environment variables.
-
-| __Environment Variable__           | Default                            | Description                                                                            |
-|------------------------------------|------------------------------------|----------------------------------------------------------------------------------------|
-| __LOGGING_LEVEL_ROOT__             | "INFO"                             | Logging level for log file messages                                                    |
-| __VERBOSE__                        | *true*                             | Service prints document processing progress to stdout                                  |
-| __BATCH_SIZE__                     | 16                                 | Number of images in memory simultaneously per service instance                         |
-| __RUN_ID__                         | "fabfb1f192c745369b88cab34471aba7" | The ID of the mlflow run to load the image classifier from                             |
-| __MIN_REL_IMAGE_SIZE__             | 0.05                               | Minimally permissible image size to page size ratio                                    |
-| __MAX_REL_IMAGE_SIZE__             | 0.75                               | Maximally permissible image size to page size ratio                                    |
-| __MIN_IMAGE_FORMAT__               | 0.1                                | Minimally permissible image width to height ratio                                      |
-| __MAX_IMAGE_FORMAT__               | 10                                 | Maximally permissible image width to height ratio                                      |
-
-See also: https://git.iqser.com/projects/RED/repos/helm/browse/redaction/templates/image-service-v2
--- a/src/image_prediction/classifier/init.py
+++ b/src/image_prediction/classifier/init.py
--- a/bamboo-specs/pom.xml
+++ b/bamboo-specs/pom.xml
@ -0,0 +1,40 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>com.atlassian.bamboo</groupId>
+    <artifactId>bamboo-specs-parent</artifactId>
+    <version>7.1.2</version>
+    <relativePath/>
+  </parent>
+
+  <artifactId>bamboo-specs</artifactId>
+  <version>1.0.0-SNAPSHOT</version>
+  <packaging>jar</packaging>
+
+  <properties>
+    <sonar.skip>true</sonar.skip>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>com.atlassian.bamboo</groupId>
+      <artifactId>bamboo-specs-api</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.atlassian.bamboo</groupId>
+      <artifactId>bamboo-specs</artifactId>
+    </dependency>
+
+    <!-- Test dependencies -->
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+
+  <!-- run 'mvn test' to perform offline validation of the plan -->
+  <!-- run 'mvn -Ppublish-specs' to upload the plan to your Bamboo server -->
+</project>
--- a/bamboo-specs/src/main/java/buildjob/PlanSpec.java
+++ b/bamboo-specs/src/main/java/buildjob/PlanSpec.java
@ -0,0 +1,182 @@
+package buildjob;
+
+import com.atlassian.bamboo.specs.api.BambooSpec;
+import com.atlassian.bamboo.specs.api.builders.BambooKey;
+import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
+import com.atlassian.bamboo.specs.api.builders.permission.PermissionType;
+import com.atlassian.bamboo.specs.api.builders.permission.Permissions;
+import com.atlassian.bamboo.specs.api.builders.permission.PlanPermissions;
+import com.atlassian.bamboo.specs.api.builders.plan.Job;
+import com.atlassian.bamboo.specs.api.builders.plan.Plan;
+import com.atlassian.bamboo.specs.api.builders.plan.PlanIdentifier;
+import com.atlassian.bamboo.specs.api.builders.plan.Stage;
+import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup;
+import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement;
+import com.atlassian.bamboo.specs.api.builders.project.Project;
+import com.atlassian.bamboo.specs.builders.task.CheckoutItem;
+import com.atlassian.bamboo.specs.builders.task.InjectVariablesTask;
+import com.atlassian.bamboo.specs.builders.task.ScriptTask;
+import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask;
+import com.atlassian.bamboo.specs.builders.task.CleanWorkingDirectoryTask;
+import com.atlassian.bamboo.specs.builders.task.VcsTagTask;
+import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
+import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
+import com.atlassian.bamboo.specs.api.builders.Variable;
+import com.atlassian.bamboo.specs.util.BambooServer;
+import com.atlassian.bamboo.specs.builders.task.ScriptTask;
+import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
+
+/**
+ * Plan configuration for Bamboo.
+ * Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
+ */
+@BambooSpec
+public class PlanSpec {
+
+    private static final String SERVICE_NAME = "image-prediction";
+    private static final String SERVICE_NAME_BASE = "image-prediction-base";
+
+    private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_","");
+
+    /**
+     * Run main to publish plan on Bamboo
+     */
+    public static void main(final String[] args) throws Exception {
+        //By default credentials are read from the '.credentials' file.
+        BambooServer bambooServer = new BambooServer("http://localhost:8085");
+
+        Plan plan = new PlanSpec().createDockerBuildPlan();
+        bambooServer.publish(plan);
+        PlanPermissions planPermission = new PlanSpec().createPlanPermission(plan.getIdentifier());
+        bambooServer.publish(planPermission);
+    }
+
+    private PlanPermissions createPlanPermission(PlanIdentifier planIdentifier) {
+        Permissions permission = new Permissions()
+                .userPermissions("atlbamboo", PermissionType.EDIT, PermissionType.VIEW, PermissionType.ADMIN, PermissionType.CLONE, PermissionType.BUILD)
+                .groupPermissions("research", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
+                .groupPermissions("Development", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
+                .groupPermissions("QA", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
+                .loggedInUserPermissions(PermissionType.VIEW)
+                .anonymousUserPermissionView();
+        return new PlanPermissions(planIdentifier.getProjectKey(), planIdentifier.getPlanKey()).permissions(permission);
+    }
+
+    private Project project() {
+        return new Project()
+                .name("RED")
+                .key(new BambooKey("RED"));
+    }
+
+    public Plan createDockerBuildPlan() {
+    return new Plan(
+            project(),
+            SERVICE_NAME, new BambooKey(SERVICE_KEY))
+            .description("Docker build for image-prediction.")
+            // .variables()
+            .stages(new Stage("Build Stage")
+              .jobs(
+                new Job("Build Job", new BambooKey("BUILD"))
+                  .tasks(
+                    new CleanWorkingDirectoryTask()
+                        .description("Clean working directory.")
+                        .enabled(true),
+                    new VcsCheckoutTask()
+                        .description("Checkout default repository.")
+                        .checkoutItems(new CheckoutItem().defaultRepository()),
+                    new VcsCheckoutTask()
+                        .description("Checkout redai_image research repository.")
+                        .checkoutItems(new CheckoutItem().repository("RR / redai_image").path("redai_image")),
+                    new ScriptTask()
+                        .description("Set config and keys.")
+                        .inlineBody("mkdir -p ~/.ssh\n" +
+                                    "echo \"${bamboo.bamboo_agent_ssh}\" | base64 -d >> ~/.ssh/id_rsa\n" +
+                                    "echo \"host vector.iqser.com\" > ~/.ssh/config\n" +
+                                    "echo \"    user bamboo-agent\" >> ~/.ssh/config\n" +
+                                    "chmod 600 ~/.ssh/config ~/.ssh/id_rsa"),
+                    new ScriptTask()
+                        .description("Build Docker container.")
+                        .location(Location.FILE)
+                        .fileFromPath("bamboo-specs/src/main/resources/scripts/docker-build.sh")
+                        .argument(SERVICE_NAME + " " + SERVICE_NAME_BASE))
+                  .dockerConfiguration(
+                      new DockerConfiguration()
+                        .image("nexus.iqser.com:5001/infra/release_build:4.2.0")
+                        .volume("/var/run/docker.sock", "/var/run/docker.sock")),
+                new Job("Sonar Job", new BambooKey("SONAR"))
+                  .tasks(
+                    new CleanWorkingDirectoryTask()
+                        .description("Clean working directory.")
+                        .enabled(true),
+                    new VcsCheckoutTask()
+                        .description("Checkout default repository.")
+                        .checkoutItems(new CheckoutItem().defaultRepository()),
+                    new VcsCheckoutTask()
+                        .description("Checkout redai_image repository.")
+                        .checkoutItems(new CheckoutItem().repository("RR / redai_image").path("redai_image")),
+                    new ScriptTask()
+                        .description("Set config and keys.")
+                        .inlineBody("mkdir -p ~/.ssh\n" +
+                                    "echo \"${bamboo.bamboo_agent_ssh}\" | base64 -d >> ~/.ssh/id_rsa\n" +
+                                    "echo \"host vector.iqser.com\" > ~/.ssh/config\n" +
+                                    "echo \"    user bamboo-agent\" >> ~/.ssh/config\n" +
+                                    "chmod 600 ~/.ssh/config ~/.ssh/id_rsa"),
+                    new ScriptTask()
+                        .description("Run Sonarqube scan.")
+                        .location(Location.FILE)
+                        .fileFromPath("bamboo-specs/src/main/resources/scripts/sonar-scan.sh")
+                        .argument(SERVICE_NAME))
+                  .dockerConfiguration(
+                      new DockerConfiguration()
+                        .image("nexus.iqser.com:5001/infra/release_build:4.2.0")
+                        .volume("/var/run/docker.sock", "/var/run/docker.sock"))),
+            new Stage("Licence Stage")
+              .jobs(
+                new Job("Git Tag Job", new BambooKey("GITTAG"))
+                  .tasks(
+                    new VcsCheckoutTask()
+                        .description("Checkout default repository.")
+                        .checkoutItems(new CheckoutItem().defaultRepository()),
+                    new ScriptTask()
+                        .description("Build git tag.")
+                        .location(Location.FILE)
+                        .fileFromPath("bamboo-specs/src/main/resources/scripts/git-tag.sh"),
+                    new InjectVariablesTask()
+                        .description("Inject git tag.")
+                        .path("git.tag")
+                        .namespace("g")
+                        .scope(InjectVariablesScope.LOCAL),
+                    new VcsTagTask()
+                        .description("${bamboo.g.gitTag}")
+                        .tagName("${bamboo.g.gitTag}")
+                        .defaultRepository())
+                .dockerConfiguration(
+                    new DockerConfiguration()
+                        .image("nexus.iqser.com:5001/infra/release_build:4.4.1")),
+                new Job("Licence Job", new BambooKey("LICENCE"))
+                  .enabled(false)
+                  .tasks(
+                    new VcsCheckoutTask()
+                        .description("Checkout default repository.")
+                        .checkoutItems(new CheckoutItem().defaultRepository()),
+                    new ScriptTask()
+                        .description("Build licence.")
+                        .location(Location.FILE)
+                        .fileFromPath("bamboo-specs/src/main/resources/scripts/create-licence.sh"))
+                  .dockerConfiguration(
+                    new DockerConfiguration()
+                        .image("nexus.iqser.com:5001/infra/maven:3.6.2-jdk-13-3.0.0")
+                        .volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
+                        .volume("/var/run/docker.sock", "/var/run/docker.sock"))))
+            .linkedRepositories("RR / " + SERVICE_NAME)
+            .linkedRepositories("RR / redai_image")
+            .triggers(new BitbucketServerTrigger())
+            .planBranchManagement(new PlanBranchManagement()
+              .createForVcsBranch()
+              .delete(new BranchCleanup()
+                  .whenInactiveInRepositoryAfterDays(14))
+              .notificationForCommitters());
+    }
+
+
+}
--- a/bamboo-specs/src/main/resources/scripts/create-licence.sh
+++ b/bamboo-specs/src/main/resources/scripts/create-licence.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+set -e
+
+if [[ \"${bamboo_version_tag}\" != \"dev\" ]]
+then
+    ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
+                    -f ${bamboo_build_working_directory}/pom.xml \
+                    versions:set  \
+                    -DnewVersion=${bamboo_version_tag}
+
+    ${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
+                    -f ${bamboo_build_working_directory}/pom.xml \
+                    -B clean deploy \
+                    -e -DdeployAtEnd=true \
+                    -Dmaven.wagon.http.ssl.insecure=true \
+                    -Dmaven.wagon.http.ssl.allowall=true \
+                    -Dmaven.wagon.http.ssl.ignore.validity.dates=true \
+                    -DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/gin4-platform-releases
+fi
--- a/bamboo-specs/src/main/resources/scripts/docker-build.sh
+++ b/bamboo-specs/src/main/resources/scripts/docker-build.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+set -e
+
+SERVICE_NAME=$1
+SERVICE_NAME_BASE=$2
+
+python3 -m venv build_venv
+source build_venv/bin/activate
+python3 -m pip install --upgrade pip
+
+pip install dvc
+pip install 'dvc[ssh]'
+dvc pull
+
+echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
+docker build -f Dockerfile_base  -t nexus.iqser.com:5001/red/$SERVICE_NAME_BASE:${bamboo_version_tag} .
+docker build -f Dockerfile  -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} --build-arg VERSION_TAG=${bamboo_version_tag} .
+echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
+docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag}
--- a/bamboo-specs/src/main/resources/scripts/git-tag.sh
+++ b/bamboo-specs/src/main/resources/scripts/git-tag.sh
@ -0,0 +1,9 @@
+#!/bin/bash
+set -e
+
+if [[ "${bamboo_version_tag}" = "dev" ]]
+then
+    echo "gitTag=${bamboo_planRepository_1_branch}_${bamboo_buildNumber}" > git.tag
+else
+    echo "gitTag=${bamboo_version_tag}" > git.tag
+fi
--- a/bamboo-specs/src/main/resources/scripts/sonar-scan.sh
+++ b/bamboo-specs/src/main/resources/scripts/sonar-scan.sh
@ -0,0 +1,51 @@
+#!/bin/bash
+set -e
+
+export JAVA_HOME=/usr/bin/sonar-scanner/jre
+
+python3 -m venv build_venv
+source build_venv/bin/activate
+python3 -m pip install --upgrade pip
+
+echo "dev setup for unit test and coverage 💖"
+
+pip install -e .
+pip install -r requirements.txt
+
+SERVICE_NAME=$1
+
+echo "dependency-check:aggregate"
+mkdir -p reports
+dependency-check --enableExperimental -f JSON -f HTML -f XML \
+  --disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
+  --exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**"
+
+if [[ -z "${bamboo_repository_pr_key}" ]]
+then
+    echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
+    /usr/bin/sonar-scanner/bin/sonar-scanner \
+      -Dsonar.projectKey=RED_$SERVICE_NAME \
+      -Dsonar.sources=image_prediction \
+      -Dsonar.host.url=https://sonarqube.iqser.com \
+      -Dsonar.login=${bamboo_sonarqube_api_token_secret} \
+      -Dsonar.branch.name=${bamboo_planRepository_1_branch} \
+      -Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
+      -Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
+      -Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
+      -Dsonar.python.coverage.reportPaths=reports/coverage.xml
+ 
+else
+    echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
+    /usr/bin/sonar-scanner/bin/sonar-scanner \
+      -Dsonar.projectKey=RED_$SERVICE_NAME \
+      -Dsonar.sources=image_prediction \
+      -Dsonar.host.url=https://sonarqube.iqser.com \
+      -Dsonar.login=${bamboo_sonarqube_api_token_secret} \
+      -Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
+      -Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \
+      -Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \
+      -Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
+      -Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
+      -Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
+      -Dsonar.python.coverage.reportPaths=reports/coverage.xml
+fi
--- a/bamboo-specs/src/test/java/buildjob/PlanSpecTest.java
+++ b/bamboo-specs/src/test/java/buildjob/PlanSpecTest.java
@ -0,0 +1,16 @@
+package buildjob;
+
+
+import com.atlassian.bamboo.specs.api.builders.plan.Plan;
+import com.atlassian.bamboo.specs.api.exceptions.PropertiesValidationException;
+import com.atlassian.bamboo.specs.api.util.EntityPropertiesBuilders;
+import org.junit.Test;
+
+public class PlanSpecTest {
+    @Test
+    public void checkYourPlanOffline() throws PropertiesValidationException {
+        Plan plan = new PlanSpec().createDockerBuildPlan();
+
+        EntityPropertiesBuilders.build(plan);
+    }
+}
--- a/bom.json
+++ b/bom.json
--- a/config.yaml
+++ b/config.yaml
@ -0,0 +1,28 @@
+webserver:
+  host: $SERVER_HOST|"127.0.0.1"  # webserver address
+  port: $SERVER_PORT|5000  # webserver port
+  mode: $SERVER_MODE|production  # webserver mode: {development, production}
+
+service:
+  logging_level: INFO  # Logging level for service logger
+  progressbar: True  # Whether a progress bar over the pages of a document is displayed while processing
+  batch_size: $BATCH_SIZE|32  # Number of images in memory simultaneously
+  verbose: $VERBOSE|True  # Service prints document processing progress to stdout
+  run_id: $RUN_ID|fabfb1f192c745369b88cab34471aba7  # The ID of the mlflow run to load the service_estimator from
+
+
+# These variables control filters that are applied to either images, image metadata or service_estimator predictions. The filter
+# result values are reported in the service responses. For convenience the response to a request contains a
+# "filters.allPassed" field, which is set to false if any of the filters returned values did not meet its specified
+# required value.
+filters:
+
+  image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)
+    min: $MIN_REL_IMAGE_SIZE|0.05  # Minimum permissible
+    max: $MAX_REL_IMAGE_SIZE|0.75  # Maximum permissible
+
+  image_width_to_height_quotient:  # Image width to height ratio
+    min: $MIN_IMAGE_FORMAT|0.1  # Minimum permissible
+    max: $MAX_IMAGE_FORMAT|10  # Maximum permissible
+
+  min_confidence: $MIN_CONFIDENCE|0.5  # Minimum permissible prediction confidence
--- a/config/pyinfra.toml
+++ b/config/pyinfra.toml
@ -1,68 +0,0 @@
-
-[asyncio]
-max_concurrent_tasks = 10
-
-[dynamic_tenant_queues]
-enabled = true
-
-[metrics.prometheus]
-enabled = true
-prefix = "redactmanager_image_service"
-
-[tracing]
-enabled = true
-# possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
-type = "azure_monitor" 
-
-[tracing.opentelemetry]
-endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
-service_name = "redactmanager_image_service"
-exporter = "otlp"
-
-[webserver]
-host = "0.0.0.0"
-port = 8080
-
-[rabbitmq]
-host = "localhost"
-port = 5672
-username = ""
-password = ""
-heartbeat = 60
-# Has to be a divider of heartbeat, and shouldn't be too big, since only in these intervals queue interactions happen (like receiving new messages)
-# This is also the minimum time the service needs to process a message
-connection_sleep = 5
-input_queue = "request_queue"
-output_queue = "response_queue"
-dead_letter_queue = "dead_letter_queue"
-
-tenant_event_queue_suffix = "_tenant_event_queue"
-tenant_event_dlq_suffix = "_tenant_events_dlq"
-tenant_exchange_name = "tenants-exchange"
-queue_expiration_time = 300000  # 5 minutes in milliseconds
-
-service_request_queue_prefix = "image_request_queue"
-service_request_exchange_name = "image_request_exchange"
-service_response_exchange_name = "image_response_exchange"
-service_dlq_name = "image_dlq"
-
-[storage]
-backend = "s3"
-
-[storage.s3]
-bucket = "redaction"
-endpoint = "http://127.0.0.1:9000"
-key = ""
-secret = ""
-region = "eu-central-1"
-
-[storage.azure]
-container = "redaction"
-connection_string = ""
-
-[storage.tenant_server]
-public_key = ""
-endpoint =  "http://tenant-user-management:8081/internal-api/tenants"
-
-[kubernetes]
-pod_name = "test_pod"
--- a/config/settings.toml
+++ b/config/settings.toml
@ -1,42 +0,0 @@
-[logging]
-level = "INFO"
-
-[service]
-# Print document processing progress to stdout
-verbose = false
-batch_size = 6
-image_stiching_tolerance = 1  # in pixels
-mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
-
-# These variables control filters that are applied to either images, image metadata or service_estimator predictions.
-# The filter result values are reported in the service responses. For convenience the response to a request contains a
-# "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
-# specified required value.
-[filters.confidence]
-# Minimum permissible prediction confidence
-min = 0.5
-
-# Image size to page size ratio (ratio of geometric means of areas)
-[filters.image_to_page_quotient]
-min = 0.05
-max = 0.75
-
-[filters.is_scanned_page]
-# Minimum permissible image to page ratio tolerance for a page to be considered scanned.
-# This is only used for filtering small images on scanned pages and is applied before processing the image, therefore
-# superseding the image_to_page_quotient filter that only applies a tag to the image after processing.
-tolerance = 0
-
-# Image width to height ratio
-[filters.image_width_to_height_quotient]
-min = 0.1
-max = 10
-
-# put class specific filters here ['signature', 'formula', 'logo']
-[filters.overrides.signature.image_to_page_quotient]
-max = 0.4
-
-[filters.overrides.logo.image_to_page_quotient]
-min = 0.06
-
-
--- a/data/mlruns.dvc
+++ b/data/mlruns.dvc
@ -1,5 +1,5 @@
 outs:
- md5: ad061d607f615afc149643f62dbf37cc.dir
-  size: 166952700
+- md5: 4219c52caf5f87f5a94f1ae00c60fb91.dir
+  size: 166952679
  nfiles: 179
  path: mlruns
--- a/src/image_prediction/compositor/init.py
+++ b/src/image_prediction/compositor/init.py
--- a/src/image_prediction/encoder/init.py
+++ b/src/image_prediction/encoder/init.py
--- a/src/image_prediction/classifier/classifier.py
+++ b/src/image_prediction/classifier/classifier.py
@ -24,11 +24,10 @@ class Classifier:
        self.__pipe = rcompose(self.__estimator_adapter, self.__label_mapper)

    def predict(self, batch: Union[np.array, Tuple[Image]]) -> List[str]:
-
-        if isinstance(batch, np.ndarray) and batch.shape[0] == 0:
+        if not isinstance(batch, tuple) and batch.shape[0] == 0:
            return []

-        return self.__pipe(batch)
+        return list(self.__pipe(batch))

    def __call__(self, batch: np.array) -> List[str]:
        logger.debug("Classifier.predict")
--- a/src/image_prediction/classifier/image_classifier.py
+++ b/src/image_prediction/classifier/image_classifier.py
--- a/src/image_prediction/encoder/encoders/init.py
+++ b/src/image_prediction/encoder/encoders/init.py
--- a/src/image_prediction/compositor/compositor.py
+++ b/src/image_prediction/compositor/compositor.py
--- a/image_prediction/config.py
+++ b/image_prediction/config.py
@ -0,0 +1,40 @@
+"""Implements a config object with dot-indexing syntax."""
+
+
+from envyaml import EnvYAML
+
+from image_prediction.locations import CONFIG_FILE
+
+
+def _get_item_and_maybe_make_dotindexable(container, item):
+    ret = container[item]
+    return DotIndexable(ret) if isinstance(ret, dict) else ret
+
+
+class DotIndexable:
+    def __init__(self, x):
+        self.x = x
+
+    def __getattr__(self, item):
+        return _get_item_and_maybe_make_dotindexable(self.x, item)
+
+    def __repr__(self):
+        return self.x.__repr__()
+
+    def __getitem__(self, item):
+        return self.__getattr__(item)
+
+
+class Config:
+    def __init__(self, config_path):
+        self.__config = EnvYAML(config_path)
+
+    def __getattr__(self, item):
+        if item in self.__config:
+            return _get_item_and_maybe_make_dotindexable(self.__config, item)
+
+    def __getitem__(self, item):
+        return self.__getattr__(item)
+
+
+CONFIG = Config(CONFIG_FILE)
--- a/src/image_prediction/default_objects.py
+++ b/src/image_prediction/default_objects.py
@ -3,17 +3,17 @@ from funcy import juxt
 from image_prediction.classifier.classifier import Classifier
 from image_prediction.classifier.image_classifier import ImageClassifier
 from image_prediction.compositor.compositor import TransformerCompositor
-from image_prediction.encoder.encoders.hash_encoder import HashEncoder
 from image_prediction.estimator.adapter.adapter import EstimatorAdapter
+from image_prediction.extractor_classifier.extractor_classifier import ExtractorClassifier
 from image_prediction.formatter.formatters.camel_case import Snake2CamelCaseKeyFormatter
 from image_prediction.formatter.formatters.enum import EnumFormatter
+from image_prediction.transformer.transformers.response import ResponseTransformer
 from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
 from image_prediction.label_mapper.mappers.probability import ProbabilityMapper
 from image_prediction.model_loader.loader import ModelLoader
 from image_prediction.model_loader.loaders.mlflow import MlflowConnector
 from image_prediction.redai_adapter.mlflow import MlflowModelReader
 from image_prediction.transformer.transformers.coordinate.pdfnet import PDFNetCoordinateTransformer
-from image_prediction.transformer.transformers.response import ResponseTransformer


 def get_mlflow_model_loader(mlruns_dir):
@ -32,12 +32,16 @@ def get_extractor(**kwargs):
    return image_extractor


+def get_extractor_classifier(model_loader, model_identifier, **kwargs):
+    extractor_classifier = ExtractorClassifier(
+        get_extractor(**kwargs), get_image_classifier(model_loader, model_identifier)
+    )
+
+    return extractor_classifier
+
+
 def get_formatter():
    formatter = TransformerCompositor(
        PDFNetCoordinateTransformer(), EnumFormatter(), ResponseTransformer(), Snake2CamelCaseKeyFormatter()
    )
    return formatter
-
-
-def get_encoder():
-    return HashEncoder()
--- a/src/image_prediction/estimator/init.py
+++ b/src/image_prediction/estimator/init.py
--- a/src/image_prediction/estimator/adapter/init.py
+++ b/src/image_prediction/estimator/adapter/init.py
--- a/src/image_prediction/estimator/adapter/adapter.py
+++ b/src/image_prediction/estimator/adapter/adapter.py
--- a/src/image_prediction/estimator/adapter/adapters/init.py
+++ b/src/image_prediction/estimator/adapter/adapters/init.py
--- a/src/image_prediction/estimator/preprocessor/init.py
+++ b/src/image_prediction/estimator/preprocessor/init.py
--- a/src/image_prediction/estimator/preprocessor/preprocessor.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessor.py
--- a/src/image_prediction/estimator/preprocessor/preprocessors/init.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessors/init.py
--- a/src/image_prediction/estimator/preprocessor/preprocessors/basic.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessors/basic.py
--- a/src/image_prediction/estimator/preprocessor/preprocessors/identity.py
+++ b/src/image_prediction/estimator/preprocessor/preprocessors/identity.py
--- a/src/image_prediction/estimator/preprocessor/utils.py
+++ b/src/image_prediction/estimator/preprocessor/utils.py
--- a/src/image_prediction/exceptions.py
+++ b/src/image_prediction/exceptions.py
@ -32,11 +32,3 @@ class IntentionalTestException(RuntimeError):

 class InvalidBox(Exception):
    pass
-
-
-class ParsingError(Exception):
-    pass
-
-
-class BadXref(ValueError):
-    pass
--- a/src/image_prediction/extraction.py
+++ b/src/image_prediction/extraction.py
--- a/image_prediction/extractor_classifier/init.py
+++ b/image_prediction/extractor_classifier/init.py
--- a/image_prediction/extractor_classifier/extractor_classifier.py
+++ b/image_prediction/extractor_classifier/extractor_classifier.py
@ -0,0 +1,32 @@
+from itertools import chain
+from typing import Iterable
+
+from funcy import chunks
+
+from image_prediction.classifier.image_classifier import ImageClassifier
+from image_prediction.image_extractor.extractor import ImageExtractor
+
+
+class ExtractorClassifier:
+    """This class is responsible for orchestrating the pairing of classifications and image metadata. It extracts images
+    from an object and classifies them. Then it ties the classification together with the metadata. It returns an
+    iterable of dictionaries, where each dictionary has a field 'label' for the classification and possibly additional
+    fields for metadata -- metadata could be void.
+    """
+
+    def __init__(self, image_extractor: ImageExtractor, image_classifier: ImageClassifier):
+        self.classifier = image_classifier
+        self.extractor = image_extractor
+
+    def __process_batch(self, batch):
+        images, metadata = zip(*batch)
+
+        predictions = self.classifier(images)
+        responses = ({"classification": prd, **mdt} for prd, mdt in zip(predictions, metadata))
+        return responses
+
+    def __call__(self, obj, **kwargs) -> Iterable[dict]:
+        image_metadata_pairs = self.extractor(obj, **kwargs)
+        batches = chunks(16, image_metadata_pairs)
+        predictions = chain.from_iterable(map(self.__process_batch, batches))
+        return predictions
--- a/src/image_prediction/flask.py
+++ b/src/image_prediction/flask.py
@ -1,20 +1,39 @@
+import multiprocessing
+import traceback
 from typing import Callable

 from flask import Flask, request, jsonify
-from prometheus_client import generate_latest, CollectorRegistry, Summary

 from image_prediction.utils import get_logger
-from image_prediction.utils.process_wrapping import wrap_in_process

 logger = get_logger()


+def run_in_process(func):
+    p = multiprocessing.Process(target=func)
+    p.start()
+    p.join()
+
+
+def wrap_in_process(func_to_wrap):
+    def build_function_and_run_in_process(*args, **kwargs):
+        def func():
+            try:
+                result = func_to_wrap(*args, **kwargs)
+                return_dict["result"] = result
+            except:
+                logger.error(traceback.format_exc())
+
+        manager = multiprocessing.Manager()
+        return_dict = manager.dict()
+        run_in_process(func)
+        return return_dict.get("result", None)
+
+    return build_function_and_run_in_process
+
+
 def make_prediction_server(predict_fn: Callable):
    app = Flask(__name__)
-    registry = CollectorRegistry(auto_describe=True)
-    metric = Summary(
-        f"redactmanager_imageClassification_seconds", f"Time spent on image-service classification.", registry=registry
-    )

    @app.route("/ready", methods=["GET"])
    def ready():
@ -34,8 +53,6 @@ def make_prediction_server(predict_fn: Callable):
        return response

    @app.route("/predict", methods=["POST"])
-    @app.route("/", methods=["POST"])
-    @metric.time()
    def predict():

        # Tensorflow does not free RAM. Workaround: Run prediction function (which instantiates a model) in sub-process.
@ -45,7 +62,7 @@ def make_prediction_server(predict_fn: Callable):
        logger.info("Analysing...")
        predictions = predict_fn_wrapped(request.data)

-        if predictions is not None:
+        if predictions:
            response = jsonify(predictions)
            logger.info("Analysis completed.")
            return response
@ -53,8 +70,4 @@ def make_prediction_server(predict_fn: Callable):
            logger.error("Analysis failed.")
            return __failure()

-    @app.route("/prometheus", methods=["GET"])
-    def prometheus():
-        return generate_latest(registry=registry)
-
    return app
--- a/src/image_prediction/formatter/formatters/init.py
+++ b/src/image_prediction/formatter/formatters/init.py
--- a/src/image_prediction/formatter/formatter.py
+++ b/src/image_prediction/formatter/formatter.py
--- a/image_prediction/formatter/formatters/init.py
+++ b/image_prediction/formatter/formatters/init.py
--- a/src/image_prediction/formatter/formatters/camel_case.py
+++ b/src/image_prediction/formatter/formatters/camel_case.py
--- a/src/image_prediction/formatter/formatters/enum.py
+++ b/src/image_prediction/formatter/formatters/enum.py
--- a/src/image_prediction/formatter/formatters/identity.py
+++ b/src/image_prediction/formatter/formatters/identity.py
--- a/src/image_prediction/formatter/formatters/key_formatter.py
+++ b/src/image_prediction/formatter/formatters/key_formatter.py
--- a/src/image_prediction/image_extractor/extractors/init.py
+++ b/src/image_prediction/image_extractor/extractors/init.py
--- a/src/image_prediction/image_extractor/extractor.py
+++ b/src/image_prediction/image_extractor/extractor.py
--- a/image_prediction/image_extractor/extractors/init.py
+++ b/image_prediction/image_extractor/extractors/init.py
--- a/src/image_prediction/image_extractor/extractors/mock.py
+++ b/src/image_prediction/image_extractor/extractors/mock.py
--- a/image_prediction/image_extractor/extractors/parsable.py
+++ b/image_prediction/image_extractor/extractors/parsable.py
@ -0,0 +1,181 @@
+import atexit
+import io
+from functools import partial, lru_cache
+from itertools import chain, starmap, filterfalse, repeat
+from operator import itemgetter
+from typing import List
+
+import fitz
+from PIL import Image
+from funcy import rcompose, merge, zipdict
+from tqdm import tqdm
+
+from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
+from image_prediction.info import Info
+from image_prediction.stitching.stitching import stitch_pairs
+from image_prediction.stitching.utils import validate_box_coords, validate_box_size
+
+
+class ParsablePDFImageExtractor(ImageExtractor):
+    def __init__(self, verbose=False, tolerance=0):
+        """
+
+        Args:
+            verbose: Whether to show progressbar
+            tolerance: The tolerance in pixels for the distance images beyond which they will not be stitched together
+        """
+        self.doc: fitz.fitz.Document = None
+        self.verbose = verbose
+        self.tolerance = tolerance
+
+    def extract(self, pdf: bytes, page_range: range = None):
+        self.doc = fitz.Document(stream=pdf)
+
+        pages = extract_pages(self.doc, page_range) if page_range else self.doc
+
+        image_metadata_pairs = chain.from_iterable(
+            map(
+                self.__process_images_on_page,
+                tqdm(pages, desc="Extracting", disable=not self.verbose, total=len(page_range) if page_range else None),
+            )
+        )
+
+        yield from image_metadata_pairs
+
+    def __process_images_on_page(self, page: fitz.fitz.Page):
+        images = get_images_on_page(self.doc, page)
+        metadata = get_metadata_for_images_on_page(self.doc, page)
+        clear_caches()
+
+        image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
+        image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
+
+        yield from image_metadata_pairs
+
+
+def extract_pages(doc, page_range):
+    page_range = range(page_range.start + 1, page_range.stop + 1)
+    pages = map(doc.load_page, page_range)
+
+    return pages
+
+
+@lru_cache(maxsize=None)
+def get_images_on_page(doc, page: fitz.Page):
+    image_infos = get_image_infos(page)
+    xrefs = map(itemgetter("xref"), image_infos)
+    images = map(partial(xref_to_image, doc), xrefs)
+
+    return images
+
+
+def get_metadata_for_images_on_page(doc, page: fitz.Page):
+
+    metadata = map(get_image_metadata, get_image_infos(page))
+    metadata = validate_coords_and_passthrough(metadata)
+
+    metadata = filter_out_tiny_images(metadata)
+    metadata = validate_size_and_passthrough(metadata)
+
+    metadata = add_page_metadata(page, metadata)
+
+    metadata = add_alpha_channel_info(doc, page, metadata)
+
+    yield from metadata
+
+
+@lru_cache(maxsize=None)
+def get_image_infos(page: fitz.Page) -> List[dict]:
+    return page.get_image_info(xrefs=True)
+
+
+@lru_cache(maxsize=None)
+def xref_to_image(doc, xref) -> Image:
+    maybe_image = load_image_handle_from_xref(doc, xref)
+    return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None
+
+
+def get_image_metadata(image_info):
+
+    x1, y1, x2, y2 = map(rounder, image_info["bbox"])
+
+    width = abs(x2 - x1)
+    height = abs(y2 - y1)
+
+    return {
+        Info.WIDTH: width,
+        Info.HEIGHT: height,
+        Info.X1: x1,
+        Info.X2: x2,
+        Info.Y1: y1,
+        Info.Y2: y2,
+    }
+
+
+def validate_coords_and_passthrough(metadata):
+    yield from map(validate_box_coords, metadata)
+
+
+def filter_out_tiny_images(metadata):
+    return filterfalse(tiny, metadata)
+
+
+def validate_size_and_passthrough(metadata):
+    yield from map(validate_box_size, metadata)
+
+
+def add_page_metadata(page, metadata):
+    return map(partial(merge, get_page_metadata(page)), metadata)
+
+
+def add_alpha_channel_info(doc, page, metadata):
+    xrefs = map(itemgetter("xref"), get_image_infos(page))
+    alpha = map(partial(has_alpha_channel, doc), xrefs)
+    alpha = ({Info.ALPHA: a} for a in alpha)
+    # alpha = map(dict, zip(repeat(Info.ALPHA), alpha))
+    metadata = starmap(merge, zip(alpha, metadata))
+
+    return metadata
+
+
+@lru_cache(maxsize=None)
+def load_image_handle_from_xref(doc, xref):
+    return doc.extract_image(xref)
+
+
+rounder = rcompose(round, int)
+
+
+def get_page_metadata(page):
+    page_width, page_height = map(rounder, page.mediabox_size)
+
+    return {
+        Info.PAGE_WIDTH: page_width,
+        Info.PAGE_HEIGHT: page_height,
+        Info.PAGE_IDX: page.number,
+    }
+
+
+def has_alpha_channel(doc, xref):
+
+    maybe_image = load_image_handle_from_xref(doc, xref)
+    maybe_smask = maybe_image["smask"] if maybe_image else None
+
+    if maybe_smask:
+        return any([doc.extract_image(maybe_smask) is not None, bool(fitz.Pixmap(doc, maybe_smask).alpha)])
+    else:
+        return bool(fitz.Pixmap(doc, xref).alpha)
+
+
+def tiny(metadata):
+    return metadata[Info.WIDTH] * metadata[Info.HEIGHT] <= 4
+
+
+def clear_caches():
+    get_image_infos.cache_clear()
+    load_image_handle_from_xref.cache_clear()
+    get_images_on_page.cache_clear()
+    xref_to_image.cache_clear()
+
+
+atexit.register(clear_caches)
--- a/src/image_prediction/info.py
+++ b/src/image_prediction/info.py
@ -12,4 +12,3 @@ class Info(Enum):
    Y1 = "y1"
    Y2 = "y2"
    ALPHA = "alpha"
-    XREF = "xref"
--- a/src/image_prediction/label_mapper/mappers/init.py
+++ b/src/image_prediction/label_mapper/mappers/init.py
--- a/src/image_prediction/label_mapper/mapper.py
+++ b/src/image_prediction/label_mapper/mapper.py
--- a/image_prediction/label_mapper/mappers/init.py
+++ b/image_prediction/label_mapper/mappers/init.py
--- a/src/image_prediction/label_mapper/mappers/numeric.py
+++ b/src/image_prediction/label_mapper/mappers/numeric.py
--- a/src/image_prediction/label_mapper/mappers/probability.py
+++ b/src/image_prediction/label_mapper/mappers/probability.py
--- a/image_prediction/locations.py
+++ b/image_prediction/locations.py
@ -0,0 +1,17 @@
+"""Defines constant paths relative to the module root path."""
+
+from pathlib import Path
+
+MODULE_DIR = Path(__file__).resolve().parents[0]
+
+PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
+
+CONFIG_FILE = PACKAGE_ROOT_DIR / "config.yaml"
+
+BANNER_FILE = PACKAGE_ROOT_DIR / "banner.txt"
+
+DATA_DIR = PACKAGE_ROOT_DIR / "data"
+
+MLRUNS_DIR = str(DATA_DIR / "mlruns")
+
+TEST_DATA_DIR = PACKAGE_ROOT_DIR / "test" / "data"
--- a/src/image_prediction/model_loader/database/init.py
+++ b/src/image_prediction/model_loader/database/init.py
--- a/src/image_prediction/model_loader/database/connectors/init.py
+++ b/src/image_prediction/model_loader/database/connectors/init.py
--- a/src/image_prediction/model_loader/database/connector.py
+++ b/src/image_prediction/model_loader/database/connector.py
--- a/image_prediction/model_loader/database/connectors/init.py
+++ b/image_prediction/model_loader/database/connectors/init.py
--- a/src/image_prediction/model_loader/database/connectors/mock.py
+++ b/src/image_prediction/model_loader/database/connectors/mock.py
--- a/src/image_prediction/model_loader/loader.py
+++ b/src/image_prediction/model_loader/loader.py
--- a/image_prediction/model_loader/loaders/init.py
+++ b/image_prediction/model_loader/loaders/init.py
--- a/src/image_prediction/model_loader/loaders/mlflow.py
+++ b/src/image_prediction/model_loader/loaders/mlflow.py
--- a/image_prediction/pipeline.py
+++ b/image_prediction/pipeline.py
@ -0,0 +1,26 @@
+import os
+
+from funcy import rcompose
+
+from image_prediction.config import CONFIG
+from image_prediction.default_objects import get_extractor_classifier, get_formatter, get_mlflow_model_loader
+from image_prediction.locations import MLRUNS_DIR
+
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+
+
+def load_pipeline(**kwargs):
+    model_loader = get_mlflow_model_loader(MLRUNS_DIR)
+    model_identifier = CONFIG.service.run_id
+
+    pipeline = Pipeline(model_loader, model_identifier, **kwargs)
+
+    return pipeline
+
+
+class Pipeline:
+    def __init__(self, model_loader, model_identifier, **kwargs):
+        self.pipe = rcompose(get_extractor_classifier(model_loader, model_identifier, **kwargs), get_formatter())
+
+    def __call__(self, pdf: bytes, page_range: range = None):
+        yield from self.pipe(pdf, page_range=page_range)
--- a/image_prediction/redai_adapter/init.py
+++ b/image_prediction/redai_adapter/init.py
--- a/src/image_prediction/redai_adapter/efficient_net_wrapper.py
+++ b/src/image_prediction/redai_adapter/efficient_net_wrapper.py
--- a/src/image_prediction/redai_adapter/mlflow.py
+++ b/src/image_prediction/redai_adapter/mlflow.py
--- a/src/image_prediction/redai_adapter/model.py
+++ b/src/image_prediction/redai_adapter/model.py
--- a/src/image_prediction/redai_adapter/model_wrapper.py
+++ b/src/image_prediction/redai_adapter/model_wrapper.py
--- a/src/image_prediction/transformer/init.py
+++ b/src/image_prediction/transformer/init.py
--- a/src/image_prediction/stitching/grouping.py
+++ b/src/image_prediction/stitching/grouping.py
--- a/src/image_prediction/stitching/merging.py
+++ b/src/image_prediction/stitching/merging.py
@ -3,7 +3,7 @@ from functools import reduce
 from typing import Iterable, Callable, List

 from PIL import Image
-from funcy import juxt, first, rest, rcompose, rpartial, complement, ilen
+from funcy import juxt, first, rest, rcompose, rpartial

 from image_prediction.image_extractor.extractor import ImageMetadataPair
 from image_prediction.info import Info
@ -13,22 +13,8 @@ from image_prediction.stitching.utils import make_coord_getter, flatten_groups_o
 from image_prediction.utils.generic import until


-def make_merger_sentinel():
-    def no_new_mergers(pairs):
-        nonlocal number_of_pairs_so_far
-
-        number_of_pairs_now = len(pairs)
-
-        if number_of_pairs_now == number_of_pairs_so_far:
-            return True
-
-        else:
-            number_of_pairs_so_far = number_of_pairs_now
-            return False
-
-    number_of_pairs_so_far = -1
-
-    return no_new_mergers
+def no_new_merges(pairs1, pairs2):
+    return len(pairs1) == len(pairs2)


 def merge_along_both_axes(pairs: Iterable[ImageMetadataPair], tolerance=0) -> List[ImageMetadataPair]:
@ -86,8 +72,7 @@ def merge_group_horizontally(group: Iterable[ImageMetadataPair], tolerance=0):

 def merge_group(group: Iterable[ImageMetadataPair], direction, tolerance=0):
    reduce_group = make_merger_aggregator(direction, tolerance=tolerance)
-    no_new_mergers = make_merger_sentinel()
-    return until(no_new_mergers, reduce_group, group)
+    return until(no_new_merges, reduce_group, group)


 def make_merger_aggregator(axis, tolerance=0) -> Callable[[Iterable[ImageMetadataPair]], Iterable[ImageMetadataPair]]:
--- a/src/image_prediction/stitching/split_mapper.py
+++ b/src/image_prediction/stitching/split_mapper.py
--- a/src/image_prediction/stitching/stitching.py
+++ b/src/image_prediction/stitching/stitching.py
@ -3,13 +3,11 @@ from typing import Iterable, List
 from funcy import rpartial

 from image_prediction.image_extractor.extractor import ImageMetadataPair
-from image_prediction.stitching.merging import merge_along_both_axes, make_merger_sentinel
+from image_prediction.stitching.merging import merge_along_both_axes, no_new_merges
 from image_prediction.utils.generic import until


 def stitch_pairs(pairs: Iterable[ImageMetadataPair], tolerance=0) -> List[ImageMetadataPair]:
    """Given a collection of image-metadata pairs from the same pages, combines all pairs that constitute adjacent
    images."""
-    no_new_mergers = make_merger_sentinel()
-    merge = rpartial(merge_along_both_axes, tolerance)
-    return until(no_new_mergers, merge, pairs)
+    return until(no_new_merges, rpartial(merge_along_both_axes, tolerance), pairs)
--- a/src/image_prediction/stitching/utils.py
+++ b/src/image_prediction/stitching/utils.py
--- a/src/image_prediction/transformer/transformers/init.py
+++ b/src/image_prediction/transformer/transformers/init.py
--- a/src/image_prediction/transformer/transformer.py
+++ b/src/image_prediction/transformer/transformer.py
--- a/src/image_prediction/transformer/transformers/coordinate/init.py
+++ b/src/image_prediction/transformer/transformers/coordinate/init.py
--- a/image_prediction/transformer/transformers/coordinate/init.py
+++ b/image_prediction/transformer/transformers/coordinate/init.py
--- a/src/image_prediction/transformer/transformers/coordinate/coordinate_transformer.py
+++ b/src/image_prediction/transformer/transformers/coordinate/coordinate_transformer.py
--- a/src/image_prediction/transformer/transformers/coordinate/fitz.py
+++ b/src/image_prediction/transformer/transformers/coordinate/fitz.py
--- a/src/image_prediction/transformer/transformers/coordinate/fpdf.py
+++ b/src/image_prediction/transformer/transformers/coordinate/fpdf.py
--- a/src/image_prediction/transformer/transformers/coordinate/pdfnet.py
+++ b/src/image_prediction/transformer/transformers/coordinate/pdfnet.py
--- a/src/image_prediction/transformer/transformers/response.py
+++ b/src/image_prediction/transformer/transformers/response.py
@ -1,5 +1,4 @@
 import math
-from dynaconf import Dynaconf
 from operator import itemgetter

 from image_prediction.config import CONFIG
@ -16,45 +15,38 @@ class ResponseTransformer(Transformer):


 def build_image_info(data: dict) -> dict:
-    page_width, page_height, x1, x2, y1, y2, width, height, alpha = itemgetter(
-        "page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height", "alpha"
+    def compute_geometric_quotient():
+        page_area_sqrt = math.sqrt(abs(page_width * page_height))
+        image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1))
+        return image_area_sqrt / page_area_sqrt
+
+    page_width, page_height, x1, x2, y1, y2, width, height = itemgetter(
+        "page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height"
    )(data)

-    classification = data["classification"]
-    label = classification["label"]
-    representation = data["representation"]
-
-    geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4)
-
-    min_image_to_page_quotient_breached = bool(
-        geometric_quotient < get_class_specific_filter_value(label, CONFIG, "image_to_page_quotient", "min")
-    )
-    max_image_to_page_quotient_breached = bool(
-        geometric_quotient > get_class_specific_filter_value(label, CONFIG, "image_to_page_quotient", "max")
-    )
+    quotient = round(compute_geometric_quotient(), 4)

+    min_image_to_page_quotient_breached = bool(quotient < CONFIG.filters.image_to_page_quotient.min)
+    max_image_to_page_quotient_breached = bool(quotient > CONFIG.filters.image_to_page_quotient.max)
    min_image_width_to_height_quotient_breached = bool(
-        width / height < get_class_specific_filter_value(label, CONFIG, "image_width_to_height_quotient", "min")
+        width / height < CONFIG.filters.image_width_to_height_quotient.min
    )
    max_image_width_to_height_quotient_breached = bool(
-        width / height > get_class_specific_filter_value(label, CONFIG, "image_width_to_height_quotient", "max")
+        width / height > CONFIG.filters.image_width_to_height_quotient.max
    )

-    min_confidence_breached = bool(
-        max(classification["probabilities"].values())
-        < get_class_specific_filter_value(label, CONFIG, "confidence", "min")
-    )
+    classification = data["classification"]
+
+    min_confidence_breached = bool(max(classification["probabilities"].values()) < CONFIG.filters.min_confidence)

    image_info = {
        "classification": classification,
-        "representation": representation,
        "position": {"x1": x1, "x2": x2, "y1": y1, "y2": y2, "pageNumber": data["page_idx"] + 1},
        "geometry": {"width": width, "height": height},
-        "alpha": alpha,
        "filters": {
            "geometry": {
                "imageSize": {
-                    "quotient": geometric_quotient,
+                    "quotient": quotient,
                    "tooLarge": max_image_to_page_quotient_breached,
                    "tooSmall": min_image_to_page_quotient_breached,
                },
@ -78,23 +70,3 @@ def build_image_info(data: dict) -> dict:
    }

    return image_info
-
-
-def compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1):
-    page_area_sqrt = math.sqrt(abs(page_width * page_height))
-    image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1))
-    return image_area_sqrt / page_area_sqrt
-
-
-def get_class_specific_filter_value(label: str, settings: Dynaconf, filter_type: str, bound: str = None):
-    try:
-        value = (
-            settings.filters.overrides[label][filter_type][bound]
-            if bound
-            else settings.filters.overrides[label][filter_type]
-        )
-        logger.warning(f"Using {label=} specific {bound=} {filter_type=} {value=}.")
-    except KeyError:
-        value = settings.filters[filter_type][bound]
-
-    return value
--- a/src/image_prediction/utils.py
+++ b/src/image_prediction/utils.py
--- a/src/image_prediction/utils/init.py
+++ b/src/image_prediction/utils/init.py
--- a/src/image_prediction/utils/banner.py
+++ b/src/image_prediction/utils/banner.py
@ -4,7 +4,8 @@ from image_prediction.locations import BANNER_FILE


 def show_banner():
-    banner = load_banner()
+    with open(BANNER_FILE) as f:
+        banner = "\n" + "".join(f.readlines()) + "\n"

    logger = logging.getLogger(__name__)
    logger.propagate = False
@ -18,9 +19,3 @@ def show_banner():
    logger.addHandler(handler)

    logger.info(banner)
-
-
-def load_banner():
-    with open(BANNER_FILE) as f:
-        banner = "\n" + "".join(f.readlines()) + "\n"
-    return banner
--- a/image_prediction/utils/generic.py
+++ b/image_prediction/utils/generic.py
@ -0,0 +1,7 @@
+from funcy import iterate, chunks
+
+
+def until(cond, func, *args, **kwargs):
+    for a, b in chunks(2, iterate(func, *args, **kwargs)):
+        if cond(a, b):
+            return a
--- a/image_prediction/utils/logger.py
+++ b/image_prediction/utils/logger.py
@ -0,0 +1,29 @@
+import logging
+
+from image_prediction.config import CONFIG
+
+logging.basicConfig()
+
+
+def make_logger_getter():
+    logger = logging.getLogger("imclf")
+    logger.propagate = False
+
+    handler = logging.StreamHandler()
+    handler.setLevel(CONFIG.service.logging_level)
+
+    log_format = "%(asctime)s %(levelname)-8s %(message)s"
+    formatter = logging.Formatter(log_format, datefmt="%Y-%m-%d %H:%M:%S")
+
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+
+    logger.setLevel(CONFIG.service.logging_level)
+
+    def get_logger():
+        return logger
+
+    return get_logger
+
+
+get_logger = make_logger_getter()
--- a/src/image_prediction/utils/pdf_annotation.py
+++ b/src/image_prediction/utils/pdf_annotation.py
@ -56,8 +56,7 @@ def annotate_image(doc, image_info):

 def init():
    PDFNet.Initialize(
-        # "Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
-        "Knecon AG:OEM:DDA-R::WL+:AMS(20270129):EA5FDFB23C7F36B9C2AE606F4F0D9197DE1FB649119F9730B622FABEF5C7"
+        "Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
    )


--- a/test/unit_tests/init.py
+++ b/test/unit_tests/init.py
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Matthias Bisping	03e7b00cfd	refactoring	2022-04-14 12:20:05 +02:00
Matthias Bisping	7aee00cb49	alpha channel querying improved	2022-04-13 17:31:33 +02:00
Matthias Bisping	2cc52c4630	renaming	2022-04-13 13:36:45 +02:00
Matthias Bisping	daa1da3a50	fix name	2022-04-13 13:17:23 +02:00
Matthias Bisping	6a7debde14	added exploration tests	2022-04-13 13:15:05 +02:00
Matthias Bisping	b4f279c549	test for until	2022-04-13 13:12:19 +02:00
Matthias Bisping	f5881f2229	formatting	2022-04-13 13:06:20 +02:00
Matthias Bisping	62bfedfea8	alpha channel test fix	2022-04-13 12:06:55 +02:00
Matthias Bisping	1d88876ab1	alpha channel info WIP	2022-04-12 18:44:04 +02:00
Matthias Bisping	bbafad5561	refactoring in preparationfor alpha channel info	2022-04-12 18:22:38 +02:00
Matthias Bisping	f17a232009	tests for box validation	2022-04-12 16:54:40 +02:00
Matthias Bisping	88a46ae7cd	adjustet expected output for actual pipeline test for change from pixel to pdf units (x2 - x1 etc.) for width and height fields of metadata records	2022-04-12 16:42:31 +02:00
Matthias Bisping	e82a81f5c8	refactoring	2022-04-12 16:34:00 +02:00
Matthias Bisping	35c5b15e32	tolerance forwarding through pipeline constructor; box validation; tiny box filtering	2022-04-12 16:29:20 +02:00
Matthias Bisping	698e647c6f	applied black	2022-04-12 15:06:18 +02:00
Matthias Bisping	d8f86d14a5	fuzzy stitching completed	2022-04-12 15:04:32 +02:00
Matthias Bisping	bb7c1be630	fuzzy stitching WIP: mostly works, but sometimes fails. run test_image_stitcher_with_gaps to debug	2022-04-11 19:20:47 +02:00
Matthias Bisping	79cd31850d	fuzzy stitching WIP: added tolerance to stitching; added fuzzification function; added tests for grouping and (fuzzy and exact)	2022-04-11 16:47:47 +02:00
Matthias Bisping	3d335783dc	topological sorting of definitions by caller hierarchy	2022-04-11 16:08:54 +02:00
Matthias Bisping	bb79f9dd55	applied black	2022-04-11 13:57:32 +02:00
Matthias Bisping	585cdf5c70	integrated stitching into parsable pdf extractor	2022-04-11 13:57:10 +02:00
Matthias Bisping	04cf0245ed	formatting	2022-04-11 13:38:09 +02:00
Matthias Bisping	3530ef72c5	docstring update	2022-04-11 13:37:46 +02:00
Matthias Bisping	d80af336eb	refactoring	2022-04-11 13:28:39 +02:00
Matthias Bisping	bcf6dc5c47	generalized split mapper	2022-04-11 13:03:02 +02:00
Matthias Bisping	f4c0547405	refactoring: replaced split mapper with dataclass	2022-04-11 12:16:42 +02:00
Matthias Bisping	1bea5fb9a8	refactoring	2022-04-11 10:29:13 +02:00
Matthias Bisping	57440f5106	refactoring	2022-04-11 09:53:32 +02:00
Matthias Bisping	710783a2f8	merging algorithm explanation adjusted	2022-04-11 09:28:00 +02:00
Matthias Bisping	887b8339a2	renaming	2022-04-08 14:17:05 +02:00
Matthias Bisping	43cb0fffed	refactoring	2022-04-08 14:13:03 +02:00
Matthias Bisping	6e7645e319	topological sorting of definitions by caller hierarchy	2022-04-08 14:04:48 +02:00
Matthias Bisping	3b18fc6158	refactoring	2022-04-08 13:56:57 +02:00
Matthias Bisping	1b10445f91	refactoring	2022-04-08 12:01:20 +02:00
Matthias Bisping	5967149c49	refactoring	2022-04-07 21:49:55 +02:00
Matthias Bisping	303970db51	refactoring	2022-04-07 21:44:04 +02:00
Matthias Bisping	51793d19e9	refactoring	2022-04-07 21:39:01 +02:00
Matthias Bisping	e276a5ec27	refactoring	2022-04-07 21:20:55 +02:00
Matthias Bisping	7e6fe7cf11	refactoring	2022-04-07 21:12:57 +02:00
Matthias Bisping	bb5db1b4ef	refactoring	2022-04-07 20:47:58 +02:00
Matthias Bisping	8ac9fcb19f	stitcher test passes	2022-04-07 19:40:26 +02:00
Matthias Bisping	160973e2be	refactoring	2022-04-07 19:05:13 +02:00
Matthias Bisping	803cc57155	refactoring	2022-04-07 18:48:12 +02:00
Matthias Bisping	50b4d239cb	group merging done	2022-04-07 18:05:15 +02:00
Matthias Bisping	9bb07f95fb	refactoring	2022-04-07 17:51:53 +02:00
Matthias Bisping	29028cc1a5	refactoring	2022-04-07 17:44:54 +02:00
Matthias Bisping	2fcb0bd149	refactoring	2022-04-07 17:28:25 +02:00
Matthias Bisping	3e882dc247	group merging wip	2022-04-07 17:18:09 +02:00
Matthias Bisping	2b1e7cbb08	added img-mdat-pair merging logic	2022-04-07 16:11:12 +02:00
Matthias Bisping	5e8b55ef10	added image concatenation; refactoring	2022-04-07 11:42:38 +02:00
Matthias Bisping	3266e0af58	refactoring; added metadata merging logic	2022-04-06 15:55:35 +02:00
Matthias Bisping	7e2696d5c5	stitching impl wip	2022-04-05 23:39:17 +02:00
Matthias Bisping	302613bf2b	refactoring eager eval because double iter later	2022-04-05 23:08:41 +02:00
Matthias Bisping	66fd103d1b	refactoring	2022-04-05 22:56:08 +02:00
Matthias Bisping	6e5d6912ed	refactoring	2022-04-05 22:53:26 +02:00
Matthias Bisping	b1efb5ed09	refactoring	2022-04-05 19:40:13 +02:00
Matthias Bisping	ef70e11352	refactoring	2022-04-05 19:38:29 +02:00
Matthias Bisping	315679468b	applied black	2022-04-05 19:35:36 +02:00
Matthias Bisping	64e3350dee	refactoring	2022-04-05 19:35:13 +02:00
Matthias Bisping	6a7e0e1000	refactoring	2022-04-05 19:33:22 +02:00
Matthias Bisping	11fc63035d	refactoring	2022-04-05 19:03:31 +02:00
Matthias Bisping	4bc295b212	refactoring	2022-04-05 18:57:08 +02:00
Matthias Bisping	4c46be4abc	test param adjustment	2022-04-05 18:09:43 +02:00
Matthias Bisping	37ee086b5d	applied black	2022-04-05 17:55:38 +02:00
Matthias Bisping	1fd30e68b6	test data generation for image stitching	2022-04-05 17:54:43 +02:00
Matthias Bisping	2c908162f1	refactoring	2022-04-05 16:31:57 +02:00
Matthias Bisping	4756b8c9bd	refactoring	2022-04-05 13:03:22 +02:00
Matthias Bisping	e0885c545a	added page range paramter to extractor	2022-04-05 13:03:17 +02:00
Matthias Bisping	fdb7ebe618	logging change	2022-04-04 23:37:49 +02:00
Matthias Bisping	ce69f7d160	removed obsolete imports	2022-04-04 21:50:10 +02:00
Matthias Bisping	8f61c4cba2	doc.extract_image(xref) can yield None; hence added filtering for None images	2022-04-04 21:49:45 +02:00
Matthias Bisping	f3e2b2335f	updated dependency versions	2022-04-04 19:35:49 +02:00
Matthias Bisping	9cda65ad41	removed obsolete code	2022-04-04 18:30:43 +02:00
Matthias Bisping	692e72b3b2	refactoring	2022-04-04 18:29:17 +02:00
Matthias Bisping	38869d52c6	refactoring	2022-04-04 18:17:49 +02:00
Matthias Bisping	e01b5c9acd	refactoring	2022-04-04 15:50:09 +02:00
Matthias Bisping	6a6fc19958	refactoring	2022-04-04 15:48:15 +02:00
Matthias Bisping	1b1f1aafef	refactoring	2022-04-04 14:19:06 +02:00
Matthias Bisping	caef37376b	renaming	2022-04-04 14:04:36 +02:00
Matthias Bisping	16aa951c96	refactoring	2022-04-04 14:01:19 +02:00
Matthias Bisping	89afb8f920	added cooridate transformation testing by images	2022-04-04 13:55:48 +02:00
Matthias Bisping	1ffc9dcc68	refactoring	2022-04-04 13:12:08 +02:00
Matthias Bisping	0976971117	refactoring	2022-04-04 10:23:22 +02:00
Matthias Bisping	b4b0058475	added additional corners coordinates for coordinate transformation tests	2022-04-04 10:18:23 +02:00
Matthias Bisping	2ee36dcb54	applied black	2022-04-03 04:48:11 +02:00
Matthias Bisping	ab382646b7	applied black	2022-04-03 04:47:49 +02:00
Matthias Bisping	8c916a79c3	updated gitignore	2022-04-03 04:47:36 +02:00
Matthias Bisping	3ff6dac2e0	added explanations for how the coordinate transformations were inferred	2022-04-03 04:46:52 +02:00
Matthias Bisping	d134884553	misc	2022-04-03 04:35:44 +02:00
Matthias Bisping	2d0545c928	refactoring	2022-04-03 04:31:50 +02:00
Matthias Bisping	65a4a8e34e	refactoring	2022-04-03 04:25:10 +02:00
Matthias Bisping	39c111fd42	integrated PDFNet coordinate transformer into pipeline	2022-04-03 04:08:00 +02:00
Matthias Bisping	0376223c9d	coordinate transformers refac	2022-04-03 04:00:15 +02:00
Matthias Bisping	bf85ef357c	coordinate transformers version 1 completed	2022-04-03 03:51:31 +02:00
Matthias Bisping	f6a7a14a20	pdfnet coordinate transformer wip	2022-04-03 03:19:46 +02:00
Matthias Bisping	41f783dc5d	coordinate transformer refac	2022-04-03 02:21:30 +02:00
Matthias Bisping	32397256c8	coordinate transformer wip	2022-04-03 02:20:03 +02:00
Matthias Bisping	f44e6f4fd7	coordinate transformer, added Fitz transformer	2022-04-03 02:15:41 +02:00
Matthias Bisping	3d2c97bc10	coordinate transformer wip	2022-04-03 01:58:51 +02:00
Matthias Bisping	9663cec12d	coordinate transformer wip	2022-04-03 01:54:51 +02:00
Matthias Bisping	c1c3f541d4	coordinate transformer wip	2022-04-03 01:45:01 +02:00
Matthias Bisping	4d86e78307	muting logger in tests	2022-04-02 19:31:08 +02:00
Matthias Bisping	1cf6ab256c	muting logger in tests	2022-04-02 18:34:13 +02:00
Matthias Bisping	a89e374c67	removed obsolete code	2022-04-02 03:41:55 +02:00
Matthias Bisping	0861e22542	fixed pipeline not working with flask... model was loaded in external process, probably; known issue	2022-04-02 03:38:44 +02:00
Matthias Bisping	7827869af4	fixed logger's logging level	2022-04-02 02:58:30 +02:00
Matthias Bisping	613bba8cfc	...	2022-04-02 02:45:21 +02:00
Matthias Bisping	5c23898280	added log messages to all pipelien components; converting pipelien output to list for REST transport; refactoring; added e2e test (flask + pipeline)... but hangs	2022-04-02 02:44:30 +02:00
Matthias Bisping	e8d0299e46	refactoring	2022-04-02 01:27:30 +02:00
Matthias Bisping	cb00aed62c	refactoring	2022-04-02 01:23:57 +02:00
Matthias Bisping	1501653673	coverage increased for flask tests	2022-04-02 00:16:01 +02:00
Matthias Bisping	b4b929b65f	added mocked server tests with flask testing uitilities	2022-04-01 21:55:59 +02:00
Matthias Bisping	3d1c251e10	removed redundant TF env var export	2022-04-01 21:35:10 +02:00
Matthias Bisping	c80549d5d3	refactoring: model wrapper to base class and derived class for efficient net	2022-04-01 21:32:18 +02:00
Matthias Bisping	070749880e	removed obsolete code	2022-04-01 21:13:15 +02:00
Matthias Bisping	94783c54f2	eliminated redai dependency; updated requirement versions	2022-04-01 21:10:41 +02:00
Matthias Bisping	2b48c6108b	added coverage.process_startup for multiprocessing coverage... but does not quite work yet	2022-04-01 19:51:33 +02:00
Matthias Bisping	da9b3d0cb9	applied black	2022-04-01 19:50:44 +02:00
Matthias Bisping	c372529ee5	dynamic waiting for server to be ready in tests	2022-04-01 19:04:41 +02:00
Matthias Bisping	1a1ece1f95	adjusted call of server running function	2022-04-01 12:22:24 +02:00
Matthias Bisping	426061e5ea	applied black	2022-04-01 12:20:32 +02:00
Matthias Bisping	7c2cf44ad0	refactoring	2022-04-01 00:21:57 +02:00
Matthias Bisping	c125e1ff6c	web server refactoring + tests	2022-03-31 23:43:14 +02:00
Matthias Bisping	dd007891c7	changed banner	2022-03-31 19:50:12 +02:00
Matthias Bisping	d3257fdeda	refactoring	2022-03-31 19:39:08 +02:00
Matthias Bisping	1581880ec6	added updated version of serve.py	2022-03-31 19:38:35 +02:00
Matthias Bisping	268b83a1ff	refactoring	2022-03-31 19:17:48 +02:00
Matthias Bisping	5caa9807e2	added response formatter and pipeline test	2022-03-31 19:01:32 +02:00
Matthias Bisping	82added50a	empty implementation of abstract base class method	2022-03-31 17:29:05 +02:00
Matthias Bisping	b6ccfbcf8f	removed obsolete import	2022-03-31 17:25:42 +02:00
Matthias Bisping	e17912caa9	derived enum formatter from key formatter	2022-03-31 17:22:54 +02:00
Matthias Bisping	3eaf9dc0e1	refactoring: introduced key mapper base class and proba mapper key enum	2022-03-31 16:55:58 +02:00
Matthias Bisping	0cefef4e15	more test cases for key transformer	2022-03-31 16:35:12 +02:00
Matthias Bisping	4f94cbd68d	refactoring	2022-03-31 16:26:40 +02:00
Matthias Bisping	2517b45d44	fixed bug in camel case transformer	2022-03-31 15:55:15 +02:00
Matthias Bisping	2a62ad7aba	typo	2022-03-31 15:48:52 +02:00
Matthias Bisping	20c980dbe6	fixed bug in camel case transformer	2022-03-31 15:47:45 +02:00
Matthias Bisping	726298b155	made formatter a transformer derivation	2022-03-31 15:26:30 +02:00
Matthias Bisping	479afbcd34	formatting	2022-03-31 15:20:41 +02:00
Matthias Bisping	4ab9f0d89b	corrected camel case converter	2022-03-31 15:18:59 +02:00
Matthias Bisping	d4604a2cb5	renaming	2022-03-31 14:52:37 +02:00
Matthias Bisping	4ebb36247e	refactoring	2022-03-31 14:49:46 +02:00
Matthias Bisping	7ec7390e90	refactoring	2022-03-31 12:52:35 +02:00
Matthias Bisping	dc1cdde458	refactoring; added compositor for formatters	2022-03-31 12:52:15 +02:00
Matthias Bisping	0921ef9a4f	removed obsolete import	2022-03-31 11:12:59 +02:00
Matthias Bisping	91dd467142	applied black	2022-03-30 19:38:15 +02:00
Matthias Bisping	b3e1604ecc	added floating point conversion to label mapper for json serializability	2022-03-30 19:36:45 +02:00
Matthias Bisping	20718996bd	refactoring; testing of prediction model handel redai adapter	2022-03-30 19:01:54 +02:00
Matthias Bisping	cc8d87338c	removed obsolete code	2022-03-30 18:17:35 +02:00
Matthias Bisping	258c1ab02d	testing laberl mappers for raising of excpetions when encountering unexpected input formats	2022-03-30 18:15:45 +02:00
Matthias Bisping	ce3d33955e	removing unused code / refactoring for coverage maximization	2022-03-30 18:03:21 +02:00
Matthias Bisping	a95cc4e06b	added config tests	2022-03-30 17:55:49 +02:00
Matthias Bisping	6d1ace473b	removed obsolete code	2022-03-30 16:35:47 +02:00
Matthias Bisping	0a22a35912	refactoring; renaming	2022-03-30 16:35:26 +02:00
Matthias Bisping	a5d3232dd0	testing index and probability label format in classifier prediction test	2022-03-30 16:34:17 +02:00
Matthias Bisping	49f9847d9a	removed obsolete code	2022-03-30 16:07:45 +02:00
Matthias Bisping	1c6f5749dd	updated classifier test for label mappers	2022-03-30 16:04:13 +02:00
Matthias Bisping	8bccec277f	added array label mapper	2022-03-30 15:54:18 +02:00
Matthias Bisping	7f37f841dd	renaming	2022-03-30 15:32:21 +02:00
Matthias Bisping	8c7e3e29f5	added label mapper	2022-03-30 14:17:58 +02:00
Matthias Bisping	99d8e921db	renaming	2022-03-30 13:57:29 +02:00
Matthias Bisping	6835394d30	added formatter test; refactored batch_size fixture	2022-03-30 13:43:13 +02:00
Matthias Bisping	ad6bb80900	fixed sorting predictions by probabilities in wrong order	2022-03-30 01:14:03 +02:00
Matthias Bisping	95209a5c9d	typo	2022-03-30 01:06:06 +02:00
Matthias Bisping	45a07c620a	fixed chaining bug that lead to greedy evaluation	2022-03-30 00:53:34 +02:00
Matthias Bisping	81ab9a5f53	tuning prediction format handling	2022-03-30 00:13:12 +02:00
Matthias Bisping	8b15ac6df4	docstring update	2022-03-29 23:57:09 +02:00
Matthias Bisping	e9489287bd	support for array prediction format	2022-03-29 23:56:22 +02:00
Matthias Bisping	15c0b73034	support for different prediction formats	2022-03-29 23:41:43 +02:00
Matthias Bisping	7a64af156b	refactoring	2022-03-29 22:59:01 +02:00
Matthias Bisping	60617fd622	added formatter to pipeline	2022-03-29 22:47:54 +02:00
Matthias Bisping	ade318c7b7	made classifier accept tupls of images in addition to np.arrays; added pipeline (wip)	2022-03-29 22:00:34 +02:00
Matthias Bisping	3339ed2eab	removed unneeded adapter derivatives and made estimator adapter abstract base class to normal class	2022-03-29 20:44:26 +02:00
Matthias Bisping	7340fb6dda	replaced string keys for metadata fields with enum members	2022-03-29 20:29:44 +02:00
Matthias Bisping	358d7ecd91	restructuring of modules	2022-03-29 20:02:40 +02:00
Matthias Bisping	d33a882d65	removed obsolete code	2022-03-29 19:54:14 +02:00
Matthias Bisping	06adedac57	reimplemented model loader logic and moved base weights into mlflow run dir	2022-03-29 19:50:43 +02:00
Matthias Bisping	edbc5c3f84	redoing model loading design	2022-03-29 18:21:14 +02:00
Matthias Bisping	f60bafd007	redoing model loading design	2022-03-29 17:25:06 +02:00
Matthias Bisping	a1c7dd4a8d	added identity preprocessor; changed default preprocessor to idenitity	2022-03-29 11:40:58 +02:00
Matthias Bisping	6b58756103	refactoring of mlflow model loader	2022-03-29 11:02:43 +02:00
Matthias Bisping	3b4c2a40b2	added patched test for mlflow model loader	2022-03-28 21:51:21 +02:00
Matthias Bisping	c06905625d	added model loader interface, model loader mock and mlflow model loader (the latter so far not tested)	2022-03-28 21:22:35 +02:00
Matthias Bisping	d44622dddc	test parametrization changed	2022-03-28 19:52:24 +02:00
Matthias Bisping	3c6dfed508	made input size adjustable via test fixture	2022-03-28 19:22:31 +02:00
Matthias Bisping	f18e183ab0	added type hint	2022-03-28 18:54:28 +02:00
Matthias Bisping	86f2abc553	renaming	2022-03-28 18:52:39 +02:00
Matthias Bisping	f0a8f2224c	refactoring	2022-03-28 18:50:18 +02:00
Matthias Bisping	9bf1dcbe1d	removed obsolete import	2022-03-28 18:31:09 +02:00
Matthias Bisping	9ce7b6e6da	refactoring	2022-03-28 18:30:51 +02:00
Matthias Bisping	e818b05472	applied black	2022-03-28 16:39:34 +02:00
Matthias Bisping	b818ee4724	fixed misaligned metadata and images	2022-03-28 16:38:46 +02:00
Julius Unverfehrt	9461be29d5	add ParsablePDFImageExtractor test	2022-03-28 15:42:54 +02:00
Julius Unverfehrt	2631eb5c0f	add metadata fixture	2022-03-28 12:05:07 +02:00
Matthias Bisping	643ab99bd3	added parsable pdf image extractor	2022-03-28 11:27:05 +02:00
Matthias Bisping	e0ab365bb9	list -> generator	2022-03-28 00:05:37 +02:00
Matthias Bisping	48737d9439	added extractor classifier	2022-03-28 00:01:19 +02:00
Matthias Bisping	a5147c9a58	added image extractor interface and mock	2022-03-27 23:05:27 +02:00
Matthias Bisping	4c939464b0	renaming	2022-03-27 22:59:28 +02:00
Matthias Bisping	334dc79f7e	refactoring	2022-03-27 18:13:58 +02:00
Matthias Bisping	9d58ae714f	renaming	2022-03-27 17:55:01 +02:00
Matthias Bisping	0f811bdc56	removed unnecessary kwarg	2022-03-27 01:24:29 +01:00
Matthias Bisping	d11333981f	applied black	2022-03-27 01:21:12 +01:00
Matthias Bisping	4fcd1e79d3	removed obsolete code; added missing __init__ for predictor	2022-03-27 01:20:03 +01:00
Matthias Bisping	5c5d132d7f	fixed batching issue in prediction monkey patch by introducinbg an output generator, that yields the expected predictions	2022-03-27 01:13:28 +01:00
Matthias Bisping	0f9510906d	refactoring; added predictor; mocking of predict function is broken: fixing next commit	2022-03-26 21:19:02 +01:00
Matthias Bisping	6343229c1e	added chunk_iterable tests	2022-03-26 20:24:59 +01:00
Matthias Bisping	7d21b0a585	refactoring	2022-03-26 19:54:18 +01:00
Matthias Bisping	364111db89	preprocessor refactoring	2022-03-26 19:38:34 +01:00
Matthias Bisping	ea298dacfa	renaming	2022-03-26 19:27:37 +01:00
Matthias Bisping	373c619b0c	formatting	2022-03-26 19:24:34 +01:00
Matthias Bisping	8aa0717007	added image-tensor conversion logic	2022-03-26 19:24:15 +01:00
Matthias Bisping	a3215e0bc3	renaming of service estimator to estimator	2022-03-25 18:24:05 +01:00
Matthias Bisping	c64bff0843	renaming of service estimator to estimator	2022-03-25 18:20:44 +01:00
Matthias Bisping	dd18087261	restructuring of modules	2022-03-25 18:18:17 +01:00
Matthias Bisping	d97b477208	added estimator preprocessor and removed adapter and adapter patch	2022-03-25 18:09:06 +01:00
Matthias Bisping	981d7816a0	refactoring: replaced estimator adapter with monkeypatch	2022-03-25 17:58:34 +01:00
Matthias Bisping	2e36a9d46d	added type hint	2022-03-25 16:28:17 +01:00
Matthias Bisping	03f269c2d7	fixed incorrect pycharme-refactoring	2022-03-25 16:28:00 +01:00
Matthias Bisping	6853d862ed	added comment motivating the implementation of the predict function of the adapter patch	2022-03-25 15:02:02 +01:00
Matthias Bisping	31591bef0f	suppress tf-internal deprication warning	2022-03-25 14:56:47 +01:00
Matthias Bisping	7834a65ff5	added keras estimator wrapper	2022-03-25 14:46:04 +01:00
Matthias Bisping	8b7293be09	introduced estimator-adapter and estimator-adapter-patch	2022-03-25 13:35:03 +01:00
Matthias Bisping	9c9070e8bf	refactoring	2022-03-25 12:24:23 +01:00
Matthias Bisping	e8fb01b4b7	formatting	2022-03-25 11:49:02 +01:00
Matthias Bisping	41f0cc8a41	estimator + model label mapping	2022-03-25 11:42:31 +01:00
Matthias Bisping	ee959346b7	refactoring: estimator + model	2022-03-25 11:23:07 +01:00