Pull request #20: New pyinfra

Merge in RR/cv-analysis from new_pyinfra to master Squashed commit of the following: commit f7a01a90aad1c402ac537de5bdf15df628ad54df Author: Julius Unverfehrt <julius.unverfehrt@iqser.com> Date: Wed Jul 27 10:40:59 2022 +0200 fix typo commit ff4d549fac5b612c2d391ae85823c5eca1e91916 Author: Julius Unverfehrt <julius.unverfehrt@iqser.com> Date: Wed Jul 27 10:34:04 2022 +0200 adjust build scripts for new pyinfra commit ecd70f60d46406d8b6cc7f36a1533d706c917ca8 Author: Julius Unverfehrt <julius.unverfehrt@iqser.com> Date: Wed Jul 27 09:42:55 2022 +0200 simplify logging by using default configurations commit 20193c14c940eed2b0a7a72058167e26064119d0 Author: Julius Unverfehrt <julius.unverfehrt@iqser.com> Date: Tue Jul 26 17:16:57 2022 +0200 tidy-up, refactor config logic to not dependent on external files commit d8069cd4d404a570bb04a04278161669d1c83332 Author: Isaac Riley <Isaac.Riley@iqser.com> Date: Tue Jul 26 15:14:59 2022 +0200 update pyinfra commit c3bc11037cca9baf016043ab997c566f5b4a2586 Author: Isaac Riley <Isaac.Riley@iqser.com> Date: Tue Jul 26 15:09:14 2022 +0200 repair tests commit 6f4e4f2863ee16ae056c1d432f663858c5f10221 Author: Isaac Riley <Isaac.Riley@iqser.com> Date: Tue Jul 26 14:52:38 2022 +0200 updated server logic to work with new pyinfra; update scripts for pyinfra as submodule commit 2a18dba81de5ee84d0bdf0e77f478693e8d8aef4 Author: Isaac Riley <Isaac.Riley@iqser.com> Date: Tue Jul 26 14:10:41 2022 +0200 formatting commit d87ce9328de9aa2341228af9b24473d5e583504e Author: Isaac Riley <Isaac.Riley@iqser.com> Date: Tue Jul 26 14:10:11 2022 +0200 make server logic compatible with new pyinfra
2022-07-27 10:50:10 +02:00 · 2022-07-27 10:50:10 +02:00 · 9d98945ff9
commit 9d98945ff9
parent 1618909d8e
31 changed files with 329 additions and 301 deletions
--- a/29
+++ b/29
@ -1,20 +1,25 @@
-ARG BASE_ROOT="nexus.iqser.com:5001/red/"
-ARG VERSION_TAG=latest
+FROM python:3.10

-FROM ${BASE_ROOT}cv-analysis-base:${VERSION_TAG}
+RUN python -m venv /app/venv
+ENV PATH="/app/venv/bin:$PATH"
+
+RUN python -m pip install --upgrade pip

 WORKDIR /app/service

+COPY ./requirements.txt ./requirements.txt
+RUN python3 -m pip install -r requirements.txt
+
+COPY ./incl/pyinfra/requirements.txt ./incl/pyinfra/requirements.txt
+RUN python -m pip install -r incl/pyinfra/requirements.txt
+
+COPY ./incl ./incl
+RUN python3 -m pip install -e incl/pyinfra
+
 COPY ./src ./src
-COPY cv_analysis ./cv_analysis
-COPY config.yaml ./config.yaml
+COPY ./cv_analysis ./cv_analysis
+COPY ./setup.py ./setup.py

-RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install -e .

-WORKDIR /app/service
-
-EXPOSE 5000
-EXPOSE 8080
-
-CMD ["python3", "src/serve.py"]
+CMD ["python3", "-u", "src/serve.py"]
--- a/31
+++ b/31
@ -1,31 +0,0 @@
-FROM python:3.10 as builder1
-
-# Use a virtual environment.
-RUN python -m venv /app/venv
-ENV PATH="/app/venv/bin:$PATH"
-
-# Upgrade pip.
-RUN python -m pip install --upgrade pip
-
-# Make a directory for the service files and copy the service repo into the container.
-WORKDIR /app/service
-COPY . ./
-
-# Install dependencies.
-RUN python3 -m pip install -r requirements.txt
-RUN python3 -m pip install -r incl/pyinfra/requirements.txt
-RUN python3 -m pip install -e incl/pyinfra
-
-# Make a new container and copy all relevant files over to filter out temporary files
-# produced during setup to reduce the final container's size.
-FROM python:3.10
-
-WORKDIR /app/
-COPY --from=builder1  /app .
-ENV PATH="/app/venv/bin:$PATH"
-
-WORKDIR /app/service
-
-RUN apt update
-#RUN apt install python3-opencv-headless 
-RUN apt install poppler-utils --yes
--- a/bamboo-specs/src/main/java/buildjob/PlanSpec.java
+++ b/bamboo-specs/src/main/java/buildjob/PlanSpec.java
@ -34,7 +34,6 @@ import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
 public class PlanSpec {

    private static final String SERVICE_NAME = "cv-analysis";
-    private static final String SERVICE_NAME_BASE = "cv-analysis-base";

    private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_","");

@ -72,7 +71,7 @@ public class PlanSpec {
    return new Plan(
            project(),
            SERVICE_NAME, new BambooKey(SERVICE_KEY))
-            .description("Docker build for cv-analysis.")
+//             .description("Docker build for cv-analysis.")
            // .variables()
            .stages(new Stage("Build Stage")
              .jobs(
@ -84,9 +83,6 @@ public class PlanSpec {
                    new VcsCheckoutTask()
                        .description("Checkout default repository.")
                        .checkoutItems(new CheckoutItem().defaultRepository()),
-                    new VcsCheckoutTask()
-                        .description("Checkout pyinfra research repository.")
-                        .checkoutItems(new CheckoutItem().repository("RR / pyinfra").path("pyinfra")),
                    new ScriptTask()
                        .description("Set config and keys.")
                        .inlineBody("mkdir -p ~/.ssh\n" +
@ -98,10 +94,10 @@ public class PlanSpec {
                        .description("Build Docker container.")
                        .location(Location.FILE)
                        .fileFromPath("bamboo-specs/src/main/resources/scripts/docker-build.sh")
-                        .argument(SERVICE_NAME + " " + SERVICE_NAME_BASE))
+                        .argument(SERVICE_NAME))
                  .dockerConfiguration(
                      new DockerConfiguration()
-                        .image("nexus.iqser.com:5001/infra/release_build:4.2.0")
+                        .image("nexus.iqser.com:5001/infra/release_build:4.5.0")
                        .volume("/var/run/docker.sock", "/var/run/docker.sock")),
                new Job("Sonar Job", new BambooKey("SONAR"))
                  .tasks(
@ -111,9 +107,6 @@ public class PlanSpec {
                    new VcsCheckoutTask()
                        .description("Checkout default repository.")
                        .checkoutItems(new CheckoutItem().defaultRepository()),
-                    new VcsCheckoutTask()
-                        .description("Checkout pyinfra research repository.")
-                        .checkoutItems(new CheckoutItem().repository("RR / pyinfra").path("pyinfra")),
                    new ScriptTask()
                        .description("Set config and keys.")
                        .inlineBody("mkdir -p ~/.ssh\n" +
@ -134,6 +127,9 @@ public class PlanSpec {
              .jobs(
                new Job("Git Tag Job", new BambooKey("GITTAG"))
                  .tasks(
+                    new CleanWorkingDirectoryTask()
+                        .description("Clean working directory.")
+                        .enabled(true),
                    new VcsCheckoutTask()
                        .description("Checkout default repository.")
                        .checkoutItems(new CheckoutItem().defaultRepository()),
@ -152,7 +148,7 @@ public class PlanSpec {
                        .defaultRepository())
                .dockerConfiguration(
                    new DockerConfiguration()
-                        .image("nexus.iqser.com:5001/infra/release_build:4.4.1")),
+                        .image("nexus.iqser.com:5001/infra/release_build:4.5.0")),
                new Job("Licence Job", new BambooKey("LICENCE"))
                  .enabled(false)
                  .tasks(
@ -169,7 +165,6 @@ public class PlanSpec {
                        .volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
                        .volume("/var/run/docker.sock", "/var/run/docker.sock"))))
            .linkedRepositories("RR / " + SERVICE_NAME)
-            .linkedRepositories("RR / pyinfra")
            .triggers(new BitbucketServerTrigger())
            .planBranchManagement(new PlanBranchManagement()
              .createForVcsBranch()
--- a/bamboo-specs/src/main/resources/scripts/docker-build.sh
+++ b/bamboo-specs/src/main/resources/scripts/docker-build.sh
@ -2,8 +2,7 @@
 set -e

 SERVICE_NAME=$1
-SERVICE_NAME_BASE=$2
-# TODO version tag on master push
+
 python3 -m venv build_venv
 source build_venv/bin/activate
 python3 -m pip install --upgrade pip
@ -13,7 +12,6 @@ pip install 'dvc[ssh]'
 dvc pull

 echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
-docker build -f Dockerfile_base  -t nexus.iqser.com:5001/red/$SERVICE_NAME_BASE:${bamboo_version_tag} .
-docker build -f Dockerfile  -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} --build-arg VERSION_TAG=${bamboo_version_tag} .
 echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
+docker build -f Dockerfile  -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} --build-arg VERSION_TAG=${bamboo_version_tag} .
 docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag}
--- a/bamboo-specs/src/main/resources/scripts/sonar-scan.sh
+++ b/bamboo-specs/src/main/resources/scripts/sonar-scan.sh
@ -7,17 +7,20 @@ python3 -m venv build_venv
 source build_venv/bin/activate
 python3 -m pip install --upgrade pip

-pip install -e .
-pip install -e incl/pyinfra
+echo "dev setup for unit test and coverage"

+pip install -e incl/pyinfra
 pip install -r incl/pyinfra/requirements.txt
+
+pip install -e .
 pip install -r requirements.txt

+
 echo "DVC pull step"
 dvc pull

 echo "coverage calculation"
-coverage run -m pytest test
+coverage run -m pytest
 echo "coverage report generation"
 coverage report -m
 coverage xml
@ -28,7 +31,7 @@ echo "dependency-check:aggregate"
 mkdir -p reports
 dependency-check --enableExperimental -f JSON -f HTML -f XML \
  --disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
-  --exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**"
+  --exclude "build_venv/**" --exclude "**/__pycache__/**"

 if [[ -z "${bamboo_repository_pr_key}" ]]
 then
--- a/config.yaml
+++ b/config.yaml
@ -1,12 +0,0 @@
-service:
-  logging_level: $LOGGING_LEVEL_ROOT|DEBUG # Logging level for log file messages
-  monitoring_enabled: $MONITORING_ENABLED|True # if app is doing monitoring or not
-  logfile_path: $LOGFILE_PATH|null  # Overwrites the default path for the service logfile (image_service/log.log)
-
-webserver:
-  host: $SERVER_HOST|"127.0.0.1"  # webserver address
-  port: $SERVER_PORT|5000  # webserver port
-
-visual_logging:
-  level: DISABLED  # NOTHING > INFO > DEBUG > ALL
-  output_folder: /tmp/debug/
--- a/cv_analysis/config.py
+++ b/cv_analysis/config.py
@ -1,39 +1,30 @@
-"""Implements a config object with dot-indexing syntax."""
+import os


-from envyaml import EnvYAML
-from cv_analysis.locations import CONFIG_FILE
-
-
-def _get_item_and_maybe_make_dotindexable(container, item):
-    ret = container[item]
-    return DotIndexable(ret) if isinstance(ret, dict) else ret
-
-
-class DotIndexable:
-    def __init__(self, x):
-        self.x = x
-
-    def __getattr__(self, item):
-        return _get_item_and_maybe_make_dotindexable(self.x, item)
-
-    def __setitem__(self, key, value):
-        self.x[key] = value
-
-    def __repr__(self):
-        return self.x.__repr__()
+def get_config():
+    return Config()


 class Config:
-    def __init__(self, config_path):
-        self.__config = EnvYAML(config_path)
+    def __init__(self):
+        self.logging_level_root = os.environ.get("LOGGING_LEVEL_ROOT", "INFO")

-    def __getattr__(self, item):
-        if item in self.__config:
-            return _get_item_and_maybe_make_dotindexable(self.__config, item)
+        # visual_logging_level: NOTHING > INFO > DEBUG > ALL
+        self.visual_logging_level = "DISABLED"
+        self.visual_logging_output_folder = "/tmp/debug"

-    def __getitem__(self, item):
-        return self.__getattr__(item)
+        # locations
+        # FIXME: is everything here necessary?
+        root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        self.dvc_data_dir = os.path.join(root, "data")
+        self.pdf_for_testing = os.path.join(self.dvc_data_dir, "pdfs_for_testing")
+        self.png_for_testing = os.path.join(self.dvc_data_dir, "pngs_for_testing")
+        self.png_figures_detected = os.path.join(self.png_for_testing, "figures_detected")
+        self.png_tables_detected = os.path.join(self.png_for_testing, "tables_detected_by_tp")
+        self.hashed_pdfs_for_testing = os.path.join(self.pdf_for_testing, "hashed")
+        self.metadata_test_files = os.path.join(self.dvc_data_dir, "metadata_testing_files.csv")
+        self.test_dir = os.path.join(root, "test")
+        self.test_data_dir = os.path.join(self.test_dir, "test_data")

-
-CONFIG = Config(CONFIG_FILE)
+    def __getitem__(self, key):
+        return self.__getattribute__(key)
--- a/cv_analysis/locations.py
+++ b/cv_analysis/locations.py
@ -1,22 +0,0 @@
-"""Defines constant paths relative to the module root path."""
-
-
-from os import path
-
-MODULE_DIR = path.dirname(path.abspath(__file__))
-PACKAGE_ROOT_DIR = path.dirname(MODULE_DIR)
-
-CONFIG_FILE = path.join(PACKAGE_ROOT_DIR, "config.yaml")
-LOG_FILE = "/tmp/log.log"
-
-DVC_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data")
-PDF_FOR_TESTING = path.join(DVC_DATA_DIR, "pdfs_for_testing")
-PNG_FOR_TESTING = path.join(DVC_DATA_DIR, "pngs_for_testing")
-PNG_FIGURES_DETECTED = path.join(PNG_FOR_TESTING, "figures_detected")
-PNG_TABLES_DETECTED = path.join(PNG_FOR_TESTING, "tables_detected_by_tp")
-HASHED_PDFS_FOR_TESTING = path.join(PDF_FOR_TESTING, "hashed")
-METADATA_TESTFILES = path.join(DVC_DATA_DIR, "metadata_testing_files.csv")
-
-
-TEST_DIR = path.join(PACKAGE_ROOT_DIR, "test")
-TEST_DATA_DIR = path.join(TEST_DIR, "test_data")
--- a/cv_analysis/redaction_detection.py
+++ b/cv_analysis/redaction_detection.py
@ -2,7 +2,6 @@ from functools import partial

 import cv2
 import numpy as np
-import pdf2image
 from iteration_utilities import starfilter, first

 from cv_analysis.utils.filters import is_large_enough, is_filled, is_boxy
--- a/cv_analysis/server/pipeline.py
+++ b/cv_analysis/server/pipeline.py
@ -7,23 +7,20 @@ from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_d
 from cv_analysis.layout_parsing import parse_layout
 from cv_analysis.server.rotate import rotate_rectangle
 from cv_analysis.table_parsing import parse_tables
-from cv_analysis.utils.logging import get_logger
 from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs
 from cv_analysis.utils.structures import Rectangle

-logger = get_logger()
-

 def make_analysis_pipeline(analysis_fn: Callable, dpi=200):
    """Make end-to-end pipeline to analyse a PDF with given analysis function.
    The pipeline returns a Generator of dicts containing page information and the analysis results.

    Steps:
-        Convert PDF to Arrays and page information
-        Analise pages, get list of bboxes per page (e.g. table cells)
+        Convert PDF to pairs of image and page information
+        Analyse pages, get list of bounding boxes per page (e.g. table cells)
        Convert pixel values to inches
        Rotate results if page is rotated
-        Format results to stream of dictionaries
+        Format results to stream of dictionaries with page information and analysis results
    """

    def pipeline(pdf: bytes, index=None):
--- a/cv_analysis/table_parsing.py
+++ b/cv_analysis/table_parsing.py
@ -1,16 +1,15 @@
 from functools import partial
 from itertools import chain, starmap
 from operator import attrgetter
+
 import cv2
 import numpy as np
-
 from funcy import lmap

+from cv_analysis.layout_parsing import parse_layout
 from cv_analysis.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d
-
 from cv_analysis.utils.structures import Rectangle
 from cv_analysis.utils.visual_logging import vizlogger
-from cv_analysis.layout_parsing import parse_layout


 def add_external_contours(image, image_h_w_lines_only):
--- a/cv_analysis/utils/display.py
+++ b/cv_analysis/utils/display.py
@ -1,4 +1,3 @@
-from numpy import resize
 import cv2
 from matplotlib import pyplot as plt

--- a/cv_analysis/utils/logging.py
+++ b/cv_analysis/utils/logging.py
@ -1,26 +0,0 @@
-"""Defines the default logger for the service."""
-import sys
-import logging
-
-from cv_analysis.config import CONFIG
-
-
-def make_logger_getter():
-    logger = logging.getLogger(__name__)
-    logger.setLevel(logging.getLevelName(CONFIG.service.logging_level))
-    formatter = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s", datefmt="%d.%m.%Y - %H:%M:%S")
-
-    ch = logging.StreamHandler(sys.stdout)
-    ch.setLevel(logging.getLevelName(CONFIG.service.logging_level))
-    ch.setFormatter(formatter)
-
-    logger.addHandler(ch)
-    logger.propagate = False
-
-    def get_logger():
-        return logger
-
-    return get_logger
-
-
-get_logger = make_logger_getter()
--- a/cv_analysis/utils/pdf2image.py
+++ b/cv_analysis/utils/pdf2image.py
@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from functools import partial
-from typing import Iterator, Tuple
+from typing import Iterator

 import fitz
 import numpy as np
@ -14,7 +14,8 @@ class ImageMetadataPair:

 def pdf_to_image_metadata_pairs(pdf: bytes, index=None, dpi=200) -> Iterator[ImageMetadataPair]:
    """Streams PDF as pairs of image (matrix) and metadata.
-    Note: If Index is not given or evaluates to None, the whole PDF will be processed."""
+    Note: If Index is not given or evaluates to None, the whole PDF will be processed.
+    """
    convert_fn = partial(page_to_image_metadata_pair, dpi=dpi)
    yield from map(convert_fn, stream_pages(pdf, index))

--- a/cv_analysis/utils/visual_logging.py
+++ b/cv_analysis/utils/visual_logging.py
@ -1,7 +1,10 @@
 import os
-from cv_analysis.config import CONFIG
+
+from cv_analysis.config import get_config
 from cv_analysis.utils.display import save_image

+CV_CONFIG = get_config()
+

 class VisualLogger:
    def __init__(self, level, output_folder):
@ -36,4 +39,4 @@ class VisualLogger:
        return self.level == "ALL"


-vizlogger = VisualLogger(CONFIG.visual_logging.level, CONFIG.visual_logging.output_folder)
+vizlogger = VisualLogger(CV_CONFIG.visual_logging_level, CV_CONFIG.visual_logging_output_folder)
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -0,0 +1,31 @@
+version: '2'
+services:
+  minio:
+    image: minio/minio
+    ports:
+      - "9000:9000"
+    environment:
+      - MINIO_ROOT_PASSWORD=password
+      - MINIO_ROOT_USER=root
+    volumes:
+      - ./data/minio_store:/data
+    command: server /data
+    network_mode: "bridge"
+  rabbitmq:
+    image: docker.io/bitnami/rabbitmq:3.9
+    ports:
+      - '4369:4369'
+      - '5551:5551'
+      - '5552:5552'
+      - '5672:5672'
+      - '25672:25672'
+      - '15672:15672'
+    environment:
+      - RABBITMQ_SECURE_PASSWORD=yes
+      - RABBITMQ_VM_MEMORY_HIGH_WATERMARK=100%
+      - RABBITMQ_DISK_FREE_ABSOLUTE_LIMIT=20Gi
+    network_mode: "bridge"
+    volumes:
+      - /opt/bitnami/rabbitmq/.rabbitmq/:/data/bitnami
+volumes:
+  mdata:
--- a/incl/pyinfra
+++ b/incl/pyinfra
@ -1 +1 @@
-Subproject commit 7e948a4cf05a3ef59fcc7e8719fcf910adc73864
+Subproject commit 6c2652837a17a29476b11b1acbc35ba8825c2cd9
--- a/pytest.ini
+++ b/pytest.ini
@ -1,2 +1,4 @@
 [pytest]
 norecursedirs = incl
+testpaths = test
+addopts = --ignore=data
--- a/scripts/export_example_pages.py
+++ b/scripts/export_example_pages.py
@ -1,11 +1,15 @@
 import hashlib
+import json
 import os
+from itertools import chain
 from os import path
+
 import pandas as pd
 from pdf2image import convert_from_path
-from itertools import chain
-import json
-from cv_analysis.locations import PDF_FOR_TESTING, TEST_DATA_DIR, PNG_FOR_TESTING, DVC_DATA_DIR, HASHED_PDFS_FOR_TESTING
+
+from cv_analysis.config import get_config
+
+CV_CONFIG = get_config()


 def read_json(path):
@ -22,7 +26,7 @@ def collect_metadata(example_pages, save=False):
    metadata = list(chain.from_iterable(metadata))
    if save:
        df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
-        df.to_csv(path.join(DVC_DATA_DIR, "metadata_testing_files.csv"))
+        df.to_csv(path.join(CV_CONFIG.dvc_data_dir, "metadata_testing_files.csv"))
    else:
        return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])

@ -46,7 +50,7 @@ def make_metadata_entry_maker():


 def split_pdf(example_pages):
-    dir_path = PDF_FOR_TESTING
+    dir_path = CV_CONFIG.pdf_for_testing
    i = 0
    for name, document_sections in example_pages.items():
        for pages in document_sections:
@ -54,7 +58,7 @@ def split_pdf(example_pages):
                pdf_path=path.join(dir_path, name), dpi=300, first_page=pages[0], last_page=pages[1]
            )
            for image in images:
-                fp = path.join(PNG_FOR_TESTING, f"fig_table{i:0>3}.png")
+                fp = path.join(CV_CONFIG.png_for_testing, f"fig_table{i:0>3}.png")
                image.save(fp=fp, dpi=(300, 300))
                i += 1

@ -74,7 +78,7 @@ def find_hash(file_path):

 def rename_files_with_hash(example_pages):
    files_to_rename = list(example_pages.keys())
-    folder = HASHED_PDFS_FOR_TESTING
+    folder = CV_CONFIG.hashed_pdfs_for_testing

    # Iterate through the folder
    for file in os.listdir(folder):
@ -99,7 +103,7 @@ def rename_files_with_hash(example_pages):


 def main():
-    examples_pages = read_json(path.join(TEST_DATA_DIR, "example_pages.json"))
+    examples_pages = read_json(path.join(CV_CONFIG.test_data_dir, "example_pages.json"))
    rename_files_with_hash(examples_pages)
    # collect_metadata(examples_pages, save=True)
    # split_pdf(examples_pages)
--- a/scripts/manage_minio.py
+++ b/scripts/manage_minio.py
@ -0,0 +1,68 @@
+import argparse
+import gzip
+import os
+from pathlib import Path
+
+from tqdm import tqdm
+
+from pyinfra.config import get_config
+from pyinfra.storage.storage import get_s3_storage
+
+CONFIG = get_config()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    subparsers = parser.add_subparsers(help="sub-command help", dest="command")
+
+    parser_add = subparsers.add_parser("add", help="Add file(s) to the MinIO store")
+    parser_add.add_argument("dossier_id")
+    add_group = parser_add.add_mutually_exclusive_group(required=True)
+    add_group.add_argument("--file", "-f")
+    add_group.add_argument("--directory", "-d")
+
+    subparsers.add_parser("purge", help="Delete all files and buckets in the MinIO store")
+
+    args = parser.parse_args()
+    return args
+
+
+def combine_dossier_id_and_file_id_and_extension(dossier_id, file_id, extension):
+    return f"{dossier_id}/{file_id}{extension}"
+
+
+def add_file_compressed(storage, bucket_name, dossier_id, path) -> None:
+    if Path(path).suffix == ".pdf":
+        suffix_gz = ".ORIGIN.pdf.gz"
+    if Path(path).suffix == ".json":
+        suffix_gz = ".TEXT.json.gz"
+    path_gz = combine_dossier_id_and_file_id_and_extension(dossier_id, Path(path).stem, suffix_gz)
+
+    with open(path, "rb") as f:
+        data = gzip.compress(f.read())
+        storage.put_object(bucket_name, path_gz, data)
+
+
+if __name__ == "__main__":
+
+    storage = get_s3_storage(CONFIG)
+    bucket_name = CONFIG.storage_bucket
+
+    if not storage.has_bucket(bucket_name):
+        storage.make_bucket(bucket_name)
+
+    args = parse_args()
+
+    if args.command == "add":
+
+        if args.file:
+            add_file_compressed(storage, bucket_name, args.dossier_id, args.file)
+
+        elif args.directory:
+            for fname in tqdm([*os.listdir(args.directory)], desc="Adding files"):
+                path = Path(args.directory) / fname
+                add_file_compressed(storage, bucket_name, args.dossier_id, path)
+
+    elif args.command == "purge":
+        storage.clear_bucket(bucket_name)
--- a/scripts/publish_requests.py
+++ b/scripts/publish_requests.py
@ -0,0 +1,84 @@
+import argparse
+import json
+
+import pika
+
+from pyinfra.config import get_config
+from pyinfra.storage.storage import get_s3_storage
+
+CONFIG = get_config()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--operation", "-o", choices=["table", "layout", "figure"], required=True)
+    args = parser.parse_args()
+    return args
+
+
+def read_connection_params():
+    credentials = pika.PlainCredentials(CONFIG.rabbitmq_username, CONFIG.rabbitmq_password)
+    parameters = pika.ConnectionParameters(
+        host=CONFIG.rabbitmq_host,
+        port=CONFIG.rabbitmq_port,
+        heartbeat=int(CONFIG.rabbitmq_heartbeat),
+        credentials=credentials,
+    )
+    return parameters
+
+
+def make_channel(connection) -> pika.adapters.blocking_connection.BlockingChannel:
+    channel = connection.channel()
+    channel.basic_qos(prefetch_count=1)
+    return channel
+
+
+def declare_queue(channel, queue: str):
+    args = {"x-dead-letter-exchange": "", "x-dead-letter-routing-key": CONFIG.dead_letter_queue}
+    return channel.queue_declare(queue=queue, auto_delete=False, durable=True, arguments=args)
+
+
+def make_connection() -> pika.BlockingConnection:
+    parameters = read_connection_params()
+    connection = pika.BlockingConnection(parameters)
+    return connection
+
+
+def build_message_bodies(operation, bucket_name):
+
+    storage = get_s3_storage(CONFIG)
+    for bucket_name, pdf_name in storage.get_all_object_names(bucket_name):
+        if "pdf" not in pdf_name:
+            continue
+        file_id = pdf_name.split(".")[0]
+        dossier_id, file_id = file_id.split("/")
+        message_dict = {
+            "dossierId": dossier_id,
+            "fileId": file_id,
+            "targetFileExtension": "ORIGIN.pdf.gz",
+            "responseFileExtension": f"{operation.upper()}.json.gz",
+            "operation": operation,
+        }
+        yield json.dumps(message_dict).encode()
+
+
+def main(args):
+    connection = make_connection()
+    channel = make_channel(connection)
+    declare_queue(channel, CONFIG.request_queue)
+    declare_queue(channel, CONFIG.response_queue)
+
+    for body in build_message_bodies(args.operation, CONFIG.storage_bucket):
+        channel.basic_publish("", CONFIG.request_queue, body)
+        print(f"Put {body} on {CONFIG.request_queue}")
+
+    for method_frame, _, body in channel.consume(queue=CONFIG.response_queue, inactivity_timeout=1):
+        if not body:
+            break
+        print(f"Received {json.loads(body)}")
+        channel.basic_ack(method_frame.delivery_tag)
+    channel.close()
+
+
+if __name__ == "__main__":
+    main(parse_args())
--- a/scripts/pyinfra_mock.py
+++ b/scripts/pyinfra_mock.py
@ -1,64 +0,0 @@
-import argparse
-import gzip
-from operator import itemgetter
-from typing import List
-
-import fitz
-import pdf2image
-from funcy import lmap, compose, pluck
-
-from pyinfra.default_objects import get_component_factory
-
-from cv_analysis.config import CONFIG
-from incl.pyinfra.test.utils.image import image_to_bytes
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pdf_path", "-p", required=True)
-    parser.add_argument("--operation", "-o", choices=["figure_detection", "table_parsing"], required=True)
-    parser.add_argument("--result_path", "-r", required=True)
-    args = parser.parse_args()
-
-    return args
-
-
-def request_metadatas(dpi, n_metadata):
-    return [{"dpi": dpi} for _ in range(1, n_metadata)]
-
-
-def draw_cells_on_page(cells: List[dict], page):
-    def format_xywh_to_x0y0x1y1(rect):
-        x, y, w, h = rect
-        return x, y, x + w, y + h
-
-    rects = map(itemgetter("x", "y", "width", "height"), cells)
-    rects = map(format_xywh_to_x0y0x1y1, rects)
-
-    for rect in rects:
-        page.draw_rect(rect, color=(0.3, 0.7, 0.1), width=2, overlay=True)
-
-
-def annotate_results_on_pdf(results, pdf_path, result_path):
-    opened_pdf = fitz.open(pdf_path)
-    metadata_per_page = pluck("metadata", results)
-
-    for page, metadata in zip(opened_pdf, metadata_per_page):
-        if metadata:
-            draw_cells_on_page(metadata["cells"], page)
-    opened_pdf.save(result_path)
-
-
-def main(args):
-    dpi = 200
-    images = lmap(compose(gzip.compress, image_to_bytes), pdf2image.convert_from_path(args.pdf_path, dpi=dpi))
-
-    submit_endpoint = f"http://{CONFIG.webserver.host}:{CONFIG.webserver.port}/{args.operation}"
-    pipeline = get_component_factory(CONFIG).get_pipeline(submit_endpoint)
-    results = list(pipeline(data=images, metadata=request_metadatas(dpi, len(images))))
-
-    annotate_results_on_pdf(results, args.pdf_path, args.result_path)
-
-
-if __name__ == "__main__":
-    main(parse_args())
--- a/scripts/show_compressed_json.py
+++ b/scripts/show_compressed_json.py
@ -0,0 +1,24 @@
+import argparse
+import gzip
+import json
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("compressed_json_path", help="Path to compressed JSON file")
+    return parser.parse_args()
+
+
+def main(fp):
+    with open(fp, "rb") as f:
+        compressed_json_path = f.read()
+
+    json_str = gzip.decompress(compressed_json_path)
+    parsed = json.loads(json_str)
+
+    print(json.dumps(parsed, indent=2))
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args.compressed_json_path)
--- a/setup/docker.sh
+++ b/setup/docker.sh
@ -1,13 +0,0 @@
-#!/bin/bash
-set -e
-
-python3 -m venv build_venv
-source build_venv/bin/activate
-python3 -m pip install --upgrade pip
-
-#pip install dvc
-#pip install 'dvc[ssh]'
-#dvc pull
-
-docker build -f Dockerfile_base -t cv-analysis-base .
-docker build -f Dockerfile -t cv-analysis .
--- a/src/serve.py
+++ b/src/serve.py
@ -1,35 +1,42 @@
+import gzip
+import json
 import logging
+from operator import itemgetter

-from waitress import serve
-
-from cv_analysis.config import CONFIG
-from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline
-from cv_analysis.server.stream import make_streamable_analysis_fn
-from cv_analysis.table_parsing import parse_tables
+from cv_analysis.server.pipeline import get_analysis_fn, make_analysis_pipeline
 from cv_analysis.utils.banner import make_art
-from cv_analysis.utils.logging import get_logger
-from incl.pyinfra.pyinfra.server.server import set_up_processing_server
+from pyinfra import config as pyinfra_config
+from pyinfra.queue.queue_manager import QueueManager
+from pyinfra.storage.storage import get_storage
+
+PYINFRA_CONFIG = pyinfra_config.get_config()
+
+logging.basicConfig(level=PYINFRA_CONFIG.logging_level_root)


-def main():
-    logger.info(make_art())
+def analysis_callback(queue_message: dict):

-    operation2function = {"table_parsing": parse_tables, "figure_detection": make_figure_detection_pipeline()}
-    operation2streamable_function = {op: make_streamable_analysis_fn(fn) for op, fn in operation2function.items()}
+    dossier_id, file_id, target_file_ext, response_file_ext, operation = itemgetter(
+        "dossierId", "fileId", "targetFileExtension", "responseFileExtension", "operation"
+    )(queue_message)
+    logging.info(f"Processing {dossier_id=}/{file_id=}, {operation=}.")
+    storage = get_storage(PYINFRA_CONFIG)
+    object_name = f"{dossier_id}/{file_id}.{target_file_ext}"
+    object_bytes = gzip.decompress(storage.get_object(PYINFRA_CONFIG.storage_bucket, object_name))
+    analysis_fn = make_analysis_pipeline(get_analysis_fn(operation))

-    server = set_up_processing_server(operation2streamable_function)
+    results = analysis_fn(object_bytes)
+    response = {**queue_message, "data": list(results)}
+    response = gzip.compress(json.dumps(response).encode())
+    response_name = f"{dossier_id}/{file_id}.{response_file_ext}"

-    serve(server, host=CONFIG.webserver.host, port=CONFIG.webserver.port, _quiet=False)
+    storage.put_object(PYINFRA_CONFIG.storage_bucket, response_name, response)
+    return {"dossierId": dossier_id, "fileId": file_id}


 if __name__ == "__main__":
-    logging.basicConfig(level=CONFIG.service.logging_level)

-    logging.getLogger("pillow").setLevel(logging.ERROR)
-    logging.getLogger("PIL").setLevel(logging.ERROR)
-    logging.getLogger("flask").setLevel(logging.ERROR)
-    logging.getLogger("urllib3").setLevel(logging.ERROR)
+    logging.info(make_art())

-    logger = get_logger()
-
-    main()
+    queue_manager = QueueManager(PYINFRA_CONFIG)
+    queue_manager.start_consuming(analysis_callback)
--- a/test/fixtures/figure_detection.py
+++ b/test/fixtures/figure_detection.py
@ -8,15 +8,11 @@ from lorem_text import lorem
 from cv_analysis.figure_detection.figure_detection_pipeline import (
    make_figure_detection_pipeline,
 )
-from cv_analysis.utils.display import show_image


@pytest.fixture
-def page_with_images(random_image, n_images, background):
-    # page_image = Image.fromarray(background.astype("uint8")).convert("RGB")
-    page_image = paste_image(page_image, random_image, (200, 200))
-    if n_images == 2:
-        page_image = paste_image(page_image, random_image, (1000, 2600))
+def page_with_images(random_image, background):
+    page_image = paste_image(background, random_image, (200, 200))
    return np.array(page_image)


--- a/test/fixtures/server.py
+++ b/test/fixtures/server.py
@ -7,7 +7,6 @@ import pytest
 from funcy import first

 from cv_analysis.utils.structures import Rectangle
-from incl.pyinfra.pyinfra.server.packing import bytes_to_string


@pytest.fixture
@ -20,7 +19,7 @@ def random_image_as_bytes_and_compressed(random_image):

@pytest.fixture
 def random_image_metadata_package(random_image_as_bytes_and_compressed):
-    data = bytes_to_string(random_image_as_bytes_and_compressed)
+    data = random_image_as_bytes_and_compressed.decode()
    return [
        {
            "data": data,
--- a/test/fixtures/table_parsing.py
+++ b/test/fixtures/table_parsing.py
@ -1,24 +1,27 @@
 import json
 from os.path import join
+
 import cv2
 import pytest
 from funcy import first

-from cv_analysis.locations import TEST_DATA_DIR
+from cv_analysis.config import get_config
 from cv_analysis.utils.draw import draw_rectangles
 from cv_analysis.utils.open_pdf import open_pdf
 from test.fixtures.figure_detection import paste_text

+CV_CONFIG = get_config()
+

@pytest.fixture
 def client_page_with_table(test_file_index):
-    img_path = join(TEST_DATA_DIR, f"test{test_file_index}.png")
+    img_path = join(CV_CONFIG.test_data_dir, f"test{test_file_index}.png")
    return first(open_pdf(img_path))


@pytest.fixture
 def expected_table_annotation(test_file_index):
-    json_path = join(TEST_DATA_DIR, f"test{test_file_index}.json")
+    json_path = join(CV_CONFIG.test_data_dir, f"test{test_file_index}.json")
    with open(json_path) as f:
        return json.load(f)

--- a/test/unit_tests/figure_detection/figure_detection_pipeline_test.py
+++ b/test/unit_tests/figure_detection/figure_detection_pipeline_test.py
@ -14,7 +14,6 @@ class TestFindPrimaryTextRegions:
        assert not list(results)

    @pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
-    @pytest.mark.parametrize("n_images", [1])
    def test_page_without_text_yields_figures(self, figure_detection_pipeline, page_with_images, image_size):
        results = figure_detection_pipeline(page_with_images)
        result_figures_size = map(lambda x: (x.w, x.h), results)
@ -35,7 +34,6 @@ class TestFindPrimaryTextRegions:
        assert error <= error_tolerance

    @pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
-    @pytest.mark.parametrize("n_images", [1, 2])
    @pytest.mark.parametrize("font_scale", [1, 1.5, 2])
    @pytest.mark.parametrize("font_style", [cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_COMPLEX])
    @pytest.mark.parametrize("text_types", powerset(["body", "header", "caption"]))
@ -45,13 +43,12 @@ class TestFindPrimaryTextRegions:
        figure_detection_pipeline,
        page_with_images_and_text,
        image_size,
-        n_images,
        error_tolerance,
    ):
        results = list(figure_detection_pipeline(page_with_images_and_text))

        result_figures_area = sum(map(lambda x: (x.w * x.h), results))
-        expected_figure_area = n_images * prod(image_size)
+        expected_figure_area = prod(image_size)

        error = abs(result_figures_area - expected_figure_area) / expected_figure_area

--- a/test/unit_tests/figure_detection/text_test.py
+++ b/test/unit_tests/figure_detection/text_test.py
@ -6,7 +6,6 @@ from cv_analysis.figure_detection.text import (
    remove_primary_text_regions,
    apply_threshold_to_image,
 )
-from cv_analysis.utils.display import show_image
 from test.utils.utils import powerset


@ -19,7 +18,6 @@ class TestFindPrimaryTextRegions:
        np.testing.assert_equal(result_page, apply_threshold_to_image(background))

    @pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
-    @pytest.mark.parametrize("n_images", [1, 2])
    def test_page_without_text_keeps_images(self, page_with_images, error_tolerance):
        result_page = remove_primary_text_regions(page_with_images)
        np.testing.assert_equal(result_page, apply_threshold_to_image(page_with_images))
@ -33,7 +31,6 @@ class TestFindPrimaryTextRegions:
        assert relative_error <= error_tolerance

    @pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
-    @pytest.mark.parametrize("n_images", [1, 2])
    @pytest.mark.parametrize("font_scale", [1, 1.5, 2])
    @pytest.mark.parametrize("font_style", [cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_COMPLEX])
    @pytest.mark.parametrize("text_types", powerset(["body", "header", "caption"]))
--- a/test/unit_tests/parse_configuration_test.py
+++ b/test/unit_tests/parse_configuration_test.py
@ -1,6 +0,0 @@
-from cv_analysis.config import CONFIG
-
-
-def test_config():
-    assert CONFIG.service
-    assert CONFIG.webserver