From 42ae5793e04fbc546b29a122e9718041380faa50 Mon Sep 17 00:00:00 2001
From: cdietrich <clarissa.dietrich@iqser.com>
Date: Tue, 1 Mar 2022 16:48:03 +0100
Subject: [PATCH] RED-3501: adapt service-container to image-service-v2

---
 .gitignore                          |  2 +
 .gitmodules                         |  2 +-
 Dockerfile                          |  3 +-
 Dockerfile_base                     |  4 +-
 config.yaml                         | 26 +++++---
 data/base_weights.h5.dvc            |  4 ++
 data/mlruns.dvc                     |  5 ++
 image_prediction/locations.py       | 17 +++--
 image_prediction/predictor.py       | 15 +++--
 image_prediction/response.py        | 71 +++++++++++++++++++++
 image_prediction/utils/estimator.py | 32 ----------
 incl/redai_image                    |  1 +
 setup/docker.sh                     |  6 +-
 src/serve.py                        | 97 ++++++++++++++++++-----------
 14 files changed, 191 insertions(+), 94 deletions(-)
 create mode 100644 data/base_weights.h5.dvc
 create mode 100644 data/mlruns.dvc
 create mode 100644 image_prediction/response.py
 delete mode 100644 image_prediction/utils/estimator.py
 create mode 160000 incl/redai_image

diff --git a/.gitignore b/.gitignore
index 56faae6..a14b81f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -171,3 +171,5 @@ fabric.properties
 .idea/codestream.xml
 
 # End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
+/image_prediction/data/mlruns/
+/data/mlruns/
diff --git a/.gitmodules b/.gitmodules
index 91435b0..1ee8d73 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "incl/redai_image"]
 	path = incl/redai_image
-	url = ssh://git@git.iqser.com:2222/rr/redai_image.git
\ No newline at end of file
+	url = ssh://git@git.iqser.com:2222/rr/redai_image.git
diff --git a/Dockerfile b/Dockerfile
index 7e336d2..fb00b3e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,13 +11,14 @@ COPY image_prediction ./image_prediction
 COPY ./setup.py ./setup.py
 COPY ./requirements.txt ./requirements.txt
 COPY ./config.yaml ./config.yaml
+COPY data data
 
 # Install dependencies differing from base image.
 RUN python3 -m pip install -r requirements.txt
 
 RUN python3 -m pip install -e .
 
-WORKDIR /app/service/incl/redai_image
+WORKDIR /app/service/incl/redai_image/redai
 RUN python3 -m pip install -e .
 WORKDIR /app/service
 
diff --git a/Dockerfile_base b/Dockerfile_base
index 4085902..c8decb7 100644
--- a/Dockerfile_base
+++ b/Dockerfile_base
@@ -10,11 +10,13 @@ RUN python -m pip install --upgrade pip
 # Make a directory for the service files and copy the service repo into the container.
 WORKDIR /app/service
 COPY ./requirements.txt ./requirements.txt
-COPY ./data ./data
+COPY ./incl/redai_image/redai/requirements_user.txt ./requirements_redai.txt
 
 # Install dependencies.
 RUN python3 -m pip install -r requirements.txt
 
+RUN python3 -m pip install -r requirements_redai.txt
+
 # Make a new container and copy all relevant files over to filter out temporary files
 # produced during setup to reduce the final container's size.
 FROM python:3.8
diff --git a/config.yaml b/config.yaml
index 2a202b4..c48b4d6 100644
--- a/config.yaml
+++ b/config.yaml
@@ -1,10 +1,3 @@
-estimator:
-  checkpoint: checkpoint.pth
-  classes: ["logo", "other", "formula", "signature", "handwriting_other"]
-  rejection_class: "other"
-  threshold: .5
-  device: cpu
-
 webserver:
   host: $SERVER_HOST|"127.0.0.1"  # webserver address
   port: $SERVER_PORT|5000  # webserver port
@@ -14,3 +7,22 @@ service:
   logging_level: $LOGGING_LEVEL_ROOT|DEBUG  # Logging level for service logger
   batch_size: $BATCH_SIZE|2  # Number of images in memory simultaneously
   verbose: $VERBOSE|True  # Service prints document processing progress to stdout
+  run_id: $RUN_ID|fabfb1f192c745369b88cab34471aba7  # The ID of the mlflow run to load the model from
+
+
+# These variables control filters that are applied to either images, image metadata or model predictions. The filter
+# result values are reported in the service responses. For convenience the response to a request contains a
+# "filters.allPassed" field, which is set to false if any of the filters returned values did not meet its specified
+# required value.
+filters:
+
+  image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)
+    min: $MIN_REL_IMAGE_SIZE|0.05  # Minimum permissible
+    max: $MAX_REL_IMAGE_SIZE|0.75  # Maximum permissible
+
+  image_width_to_height_quotient:  # Image width to height ratio
+    min: $MIN_IMAGE_FORMAT|0.1  # Minimum permissible
+    max: $MAX_IMAGE_FORMAT|10  # Maximum permissible
+
+  min_confidence: $MIN_CONFIDENCE|0.5  # Minimum permissible prediction confidence
+
diff --git a/data/base_weights.h5.dvc b/data/base_weights.h5.dvc
new file mode 100644
index 0000000..9f07d13
--- /dev/null
+++ b/data/base_weights.h5.dvc
@@ -0,0 +1,4 @@
+outs:
+- md5: 6d0186c1f25e889d531788f168fa6cf0
+  size: 16727296
+  path: base_weights.h5
diff --git a/data/mlruns.dvc b/data/mlruns.dvc
new file mode 100644
index 0000000..d390fed
--- /dev/null
+++ b/data/mlruns.dvc
@@ -0,0 +1,5 @@
+outs:
+- md5: d1c708270bab6fcd344d4a8b05d1103d.dir
+  size: 150225383
+  nfiles: 178
+  path: mlruns
diff --git a/image_prediction/locations.py b/image_prediction/locations.py
index 264cdda..9c67dc0 100644
--- a/image_prediction/locations.py
+++ b/image_prediction/locations.py
@@ -1,7 +1,14 @@
-from pathlib import Path
+from os import path
 
+MODULE_DIR = path.dirname(path.abspath(__file__))
+PACKAGE_ROOT_DIR = path.dirname(MODULE_DIR)
+REPO_ROOT_DIR = path.dirname(path.dirname(PACKAGE_ROOT_DIR))
 
-MODULE_ROOT = Path(__file__).resolve().parents[1]
-CONFIG_FILE = MODULE_ROOT / "config.yaml"
-DATA_DIR = MODULE_ROOT / "data"
-TORCH_HOME = DATA_DIR
+DOCKER_COMPOSE_FILE = path.join(REPO_ROOT_DIR, "docker-compose.yaml")
+
+CONFIG_FILE = path.join(PACKAGE_ROOT_DIR, "config.yaml")
+LOG_FILE = "/tmp/log.log"
+
+DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data")
+MLRUNS_DIR = path.join(DATA_DIR, "mlruns")
+BASE_WEIGHTS = path.join(DATA_DIR, "base_weights.h5")
diff --git a/image_prediction/predictor.py b/image_prediction/predictor.py
index 8320583..3d89a69 100644
--- a/image_prediction/predictor.py
+++ b/image_prediction/predictor.py
@@ -1,7 +1,13 @@
 import logging
 from operator import itemgetter
+from typing import List, Dict
+
+import numpy as np
 
 from image_prediction.config import CONFIG
+from image_prediction.locations import MLRUNS_DIR, BASE_WEIGHTS
+from incl.redai_image.redai.redai.backend.model.model_handle import ModelHandle
+from incl.redai_image.redai.redai.utils.mlflow_reader import MlflowModelReader
 
 
 class Predictor:
@@ -21,9 +27,7 @@ class Predictor:
                 reader = MlflowModelReader(
                     run_id=CONFIG.service.run_id, mlruns_dir=MLRUNS_DIR
                 )
-                # message_queue.put(text="Loading model...", level=logging.DEBUG)
                 self.model_handle = reader.get_model_handle(BASE_WEIGHTS)
-                # message_queue.put(text="Model loaded.", level=logging.DEBUG)
             else:
                 self.model_handle = model_handle
 
@@ -31,12 +35,7 @@ class Predictor:
             self.classes_readable = np.array(self.model_handle.classes)
             self.classes_readable_aligned = self.classes_readable[self.classes[list(range(len(self.classes)))]]
         except Exception as e:
-            message_queue.put(
-                text="Service estimator initialization failed.",
-                exception=e,
-                level=logging.CRITICAL,
-                trace=traceback.format_exc(),
-            )
+            logging.info(f"Service estimator initialization failed: {e}")
 
     def __make_predictions_human_readable(self, probs: np.ndarray) -> List[Dict[str, float]]:
         """Translates an n x m matrix of probabilities over classes into an n-element list of mappings from classes to
diff --git a/image_prediction/response.py b/image_prediction/response.py
new file mode 100644
index 0000000..2fc3225
--- /dev/null
+++ b/image_prediction/response.py
@@ -0,0 +1,71 @@
+"""Defines functions for constructing service responses."""
+
+
+from itertools import starmap
+from operator import itemgetter
+
+import numpy as np
+
+from image_prediction.config import CONFIG
+
+
+def build_response(predictions: list, metadata: list) -> list:
+    return list(starmap(build_image_info, zip(predictions, metadata)))
+
+
+def build_image_info(prediction: dict, metadata: dict) -> dict:
+    def compute_geometric_quotient():
+        page_area_sqrt = np.sqrt(abs(page_width * page_height))
+        image_area_sqrt = np.sqrt(abs(x2 - x1) * abs(y2 - y1))
+        return image_area_sqrt / page_area_sqrt
+
+    page_width, page_height, x1, x2, y1, y2, width, height = itemgetter(
+        "page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height"
+    )(metadata)
+
+    quotient = compute_geometric_quotient()
+
+    min_image_to_page_quotient_breached = bool(quotient < CONFIG.filters.image_to_page_quotient.min)
+    max_image_to_page_quotient_breached = bool(quotient > CONFIG.filters.image_to_page_quotient.max)
+    min_image_width_to_height_quotient_breached = bool(
+        width / height < CONFIG.filters.image_width_to_height_quotient.min
+    )
+    max_image_width_to_height_quotient_breached = bool(
+        width / height > CONFIG.filters.image_width_to_height_quotient.max
+    )
+
+    min_confidence_breached = bool(max(prediction["probabilities"].values()) < CONFIG.filters.min_confidence)
+    prediction["label"] = prediction.pop("class")  # "class" as field name causes problem for Java objectmapper
+    prediction["probabilities"] = {klass: np.round(prob, 6) for klass, prob in prediction["probabilities"].items()}
+
+    image_info = {
+        "classification": prediction,
+        "position": {"x1": x1, "x2": x2, "y1": y1, "y2": y2, "pageNumber": metadata["page_idx"] + 1},
+        "geometry": {"width": width, "height": height},
+        "filters": {
+            "geometry": {
+                "imageSize": {
+                    "quotient": quotient,
+                    "tooLarge": max_image_to_page_quotient_breached,
+                    "tooSmall": min_image_to_page_quotient_breached,
+                },
+                "imageFormat": {
+                    "quotient": width / height,
+                    "tooTall": min_image_width_to_height_quotient_breached,
+                    "tooWide": max_image_width_to_height_quotient_breached,
+                },
+            },
+            "probability": {"unconfident": min_confidence_breached},
+            "allPassed": not any(
+                [
+                    max_image_to_page_quotient_breached,
+                    min_image_to_page_quotient_breached,
+                    min_image_width_to_height_quotient_breached,
+                    max_image_width_to_height_quotient_breached,
+                    min_confidence_breached,
+                ]
+            ),
+        },
+    }
+
+    return image_info
diff --git a/image_prediction/utils/estimator.py b/image_prediction/utils/estimator.py
deleted file mode 100644
index 113f02a..0000000
--- a/image_prediction/utils/estimator.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import os
-
-from image_prediction.config import CONFIG
-from image_prediction.locations import DATA_DIR, TORCH_HOME
-from image_prediction.predictor import Predictor
-
-
-def suppress_userwarnings():
-    import warnings
-
-    warnings.filterwarnings("ignore")
-
-
-def load_classes():
-    classes = CONFIG.estimator.classes
-    id2class = dict(zip(range(1, len(classes) + 1), classes))
-    return id2class
-
-
-def get_checkpoint():
-    return DATA_DIR / CONFIG.estimator.checkpoint
-
-
-def set_torch_env():
-    os.environ["TORCH_HOME"] = str(TORCH_HOME)
-
-
-def initialize_predictor(resume):
-    set_torch_env()
-    checkpoint = get_checkpoint() if not resume else resume
-    predictor = Predictor(checkpoint, classes=load_classes(), rejection_class=CONFIG.estimator.rejection_class)
-    return predictor
diff --git a/incl/redai_image b/incl/redai_image
new file mode 160000
index 0000000..4c3b26d
--- /dev/null
+++ b/incl/redai_image
@@ -0,0 +1 @@
+Subproject commit 4c3b26d7673457aaa99e0663dad6950cd36da967
diff --git a/setup/docker.sh b/setup/docker.sh
index b8cf880..7b4a837 100755
--- a/setup/docker.sh
+++ b/setup/docker.sh
@@ -5,9 +5,9 @@ python3 -m venv build_venv
 source build_venv/bin/activate
 python3 -m pip install --upgrade pip
 
-#pip install dvc
-#pip install 'dvc[ssh]'
-#dvc pull
+pip install dvc
+pip install 'dvc[ssh]'
+dvc pull
 
 git submodule update --init --recursive
 
diff --git a/src/serve.py b/src/serve.py
index 65f6d04..4c292d3 100644
--- a/src/serve.py
+++ b/src/serve.py
@@ -1,17 +1,29 @@
 import argparse
+import json
 import logging
-from typing import Callable
+import tempfile
+from itertools import chain
+from operator import itemgetter
+from typing import Iterable
 
 from flask import Flask, request, jsonify
 from waitress import serve
 
 from image_prediction.config import CONFIG
-from image_prediction.utils.estimator import suppress_userwarnings, initialize_predictor
+from image_prediction.predictor import Predictor
+from image_prediction.response import build_response
+from incl.redai_image.redai.redai.backend.pdf.image_extraction import extract_and_stitch
+from incl.redai_image.redai.redai.utils.shared import chunk_iterable
+
+
+def suppress_userwarnings():
+    import warnings
+
+    warnings.filterwarnings("ignore")
 
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--resume")
     parser.add_argument("--warnings", action="store_true", default=False)
     args = parser.parse_args()
 
@@ -22,16 +34,9 @@ def main(args):
     if not args.warnings:
         suppress_userwarnings()
 
-    predictor = initialize_predictor(args.resume)
+    predictor = Predictor()
     logging.info("Predictor ready.")
 
-    prediction_server = make_prediction_server(predictor.predict_pdf)
-
-    run_prediction_server(prediction_server, mode=CONFIG.webserver.mode)
-
-
-def make_prediction_server(predict_fn: Callable):
-
     app = Flask(__name__)
 
     @app.route("/ready", methods=["GET"])
@@ -48,46 +53,66 @@ def make_prediction_server(predict_fn: Callable):
 
     @app.route("/", methods=["POST"])
     def predict():
-        def __predict():
-
-            def inner():
-
-                pdf = request.data
-
-                logging.debug("Running predictor on document...")
-                predictions = predict_fn(pdf)
-                logging.debug(f"Found {len(predictions)} images in document.")
-                response = jsonify(list(predictions))
+        pdf = request.data
 
+        logging.debug("Running predictor on document...")
+        # extract images from pdfs
+        with tempfile.NamedTemporaryFile() as tmp_file:
+            tmp_file.write(pdf)
+            image_metadata_pairs = extract_image_metadata_pairs(tmp_file.name)
+            try:
+                predictions, metadata = classify_images(predictor, image_metadata_pairs)
+            except Exception as err:
+                logging.warning("Analysis failed.")
+                logging.exception(err)
+                response = jsonify("Analysis failed.")
+                response.status_code = 500
                 return response
+            logging.debug(f"Found images in document.")
 
-            logging.info(f"Analyzing...")
-            result = inner()
-            logging.info("Analysis completed.")
-            return result
+        response = jsonify(build_response(list(predictions), list(metadata)))
 
-        try:
-            return __predict()
-        except Exception as err:
-            logging.warning("Analysis failed.")
-            logging.exception(err)
-            response = jsonify("Analysis failed.")
-            response.status_code = 500
-            return response
+        logging.info("Analysis completed.")
+        return response
 
-    return app
+    run_prediction_server(app, mode=CONFIG.webserver.mode)
 
 
 def run_prediction_server(app, mode="development"):
-
     if mode == "development":
         app.run(host=CONFIG.webserver.host, port=CONFIG.webserver.port, debug=True)
     elif mode == "production":
         serve(app, host=CONFIG.webserver.host, port=CONFIG.webserver.port)
 
 
-if __name__ == "__main__":
+def extract_image_metadata_pairs(pdf_path: str, **kwargs):
+    def image_is_large_enough(metadata: dict):
+        x1, x2, y1, y2 = itemgetter("x1", "x2", "y1", "y2")(metadata)
 
+        return abs(x1 - x2) > 2 and abs(y1 - y2) > 2
+
+    yield from extract_and_stitch(pdf_path, convert_to_rgb=True, filter_fn=image_is_large_enough, **kwargs)
+
+
+def classify_images(predictor, image_metadata_pairs: Iterable, batch_size: int = CONFIG.service.batch_size):
+    def process_chunk(chunk):
+        images, metadata = zip(*chunk)
+        predictions = predictor.predict(images, probabilities=True)
+        return predictions, metadata
+
+    def predict(image_metadata_pair_generator):
+        chunks = chunk_iterable(image_metadata_pair_generator, n=batch_size)
+        return map(chain.from_iterable, zip(*map(process_chunk, chunks)))
+
+    try:
+        predictions, metadata = predict(image_metadata_pairs)
+        return predictions, metadata
+
+    except ValueError:
+        return [], []
+
+
+if __name__ == "__main__":
     logging_level = CONFIG.service.logging_level
     logging.basicConfig(level=logging_level)
     logging.getLogger("flask").setLevel(logging.ERROR)