From 0d06ad657e3c21dcef361c53df37b05aba64528b Mon Sep 17 00:00:00 2001
From: Matthias Bisping <matthias.bisping@iqser.com>
Date: Mon, 25 Apr 2022 11:19:35 +0200
Subject: [PATCH] readme updated and config

---
 README.md   | 129 ++++++++++++++++++++++++++++++++++++++++++++++++----
 config.yaml |  14 +++---
 2 files changed, 126 insertions(+), 17 deletions(-)
diff --git a/README.md b/README.md
index f913627..72ff084 100644
--- a/README.md
+++ b/README.md
@@ -1,25 +1,136 @@
-### Building
+### Setup
 
 Build base image
 ```bash
-setup/docker.sh
-```
-
-Build head image
-```bash
-docker build -f Dockerfile -t image-prediction . --build-arg BASE_ROOT=""
+docker build -f Dockerfile_base -t image-prediction-base .
+docker build -f Dockerfile -t image-prediction .
 ```
 
 ### Usage
 
+#### Without Docker
+
+
+```bash
+py scripts/run_pipeline.py /path/to/a/pdf
+```
+
+#### With Docker
+
 Shell 1
 
 ```bash
-docker run --rm --net=host --rm image-prediction
+docker run --rm --net=host image-prediction
 ```
 
 Shell 2
 
 ```bash
-python scripts/pyinfra_mock.py --pdf_path /path/to/a/pdf
+python scripts/pyinfra_mock.py /path/to/a/pdf
 ```
+
+### Message Body Formats
+
+
+#### Request Format
+
+The request messages need to provide the fields `"dossierID"` and `"fileID"`. The file to be processed is assumed to be
+located in the MinIO store under `redaction/<dossierID>/<fileID>.ORIG.pdf.gz`. A request should look like this:
+
+```json
+{
+    "dossierID": "<string identifier>",
+    "fileID": "<string identifier>"
+}
+```
+
+Any additional keys are ignored.
+
+
+#### Response Format
+
+Response bodies contain information about the identified class of the image, the confidence of the classification, the
+position and size of the image as well as the results of additional convenience filters which can be configured through
+environment variables. A response body looks like this:
+
+```json
+{
+  "dossierId": "debug",
+  "fileId": "13ffa9851740c8d20c4c7d1706d72f2a",
+  "data": [...]
+}
+```
+
+An image metadata record (entry in `"data"` field of a response body) looks like this:
+
+```json
+{
+  "classification": {
+    "label": "logo",
+    "probabilities": {
+      "logo": 1.0,
+      "signature": 1.1599173226749333e-17,
+      "other": 2.994595513398207e-23,
+      "formula": 4.352109377281029e-31
+    }
+  },
+  "position": {
+    "x1": 475.95,
+    "x2": 533.4,
+    "y1": 796.47,
+    "y2": 827.62,
+    "pageNumber": 6
+  },
+  "geometry": {
+    "width": 57.44999999999999,
+    "height": 31.149999999999977
+  },
+  "filters": {
+    "geometry": {
+      "imageSize": {
+        "quotient": 0.05975350599135938,
+        "tooLarge": false,
+        "tooSmall": false
+      },
+      "imageFormat": {
+        "quotient": 1.8443017656500813,
+        "tooTall": false,
+        "tooWide": false
+      }
+    },
+    "probability": {
+      "unconfident": false
+    },
+    "allPassed": true
+  }
+}
+```
+
+
+## Configuration
+
+A configuration file is located under `incl/image_service/config.yaml`. All relevant variables can be configured via
+exporting environment variables.
+
+| __Environment Variable__           | Default                            | Description                                                                            |
+|------------------------------------|------------------------------------|----------------------------------------------------------------------------------------|
+| __LOGGING_LEVEL_ROOT__             | "INFO"                             | Logging level for log file messages                                                    |
+| __VERBOSE__                        | *true*                             | Service prints document processing progress to stdout                                  |
+| __BATCH_SIZE__                     | 32                                 | Number of images in memory simultaneously per service instance                         |
+| __RUN_ID__                         | "fabfb1f192c745369b88cab34471aba7" | The ID of the mlflow run to load the image classifier from                             |
+| __MIN_REL_IMAGE_SIZE__             | 0.05                               | Minimally permissible image size to page size ratio                                    |
+| __MAX_REL_IMAGE_SIZE__             | 0.75                               | Maximally permissible image size to page size ratio                                    |
+| __MIN_IMAGE_FORMAT__               | 0.1                                | Minimally permissible image width to height ratio                                      |
+| __MAX_IMAGE_FORMAT__               | 10                                 | Maximally permissible image width to height ratio                                      |
+
+See also: https://git.iqser.com/projects/RED/repos/helm/browse/redaction/templates/image-service-v2
+
+
+## Liveness and Readiness
+
+The service runs a webserver on `0.0.0.0/8080` which responds to GET requests on `0.0.0.0/8080/ready` and `0.0.0.0/8080/health`
+with the status of the service (status code 200 for nominal status). Each service instance is monitored independently.
+A request to `0.0.0.0/8080` is forwarded to subordinated webservers each coupled to exactly one service instance.
+The responses by the subordinated webservers are aggregated either under an all or an existential quantifier (see
+`CHECK_QUANTIFIER`). Note that checks are evaluated lazily, so missing check logs from subordinated webservers is not
+unexpected when using an existential quantifier.
diff --git a/config.yaml b/config.yaml
index ab36d34..7c88982 100644
--- a/config.yaml
+++ b/config.yaml
@@ -1,20 +1,18 @@
 webserver:
   host: $SERVER_HOST|"127.0.0.1"  # webserver address
   port: $SERVER_PORT|5000  # webserver port
-  mode: $SERVER_MODE|production  # webserver mode: {development, production}
 
 service:
-  logging_level: INFO  # Logging level for service logger
-  progressbar: True  # Whether a progress bar over the pages of a document is displayed while processing
-  batch_size: $BATCH_SIZE|16  # Number of images in memory simultaneously
+  logging_level: $LOGGING_LEVEL_ROOT|INFO  # Logging level for service logger
   verbose: $VERBOSE|True  # Service prints document processing progress to stdout
+  batch_size: $BATCH_SIZE|16  # Number of images in memory simultaneously
   run_id: $RUN_ID|fabfb1f192c745369b88cab34471aba7  # The ID of the mlflow run to load the service_estimator from
 
 
-# These variables control filters that are applied to either images, image metadata or service_estimator predictions. The filter
-# result values are reported in the service responses. For convenience the response to a request contains a
-# "filters.allPassed" field, which is set to false if any of the filters returned values did not meet its specified
-# required value.
+# These variables control filters that are applied to either images, image metadata or service_estimator predictions.
+# The filter result values are reported in the service responses. For convenience the response to a request contains a
+# "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
+# specified required value.
 filters:
 
   image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)