From 521222eb963874ce5e49ea44dce7a8ae88d9eca5 Mon Sep 17 00:00:00 2001
From: Julius Unverfehrt <Julius.Unverfehrt@iqser.com>
Date: Mon, 12 Sep 2022 14:49:56 +0200
Subject: [PATCH] Pull request #31: RED-5202 port hotfixes

Merge in RR/image-prediction from RED-5202-port-hotfixes to master

Squashed commit of the following:

commit c1b92270354c764861da0f7782348e9cd0725d76
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Mon Sep 12 13:28:44 2022 +0200

    fixed statefulness issue with os.environ in tests

commit ad9c5657fe93079d5646ba2b70fa091e8d2daf76
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Mon Sep 12 13:04:55 2022 +0200

    - Adapted response formatting logic for threshold maps passed via env vars.
    - Added test for reading threshold maps and values from env vars.

commit c60e8cd6781b8e0c3ec69ccd0a25375803de26f0
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Mon Sep 12 11:38:01 2022 +0200

    add parser for environment variables WIP

commit 101b71726c697f30ec9298ba62d2203bd7da2efb
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Mon Sep 12 09:52:33 2022 +0200

    Add typehints, make custom page quotient breach function private since the intention of outsourcing it from build_image_info is to make it testable seperately

commit 04aee4e62781e78cd54c6d20e961dcd7bf1fc081
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Mon Sep 12 09:25:59 2022 +0200

    DotIndexable default get method exception made more specific

commit 4584e7ba66400033dc5f1a38473b644eeb11e67c
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Mon Sep 12 08:55:05 2022 +0200

    RED-5202 port temporary broken image handling so the hotfix won't be lost by upgrading the service. A proper solution is still desirable (see RED-5148)

commit 5f99622646b3f6d3a842aebef91ff8e082072cd6
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Mon Sep 12 08:47:02 2022 +0200

    RED-5202 add per class customizable max image to page quotient setting for signatures, default is 0.4. Can be overwritten by , set to null to use default value or set to value that should be used.
---
 image_prediction/config.py                    |   6 ++
 image_prediction/exceptions.py                |   4 +
 .../image_extractor/extractors/parsable.py    |  25 ++++-
 .../transformer/transformers/response.py      | 101 +++++++++++++++---
 test/unit_tests/config_test.py                |  10 ++
 test/unit_tests/response_transformer_test.py  |  36 +++++++
 6 files changed, 167 insertions(+), 15 deletions(-)
 create mode 100644 test/unit_tests/response_transformer_test.py

diff --git a/image_prediction/config.py b/image_prediction/config.py
index 4696191..98d2af1 100644
--- a/image_prediction/config.py
+++ b/image_prediction/config.py
@@ -15,6 +15,12 @@ class DotIndexable:
     def __init__(self, x):
         self.x = x
 
+    def get(self, item, default=None):
+        try:
+            return _get_item_and_maybe_make_dotindexable(self.x, item)
+        except KeyError:
+            return default
+
     def __getattr__(self, item):
         return _get_item_and_maybe_make_dotindexable(self.x, item)
 
diff --git a/image_prediction/exceptions.py b/image_prediction/exceptions.py
index 1b88f0d..f03b42a 100644
--- a/image_prediction/exceptions.py
+++ b/image_prediction/exceptions.py
@@ -32,3 +32,7 @@ class IntentionalTestException(RuntimeError):
 
 class InvalidBox(Exception):
     pass
+
+
+class ParsingError(Exception):
+    pass
diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py
index f478f38..eac09e1 100644
--- a/image_prediction/image_extractor/extractors/parsable.py
+++ b/image_prediction/image_extractor/extractors/parsable.py
@@ -1,14 +1,17 @@
 import atexit
 import io
+import json
+import traceback
 from functools import partial, lru_cache
 from itertools import chain, starmap, filterfalse
-from operator import itemgetter
-from typing import List
+from operator import itemgetter, truth
+from typing import List, Iterable, Iterator
 
 import fitz
 from PIL import Image
 from funcy import rcompose, merge, pluck, curry, compose
 
+from image_prediction.formatter.formatters.enum import EnumFormatter
 from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
 from image_prediction.info import Info
 from image_prediction.stitching.stitching import stitch_pairs
@@ -47,10 +50,28 @@ class ParsablePDFImageExtractor(ImageExtractor):
         clear_caches()
 
         image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
+        #  TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the
+        #   validation here. Invalid images can then be split into a different stream and joined with the intact images
+        #   again for the formatting step.
+        image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs)
         image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
 
         yield from image_metadata_pairs
 
+    @staticmethod
+    def __filter_valid_images(image_metadata_pairs: Iterable[ImageMetadataPair]) -> Iterator[ImageMetadataPair]:
+        def validate(image: Image.Image, metadata: dict):
+            try:
+                # TODO: stand-in heuristic for testing if image is valid => find cleaner solution (RED-5148)
+                image.resize((100, 100)).convert("RGB")
+                return ImageMetadataPair(image, metadata)
+            except (OSError, Exception) as err:
+                metadata = json.dumps(EnumFormatter()(metadata), indent=2)
+                logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
+                return None
+
+        return filter(truth, starmap(validate, image_metadata_pairs))
+
 
 def extract_pages(doc, page_range):
     page_range = range(page_range.start + 1, page_range.stop + 1)
diff --git a/image_prediction/transformer/transformers/response.py b/image_prediction/transformer/transformers/response.py
index 3e35104..91244c9 100644
--- a/image_prediction/transformer/transformers/response.py
+++ b/image_prediction/transformer/transformers/response.py
@@ -1,7 +1,13 @@
+import json
 import math
+import os
+from functools import lru_cache
 from operator import itemgetter
 
+from funcy import filter, juxt, first, rest, compose
+
 from image_prediction.config import CONFIG
+from image_prediction.exceptions import ParsingError
 from image_prediction.transformer.transformer import Transformer
 from image_prediction.utils import get_logger
 
@@ -14,6 +20,45 @@ class ResponseTransformer(Transformer):
         return build_image_info(data)
 
 
+def get_class_specific_min_image_to_page_quotient(label, table=None):
+    return get_class_specific_value(
+        "REL_IMAGE_SIZE", label, "min", CONFIG.filters.image_to_page_quotient.min, table=table
+    )
+
+
+def get_class_specific_max_image_to_page_quotient(label, table=None):
+    return get_class_specific_value(
+        "REL_IMAGE_SIZE", label, "max", CONFIG.filters.image_to_page_quotient.max, table=table
+    )
+
+
+def get_class_specific_min_image_width_to_height_quotient(label, table=None):
+    return get_class_specific_value(
+        "IMAGE_FORMAT", label, "min", CONFIG.filters.image_width_to_height_quotient.min, table=table
+    )
+
+
+def get_class_specific_max_image_width_to_height_quotient(label, table=None):
+    return get_class_specific_value(
+        "IMAGE_FORMAT", label, "max", CONFIG.filters.image_width_to_height_quotient.max, table=table
+    )
+
+
+def get_class_specific_min_classification_confidence(label, table=None):
+    return get_class_specific_value("CONFIDENCE", label, "min", CONFIG.filters.min_confidence, table=table)
+
+
+def get_class_specific_value(prefix, label, bound, fallback_value, table=None):
+    def fallback():
+        logger.warning(f"Failed to resolve {bound} {prefix.lower().replace('_', '-')} value for class '{label}'.")
+        return fallback_value
+
+    assert bound in ["min", "max"]
+
+    threshold_map = parse_env_var(prefix, table=table) or {}
+    return threshold_map.get(label, {}).get(bound) or fallback()
+
+
 def build_image_info(data: dict) -> dict:
     def compute_geometric_quotient():
         page_area_sqrt = math.sqrt(abs(page_width * page_height))
@@ -24,21 +69,29 @@ def build_image_info(data: dict) -> dict:
         "page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height", "alpha"
     )(data)
 
-    quotient = round(compute_geometric_quotient(), 4)
-
-    min_image_to_page_quotient_breached = bool(quotient < CONFIG.filters.image_to_page_quotient.min)
-    max_image_to_page_quotient_breached = bool(quotient > CONFIG.filters.image_to_page_quotient.max)
-    min_image_width_to_height_quotient_breached = bool(
-        width / height < CONFIG.filters.image_width_to_height_quotient.min
-    )
-    max_image_width_to_height_quotient_breached = bool(
-        width / height > CONFIG.filters.image_width_to_height_quotient.max
-    )
-
     classification = data["classification"]
+    label = classification["label"]
     representation = data["representation"]
 
-    min_confidence_breached = bool(max(classification["probabilities"].values()) < CONFIG.filters.min_confidence)
+    geometric_quotient = round(compute_geometric_quotient(), 4)
+
+    min_image_to_page_quotient_breached = bool(
+        geometric_quotient < get_class_specific_min_image_to_page_quotient(label)
+    )
+    max_image_to_page_quotient_breached = bool(
+        geometric_quotient > get_class_specific_max_image_to_page_quotient(label)
+    )
+
+    min_image_width_to_height_quotient_breached = bool(
+        width / height < get_class_specific_min_image_width_to_height_quotient(label)
+    )
+    max_image_width_to_height_quotient_breached = bool(
+        width / height > get_class_specific_max_image_width_to_height_quotient(label)
+    )
+
+    min_confidence_breached = bool(
+        max(classification["probabilities"].values()) < get_class_specific_min_classification_confidence(label)
+    )
 
     image_info = {
         "classification": classification,
@@ -49,7 +102,7 @@ def build_image_info(data: dict) -> dict:
         "filters": {
             "geometry": {
                 "imageSize": {
-                    "quotient": quotient,
+                    "quotient": geometric_quotient,
                     "tooLarge": max_image_to_page_quotient_breached,
                     "tooSmall": min_image_to_page_quotient_breached,
                 },
@@ -73,3 +126,25 @@ def build_image_info(data: dict) -> dict:
     }
 
     return image_info
+
+
+@lru_cache(maxsize=None)
+def parse_env_var(prefix, table=None):
+    table = table or os.environ
+    head, tail = juxt(first, compose(list, rest))(filter(prefix, table))
+    if not head:
+        logger.warning(f"Found no environment variable with prefix '{prefix}'.")
+    elif tail:
+        logger.warning(f"Found multiple candidates for environment variable with prefix '{prefix}'.")
+    else:
+        try:
+            return parse_env_var_value(table[head])
+        except ParsingError as err:
+            logger.warning(err)
+
+
+def parse_env_var_value(env_var_value):
+    try:
+        return json.loads(env_var_value)
+    except Exception as err:
+        raise ParsingError(f"Failed to parse {env_var_value}") from err
diff --git a/test/unit_tests/config_test.py b/test/unit_tests/config_test.py
index 8c16cf8..878d7e1 100644
--- a/test/unit_tests/config_test.py
+++ b/test/unit_tests/config_test.py
@@ -36,3 +36,13 @@ def test_dot_access_key_does_not_exists(config):
 
 def test_access_key_does_not_exists(config):
     assert config["B"] is None
+
+
+def test_get_method_returns_key_if_key_does_exist(config):
+    dot_indexable = config.D.E
+    assert dot_indexable.get("F", "default_value") is True
+
+
+def test_get_method_returns_default_if_key_does_not_exist(config):
+    dot_indexable = config.D.E
+    assert dot_indexable.get("X", "default_value") == "default_value"
diff --git a/test/unit_tests/response_transformer_test.py b/test/unit_tests/response_transformer_test.py
new file mode 100644
index 0000000..ac8d822
--- /dev/null
+++ b/test/unit_tests/response_transformer_test.py
@@ -0,0 +1,36 @@
+import json
+
+import pytest
+from frozendict import frozendict
+
+from image_prediction.transformer.transformers.response import (
+    get_class_specific_min_image_to_page_quotient,
+    get_class_specific_max_image_to_page_quotient,
+    get_class_specific_max_image_width_to_height_quotient,
+    get_class_specific_min_image_width_to_height_quotient,
+    get_class_specific_min_classification_confidence,
+)
+
+
+@pytest.fixture
+def label():
+    return "signature"
+
+
+@pytest.fixture
+def page_quotient_threshold_map(label):
+    return frozendict(
+        {
+            "REL_IMAGE_SIZE_MAP": json.dumps({label: {"min": 0.1, "max": 0.2}}),
+            "IMAGE_FORMAT_MAP": json.dumps({label: {"min": 0.5, "max": 0.4}}),
+            "CONFIDENCE": json.dumps({label: {"min": 0.8}}),
+        }
+    )
+
+
+def test_read_environment_vars_for_thresholds(page_quotient_threshold_map, label):
+    assert get_class_specific_min_image_to_page_quotient(label, table=page_quotient_threshold_map) == 0.1
+    assert get_class_specific_max_image_to_page_quotient(label, table=page_quotient_threshold_map) == 0.2
+    assert get_class_specific_min_image_width_to_height_quotient(label, table=page_quotient_threshold_map) == 0.5
+    assert get_class_specific_max_image_width_to_height_quotient(label, table=page_quotient_threshold_map) == 0.4
+    assert get_class_specific_min_classification_confidence(label, table=page_quotient_threshold_map) == 0.8