Pull request #30 : RED-5202 add per class customizable max_rel_image_size configuration.

Merge in RR/image-prediction from RED-5202-add-class-specific-image-size-heuristic to release/1.2.x * commit 'f48455c496a12d99bb8e9e015d61f661bd94519b': RED-5202 add per class customizable max_rel_image_size configuration. If value isn't set or isn't defined at all for a class, the default value is used (change is backwards compatible). Custom value enabled for signatures, set to 0.4 relative image size if not overwritten by ENV (; set to null to use default value or define custom value. Value is 0.4 if ENV is not found.)
RED-5202 add per class customizable max_rel_image_size configuration. If value isn't set or isn't defined at all for a class, the default value is used (change is backwards compatible). Custom value enabled for signatures, set to 0.4 relative image size if not overwritten by ENV (; set to null to use default value or define custom value. Value is 0.4 if ENV is not found.)
2022-09-09 14:11:17 +02:00 · 2022-09-09 13:59:54 +02:00 · 2022-08-30 16:24:58 +02:00 · 2022-08-29 13:01:42 +02:00
7 changed files with 44 additions and 5 deletions
--- a/config.yaml
+++ b/config.yaml
@ -18,6 +18,10 @@ filters:
  image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)
    min: $MIN_REL_IMAGE_SIZE|0.05  # Minimum permissible
    max: $MAX_REL_IMAGE_SIZE|0.75  # Maximum permissible
+    customized:  # Customized settings per class (RED-5202)
+      max:
+        signature: $MAX_REL_SIGNATURE_SIZE|0.4
+

  image_width_to_height_quotient:  # Image width to height ratio
    min: $MIN_IMAGE_FORMAT|0.1  # Minimum permissible
--- a/image_prediction/config.py
+++ b/image_prediction/config.py
@ -15,6 +15,12 @@ class DotIndexable:
    def __init__(self, x):
        self.x = x

+    def get(self, item, default=None):
+        try:
+            return _get_item_and_maybe_make_dotindexable(self.x, item)
+        except KeyError:
+            return default
+
    def __getattr__(self, item):
        return _get_item_and_maybe_make_dotindexable(self.x, item)

--- a/image_prediction/image_extractor/extractors/parsable.py
+++ b/image_prediction/image_extractor/extractors/parsable.py
@ -1,14 +1,17 @@
 import atexit
 import io
+import json
+import traceback
 from functools import partial, lru_cache
 from itertools import chain, starmap, filterfalse
-from operator import itemgetter
+from operator import itemgetter, truth
 from typing import List

 import fitz
 from PIL import Image
 from funcy import rcompose, merge, pluck, curry, compose

+from image_prediction.formatter.formatters.enum import EnumFormatter
 from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
 from image_prediction.info import Info
 from image_prediction.stitching.stitching import stitch_pairs
@ -46,10 +49,28 @@ class ParsablePDFImageExtractor(ImageExtractor):
        clear_caches()

        image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
+        #  TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the
+        #   validation here. Invalid images can then be split into a different stream and joined with the intact images
+        #   again for the formatting step.
+        image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs)
        image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)

        yield from image_metadata_pairs

+    @staticmethod
+    def __filter_valid_images(image_metadata_pairs):
+        def validate(image: Image.Image, metadata: dict):
+            try:
+                # TODO: stand-in heuristic for testing if image is valid => find cleaner solution
+                image.resize((100, 100)).convert("RGB")
+                return ImageMetadataPair(image, metadata)
+            except (OSError, Exception) as err:
+                metadata = json.dumps(EnumFormatter()(metadata), indent=2)
+                logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
+                return None
+
+        return filter(truth, starmap(validate, image_metadata_pairs))
+

 def extract_pages(doc, page_range):
    page_range = range(page_range.start + 1, page_range.stop + 1)
--- a/image_prediction/transformer/transformers/response.py
+++ b/image_prediction/transformer/transformers/response.py
@ -20,6 +20,13 @@ def build_image_info(data: dict) -> dict:
        image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1))
        return image_area_sqrt / page_area_sqrt

+    def is_max_image_to_page_quotient_breached(quotient, label):
+        default_max_quotient = CONFIG.filters.image_to_page_quotient.max
+        customized_entries = CONFIG.filters.image_to_page_quotient.customized.max
+        max_quotient = customized_entries.get(label, default_max_quotient)
+        max_quotient = max_quotient if max_quotient else default_max_quotient
+        return bool(quotient > max_quotient)
+
    page_width, page_height, x1, x2, y1, y2, width, height, alpha = itemgetter(
        "page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height", "alpha"
    )(data)
@ -27,7 +34,9 @@ def build_image_info(data: dict) -> dict:
    quotient = round(compute_geometric_quotient(), 4)

    min_image_to_page_quotient_breached = bool(quotient < CONFIG.filters.image_to_page_quotient.min)
-    max_image_to_page_quotient_breached = bool(quotient > CONFIG.filters.image_to_page_quotient.max)
+    max_image_to_page_quotient_breached = is_max_image_to_page_quotient_breached(
+        quotient, data["classification"]["label"]
+    )
    min_image_width_to_height_quotient_breached = bool(
        width / height < CONFIG.filters.image_width_to_height_quotient.min
    )
--- a/src/serve.py
+++ b/src/serve.py
@ -10,7 +10,6 @@ from image_prediction.utils.banner import show_banner


 def main():
-
    def predict(pdf):
        # Keras service_estimator.predict stalls when service_estimator was loaded in different process;
        # therefore, we re-load the model (part of the pipeline) every time we process a new document.
--- a/test/conftest.py
+++ b/test/conftest.py
@ -17,7 +17,7 @@ pytest_plugins = [
    "test.fixtures.parameters",
    "test.fixtures.pdf",
    "test.fixtures.target",
-    "test.unit_tests.image_stitching_test"
+    "test.unit_tests.image_stitching_test",
 ]


--- a/test/exploration_tests/funcy_test.py
+++ b/test/exploration_tests/funcy_test.py
@ -3,7 +3,7 @@ from funcy import rcompose, chunks


 def test_rcompose():
-    f = rcompose(lambda x: x ** 2, str, lambda x: x * 2)
+    f = rcompose(lambda x: x**2, str, lambda x: x * 2)
    assert f(3) == "99"
Author	SHA1	Message	Date
Julius Unverfehrt	2a62c4fca2	Pull request #30 : RED-5202 add per class customizable max_rel_image_size configuration. Merge in RR/image-prediction from RED-5202-add-class-specific-image-size-heuristic to release/1.2.x * commit 'f48455c496a12d99bb8e9e015d61f661bd94519b': RED-5202 add per class customizable max_rel_image_size configuration. If value isn't set or isn't defined at all for a class, the default value is used (change is backwards compatible). Custom value enabled for signatures, set to 0.4 relative image size if not overwritten by ENV (; set to null to use default value or define custom value. Value is 0.4 if ENV is not found.)	2022-09-09 14:11:17 +02:00
Julius Unverfehrt	f48455c496	RED-5202 add per class customizable max_rel_image_size configuration. If value isn't set or isn't defined at all for a class, the default value is used (change is backwards compatible). Custom value enabled for signatures, set to 0.4 relative image size if not overwritten by ENV (; set to null to use default value or define custom value. Value is 0.4 if ENV is not found.)	2022-09-09 13:59:54 +02:00
Matthias Bisping	ee48f141f8	Pull request #29 : RED-5107 robustify image service alternative Merge in RR/image-prediction from RED-5107-robustify-image-service-alternative to release/1.2.x Squashed commit of the following: commit 1a8fbeebd3c05f25d69210e53bf6dce67bc2342f Merge: 00ac0d6 c03913e Author: Matthias Bisping <matthias.bisping@axbit.com> Date: Tue Aug 30 16:19:16 2022 +0200 Merge branch 'release/1.2.x' into RED-5107-robustify-image-service-alternative commit 00ac0d61abdd97eb7c2576d2db9e6859b91c9c41 Author: Matthias Bisping <matthias.bisping@axbit.com> Date: Tue Aug 30 16:03:41 2022 +0200 applied black commit 983265f4355253a3a371747b04b1926ff3578fef Author: Matthias Bisping <matthias.bisping@axbit.com> Date: Tue Aug 30 15:59:11 2022 +0200 Added image validation after image extraction to parsable-pdf image extractor. Invalid images are dropped, hence these images will appear as skipped for the service caller.	2022-08-30 16:24:58 +02:00
Julius Unverfehrt	c03913e088	Pull request #26 : RED-5107: move image normalization for predictor to image extraction step to be able to properly catch exeption thrown from this step Merge in RR/image-prediction from RED-5107-hotfix to release/3.4.1 Squashed commit of the following: commit b7b99074054e67201537efc2f0a5b96f29bd1684 Author: Julius Unverfehrt <julius.unverfehrt@iqser.com> Date: Mon Aug 29 12:57:50 2022 +0200 RED-5107: move image normalization for predictor to image extraction step to be able to properly catch exeption thrown from this step	2022-08-29 13:01:42 +02:00