Compare commits

...

4 Commits

Author SHA1 Message Date
Julius Unverfehrt
2a62c4fca2 Pull request #30: RED-5202 add per class customizable max_rel_image_size configuration.
Merge in RR/image-prediction from RED-5202-add-class-specific-image-size-heuristic to release/1.2.x

* commit 'f48455c496a12d99bb8e9e015d61f661bd94519b':
  RED-5202 add per class customizable max_rel_image_size configuration. If value isn't set or isn't defined at all for a class, the default value is used (change is backwards compatible). Custom value enabled for signatures, set to 0.4 relative image size if not overwritten by ENV (; set to null to use default value or define custom value. Value is 0.4 if ENV is not found.)
2022-09-09 14:11:17 +02:00
Julius Unverfehrt
f48455c496 RED-5202 add per class customizable max_rel_image_size configuration. If value isn't set or isn't defined at all for a class, the default value is used (change is backwards compatible). Custom value enabled for signatures, set to 0.4 relative image size if not overwritten by ENV (; set to null to use default value or define custom value. Value is 0.4 if ENV is not found.) 2022-09-09 13:59:54 +02:00
Matthias Bisping
ee48f141f8 Pull request #29: RED-5107 robustify image service alternative
Merge in RR/image-prediction from RED-5107-robustify-image-service-alternative to release/1.2.x

Squashed commit of the following:

commit 1a8fbeebd3c05f25d69210e53bf6dce67bc2342f
Merge: 00ac0d6 c03913e
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Tue Aug 30 16:19:16 2022 +0200

    Merge branch 'release/1.2.x' into RED-5107-robustify-image-service-alternative

commit 00ac0d61abdd97eb7c2576d2db9e6859b91c9c41
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Tue Aug 30 16:03:41 2022 +0200

    applied black

commit 983265f4355253a3a371747b04b1926ff3578fef
Author: Matthias Bisping <matthias.bisping@axbit.com>
Date:   Tue Aug 30 15:59:11 2022 +0200

    Added image validation after image extraction to parsable-pdf image extractor. Invalid images are dropped, hence these images will appear as skipped for the service caller.
2022-08-30 16:24:58 +02:00
Julius Unverfehrt
c03913e088 Pull request #26: RED-5107: move image normalization for predictor to image extraction step to be able to properly catch exeption thrown from this step
Merge in RR/image-prediction from RED-5107-hotfix to release/3.4.1

Squashed commit of the following:

commit b7b99074054e67201537efc2f0a5b96f29bd1684
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Mon Aug 29 12:57:50 2022 +0200

    RED-5107: move image normalization for predictor to image extraction step to be able to properly catch exeption thrown from this step
2022-08-29 13:01:42 +02:00
7 changed files with 44 additions and 5 deletions

View File

@ -18,6 +18,10 @@ filters:
image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)
min: $MIN_REL_IMAGE_SIZE|0.05 # Minimum permissible
max: $MAX_REL_IMAGE_SIZE|0.75 # Maximum permissible
customized: # Customized settings per class (RED-5202)
max:
signature: $MAX_REL_SIGNATURE_SIZE|0.4
image_width_to_height_quotient: # Image width to height ratio
min: $MIN_IMAGE_FORMAT|0.1 # Minimum permissible

View File

@ -15,6 +15,12 @@ class DotIndexable:
def __init__(self, x):
self.x = x
def get(self, item, default=None):
try:
return _get_item_and_maybe_make_dotindexable(self.x, item)
except KeyError:
return default
def __getattr__(self, item):
return _get_item_and_maybe_make_dotindexable(self.x, item)

View File

@ -1,14 +1,17 @@
import atexit
import io
import json
import traceback
from functools import partial, lru_cache
from itertools import chain, starmap, filterfalse
from operator import itemgetter
from operator import itemgetter, truth
from typing import List
import fitz
from PIL import Image
from funcy import rcompose, merge, pluck, curry, compose
from image_prediction.formatter.formatters.enum import EnumFormatter
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
from image_prediction.info import Info
from image_prediction.stitching.stitching import stitch_pairs
@ -46,10 +49,28 @@ class ParsablePDFImageExtractor(ImageExtractor):
clear_caches()
image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
# TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the
# validation here. Invalid images can then be split into a different stream and joined with the intact images
# again for the formatting step.
image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs)
image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
yield from image_metadata_pairs
@staticmethod
def __filter_valid_images(image_metadata_pairs):
def validate(image: Image.Image, metadata: dict):
try:
# TODO: stand-in heuristic for testing if image is valid => find cleaner solution
image.resize((100, 100)).convert("RGB")
return ImageMetadataPair(image, metadata)
except (OSError, Exception) as err:
metadata = json.dumps(EnumFormatter()(metadata), indent=2)
logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
return None
return filter(truth, starmap(validate, image_metadata_pairs))
def extract_pages(doc, page_range):
page_range = range(page_range.start + 1, page_range.stop + 1)

View File

@ -20,6 +20,13 @@ def build_image_info(data: dict) -> dict:
image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1))
return image_area_sqrt / page_area_sqrt
def is_max_image_to_page_quotient_breached(quotient, label):
default_max_quotient = CONFIG.filters.image_to_page_quotient.max
customized_entries = CONFIG.filters.image_to_page_quotient.customized.max
max_quotient = customized_entries.get(label, default_max_quotient)
max_quotient = max_quotient if max_quotient else default_max_quotient
return bool(quotient > max_quotient)
page_width, page_height, x1, x2, y1, y2, width, height, alpha = itemgetter(
"page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height", "alpha"
)(data)
@ -27,7 +34,9 @@ def build_image_info(data: dict) -> dict:
quotient = round(compute_geometric_quotient(), 4)
min_image_to_page_quotient_breached = bool(quotient < CONFIG.filters.image_to_page_quotient.min)
max_image_to_page_quotient_breached = bool(quotient > CONFIG.filters.image_to_page_quotient.max)
max_image_to_page_quotient_breached = is_max_image_to_page_quotient_breached(
quotient, data["classification"]["label"]
)
min_image_width_to_height_quotient_breached = bool(
width / height < CONFIG.filters.image_width_to_height_quotient.min
)

View File

@ -10,7 +10,6 @@ from image_prediction.utils.banner import show_banner
def main():
def predict(pdf):
# Keras service_estimator.predict stalls when service_estimator was loaded in different process;
# therefore, we re-load the model (part of the pipeline) every time we process a new document.

View File

@ -17,7 +17,7 @@ pytest_plugins = [
"test.fixtures.parameters",
"test.fixtures.pdf",
"test.fixtures.target",
"test.unit_tests.image_stitching_test"
"test.unit_tests.image_stitching_test",
]

View File

@ -3,7 +3,7 @@ from funcy import rcompose, chunks
def test_rcompose():
f = rcompose(lambda x: x ** 2, str, lambda x: x * 2)
f = rcompose(lambda x: x**2, str, lambda x: x * 2)
assert f(3) == "99"