From ee48f141f874e4220e564a092e5089a0ae103793 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Tue, 30 Aug 2022 16:24:58 +0200 Subject: [PATCH] Pull request #29: RED-5107 robustify image service alternative Merge in RR/image-prediction from RED-5107-robustify-image-service-alternative to release/1.2.x Squashed commit of the following: commit 1a8fbeebd3c05f25d69210e53bf6dce67bc2342f Merge: 00ac0d6 c03913e Author: Matthias Bisping Date: Tue Aug 30 16:19:16 2022 +0200 Merge branch 'release/1.2.x' into RED-5107-robustify-image-service-alternative commit 00ac0d61abdd97eb7c2576d2db9e6859b91c9c41 Author: Matthias Bisping Date: Tue Aug 30 16:03:41 2022 +0200 applied black commit 983265f4355253a3a371747b04b1926ff3578fef Author: Matthias Bisping Date: Tue Aug 30 15:59:11 2022 +0200 Added image validation after image extraction to parsable-pdf image extractor. Invalid images are dropped, hence these images will appear as skipped for the service caller. --- .../image_extractor/extractors/parsable.py | 37 ++++++++++--------- .../redai_adapter/model_wrapper.py | 4 ++ src/serve.py | 1 - test/conftest.py | 2 +- test/exploration_tests/funcy_test.py | 2 +- test/unit_tests/image_extractor_test.py | 9 +---- 6 files changed, 26 insertions(+), 29 deletions(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index ff58710..a849b21 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -1,5 +1,7 @@ import atexit import io +import json +import traceback from functools import partial, lru_cache from itertools import chain, starmap, filterfalse from operator import itemgetter, truth @@ -9,6 +11,7 @@ import fitz from PIL import Image from funcy import rcompose, merge, pluck, curry, compose +from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair from image_prediction.info import Info from image_prediction.stitching.stitching import stitch_pairs @@ -30,8 +33,6 @@ class ParsablePDFImageExtractor(ImageExtractor): self.doc: fitz.fitz.Document = None self.verbose = verbose self.tolerance = tolerance - # TODO: Move assignment of input shape for predictor, should not be set here since dependent on predictor - self.input_shape = (224, 224, 3) def extract(self, pdf: bytes, page_range: range = None): self.doc = fitz.Document(stream=pdf) @@ -48,27 +49,27 @@ class ParsablePDFImageExtractor(ImageExtractor): clear_caches() image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata))) + # TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the + # validation here. Invalid images can then be split into a different stream and joined with the intact images + # again for the formatting step. + image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs) image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance) - image_metadata_pairs = filter(truth, map(self.__preprocess, image_metadata_pairs)) yield from image_metadata_pairs - def __preprocess(self, image_metadata_pair): - image, metadata = image_metadata_pair + @staticmethod + def __filter_valid_images(image_metadata_pairs): + def validate(image: Image.Image, metadata: dict): + try: + # TODO: stand-in heuristic for testing if image is valid => find cleaner solution + image.resize((100, 100)).convert("RGB") + return ImageMetadataPair(image, metadata) + except (OSError, Exception) as err: + metadata = json.dumps(EnumFormatter()(metadata), indent=2) + logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}") + return None - try: - image = self.__resize_and_convert(image) - image_metadata_pair = ImageMetadataPair(image, metadata) - except Exception as err: - logger.warn( - f"{err}: couldn't preprocess image [ page_idx: {metadata[Info.PAGE_IDX]}, x1: {metadata[Info.X1]}, y1: {metadata[Info.Y1]}, width: {metadata[Info.WIDTH]}, height: {metadata[Info.HEIGHT]} ]" - ) - image_metadata_pair = None - - return image_metadata_pair - - def __resize_and_convert(self, image): - return image.resize(self.input_shape[:-1]).convert("RGB") + return filter(truth, starmap(validate, image_metadata_pairs)) def extract_pages(doc, page_range): diff --git a/image_prediction/redai_adapter/model_wrapper.py b/image_prediction/redai_adapter/model_wrapper.py index 2e35c1a..776931e 100644 --- a/image_prediction/redai_adapter/model_wrapper.py +++ b/image_prediction/redai_adapter/model_wrapper.py @@ -27,7 +27,11 @@ class ModelWrapper(abc.ABC): def __images_to_tensor(images): return np.array(list(map(tf.keras.preprocessing.image.img_to_array, images))) + def __resize_and_convert(self, image): + return image.resize(self.input_shape[:-1]).convert("RGB") + def prep_images(self, images): + images = map(self.__resize_and_convert, images) tensor = self.__images_to_tensor(images) tensor = self.__preprocess_tensor(tensor) diff --git a/src/serve.py b/src/serve.py index 37a7906..16ac4ea 100644 --- a/src/serve.py +++ b/src/serve.py @@ -10,7 +10,6 @@ from image_prediction.utils.banner import show_banner def main(): - def predict(pdf): # Keras service_estimator.predict stalls when service_estimator was loaded in different process; # therefore, we re-load the model (part of the pipeline) every time we process a new document. diff --git a/test/conftest.py b/test/conftest.py index 65298b0..807551b 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -17,7 +17,7 @@ pytest_plugins = [ "test.fixtures.parameters", "test.fixtures.pdf", "test.fixtures.target", - "test.unit_tests.image_stitching_test" + "test.unit_tests.image_stitching_test", ] diff --git a/test/exploration_tests/funcy_test.py b/test/exploration_tests/funcy_test.py index 30c2cef..fc32538 100644 --- a/test/exploration_tests/funcy_test.py +++ b/test/exploration_tests/funcy_test.py @@ -3,7 +3,7 @@ from funcy import rcompose, chunks def test_rcompose(): - f = rcompose(lambda x: x ** 2, str, lambda x: x * 2) + f = rcompose(lambda x: x**2, str, lambda x: x * 2) assert f(3) == "99" diff --git a/test/unit_tests/image_extractor_test.py b/test/unit_tests/image_extractor_test.py index bf9dfd0..e52b2b5 100644 --- a/test/unit_tests/image_extractor_test.py +++ b/test/unit_tests/image_extractor_test.py @@ -5,7 +5,7 @@ import fitz import fpdf import pytest from PIL import Image -from funcy import first, rest, lmap +from funcy import first, rest from image_prediction.extraction import extract_images_from_pdf from image_prediction.image_extractor.extractor import ImageMetadataPair @@ -27,13 +27,6 @@ def test_image_extractor_mock(image_extractor, images): @pytest.mark.parametrize("alpha", [False, True]) def test_parsable_pdf_image_extractor(image_extractor, pdf, images, metadata, input_size, alpha): images_extracted, metadata_extracted = map(list, extract_images_from_pdf(pdf, image_extractor)) - - # TODO: move resize operation to expected images fixture - def __resize_and_convert(image): - return image.resize((224, 224)).convert("RGB") - - images = lmap(__resize_and_convert, images) - if not alpha: assert image_sets_equal(images_extracted, images) assert metadata_equal(metadata_extracted, metadata)