diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index f478f38..9a2d6d6 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -1,14 +1,17 @@ import atexit import io +import json +import traceback from functools import partial, lru_cache from itertools import chain, starmap, filterfalse -from operator import itemgetter +from operator import itemgetter, truth from typing import List import fitz from PIL import Image from funcy import rcompose, merge, pluck, curry, compose +from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair from image_prediction.info import Info from image_prediction.stitching.stitching import stitch_pairs @@ -47,10 +50,28 @@ class ParsablePDFImageExtractor(ImageExtractor): clear_caches() image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata))) + # TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the + # validation here. Invalid images can then be split into a different stream and joined with the intact images + # again for the formatting step. + image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs) image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance) yield from image_metadata_pairs + @staticmethod + def __filter_valid_images(image_metadata_pairs): + def validate(image: Image.Image, metadata: dict): + try: + # TODO: stand-in heuristic for testing if image is valid => find cleaner solution (RED-5148) + image.resize((100, 100)).convert("RGB") + return ImageMetadataPair(image, metadata) + except (OSError, Exception) as err: + metadata = json.dumps(EnumFormatter()(metadata), indent=2) + logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}") + return None + + return filter(truth, starmap(validate, image_metadata_pairs)) + def extract_pages(doc, page_range): page_range = range(page_range.start + 1, page_range.stop + 1)