From 983265f4355253a3a371747b04b1926ff3578fef Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Tue, 30 Aug 2022 15:59:11 +0200 Subject: [PATCH] Added image validation after image extraction to parsable-pdf image extractor. Invalid images are dropped, hence these images will appear as skipped for the service caller. --- .../image_extractor/extractors/parsable.py | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index 9fe5b46..a849b21 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -1,14 +1,17 @@ import atexit import io +import json +import traceback from functools import partial, lru_cache from itertools import chain, starmap, filterfalse -from operator import itemgetter +from operator import itemgetter, truth from typing import List import fitz from PIL import Image from funcy import rcompose, merge, pluck, curry, compose +from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair from image_prediction.info import Info from image_prediction.stitching.stitching import stitch_pairs @@ -46,10 +49,28 @@ class ParsablePDFImageExtractor(ImageExtractor): clear_caches() image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata))) + # TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the + # validation here. Invalid images can then be split into a different stream and joined with the intact images + # again for the formatting step. + image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs) image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance) yield from image_metadata_pairs + @staticmethod + def __filter_valid_images(image_metadata_pairs): + def validate(image: Image.Image, metadata: dict): + try: + # TODO: stand-in heuristic for testing if image is valid => find cleaner solution + image.resize((100, 100)).convert("RGB") + return ImageMetadataPair(image, metadata) + except (OSError, Exception) as err: + metadata = json.dumps(EnumFormatter()(metadata), indent=2) + logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}") + return None + + return filter(truth, starmap(validate, image_metadata_pairs)) + def extract_pages(doc, page_range): page_range = range(page_range.start + 1, page_range.stop + 1)