RED-5202 port temporary broken image handling so the hotfix won't be lost by upgrading the service. A proper solution is still desirable (see RED-5148)

This commit is contained in:
Julius Unverfehrt 2022-09-12 08:55:05 +02:00
parent 5f99622646
commit 4584e7ba66

View File

@ -1,14 +1,17 @@
import atexit
import io
import json
import traceback
from functools import partial, lru_cache
from itertools import chain, starmap, filterfalse
from operator import itemgetter
from operator import itemgetter, truth
from typing import List
import fitz
from PIL import Image
from funcy import rcompose, merge, pluck, curry, compose
from image_prediction.formatter.formatters.enum import EnumFormatter
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
from image_prediction.info import Info
from image_prediction.stitching.stitching import stitch_pairs
@ -47,10 +50,28 @@ class ParsablePDFImageExtractor(ImageExtractor):
clear_caches()
image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
# TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the
# validation here. Invalid images can then be split into a different stream and joined with the intact images
# again for the formatting step.
image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs)
image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
yield from image_metadata_pairs
@staticmethod
def __filter_valid_images(image_metadata_pairs):
def validate(image: Image.Image, metadata: dict):
try:
# TODO: stand-in heuristic for testing if image is valid => find cleaner solution (RED-5148)
image.resize((100, 100)).convert("RGB")
return ImageMetadataPair(image, metadata)
except (OSError, Exception) as err:
metadata = json.dumps(EnumFormatter()(metadata), indent=2)
logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
return None
return filter(truth, starmap(validate, image_metadata_pairs))
def extract_pages(doc, page_range):
page_range = range(page_range.start + 1, page_range.stop + 1)