RED-5202 port temporary broken image handling so the hotfix won't be lost by upgrading the service. A proper solution is still desirable (see RED-5148)
This commit is contained in:
parent
5f99622646
commit
4584e7ba66
@ -1,14 +1,17 @@
|
||||
import atexit
|
||||
import io
|
||||
import json
|
||||
import traceback
|
||||
from functools import partial, lru_cache
|
||||
from itertools import chain, starmap, filterfalse
|
||||
from operator import itemgetter
|
||||
from operator import itemgetter, truth
|
||||
from typing import List
|
||||
|
||||
import fitz
|
||||
from PIL import Image
|
||||
from funcy import rcompose, merge, pluck, curry, compose
|
||||
|
||||
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||
from image_prediction.info import Info
|
||||
from image_prediction.stitching.stitching import stitch_pairs
|
||||
@ -47,10 +50,28 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
||||
clear_caches()
|
||||
|
||||
image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
|
||||
# TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the
|
||||
# validation here. Invalid images can then be split into a different stream and joined with the intact images
|
||||
# again for the formatting step.
|
||||
image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs)
|
||||
image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
|
||||
|
||||
yield from image_metadata_pairs
|
||||
|
||||
@staticmethod
|
||||
def __filter_valid_images(image_metadata_pairs):
|
||||
def validate(image: Image.Image, metadata: dict):
|
||||
try:
|
||||
# TODO: stand-in heuristic for testing if image is valid => find cleaner solution (RED-5148)
|
||||
image.resize((100, 100)).convert("RGB")
|
||||
return ImageMetadataPair(image, metadata)
|
||||
except (OSError, Exception) as err:
|
||||
metadata = json.dumps(EnumFormatter()(metadata), indent=2)
|
||||
logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
|
||||
return None
|
||||
|
||||
return filter(truth, starmap(validate, image_metadata_pairs))
|
||||
|
||||
|
||||
def extract_pages(doc, page_range):
|
||||
page_range = range(page_range.start + 1, page_range.stop + 1)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user