diff --git a/image_prediction/image_extractor/extractors/parsable.py b/image_prediction/image_extractor/extractors/parsable.py index d115059..784a54f 100644 --- a/image_prediction/image_extractor/extractors/parsable.py +++ b/image_prediction/image_extractor/extractors/parsable.py @@ -8,9 +8,9 @@ from typing import List, Union import fitz import numpy as np from PIL import Image -from funcy import merge, compose, rcompose, keep -from pymonad.maybe import Maybe, Nothing, Just -from pymonad.tools import curry +from funcy import merge, compose, rcompose, keep, lfilter +from pymonad.either import Right, Left, Either +from pymonad.tools import curry, identity from image_prediction.exceptions import InvalidBox, BadXref from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair @@ -48,7 +48,7 @@ class ParsablePDFImageExtractor(ImageExtractor): metadata = extract_valid_metadata(self.doc, page) maybe_image_metadata_pairs = map(partial(metadatum_to_image_metadata_pair, self.doc), metadata) - image_metadata_pairs = [pair.value for pair in maybe_image_metadata_pairs if pair.is_just()] + image_metadata_pairs = keep(right, maybe_image_metadata_pairs) clear_caches() image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance) @@ -56,6 +56,12 @@ class ParsablePDFImageExtractor(ImageExtractor): yield from image_metadata_pairs +def right(pair: Either): + if pair.is_right(): + return pair.either(identity, identity) + logger.warning(f"Skipping bad image. reason: {pair.either(identity, identity)}") + + def extract_pages(doc, page_range): page_range = range(page_range.start + 1, page_range.stop + 1) pages = map(doc.load_page, page_range) @@ -63,14 +69,14 @@ def extract_pages(doc, page_range): yield from pages -def validate_image(image: Image.Image) -> Maybe: +def validate_image(image: Image.Image) -> Either: try: # TODO: stand-in heuristic for testing if image is valid => find cleaner solution (RED-5148) image.resize((100, 100)).convert("RGB") - return Just(image) + return Right(image) except (OSError, Exception): logger.warning(f"Invalid image encountered.") - return Nothing + return Left("Invalid image.") def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page): @@ -82,9 +88,9 @@ def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page): )(page) -def metadatum_to_image_metadata_pair(doc, metadatum: dict) -> Maybe: +def metadatum_to_image_metadata_pair(doc, metadatum: dict) -> Either: maybe_image = xref_to_maybe_image(doc, metadatum[Info.XREF]).bind(validate_image) - maybe_image_metadata_pair = make_maybe_image_metadata_pair(maybe_image, Just(metadatum)) + maybe_image_metadata_pair = make_maybe_image_metadata_pair(maybe_image, Right(metadatum)) return maybe_image_metadata_pair @@ -111,21 +117,22 @@ def get_metadata_for_images_on_page(page: fitz.Page): @lru_cache(maxsize=None) -def xref_to_maybe_image(doc, xref) -> Maybe: +def xref_to_maybe_image(doc, xref) -> Either: try: - return Just(extract_image(doc, xref)) + return Right(extract_image(doc, xref)) except BadXref: - return Nothing + return Left("Bad xref.") -def make_maybe_image_metadata_pair(image: Maybe, metadata: Maybe): +def make_maybe_image_metadata_pair(image: Either, metadata: Either): # haskell.org/tutorial/monads.html # (>>) :: m a -> m b -> m b - return Just(make_image_metadata_pair).amap(image).amap(metadata) + return Right(make_image_metadata_pair).amap(image).amap(metadata) + # TODO: Somehow metadata needs to be added to Lefts for logging the reference to the invalid image @curry(2) -def make_image_metadata_pair(image: Image.Image, metadatum: dict) -> Just: +def make_image_metadata_pair(image: Image.Image, metadatum: dict) -> ImageMetadataPair: return ImageMetadataPair(image, metadatum)