[WIP] Either-refactoring

Replace Maybe with Either to allow passing on error information or
metadata which otherwise get sucked up by Nothing.
This commit is contained in:
Matthias Bisping 2023-02-06 16:57:31 +01:00
parent 89989543d8
commit 3235a857f6

View File

@ -8,9 +8,9 @@ from typing import List, Union
import fitz import fitz
import numpy as np import numpy as np
from PIL import Image from PIL import Image
from funcy import merge, compose, rcompose, keep from funcy import merge, compose, rcompose, keep, lfilter
from pymonad.maybe import Maybe, Nothing, Just from pymonad.either import Right, Left, Either
from pymonad.tools import curry from pymonad.tools import curry, identity
from image_prediction.exceptions import InvalidBox, BadXref from image_prediction.exceptions import InvalidBox, BadXref
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
@ -48,7 +48,7 @@ class ParsablePDFImageExtractor(ImageExtractor):
metadata = extract_valid_metadata(self.doc, page) metadata = extract_valid_metadata(self.doc, page)
maybe_image_metadata_pairs = map(partial(metadatum_to_image_metadata_pair, self.doc), metadata) maybe_image_metadata_pairs = map(partial(metadatum_to_image_metadata_pair, self.doc), metadata)
image_metadata_pairs = [pair.value for pair in maybe_image_metadata_pairs if pair.is_just()] image_metadata_pairs = keep(right, maybe_image_metadata_pairs)
clear_caches() clear_caches()
image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance) image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
@ -56,6 +56,12 @@ class ParsablePDFImageExtractor(ImageExtractor):
yield from image_metadata_pairs yield from image_metadata_pairs
def right(pair: Either):
if pair.is_right():
return pair.either(identity, identity)
logger.warning(f"Skipping bad image. reason: {pair.either(identity, identity)}")
def extract_pages(doc, page_range): def extract_pages(doc, page_range):
page_range = range(page_range.start + 1, page_range.stop + 1) page_range = range(page_range.start + 1, page_range.stop + 1)
pages = map(doc.load_page, page_range) pages = map(doc.load_page, page_range)
@ -63,14 +69,14 @@ def extract_pages(doc, page_range):
yield from pages yield from pages
def validate_image(image: Image.Image) -> Maybe: def validate_image(image: Image.Image) -> Either:
try: try:
# TODO: stand-in heuristic for testing if image is valid => find cleaner solution (RED-5148) # TODO: stand-in heuristic for testing if image is valid => find cleaner solution (RED-5148)
image.resize((100, 100)).convert("RGB") image.resize((100, 100)).convert("RGB")
return Just(image) return Right(image)
except (OSError, Exception): except (OSError, Exception):
logger.warning(f"Invalid image encountered.") logger.warning(f"Invalid image encountered.")
return Nothing return Left("Invalid image.")
def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page): def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
@ -82,9 +88,9 @@ def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
)(page) )(page)
def metadatum_to_image_metadata_pair(doc, metadatum: dict) -> Maybe: def metadatum_to_image_metadata_pair(doc, metadatum: dict) -> Either:
maybe_image = xref_to_maybe_image(doc, metadatum[Info.XREF]).bind(validate_image) maybe_image = xref_to_maybe_image(doc, metadatum[Info.XREF]).bind(validate_image)
maybe_image_metadata_pair = make_maybe_image_metadata_pair(maybe_image, Just(metadatum)) maybe_image_metadata_pair = make_maybe_image_metadata_pair(maybe_image, Right(metadatum))
return maybe_image_metadata_pair return maybe_image_metadata_pair
@ -111,21 +117,22 @@ def get_metadata_for_images_on_page(page: fitz.Page):
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def xref_to_maybe_image(doc, xref) -> Maybe: def xref_to_maybe_image(doc, xref) -> Either:
try: try:
return Just(extract_image(doc, xref)) return Right(extract_image(doc, xref))
except BadXref: except BadXref:
return Nothing return Left("Bad xref.")
def make_maybe_image_metadata_pair(image: Maybe, metadata: Maybe): def make_maybe_image_metadata_pair(image: Either, metadata: Either):
# haskell.org/tutorial/monads.html # haskell.org/tutorial/monads.html
# (>>) :: m a -> m b -> m b # (>>) :: m a -> m b -> m b
return Just(make_image_metadata_pair).amap(image).amap(metadata) return Right(make_image_metadata_pair).amap(image).amap(metadata)
# TODO: Somehow metadata needs to be added to Lefts for logging the reference to the invalid image
@curry(2) @curry(2)
def make_image_metadata_pair(image: Image.Image, metadatum: dict) -> Just: def make_image_metadata_pair(image: Image.Image, metadatum: dict) -> ImageMetadataPair:
return ImageMetadataPair(image, metadatum) return ImageMetadataPair(image, metadatum)