[WIP] Either-refactoring
Replace Maybe with Either to allow passing on error information or metadata which otherwise get sucked up by Nothing.
This commit is contained in:
parent
89989543d8
commit
3235a857f6
@ -8,9 +8,9 @@ from typing import List, Union
|
||||
import fitz
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from funcy import merge, compose, rcompose, keep
|
||||
from pymonad.maybe import Maybe, Nothing, Just
|
||||
from pymonad.tools import curry
|
||||
from funcy import merge, compose, rcompose, keep, lfilter
|
||||
from pymonad.either import Right, Left, Either
|
||||
from pymonad.tools import curry, identity
|
||||
|
||||
from image_prediction.exceptions import InvalidBox, BadXref
|
||||
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||
@ -48,7 +48,7 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
||||
metadata = extract_valid_metadata(self.doc, page)
|
||||
|
||||
maybe_image_metadata_pairs = map(partial(metadatum_to_image_metadata_pair, self.doc), metadata)
|
||||
image_metadata_pairs = [pair.value for pair in maybe_image_metadata_pairs if pair.is_just()]
|
||||
image_metadata_pairs = keep(right, maybe_image_metadata_pairs)
|
||||
clear_caches()
|
||||
|
||||
image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
|
||||
@ -56,6 +56,12 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
||||
yield from image_metadata_pairs
|
||||
|
||||
|
||||
def right(pair: Either):
|
||||
if pair.is_right():
|
||||
return pair.either(identity, identity)
|
||||
logger.warning(f"Skipping bad image. reason: {pair.either(identity, identity)}")
|
||||
|
||||
|
||||
def extract_pages(doc, page_range):
|
||||
page_range = range(page_range.start + 1, page_range.stop + 1)
|
||||
pages = map(doc.load_page, page_range)
|
||||
@ -63,14 +69,14 @@ def extract_pages(doc, page_range):
|
||||
yield from pages
|
||||
|
||||
|
||||
def validate_image(image: Image.Image) -> Maybe:
|
||||
def validate_image(image: Image.Image) -> Either:
|
||||
try:
|
||||
# TODO: stand-in heuristic for testing if image is valid => find cleaner solution (RED-5148)
|
||||
image.resize((100, 100)).convert("RGB")
|
||||
return Just(image)
|
||||
return Right(image)
|
||||
except (OSError, Exception):
|
||||
logger.warning(f"Invalid image encountered.")
|
||||
return Nothing
|
||||
return Left("Invalid image.")
|
||||
|
||||
|
||||
def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
|
||||
@ -82,9 +88,9 @@ def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
|
||||
)(page)
|
||||
|
||||
|
||||
def metadatum_to_image_metadata_pair(doc, metadatum: dict) -> Maybe:
|
||||
def metadatum_to_image_metadata_pair(doc, metadatum: dict) -> Either:
|
||||
maybe_image = xref_to_maybe_image(doc, metadatum[Info.XREF]).bind(validate_image)
|
||||
maybe_image_metadata_pair = make_maybe_image_metadata_pair(maybe_image, Just(metadatum))
|
||||
maybe_image_metadata_pair = make_maybe_image_metadata_pair(maybe_image, Right(metadatum))
|
||||
return maybe_image_metadata_pair
|
||||
|
||||
|
||||
@ -111,21 +117,22 @@ def get_metadata_for_images_on_page(page: fitz.Page):
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def xref_to_maybe_image(doc, xref) -> Maybe:
|
||||
def xref_to_maybe_image(doc, xref) -> Either:
|
||||
try:
|
||||
return Just(extract_image(doc, xref))
|
||||
return Right(extract_image(doc, xref))
|
||||
except BadXref:
|
||||
return Nothing
|
||||
return Left("Bad xref.")
|
||||
|
||||
|
||||
def make_maybe_image_metadata_pair(image: Maybe, metadata: Maybe):
|
||||
def make_maybe_image_metadata_pair(image: Either, metadata: Either):
|
||||
# haskell.org/tutorial/monads.html
|
||||
# (>>) :: m a -> m b -> m b
|
||||
return Just(make_image_metadata_pair).amap(image).amap(metadata)
|
||||
return Right(make_image_metadata_pair).amap(image).amap(metadata)
|
||||
# TODO: Somehow metadata needs to be added to Lefts for logging the reference to the invalid image
|
||||
|
||||
|
||||
@curry(2)
|
||||
def make_image_metadata_pair(image: Image.Image, metadatum: dict) -> Just:
|
||||
def make_image_metadata_pair(image: Image.Image, metadatum: dict) -> ImageMetadataPair:
|
||||
return ImageMetadataPair(image, metadatum)
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user