[WIP] Monadic refactoring

This commit is contained in:
Matthias Bisping 2023-02-06 14:36:25 +01:00
parent f645984ea4
commit 4e3168e51c

View File

@ -7,10 +7,13 @@ from itertools import chain, starmap, filterfalse
from operator import itemgetter
from typing import Iterable, Iterator, List, Union
import IPython
import fitz
import numpy as np
from PIL import Image
from funcy import merge, compose, rcompose, keep
from funcy import merge, compose, rcompose, keep, identity
from pymonad.maybe import Maybe, Nothing, Just
from pymonad.tools import curry
from image_prediction.exceptions import InvalidBox, BadXref
from image_prediction.formatter.formatters.enum import EnumFormatter
@ -22,6 +25,8 @@ from image_prediction.utils import get_logger
logger = get_logger()
Maybe
class ParsablePDFImageExtractor(ImageExtractor):
def __init__(self, verbose=False, tolerance=0):
@ -47,13 +52,19 @@ class ParsablePDFImageExtractor(ImageExtractor):
def __process_images_on_page(self, page: fitz.fitz.Page):
metadata = extract_valid_metadata(self.doc, page)
image_metadata_pairs = keep(partial(metadatum_to_image_metadata_pair, self.doc), metadata)
maybe_image_metadata_pairs = map(partial(metadatum_to_image_metadata_pair, self.doc), metadata)
image_metadata_pairs = [pair.value for pair in maybe_image_metadata_pairs if pair.is_just()]
print(image_metadata_pairs)
clear_caches()
# TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the
# validation here. Invalid images can then be split into a different stream and joined with the intact images
# again for the formatting step.
image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs)
image_metadata_pairs = list(image_metadata_pairs)
print(image_metadata_pairs)
image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
yield from image_metadata_pairs
@ -89,9 +100,9 @@ def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
)(page)
def metadatum_to_image_metadata_pair(doc, metadatum: dict):
def metadatum_to_image_metadata_pair(doc, metadatum: dict) -> Maybe:
maybe_image = xref_to_maybe_image(doc, metadatum[Info.XREF])
return make_maybe_image_metadata_pair(maybe_image, metadatum)
return make_maybe_image_metadata_pair(maybe_image, Just(metadatum))
def add_alpha_channel_info(doc, metadata):
@ -117,15 +128,19 @@ def get_metadata_for_images_on_page(page: fitz.Page):
@lru_cache(maxsize=None)
def xref_to_maybe_image(doc, xref) -> Union[Image.Image, None]:
def xref_to_maybe_image(doc, xref) -> Maybe:
try:
return extract_image(doc, xref)
return Just(extract_image(doc, xref))
except BadXref:
return None
return Nothing
def make_maybe_image_metadata_pair(image, metadata):
return ImageMetadataPair(image, metadata) if image and metadata else None
def make_maybe_image_metadata_pair(image: Maybe, metadata: Maybe):
return Just(image.bind(curry(2, make_image_metadata_pair))).amap(metadata)
def make_image_metadata_pair(image: Image.Image, metadatum: dict) -> Just:
return ImageMetadataPair(image, metadatum)
def extract_image(doc, xref) -> Image.Image: