[WIP] Monadic refactoring

This commit is contained in:
Matthias Bisping 2023-02-06 14:36:25 +01:00
parent f645984ea4
commit 4e3168e51c

View File

@ -7,10 +7,13 @@ from itertools import chain, starmap, filterfalse
from operator import itemgetter from operator import itemgetter
from typing import Iterable, Iterator, List, Union from typing import Iterable, Iterator, List, Union
import IPython
import fitz import fitz
import numpy as np import numpy as np
from PIL import Image from PIL import Image
from funcy import merge, compose, rcompose, keep from funcy import merge, compose, rcompose, keep, identity
from pymonad.maybe import Maybe, Nothing, Just
from pymonad.tools import curry
from image_prediction.exceptions import InvalidBox, BadXref from image_prediction.exceptions import InvalidBox, BadXref
from image_prediction.formatter.formatters.enum import EnumFormatter from image_prediction.formatter.formatters.enum import EnumFormatter
@ -22,6 +25,8 @@ from image_prediction.utils import get_logger
logger = get_logger() logger = get_logger()
Maybe
class ParsablePDFImageExtractor(ImageExtractor): class ParsablePDFImageExtractor(ImageExtractor):
def __init__(self, verbose=False, tolerance=0): def __init__(self, verbose=False, tolerance=0):
@ -47,13 +52,19 @@ class ParsablePDFImageExtractor(ImageExtractor):
def __process_images_on_page(self, page: fitz.fitz.Page): def __process_images_on_page(self, page: fitz.fitz.Page):
metadata = extract_valid_metadata(self.doc, page) metadata = extract_valid_metadata(self.doc, page)
image_metadata_pairs = keep(partial(metadatum_to_image_metadata_pair, self.doc), metadata)
maybe_image_metadata_pairs = map(partial(metadatum_to_image_metadata_pair, self.doc), metadata)
image_metadata_pairs = [pair.value for pair in maybe_image_metadata_pairs if pair.is_just()]
print(image_metadata_pairs)
clear_caches() clear_caches()
# TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the # TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the
# validation here. Invalid images can then be split into a different stream and joined with the intact images # validation here. Invalid images can then be split into a different stream and joined with the intact images
# again for the formatting step. # again for the formatting step.
image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs) image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs)
image_metadata_pairs = list(image_metadata_pairs)
print(image_metadata_pairs)
image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance) image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
yield from image_metadata_pairs yield from image_metadata_pairs
@ -89,9 +100,9 @@ def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
)(page) )(page)
def metadatum_to_image_metadata_pair(doc, metadatum: dict): def metadatum_to_image_metadata_pair(doc, metadatum: dict) -> Maybe:
maybe_image = xref_to_maybe_image(doc, metadatum[Info.XREF]) maybe_image = xref_to_maybe_image(doc, metadatum[Info.XREF])
return make_maybe_image_metadata_pair(maybe_image, metadatum) return make_maybe_image_metadata_pair(maybe_image, Just(metadatum))
def add_alpha_channel_info(doc, metadata): def add_alpha_channel_info(doc, metadata):
@ -117,15 +128,19 @@ def get_metadata_for_images_on_page(page: fitz.Page):
@lru_cache(maxsize=None) @lru_cache(maxsize=None)
def xref_to_maybe_image(doc, xref) -> Union[Image.Image, None]: def xref_to_maybe_image(doc, xref) -> Maybe:
try: try:
return extract_image(doc, xref) return Just(extract_image(doc, xref))
except BadXref: except BadXref:
return None return Nothing
def make_maybe_image_metadata_pair(image, metadata): def make_maybe_image_metadata_pair(image: Maybe, metadata: Maybe):
return ImageMetadataPair(image, metadata) if image and metadata else None return Just(image.bind(curry(2, make_image_metadata_pair))).amap(metadata)
def make_image_metadata_pair(image: Image.Image, metadatum: dict) -> Just:
return ImageMetadataPair(image, metadatum)
def extract_image(doc, xref) -> Image.Image: def extract_image(doc, xref) -> Image.Image: