[WIP] Monadic refactoring
This commit is contained in:
parent
f645984ea4
commit
4e3168e51c
@ -7,10 +7,13 @@ from itertools import chain, starmap, filterfalse
|
|||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
from typing import Iterable, Iterator, List, Union
|
from typing import Iterable, Iterator, List, Union
|
||||||
|
|
||||||
|
import IPython
|
||||||
import fitz
|
import fitz
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from funcy import merge, compose, rcompose, keep
|
from funcy import merge, compose, rcompose, keep, identity
|
||||||
|
from pymonad.maybe import Maybe, Nothing, Just
|
||||||
|
from pymonad.tools import curry
|
||||||
|
|
||||||
from image_prediction.exceptions import InvalidBox, BadXref
|
from image_prediction.exceptions import InvalidBox, BadXref
|
||||||
from image_prediction.formatter.formatters.enum import EnumFormatter
|
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||||
@ -22,6 +25,8 @@ from image_prediction.utils import get_logger
|
|||||||
|
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
|
||||||
|
Maybe
|
||||||
|
|
||||||
|
|
||||||
class ParsablePDFImageExtractor(ImageExtractor):
|
class ParsablePDFImageExtractor(ImageExtractor):
|
||||||
def __init__(self, verbose=False, tolerance=0):
|
def __init__(self, verbose=False, tolerance=0):
|
||||||
@ -47,13 +52,19 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
|||||||
|
|
||||||
def __process_images_on_page(self, page: fitz.fitz.Page):
|
def __process_images_on_page(self, page: fitz.fitz.Page):
|
||||||
metadata = extract_valid_metadata(self.doc, page)
|
metadata = extract_valid_metadata(self.doc, page)
|
||||||
image_metadata_pairs = keep(partial(metadatum_to_image_metadata_pair, self.doc), metadata)
|
|
||||||
|
maybe_image_metadata_pairs = map(partial(metadatum_to_image_metadata_pair, self.doc), metadata)
|
||||||
|
image_metadata_pairs = [pair.value for pair in maybe_image_metadata_pairs if pair.is_just()]
|
||||||
|
print(image_metadata_pairs)
|
||||||
clear_caches()
|
clear_caches()
|
||||||
|
|
||||||
# TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the
|
# TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the
|
||||||
# validation here. Invalid images can then be split into a different stream and joined with the intact images
|
# validation here. Invalid images can then be split into a different stream and joined with the intact images
|
||||||
# again for the formatting step.
|
# again for the formatting step.
|
||||||
image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs)
|
image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs)
|
||||||
|
image_metadata_pairs = list(image_metadata_pairs)
|
||||||
|
print(image_metadata_pairs)
|
||||||
|
|
||||||
image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
|
image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
|
||||||
|
|
||||||
yield from image_metadata_pairs
|
yield from image_metadata_pairs
|
||||||
@ -89,9 +100,9 @@ def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
|
|||||||
)(page)
|
)(page)
|
||||||
|
|
||||||
|
|
||||||
def metadatum_to_image_metadata_pair(doc, metadatum: dict):
|
def metadatum_to_image_metadata_pair(doc, metadatum: dict) -> Maybe:
|
||||||
maybe_image = xref_to_maybe_image(doc, metadatum[Info.XREF])
|
maybe_image = xref_to_maybe_image(doc, metadatum[Info.XREF])
|
||||||
return make_maybe_image_metadata_pair(maybe_image, metadatum)
|
return make_maybe_image_metadata_pair(maybe_image, Just(metadatum))
|
||||||
|
|
||||||
|
|
||||||
def add_alpha_channel_info(doc, metadata):
|
def add_alpha_channel_info(doc, metadata):
|
||||||
@ -117,15 +128,19 @@ def get_metadata_for_images_on_page(page: fitz.Page):
|
|||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@lru_cache(maxsize=None)
|
||||||
def xref_to_maybe_image(doc, xref) -> Union[Image.Image, None]:
|
def xref_to_maybe_image(doc, xref) -> Maybe:
|
||||||
try:
|
try:
|
||||||
return extract_image(doc, xref)
|
return Just(extract_image(doc, xref))
|
||||||
except BadXref:
|
except BadXref:
|
||||||
return None
|
return Nothing
|
||||||
|
|
||||||
|
|
||||||
def make_maybe_image_metadata_pair(image, metadata):
|
def make_maybe_image_metadata_pair(image: Maybe, metadata: Maybe):
|
||||||
return ImageMetadataPair(image, metadata) if image and metadata else None
|
return Just(image.bind(curry(2, make_image_metadata_pair))).amap(metadata)
|
||||||
|
|
||||||
|
|
||||||
|
def make_image_metadata_pair(image: Image.Image, metadatum: dict) -> Just:
|
||||||
|
return ImageMetadataPair(image, metadatum)
|
||||||
|
|
||||||
|
|
||||||
def extract_image(doc, xref) -> Image.Image:
|
def extract_image(doc, xref) -> Image.Image:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user