refactoring

This commit is contained in:
Matthias Bisping 2022-04-11 13:28:39 +02:00
parent bcf6dc5c47
commit d80af336eb

View File

@ -1,5 +1,5 @@
import io import io
from functools import partial from functools import partial, lru_cache
from itertools import chain, starmap from itertools import chain, starmap
from operator import itemgetter, truth from operator import itemgetter, truth
@ -43,21 +43,39 @@ def extract_pages(doc, page_range):
return pages return pages
def load_image_from_xref(doc, xref):
maybe_image = doc.extract_image(xref)
return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None
@lru_cache(maxsize=None)
def get_image_infos(page: fitz.Page):
return page.get_image_info(xrefs=True)
def get_images_on_page(doc, page: fitz.Page):
image_infos = get_image_infos(page)
xrefs = map(itemgetter("xref"), image_infos)
images = map(partial(load_image_from_xref, doc), xrefs)
return images
def get_metadata_for_images_on_page(page: fitz.Page):
image_infos = get_image_infos(page)
metadata = map(get_image_metadata, image_infos)
metadata = map(partial(merge, get_page_metadata(page)), metadata)
return metadata
class ParsablePDFImageExtractor(ImageExtractor): class ParsablePDFImageExtractor(ImageExtractor):
def __init__(self, verbose=False): def __init__(self, verbose=False):
self.doc: fitz.fitz.Document = None self.doc: fitz.fitz.Document = None
self.verbose = verbose self.verbose = verbose
def __process_images_on_page(self, page: fitz.fitz.Page): def __process_images_on_page(self, page: fitz.fitz.Page):
def load_image_from_xref(xref): images = get_images_on_page(self.doc, page)
maybe_image = self.doc.extract_image(xref) metadata = get_metadata_for_images_on_page(page)
return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None get_image_infos.cache_clear()
image_infos = page.get_image_info(xrefs=True)
xrefs = map(itemgetter("xref"), image_infos)
images = map(load_image_from_xref, xrefs)
metadata = map(get_image_metadata, image_infos)
metadata = map(partial(merge, get_page_metadata(page)), metadata)
return starmap(ImageMetadataPair, filter(compose(all, curry(map)(truth)), zip(images, metadata))) return starmap(ImageMetadataPair, filter(compose(all, curry(map)(truth)), zip(images, metadata)))