import io from functools import partial from itertools import chain, starmap from operator import itemgetter, truth import fitz from PIL import Image from funcy import rcompose, compose, curry, merge from tqdm import tqdm from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair from image_prediction.info import Info rounder = rcompose(round, int) def get_image_metadata(image_info): x1, y1, x2, y2 = map(rounder, image_info["bbox"]) width, height = itemgetter("width", "height")(image_info) return { Info.WIDTH: width, Info.HEIGHT: height, Info.X1: x1, Info.X2: x2, Info.Y1: y1, Info.Y2: y2, } def get_page_metadata(page): page_width, page_height = map(rounder, page.mediabox_size) return { Info.PAGE_WIDTH: page_width, Info.PAGE_HEIGHT: page_height, Info.PAGE_IDX: page.number, } def extract_pages(doc, page_range): page_range = range(page_range.start + 1, page_range.stop + 1) pages = map(doc.load_page, page_range) return pages class ParsablePDFImageExtractor(ImageExtractor): def __init__(self, verbose=False): self.doc: fitz.fitz.Document = None self.verbose = verbose def __process_images_on_page(self, page: fitz.fitz.Page): def load_image_from_xref(xref): maybe_image = self.doc.extract_image(xref) return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None image_infos = page.get_image_info(xrefs=True) xrefs = map(itemgetter("xref"), image_infos) images = map(load_image_from_xref, xrefs) metadata = map(get_image_metadata, image_infos) metadata = map(partial(merge, get_page_metadata(page)), metadata) return starmap(ImageMetadataPair, filter(compose(all, curry(map)(truth)), zip(images, metadata))) def extract(self, pdf: bytes, page_range: range = None): self.doc = fitz.Document(stream=pdf) pages = extract_pages(self.doc, page_range) if page_range else self.doc image_metadata_pairs = chain.from_iterable( map( self.__process_images_on_page, tqdm(pages, desc="Extracting", disable=not self.verbose, total=len(page_range) if page_range else None), ) ) return image_metadata_pairs