from functools import partial from typing import Iterator, Tuple import fitz import numpy as np def pdf_to_array_and_metadata(pdf: bytes, index=None, dpi=200) -> Iterator[Tuple[np.ndarray, dict]]: """Stream the pages of a PDF as Tuples of page as matrix representation and page metadata. Note: If Index is not given or evaluates to None, the whole PDF will be processed. """ convert_fn = partial(page_to_array_and_metadata, dpi=dpi) yield from map(convert_fn, stream_pages(pdf, index)) def page_to_array_and_metadata(page: fitz.Page, dpi): metadata = get_page_info(page, dpi) pixmap = page.get_pixmap(dpi=dpi) array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) return array, metadata def stream_pages(pdf: bytes, index=None) -> Iterator[fitz.Page]: with fitz.open(stream=pdf) as pdf_handle: if not index: yield from pdf_handle else: for i in index: yield pdf_handle[i] def get_page_info(page, dpi): return { "index": page.number, "rotation": page.rotation, "width": page.rect.width, # rotated page width in inches "height": page.rect.height, # rotated page height in inches "dpi": dpi, }