cv-analysis-service/cv_analysis/utils/pdf2array.py

from functools import partial
from typing import Iterator, Tuple

import fitz
import numpy as np


def pdf_to_array_and_metadata(pdf: bytes, index=None, dpi=200) -> Iterator[Tuple[np.ndarray, dict]]:
    """Stream the pages of a PDF as Tuples of page as matrix representation and page metadata.
    Note: If Index is not given or evaluates to None, the whole PDF will be processed.
    """
    convert_fn = partial(page_to_array_and_metadata, dpi=dpi)
    yield from map(convert_fn, stream_pages(pdf, index))


def page_to_array_and_metadata(page: fitz.Page, dpi):
    metadata = get_page_info(page, dpi)
    pixmap = page.get_pixmap(dpi=dpi)
    array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)

    return array, metadata


def stream_pages(pdf: bytes, index=None) -> Iterator[fitz.Page]:
    with fitz.open(stream=pdf) as pdf_handle:
        if not index:
            yield from pdf_handle
        else:
            for i in index:
                yield pdf_handle[i]


def get_page_info(page, dpi):
    return {
        "index": page.number,
        "rotation": page.rotation,
        "width": page.rect.width,  # rotated page width in inches
        "height": page.rect.height,  # rotated page height in inches
        "dpi": dpi,
    }