from numpy import array, ndarray import pdf2image from PIL import Image from cv_analysis.utils.preprocessing import preprocess_page_array def open_pdf(pdf, first_page=0, last_page=None): first_page += 1 last_page = None if last_page is None else last_page + 1 if type(pdf) == str: if pdf.lower().endswith((".png", ".jpg", ".jpeg")): pages = [Image.open(pdf)] elif pdf.lower().endswith(".pdf"): pages = pdf2image.convert_from_path(pdf, first_page=first_page, last_page=last_page) else: raise IOError("Invalid file extension. Accepted filetypes:\n\t.png\n\t.jpg\n\t.jpeg\n\t.pdf") elif type(pdf) == bytes: pages = pdf2image.convert_from_bytes(pdf, first_page=first_page, last_page=last_page) elif type(pdf) in {list, ndarray}: return pdf pages = [preprocess_page_array(array(p)) for p in pages] return pages