from numpy import array, ndarray import pdf2image from PIL import Image import cv2 from cv_analysis.utils.deskew import deskew def preprocess_pdf_image(page): if len(page.shape) > 2: page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY) page = cv2.fastNlMeansDenoising(page, h=3) return deskew(page) def open_pdf(pdf, first_page=0, last_page=None): first_page += 1 last_page = None if last_page is None else last_page + 1 if type(pdf) == str: if pdf.endswith(".jpg") or pdf.endswith(".png"): pages = [Image.open(pdf)] # assume pdf as default file type for a path argument else: pages = pdf2image.convert_from_path(pdf, first_page=first_page, last_page=last_page) elif type(pdf) == bytes: pages = pdf2image.convert_from_bytes(pdf, first_page=first_page, last_page=last_page) elif type(pdf) in {list, ndarray}: return pdf pages = [preprocess_pdf_image(array(p)) for p in pages] pages, angles = list(zip(*pages)) return pages, angles