from numpy import array, ndarray import pdf2image from PIL import Image from cv_analysis.utils.preprocessing import preprocess_page_array def open_analysis_input_file(path_or_bytes, first_page=1, last_page=None): assert first_page > 0, "Page numbers are 1-based." assert last_page is None or last_page >= first_page, "last_page must be greater than or equal to first_page." last_page = last_page or first_page if type(path_or_bytes) == str: if path_or_bytes.lower().endswith((".png", ".jpg", ".jpeg")): pages = [Image.open(path_or_bytes)] elif path_or_bytes.lower().endswith(".pdf"): pages = pdf2image.convert_from_path(path_or_bytes, first_page=first_page, last_page=last_page) else: raise IOError("Invalid file extension. Accepted filetypes: .png, .jpg, .jpeg, .pdf") elif type(path_or_bytes) == bytes: pages = pdf2image.convert_from_bytes(path_or_bytes, first_page=first_page, last_page=last_page) elif type(path_or_bytes) in {list, ndarray}: return path_or_bytes pages = [preprocess_page_array(array(p)) for p in pages] return pages