30 lines
1.1 KiB
Python
30 lines
1.1 KiB
Python
from numpy import array, ndarray
|
|
import pdf2image
|
|
from PIL import Image
|
|
|
|
from cv_analysis.utils.preprocessing import preprocess_page_array
|
|
|
|
|
|
def open_analysis_input_file(path_or_bytes, first_page=1, last_page=None):
|
|
|
|
assert first_page > 0, "Page numbers are 1-based."
|
|
assert last_page is None or last_page >= first_page, "last_page must be greater than or equal to first_page."
|
|
|
|
last_page = last_page or first_page
|
|
|
|
if type(path_or_bytes) == str:
|
|
if path_or_bytes.lower().endswith((".png", ".jpg", ".jpeg")):
|
|
pages = [Image.open(path_or_bytes)]
|
|
elif path_or_bytes.lower().endswith(".pdf"):
|
|
pages = pdf2image.convert_from_path(path_or_bytes, first_page=first_page, last_page=last_page)
|
|
else:
|
|
raise IOError("Invalid file extension. Accepted filetypes: .png, .jpg, .jpeg, .pdf")
|
|
elif type(path_or_bytes) == bytes:
|
|
pages = pdf2image.convert_from_bytes(path_or_bytes, first_page=first_page, last_page=last_page)
|
|
elif type(path_or_bytes) in {list, ndarray}:
|
|
return path_or_bytes
|
|
|
|
pages = [preprocess_page_array(array(p)) for p in pages]
|
|
|
|
return pages
|