2022-03-23 13:46:57 +01:00

32 lines
1.0 KiB
Python

from numpy import array, ndarray
import pdf2image
from PIL import Image
import cv2
from cv_analysis.utils.deskew import deskew
def preprocess_pdf_image(page):
if len(page.shape) > 2:
page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
page = cv2.fastNlMeansDenoising(page, h=3)
return deskew(page)
def open_pdf(pdf, first_page=0, last_page=None):
first_page += 1
last_page = None if last_page is None else last_page + 1
if type(pdf) == str:
if pdf.endswith(".jpg") or pdf.endswith(".png"):
pages = [Image.open(pdf)]
# assume pdf as default file type for a path argument
else:
pages = pdf2image.convert_from_path(pdf, first_page=first_page, last_page=last_page)
elif type(pdf) == bytes:
pages = pdf2image.convert_from_bytes(pdf, first_page=first_page, last_page=last_page)
elif type(pdf) in {list, ndarray}:
return pdf
pages = [preprocess_pdf_image(array(p)) for p in pages]
pages, angles = list(zip(*pages))
return pages, angles