cv-analysis-service/scripts/deskew_demo.py
2022-03-23 13:46:57 +01:00

51 lines
1.8 KiB
Python

# sample usage: python3 scripts/deskew_demo.py /path/to/crooked.pdf 0
import argparse
import numpy as np
import pdf2image
from PIL import Image
from cv_analysis.utils.deskew import deskew_histbased # , deskew_linebased
from cv_analysis.utils.display import show_mpl
from cv_analysis.utils.draw import draw_stats
from cv_analysis.table_parsing import parse_table
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("pdf_path")
parser.add_argument("page_index", type=int)
parser.add_argument("--save_path")
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
page = pdf2image.convert_from_path(args.pdf_path, first_page=args.page_index + 1, last_page=args.page_index + 1)[0]
page = np.array(page)
show_mpl(page)
# page_ = deskew_linebased(page, verbose=True)
# show_mpl(page_)
page_corr, _ = deskew_histbased(page, verbose=True)
show_mpl(page_corr)
if args.save_path:
page_ = Image.fromarray(page).convert("RGB")
page_.save(args.save_path.replace(".pdf", "_uncorrected.pdf"))
page_corr_ = Image.fromarray(page_corr).convert("RGB")
page_corr_.save(args.save_path.replace(".pdf", "_corrected.pdf"))
# annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index)
stats = parse_table(page)
page = draw_stats(page, stats)
show_mpl(page)
stats_corr = parse_table(page_corr)
page_corr = draw_stats(page_corr, stats_corr)
show_mpl(page_corr)
if args.save_path:
page = Image.fromarray(page).convert("RGB")
page.save(args.save_path.replace(".pdf", "_uncorrected_annotated.pdf"))
page_corr = Image.fromarray(page_corr).convert("RGB")
page_corr.save(args.save_path.replace(".pdf", "_corrected_annotated.pdf"))