import argparse import json from operator import itemgetter from pathlib import Path import fitz from cv_analysis.server.pipeline import get_analysis_fn, make_analysis_pipeline def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("pdf_path") parser.add_argument("output_folder") parser.add_argument("--type", "-t", choices=["table", "layout", "figure"], required=True) parser.add_argument("--verbose", action="store_true") parser.add_argument("--silent", dest="verbose", action="store_false") parser.set_defaults(verbose=False) return parser.parse_args() def analyse_annotate_save(pdf, analysis_type, output_path, verbose): pipe = make_analysis_pipeline(get_analysis_fn(analysis_type)) results = list(pipe(pdf)) if verbose: print(json.dumps(results, indent=2)) with fitz.open(stream=pdf) as pdf_handle: for result in results: page = pdf_handle[result["index"]] for rect in result["bboxes"]: x1, y1, x2, y2 = itemgetter("x1", "y1", "x2", "y2")(rect) page.draw_rect((x1, y1, x2, y2), color=(0.5, 0.7, 0.2), width=2) pdf_handle.save(output_path) if __name__ == "__main__": args = parse_args() with open(args.pdf_path, "rb") as f: pdf_bytes = f.read() Path(args.output_folder).mkdir(parents=True, exist_ok=True) output_path = f"{args.output_folder}/{Path(args.pdf_path).stem}_annotated_{args.type}.pdf" analyse_annotate_save(pdf_bytes, args.type, output_path, args.verbose)