diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py index c2b4bb0..0a01287 100644 --- a/scripts/run_pipeline.py +++ b/scripts/run_pipeline.py @@ -14,6 +14,7 @@ def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("input", help="pdf file or directory") + parser.add_argument("--metadata", help="optional figure detection metadata") parser.add_argument("--print", "-p", help="print output to terminal", action="store_true", default=False) parser.add_argument("--page_interval", "-i", help="page interval [i, j), min index = 0", nargs=2, type=int) @@ -22,10 +23,14 @@ def parse_args(): return args -def process_pdf(pipeline, pdf_path, page_range=None): +def process_pdf(pipeline, pdf_path, metadata=None, page_range=None): + if metadata: + with open(metadata) as f: + metadata = json.load(f) + with open(pdf_path, "rb") as f: logger.info(f"Processing {pdf_path}") - predictions = list(pipeline(f.read(), page_range=page_range)) + predictions = list(pipeline(f.read(), page_range=page_range, metadata_per_image=metadata)) annotate_pdf( pdf_path, predictions, os.path.join("/tmp", os.path.basename(pdf_path.replace(".pdf", "_annotated.pdf"))) @@ -42,9 +47,10 @@ def main(args): else: pdf_paths = glob(os.path.join(args.input, "*.pdf")) page_range = range(*args.page_interval) if args.page_interval else None + metadata = args.metadata if args.metadata else None for pdf_path in pdf_paths: - predictions = process_pdf(pipeline, pdf_path, page_range=page_range) + predictions = process_pdf(pipeline, pdf_path, metadata, page_range=page_range) if args.print: print(pdf_path) print(json.dumps(predictions, indent=2))