update pipeline script to also work with figure detection metadata
This commit is contained in:
parent
2b2da1b60c
commit
f43795cee0
@ -14,6 +14,7 @@ def parse_args():
|
|||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
parser.add_argument("input", help="pdf file or directory")
|
parser.add_argument("input", help="pdf file or directory")
|
||||||
|
parser.add_argument("--metadata", help="optional figure detection metadata")
|
||||||
parser.add_argument("--print", "-p", help="print output to terminal", action="store_true", default=False)
|
parser.add_argument("--print", "-p", help="print output to terminal", action="store_true", default=False)
|
||||||
parser.add_argument("--page_interval", "-i", help="page interval [i, j), min index = 0", nargs=2, type=int)
|
parser.add_argument("--page_interval", "-i", help="page interval [i, j), min index = 0", nargs=2, type=int)
|
||||||
|
|
||||||
@ -22,10 +23,14 @@ def parse_args():
|
|||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
||||||
def process_pdf(pipeline, pdf_path, page_range=None):
|
def process_pdf(pipeline, pdf_path, metadata=None, page_range=None):
|
||||||
|
if metadata:
|
||||||
|
with open(metadata) as f:
|
||||||
|
metadata = json.load(f)
|
||||||
|
|
||||||
with open(pdf_path, "rb") as f:
|
with open(pdf_path, "rb") as f:
|
||||||
logger.info(f"Processing {pdf_path}")
|
logger.info(f"Processing {pdf_path}")
|
||||||
predictions = list(pipeline(f.read(), page_range=page_range))
|
predictions = list(pipeline(f.read(), page_range=page_range, metadata_per_image=metadata))
|
||||||
|
|
||||||
annotate_pdf(
|
annotate_pdf(
|
||||||
pdf_path, predictions, os.path.join("/tmp", os.path.basename(pdf_path.replace(".pdf", "_annotated.pdf")))
|
pdf_path, predictions, os.path.join("/tmp", os.path.basename(pdf_path.replace(".pdf", "_annotated.pdf")))
|
||||||
@ -42,9 +47,10 @@ def main(args):
|
|||||||
else:
|
else:
|
||||||
pdf_paths = glob(os.path.join(args.input, "*.pdf"))
|
pdf_paths = glob(os.path.join(args.input, "*.pdf"))
|
||||||
page_range = range(*args.page_interval) if args.page_interval else None
|
page_range = range(*args.page_interval) if args.page_interval else None
|
||||||
|
metadata = args.metadata if args.metadata else None
|
||||||
|
|
||||||
for pdf_path in pdf_paths:
|
for pdf_path in pdf_paths:
|
||||||
predictions = process_pdf(pipeline, pdf_path, page_range=page_range)
|
predictions = process_pdf(pipeline, pdf_path, metadata, page_range=page_range)
|
||||||
if args.print:
|
if args.print:
|
||||||
print(pdf_path)
|
print(pdf_path)
|
||||||
print(json.dumps(predictions, indent=2))
|
print(json.dumps(predictions, indent=2))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user