Merge in RR/image-prediction from integrate-image-extraction-new-pyinfra to master
Squashed commit of the following:
commit 8470c065c71ea2a985aadfc399fb32c693e3a90f
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Thu Aug 18 09:19:52 2022 +0200
add key script
commit 8f6eb1e79083fb32fb7bedac640c10b6fd411899
Merge: 27fd7de c1b9629
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Thu Aug 18 09:17:50 2022 +0200
Merge branch 'master' of ssh://git.iqser.com:2222/rr/image-prediction into integrate-image-extraction-new-pyinfra
commit 27fd7de39a59d0d88fbddb471dd7797b61223ece
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Aug 17 13:15:09 2022 +0200
update pyinfra
commit ca58f85642598dc15e286074982e7cedae9a1355
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Aug 16 16:16:10 2022 +0200
update pdf2image-service
commit f43795cee0e211e14ac5f9296b01d440ae759c55
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Mon Aug 15 10:32:02 2022 +0200
update pipeline script to also work with figure detection metadata
commit 2b2da1b60ce56fb006cf2f6b65aeda9774391b2a
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Fri Aug 12 13:37:48 2022 +0200
add new pyinfra, add optional image classifcation under key dataCV if figure metadata is present on storage
commit bae25bedbd3a262a9d00e18a1b19f4ee6f1eb924
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Aug 10 13:27:41 2022 +0200
tidy-up
commit 287b0ebc8a952e506185d13508eaa386d0420704
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Aug 10 12:57:35 2022 +0200
update server logic for new pyinfra, add extraction from scanned PDF with figure detection logic
commit 3225cefaa25e4559b105397bc06c867a22806ba8
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Aug 10 10:37:31 2022 +0200
integrate new pyinfra logic
commit 46926078342b0680a7416560bb69bec037cf8038
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Aug 3 13:15:27 2022 +0200
add image extraction for scanned PDFs WIP
commit 1b3b11b6f9044d44cb9a822a78197a2ebc6f306a
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Aug 3 09:41:06 2022 +0200
add pyinfra and pdf2image as git submodule
63 lines
1.9 KiB
Python
63 lines
1.9 KiB
Python
import argparse
|
|
import json
|
|
import os
|
|
from glob import glob
|
|
from operator import truth
|
|
|
|
from image_prediction.pipeline import load_pipeline
|
|
from image_prediction.utils import get_logger
|
|
from image_prediction.utils.pdf_annotation import annotate_pdf
|
|
|
|
logger = get_logger()
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument("input", help="pdf file or directory")
|
|
parser.add_argument("--metadata", help="optional figure detection metadata")
|
|
parser.add_argument("--print", "-p", help="print output to terminal", action="store_true", default=False)
|
|
parser.add_argument("--page_interval", "-i", help="page interval [i, j), min index = 0", nargs=2, type=int)
|
|
|
|
args = parser.parse_args()
|
|
|
|
return args
|
|
|
|
|
|
def process_pdf(pipeline, pdf_path, metadata=None, page_range=None):
|
|
if metadata:
|
|
with open(metadata) as f:
|
|
metadata = json.load(f)
|
|
|
|
with open(pdf_path, "rb") as f:
|
|
logger.info(f"Processing {pdf_path}")
|
|
predictions = list(pipeline(f.read(), page_range=page_range, metadata_per_image=metadata))
|
|
|
|
annotate_pdf(
|
|
pdf_path, predictions, os.path.join("/tmp", os.path.basename(pdf_path.replace(".pdf", f"_{truth(metadata)}_annotated.pdf")))
|
|
)
|
|
|
|
return predictions
|
|
|
|
|
|
def main(args):
|
|
pipeline = load_pipeline(verbose=True, tolerance=3)
|
|
|
|
if os.path.isfile(args.input):
|
|
pdf_paths = [args.input]
|
|
else:
|
|
pdf_paths = glob(os.path.join(args.input, "*.pdf"))
|
|
page_range = range(*args.page_interval) if args.page_interval else None
|
|
metadata = args.metadata if args.metadata else None
|
|
|
|
for pdf_path in pdf_paths:
|
|
predictions = process_pdf(pipeline, pdf_path, metadata, page_range=page_range)
|
|
if args.print:
|
|
print(pdf_path)
|
|
print(json.dumps(predictions, indent=2))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = parse_args()
|
|
main(args)
|