cv-analysis-service/scripts/annotate_pdf.py
Julius Unverfehrt 59a0a61708 Pull request #25: Pdf2image
Merge in RR/cv-analysis from pdf2image to master

Squashed commit of the following:

commit 1353f54d2dceb0a79b1f81bfa2c035f5a454275a
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Wed Aug 10 09:07:31 2022 +0200

    add deRotation and transformation vie rectanglePlus

commit 51459dbf57a86e3eac66ec0da02de40dc1b68796
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Tue Aug 9 08:53:50 2022 +0200

    add derotation and to pdf coords transformation to cv-analysis output

commit 733991e2f5a4664205b2f7cc756cebcbc9ee3930
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Mon Aug 8 15:15:13 2022 +0200

    update pipline with detrotation logic WIP
2022-08-10 09:17:59 +02:00

53 lines
1.7 KiB
Python

import argparse
import json
from operator import itemgetter
from pathlib import Path
import fitz
from funcy import lmap
from cv_analysis.server.pipeline import get_analysis_fn, make_analysis_pipeline, get_analysis_pipeline
from cv_analysis.utils.display import show_image_mpl
from pdf2img.extraction import extract_images
from pdf2img.get_rectangles import parse_to_image_rectangles
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("pdf_path")
parser.add_argument("output_folder")
parser.add_argument("--type", "-t", choices=["table", "layout", "figure"], required=True)
parser.add_argument("--verbose", action="store_true")
parser.add_argument("--silent", dest="verbose", action="store_false")
parser.set_defaults(verbose=False)
return parser.parse_args()
def analyse_annotate_save(pdf, analysis_type, output_path, verbose):
pipe = get_analysis_pipeline(analysis_type)
results = list(pipe(pdf))
if verbose:
print(json.dumps(results, indent=2))
with fitz.open(stream=pdf) as pdf_handle:
for result in results:
page = pdf_handle[result["pageInfo"]["number"]]
bbox = result["boundingBoxScreen"]
x1, y1, x2, y2 = itemgetter("x0", "y0", "x1", "y1")(bbox)
rect = fitz.Rect(x1, y1, x2, y2)
page.draw_rect(rect, color=(0.5, 0.7, 0.2), width=2)
pdf_handle.save(output_path)
if __name__ == "__main__":
args = parse_args()
with open(args.pdf_path, "rb") as f:
pdf_bytes = f.read()
Path(args.output_folder).mkdir(parents=True, exist_ok=True)
output_path = f"{args.output_folder}/{Path(args.pdf_path).stem}_annotated_{args.type}.pdf"
analyse_annotate_save(pdf_bytes, args.type, output_path, args.verbose)