cv-analysis-service/scripts/annotate_pdf.py
2024-04-29 12:09:44 +02:00

63 lines
2.1 KiB
Python

import argparse
import timeit
from itertools import starmap
from pathlib import Path
from funcy import lmap
from pdf2img.conversion import convert_pages_to_images
from PIL import Image
from cv_analysis.figure_detection.figure_detection import detect_figures
from cv_analysis.layout_parsing import parse_layout
from cv_analysis.table_parsing import parse_tables
from cv_analysis.utils.draw import draw_rectangles
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("pdf_path")
parser.add_argument("--output_folder", default="/tmp")
parser.add_argument("--type", "-t", choices=["table", "layout", "figure"], required=True)
return parser.parse_args()
def analyse_and_annotate(images, analysis_fn):
arrays = lmap(lambda image: image.asarray(), images)
result = map(analysis_fn, arrays)
annotated_images = starmap(draw_rectangles, zip(arrays, result))
return annotated_images
def save_as_pdf(images, output_folder, file_name, operation):
Path(output_folder).mkdir(parents=True, exist_ok=True)
images = lmap(Image.fromarray, images)
images[0].save(f"{output_folder}/{file_name}_annotated_{operation}.pdf", save_all=True, append_images=images)
def get_analysis_fn(analysis_type):
if analysis_type == "table":
return parse_tables
elif analysis_type == "layout":
return parse_layout
elif analysis_type == "figure":
return detect_figures
else:
raise
if __name__ == "__main__":
args = parse_args()
t0 = timeit.default_timer()
with open(args.pdf_path, "rb") as f:
pdf_bytes = f.read()
images = convert_pages_to_images(pdf_bytes)
t1 = timeit.default_timer()
annotated_pages = analyse_and_annotate(images=images, analysis_fn=get_analysis_fn(args.type))
t2 = timeit.default_timer()
save_as_pdf(annotated_pages, args.output_folder, Path(args.pdf_path).stem, args.type)
t3 = timeit.default_timer()
print("[s] opening file and convert pdf pages to images: ", t1-t0)
print("[s] analyse and annotate images: ", t2-t1)
print("[s] save images as pdf: ", t3-t2)