readd annotate_pdf script
This commit is contained in:
parent
010e029d9b
commit
b14a341cfc
53
scripts/annotate_pdf.py
Normal file
53
scripts/annotate_pdf.py
Normal file
@ -0,0 +1,53 @@
|
||||
import argparse
|
||||
from itertools import starmap
|
||||
from pathlib import Path
|
||||
|
||||
from PIL import Image
|
||||
from funcy import lmap
|
||||
|
||||
from cv_analysis.figure_detection.figure_detection import detect_figures
|
||||
from cv_analysis.layout_parsing import parse_layout
|
||||
from cv_analysis.table_parsing import parse_tables
|
||||
from cv_analysis.utils.draw import draw_rectangles
|
||||
from pdf2img.conversion import convert_pages_to_images
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("pdf_path")
|
||||
parser.add_argument("--output_folder", default="/tmp")
|
||||
parser.add_argument("--type", "-t", choices=["table", "layout", "figure"], required=True)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def analyse_and_annotate(images, analysis_fn):
|
||||
arrays = lmap(lambda image: image.asarray(), images)
|
||||
result = map(analysis_fn, arrays)
|
||||
annotated_images = starmap(draw_rectangles, zip(arrays, result))
|
||||
return annotated_images
|
||||
|
||||
|
||||
def save_as_pdf(images, output_folder, file_name, operation):
|
||||
Path(output_folder).mkdir(parents=True, exist_ok=True)
|
||||
images = lmap(Image.fromarray, images)
|
||||
images[0].save(f"{output_folder}/{file_name}_annotated_{operation}.pdf", save_all=True, append_images=images)
|
||||
|
||||
|
||||
def get_analysis_fn(analysis_type):
|
||||
if analysis_type == "table":
|
||||
return parse_tables
|
||||
elif analysis_type == "layout":
|
||||
return parse_layout
|
||||
elif analysis_type == "figure":
|
||||
return detect_figures
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
with open(args.pdf_path, "rb") as f:
|
||||
pdf_bytes = f.read()
|
||||
images = convert_pages_to_images(pdf_bytes)
|
||||
annotated_pages = analyse_and_annotate(images=images, analysis_fn=get_analysis_fn(args.type))
|
||||
save_as_pdf(annotated_pages, args.output_folder, Path(args.pdf_path).stem, args.type)
|
||||
Loading…
x
Reference in New Issue
Block a user