62 lines
2.1 KiB
Python
62 lines
2.1 KiB
Python
import argparse
|
|
import timeit
|
|
from itertools import starmap
|
|
from pathlib import Path
|
|
|
|
from funcy import lmap
|
|
from pdf2img.conversion import convert_pages_to_images
|
|
from PIL import Image
|
|
|
|
from cv_analysis.figure_detection.figure_detection import detect_figures
|
|
from cv_analysis.layout_parsing import parse_layout
|
|
from cv_analysis.table_parsing import parse_tables
|
|
from cv_analysis.utils.draw import draw_rectangles
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("pdf_path")
|
|
parser.add_argument("--output_folder", default="/tmp")
|
|
parser.add_argument("--type", "-t", choices=["table", "layout", "figure"], required=True)
|
|
return parser.parse_args()
|
|
|
|
|
|
def analyse_and_annotate(images, analysis_fn):
|
|
arrays = lmap(lambda image: image.asarray(), images)
|
|
result = map(analysis_fn, arrays)
|
|
annotated_images = starmap(draw_rectangles, zip(arrays, result))
|
|
return annotated_images
|
|
|
|
|
|
def save_as_pdf(images, output_folder, file_name, operation):
|
|
Path(output_folder).mkdir(parents=True, exist_ok=True)
|
|
images = lmap(Image.fromarray, images)
|
|
images[0].save(f"{output_folder}/{file_name}_annotated_{operation}.pdf", save_all=True, append_images=images)
|
|
|
|
|
|
def get_analysis_fn(analysis_type):
|
|
if analysis_type == "table":
|
|
return parse_tables
|
|
elif analysis_type == "layout":
|
|
return parse_layout
|
|
elif analysis_type == "figure":
|
|
return detect_figures
|
|
else:
|
|
raise
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = parse_args()
|
|
t0 = timeit.default_timer()
|
|
with open(args.pdf_path, "rb") as f:
|
|
pdf_bytes = f.read()
|
|
images = convert_pages_to_images(pdf_bytes)
|
|
t1 = timeit.default_timer()
|
|
annotated_pages = analyse_and_annotate(images=images, analysis_fn=get_analysis_fn(args.type))
|
|
t2 = timeit.default_timer()
|
|
save_as_pdf(annotated_pages, args.output_folder, Path(args.pdf_path).stem, args.type)
|
|
t3 = timeit.default_timer()
|
|
print("[s] opening file and convert pdf pages to images: ", t1 - t0)
|
|
print("[s] analyse and annotate images: ", t2 - t1)
|
|
print("[s] save images as pdf: ", t3 - t2)
|