cv-analysis-service/scripts/measure_runtimes.py

import argparse
import time
from functools import partial
from pathlib import Path

import fitz
import numpy as np
from funcy import lmap
from matplotlib import pyplot as plt

from cv_analysis.server.pipeline import make_analysis_pipeline, get_analysis_fn


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("pdf_folder", help="Path to folder with PDFs to evaluate")
    parser.add_argument("output_folder", help="Path to folder where the Runtime plot should be stored")
    parser.add_argument("n_runs", help="Number of runs per test")
    return parser.parse_args()


def measure(fn, n_runs):
    def run(*args, **kwargs):
        def _run():
            start = time.time()
            results = list(fn(*args, **kwargs))  # Evaluate generators
            end = time.time()
            return end - start

        runtimes = [_run() for _ in range(n_runs)]
        return np.mean(runtimes), np.std(runtimes)

    return run


def run_tests(pdf, test_cases, n_runs):
    def measure_analysis_pipe(test_case):
        timed_analysis_pipe = measure(make_analysis_pipeline(get_analysis_fn(test_case)), n_runs)
        return timed_analysis_pipe(pdf)

    return lmap(measure_analysis_pipe, test_cases)


def to_ms_per_page(runtime, page_count):
    ms_per_page = runtime / page_count * 1000
    return round(ms_per_page, 0)


def measure_pdf(pdf_path, n_runs):
    with open(pdf_path, "rb") as f:
        pdf = f.read()
        page_count = fitz.open(stream=pdf).page_count
        format_fn = partial(to_ms_per_page, page_count=page_count)

    means, std = zip(*run_tests(pdf, test_cases, n_runs=n_runs))
    means, std = lmap(format_fn, means), lmap(format_fn, std)
    return means, std


def plot_results_and_save(results, labels, n_runs, test_pdf_paths):
    fig, ax = plt.subplots()
    width = 0.2
    x_labels = np.arange(len(labels))
    plt.xticks(ticks=x_labels, labels=labels, rotation=90)
    plt.grid(linestyle="dotted")

    for idx, (result, test_pdf_path) in enumerate(zip(results, test_pdf_paths)):
        x = x_labels + idx * width
        means, std = result
        bars = ax.bar(x, means, width, yerr=std, label=f"{test_pdf_path.stem}")
        ax.bar_label(bars)
    ax.set_ylabel("ms/page")
    ax.set_xlabel("Cv-analysis operation")
    ax.set_title(f"Cv-analysis runtime estimation {n_runs=}")
    ax.legend(loc=0)

    Path(args.output_folder).mkdir(parents=True, exist_ok=True)
    output_path = f"{args.output_folder}/cv_analysis_runtime_{n_runs=}.png"
    plt.savefig(output_path, dpi=200, bbox_inches="tight", pad_inches=0.5)
    plt.close()


def measure_and_save_plot(args, test_cases):
    n_runs = int(args.n_runs)
    measure_pdf_fn = partial(measure_pdf, n_runs=n_runs)
    test_pdf_paths = list(Path(args.pdf_folder).glob("*.pdf"))
    results = lmap(measure_pdf_fn, test_pdf_paths)
    plot_results_and_save(results, test_cases, n_runs, test_pdf_paths)


if __name__ == "__main__":

    test_cases = ["table", "layout", "figure"]

    args = parse_args()
    measure_and_save_plot(args, test_cases)