Pull request #19: Refactor evaluate

Merge in RR/cv-analysis from refactor-evaluate to master

Squashed commit of the following:

commit cde03a492452610322f8b7d3eb804a51afb76d81
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Fri Jul 22 12:37:36 2022 +0200

    add optional show analysis metadata dict

commit fb8bb9e2afa7767f2560f865516295be65f97f20
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Fri Jul 22 12:13:18 2022 +0200

    add script to evaluate runtime per page for all cv-analysis operations for multiple PDFs

commit 721e823e2ec38aae3fea51d01e2135fc8f228d94
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Fri Jul 22 10:30:31 2022 +0200

    refactor

commit a453753cfa477e162e5902ce191ded61cb678337
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Fri Jul 22 10:19:24 2022 +0200

    add logic to transform result coordinates accordingly to page rotation, update annotation script to use this logic

commit 71c09758d0fb763a2c38c6871e1d9bf51f2e7c41
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Thu Jul 21 15:57:49 2022 +0200

    introduce pipeline for image conversion, analysis and result formatting

commit aef252a41b9658dd0c4f55aa2d9f84de933586e0
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Thu Jul 21 15:57:38 2022 +0200

    introduce pipeline for image conversion, analysis and result formatting
This commit is contained in:
Julius Unverfehrt 2022-07-22 15:11:40 +02:00
parent e7b28f5bda
commit a871fa3bd3
7 changed files with 221 additions and 108 deletions

View File

@ -0,0 +1,64 @@
from functools import partial
from typing import Callable
from funcy import lmap
from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline
from cv_analysis.layout_parsing import parse_layout
from cv_analysis.server.rotate import rotate_rectangle
from cv_analysis.table_parsing import parse_tables
from cv_analysis.utils.logging import get_logger
from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs
from cv_analysis.utils.structures import Rectangle
logger = get_logger()
def make_analysis_pipeline(analysis_fn: Callable, dpi=200):
"""Make end-to-end pipeline to analyse a PDF with given analysis function.
The pipeline returns a Generator of dicts containing page information and the analysis results.
Steps:
Convert PDF to Arrays and page information
Analise pages, get list of bboxes per page (e.g. table cells)
Convert pixel values to inches
Rotate results if page is rotated
Format results to stream of dictionaries
"""
def pipeline(pdf: bytes, index=None):
image_metadata_pairs = pdf_to_image_metadata_pairs(pdf, index=index, dpi=dpi)
results = map(image_metadata_pair_to_results, image_metadata_pairs)
results_filtered = filter(lambda x: x["bboxes"], results)
return results_filtered
def image_metadata_pair_to_results(image_metadata_pair):
rectangles = analysis_fn(image_metadata_pair.image)
rectangles = map(partial(pixel_rect_to_inches_rect, dpi=dpi), rectangles)
if image_metadata_pair.metadata["rotation"] != 0:
rotate_rectangle_fn = partial(rotate_rectangle, metadata=image_metadata_pair.metadata)
rectangles = map(rotate_rectangle_fn, rectangles)
bboxes = lmap(lambda x: x.json_xyxy(), rectangles)
return {**image_metadata_pair.metadata, "bboxes": bboxes}
return pipeline
def get_analysis_fn(analysis_type):
if analysis_type == "table":
return parse_tables
elif analysis_type == "layout":
return parse_layout
elif analysis_type == "figure":
return make_figure_detection_pipeline()
else:
raise
def pixel_rect_to_inches_rect(rect, dpi):
def convert_pixel_to_inch(pixel):
return pixel / dpi * 72
bbox = rect.x1, rect.y1, rect.x2, rect.y2
bbox_inches = tuple(map(convert_pixel_to_inch, bbox))
return Rectangle.from_xyxy(bbox_inches, discrete=False)

View File

@ -1,35 +1,25 @@
from _operator import itemgetter from _operator import itemgetter
from functools import partial
import numpy as np import numpy as np
from cv_analysis.utils.structures import Rectangle from cv_analysis.utils.structures import Rectangle
def make_formatter(dpi, page_size, rotation): def rotate_rectangle(rectangle, metadata):
width, height, rotation = itemgetter("width", "height", "rotation")(metadata)
rotation = rotation // 90 if rotation not in [0, 1, 2, 3] else rotation rotation = rotation // 90 if rotation not in [0, 1, 2, 3] else rotation
def format_(key2pixel): if rotation in [1, 3]:
convert = partial(convert_pixel_to_inch, dpi=dpi) width, height = height, width
x, y, w, h = map(convert, itemgetter("x", "y", "width", "height")(key2pixel))
x1, y1 = x + w, y + h
matrix = np.vstack([[x, y], [x1, y1]]).T
new_matrix = rotate_and_shift(matrix, rotation, page_size)
x1, x2 = sorted(new_matrix[0, :])
y1, y2 = sorted(new_matrix[1, :])
return Rectangle.from_xyxy((x1, y1, x2, y2), discrete=False).json_xywh()
return format_ x1, y1, x2, y2 = rectangle.xyxy()
matrix = np.vstack([[x1, y1], [x2, y2]]).T
new_matrix = rotate_and_shift(matrix, rotation, (width, height))
x1, x2 = sorted(new_matrix[0, :])
y1, y2 = sorted(new_matrix[1, :])
def convert_pixel_to_inch(pixel, dpi): return Rectangle.from_xyxy((x1, y1, x2, y2), discrete=False)
return pixel / dpi * 72
def rotate(input_matrix, radians):
rotation_matrix = np.vstack([[np.cos(radians), -np.sin(radians)], [np.sin(radians), np.cos(radians)]])
return np.dot(rotation_matrix, input_matrix)
def rotate_and_shift(matrix, rotation, size, debug=False): def rotate_and_shift(matrix, rotation, size, debug=False):
@ -109,3 +99,9 @@ def __show_matrices(size, radians, matrix, matrix_rotated, matrix_rotated_and_sh
axes[1].quiver([0, 0], [0, 0], m3[0, :], m3[1, :], scale=5, scale_units="inches", color="blue") axes[1].quiver([0, 0], [0, 0], m3[0, :], m3[1, :], scale=5, scale_units="inches", color="blue")
plt.show() plt.show()
def rotate(input_matrix, radians):
rotation_matrix = np.vstack([[np.cos(radians), -np.sin(radians)], [np.sin(radians), np.cos(radians)]])
return np.dot(rotation_matrix, input_matrix)

View File

@ -1,43 +0,0 @@
import gzip
from operator import itemgetter
from typing import Callable
from funcy import lmap
from pyinfra.server.utils import make_streamable_and_wrap_in_packing_logic
from cv_analysis.server.format import make_formatter
from cv_analysis.utils.logging import get_logger
from cv_analysis.utils.open_pdf import open_pdf
logger = get_logger()
def make_streamable_analysis_fn(analysis_fn: Callable):
"""Makes an analysis function streamable for pyinfra server logic. The wrapped function then
works with data and metadata and returns a tuple or generator of tuples with data and metadata.
For more information about the server logic, see the PyInfra documentation.
Args:
analysis_fn: cv-analysis function
Returns:
wrapped function
"""
def analyse(data: bytes, metadata: dict):
image = open_pdf(gzip.decompress(data))[0]
dpi = metadata["image_info"]["dpi"]
width, height, rotation = itemgetter("width", "height", "rotation")(metadata["page_info"])
formatter = make_formatter(dpi, (width, height), rotation)
results = map(lambda x: x.json_xywh(), analysis_fn(image))
results = {"cells": (lmap(formatter, results))}
logger.debug(f"Page {metadata['page_info'].get('index', '')}: Found {len(results['cells'])} cells.")
return b"", {**metadata, **results}
return make_streamable_and_wrap_in_packing_logic(analyse, batched=False)

View File

@ -1,3 +1,4 @@
from dataclasses import dataclass
from functools import partial from functools import partial
from typing import Iterator, Tuple from typing import Iterator, Tuple
@ -5,20 +6,25 @@ import fitz
import numpy as np import numpy as np
def pdf_to_array_and_metadata(pdf: bytes, index=None, dpi=200) -> Iterator[Tuple[np.ndarray, dict]]: @dataclass
"""Stream the pages of a PDF as Tuples of page as matrix representation and page metadata. class ImageMetadataPair:
Note: If Index is not given or evaluates to None, the whole PDF will be processed. image: np.ndarray
""" metadata: dict
convert_fn = partial(page_to_array_and_metadata, dpi=dpi)
def pdf_to_image_metadata_pairs(pdf: bytes, index=None, dpi=200) -> Iterator[ImageMetadataPair]:
"""Streams PDF as pairs of image (matrix) and metadata.
Note: If Index is not given or evaluates to None, the whole PDF will be processed."""
convert_fn = partial(page_to_image_metadata_pair, dpi=dpi)
yield from map(convert_fn, stream_pages(pdf, index)) yield from map(convert_fn, stream_pages(pdf, index))
def page_to_array_and_metadata(page: fitz.Page, dpi): def page_to_image_metadata_pair(page: fitz.Page, dpi):
metadata = get_page_info(page, dpi) metadata = get_page_info(page)
pixmap = page.get_pixmap(dpi=dpi) pixmap = page.get_pixmap(dpi=dpi)
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
return array, metadata return ImageMetadataPair(array, metadata)
def stream_pages(pdf: bytes, index=None) -> Iterator[fitz.Page]: def stream_pages(pdf: bytes, index=None) -> Iterator[fitz.Page]:
@ -30,11 +36,10 @@ def stream_pages(pdf: bytes, index=None) -> Iterator[fitz.Page]:
yield pdf_handle[i] yield pdf_handle[i]
def get_page_info(page, dpi): def get_page_info(page):
return { return {
"index": page.number, "index": page.number,
"rotation": page.rotation, "rotation": page.rotation,
"width": page.rect.width, # rotated page width in inches "width": page.rect.width, # rotated page width in inches
"height": page.rect.height, # rotated page height in inches "height": page.rect.height, # rotated page height in inches
"dpi": dpi,
} }

View File

@ -1,15 +1,11 @@
import argparse import argparse
from itertools import starmap import json
from operator import itemgetter
from pathlib import Path from pathlib import Path
from PIL import Image import fitz
from funcy import lmap
from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline from cv_analysis.server.pipeline import get_analysis_fn, make_analysis_pipeline
from cv_analysis.layout_parsing import parse_layout
from cv_analysis.table_parsing import parse_tables
from cv_analysis.utils.draw import draw_rectangles
from cv_analysis.utils.pdf2array import pdf_to_array_and_metadata
def parse_args(): def parse_args():
@ -17,36 +13,35 @@ def parse_args():
parser.add_argument("pdf_path") parser.add_argument("pdf_path")
parser.add_argument("output_folder") parser.add_argument("output_folder")
parser.add_argument("--type", "-t", choices=["table", "layout", "figure"], required=True) parser.add_argument("--type", "-t", choices=["table", "layout", "figure"], required=True)
parser.add_argument("--verbose", action="store_true")
parser.add_argument("--silent", dest="verbose", action="store_false")
parser.set_defaults(verbose=False)
return parser.parse_args() return parser.parse_args()
def analyse_and_annotate(images, analysis_fn): def analyse_annotate_save(pdf, analysis_type, output_path, verbose):
result = map(analysis_fn, images) pipe = make_analysis_pipeline(get_analysis_fn(analysis_type))
annotated_images = starmap(draw_rectangles, zip(images, result)) results = list(pipe(pdf))
return annotated_images
if verbose:
print(json.dumps(results, indent=2))
def save_as_pdf(images, output_folder, file_name, operation): with fitz.open(stream=pdf) as pdf_handle:
Path(output_folder).mkdir(parents=True, exist_ok=True) for result in results:
images = lmap(Image.fromarray, images) page = pdf_handle[result["index"]]
images[0].save(f"{output_folder}/{file_name}_annotated_{operation}.pdf", save_all=True, append_images=images) for rect in result["bboxes"]:
x1, y1, x2, y2 = itemgetter("x1", "y1", "x2", "y2")(rect)
page.draw_rect((x1, y1, x2, y2), color=(0.5, 0.7, 0.2), width=2)
def get_analysis_fn(analysis_type): pdf_handle.save(output_path)
if analysis_type == "table":
return parse_tables
elif analysis_type == "layout":
return parse_layout
elif analysis_type == "figure":
return make_figure_detection_pipeline()
else:
raise
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
with open(args.pdf_path, "rb") as f: with open(args.pdf_path, "rb") as f:
pdf_bytes = f.read() pdf_bytes = f.read()
images, _ = zip(*pdf_to_array_and_metadata(pdf_bytes))
annotated_pages = analyse_and_annotate(images=images, analysis_fn=get_analysis_fn(args.type)) Path(args.output_folder).mkdir(parents=True, exist_ok=True)
save_as_pdf(annotated_pages, args.output_folder, Path(args.pdf_path).stem, args.type) output_path = f"{args.output_folder}/{Path(args.pdf_path).stem}_annotated_{args.type}.pdf"
analyse_annotate_save(pdf_bytes, args.type, output_path, args.verbose)

View File

@ -0,0 +1,96 @@
import argparse
import time
from functools import partial
from pathlib import Path
import fitz
import numpy as np
from funcy import lmap
from matplotlib import pyplot as plt
from cv_analysis.server.pipeline import make_analysis_pipeline, get_analysis_fn
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("pdf_folder", help="Path to folder with PDFs to evaluate")
parser.add_argument("output_folder", help="Path to folder where the Runtime plot should be stored")
parser.add_argument("n_runs", help="Number of runs per test")
return parser.parse_args()
def measure(fn, n_runs):
def run(*args, **kwargs):
def _run():
start = time.time()
results = list(fn(*args, **kwargs)) # Evaluate generators
end = time.time()
return end - start
runtimes = [_run() for _ in range(n_runs)]
return np.mean(runtimes), np.std(runtimes)
return run
def run_tests(pdf, test_cases, n_runs):
def measure_analysis_pipe(test_case):
timed_analysis_pipe = measure(make_analysis_pipeline(get_analysis_fn(test_case)), n_runs)
return timed_analysis_pipe(pdf)
return lmap(measure_analysis_pipe, test_cases)
def to_ms_per_page(runtime, page_count):
ms_per_page = runtime / page_count * 1000
return round(ms_per_page, 0)
def measure_pdf(pdf_path, n_runs):
with open(pdf_path, "rb") as f:
pdf = f.read()
page_count = fitz.open(stream=pdf).page_count
format_fn = partial(to_ms_per_page, page_count=page_count)
means, std = zip(*run_tests(pdf, test_cases, n_runs=n_runs))
means, std = lmap(format_fn, means), lmap(format_fn, std)
return means, std
def plot_results_and_save(results, labels, n_runs, test_pdf_paths):
fig, ax = plt.subplots()
width = 0.2
x_labels = np.arange(len(labels))
plt.xticks(ticks=x_labels, labels=labels, rotation=90)
plt.grid(linestyle="dotted")
for idx, (result, test_pdf_path) in enumerate(zip(results, test_pdf_paths)):
x = x_labels + idx * width
means, std = result
bars = ax.bar(x, means, width, yerr=std, label=f"{test_pdf_path.stem}")
ax.bar_label(bars)
ax.set_ylabel("ms/page")
ax.set_xlabel("Cv-analysis operation")
ax.set_title(f"Cv-analysis runtime estimation {n_runs=}")
ax.legend(loc=0)
Path(args.output_folder).mkdir(parents=True, exist_ok=True)
output_path = f"{args.output_folder}/cv_analysis_runtime_{n_runs=}.png"
plt.savefig(output_path, dpi=200, bbox_inches="tight", pad_inches=0.5)
plt.close()
def measure_and_save_plot(args, test_cases):
n_runs = int(args.n_runs)
measure_pdf_fn = partial(measure_pdf, n_runs=n_runs)
test_pdf_paths = list(Path(args.pdf_folder).glob("*.pdf"))
results = lmap(measure_pdf_fn, test_pdf_paths)
plot_results_and_save(results, test_cases, n_runs, test_pdf_paths)
if __name__ == "__main__":
test_cases = ["table", "layout", "figure"]
args = parse_args()
measure_and_save_plot(args, test_cases)

View File

@ -2,7 +2,7 @@ import fitz
import numpy as np import numpy as np
import pytest import pytest
from cv_analysis.utils.pdf2array import pdf_to_array_and_metadata from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs
@pytest.fixture @pytest.fixture
@ -17,8 +17,8 @@ def pdf(n_pages):
@pytest.mark.parametrize("n_pages", [1]) @pytest.mark.parametrize("n_pages", [1])
def test_pdf_to_array_and_metadata(pdf): def test_pdf_to_array_and_metadata(pdf):
for array, metadata in pdf_to_array_and_metadata(pdf): for image_metadata_pair in pdf_to_image_metadata_pairs(pdf):
assert isinstance(array, np.ndarray) assert isinstance(image_metadata_pair.image, np.ndarray)
assert array.shape == (2339, 1653, 3) # Height, Width, Color channels assert image_metadata_pair.image.shape == (2339, 1653, 3) # Height, Width, Color channels
assert isinstance(metadata, dict) assert isinstance(image_metadata_pair.metadata, dict)