From a2451b910328be44ab651fe97c23141b98174112 Mon Sep 17 00:00:00 2001
From: Julius Unverfehrt <Julius.Unverfehrt@iqser.com>
Date: Wed, 20 Jul 2022 11:01:55 +0200
Subject: [PATCH 1/3] Pull request #17: Add pdf2array func

Merge in RR/cv-analysis from add-pdf2array-func to master

Squashed commit of the following:

commit 6e6e9a509ede0abf28fb93a2042960efcc9453bd
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Wed Jul 20 09:12:01 2022 +0200

    update script with layout parsing, refactor pdf2array

commit 191bc71f58aa5c07b0cadbdb7067cd72c3d8858b
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Wed Jul 20 09:10:06 2022 +0200

    update script with layout parsing, refactor pdf2array

commit 25201bbb4151a23784193181272d379232877d2f
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Wed Jul 20 08:33:20 2022 +0200

    add pdf2array functionality
---
 cv_analysis/utils/pdf2array.py    | 40 ++++++++++++++++++++++++
 requirements.txt                  |  5 ++-
 scripts/annotate_figures.py       | 38 ----------------------
 scripts/annotate_pdf.py           | 52 +++++++++++++++++++++++++++++++
 test/unit_tests/pdf2array_test.py | 24 ++++++++++++++
 5 files changed, 120 insertions(+), 39 deletions(-)
 create mode 100644 cv_analysis/utils/pdf2array.py
 delete mode 100644 scripts/annotate_figures.py
 create mode 100644 scripts/annotate_pdf.py
 create mode 100644 test/unit_tests/pdf2array_test.py

diff --git a/cv_analysis/utils/pdf2array.py b/cv_analysis/utils/pdf2array.py
new file mode 100644
index 0000000..8ce1ea6
--- /dev/null
+++ b/cv_analysis/utils/pdf2array.py
@@ -0,0 +1,40 @@
+from functools import partial
+from typing import Iterator, Tuple
+
+import fitz
+import numpy as np
+
+
+def pdf_to_array_and_metadata(pdf: bytes, index=None, dpi=200) -> Iterator[Tuple[np.ndarray, dict]]:
+    """Stream the pages of a PDF as Tuples of page as matrix representation and page metadata.
+    Note: If Index is not given or evaluates to None, the whole PDF will be processed.
+    """
+    convert_fn = partial(page_to_array_and_metadata, dpi=dpi)
+    yield from map(convert_fn, stream_pages(pdf, index))
+
+
+def page_to_array_and_metadata(page: fitz.Page, dpi):
+    metadata = get_page_info(page, dpi)
+    pixmap = page.get_pixmap(dpi=dpi)
+    array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
+
+    return array, metadata
+
+
+def stream_pages(pdf: bytes, index=None) -> Iterator[fitz.Page]:
+    with fitz.open(stream=pdf) as pdf_handle:
+        if not index:
+            yield from pdf_handle
+        else:
+            for i in index:
+                yield pdf_handle[i]
+
+
+def get_page_info(page, dpi):
+    return {
+        "index": page.number,
+        "rotation": page.rotation,
+        "width": page.rect.width,  # rotated page width in inches
+        "height": page.rect.height,  # rotated page height in inches
+        "dpi": dpi,
+    }
diff --git a/requirements.txt b/requirements.txt
index dc2ae9d..de53279 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,4 +16,7 @@ coverage~=5.5
 dependency-check~=0.6.0
 prometheus-client~=0.13.1
 prometheus_flask_exporter~=0.19.0
-lorem-text==2.1
\ No newline at end of file
+lorem-text==2.1
+
+# pdf2array
+PyMuPDF==1.19.6
\ No newline at end of file
diff --git a/scripts/annotate_figures.py b/scripts/annotate_figures.py
deleted file mode 100644
index cdc72f4..0000000
--- a/scripts/annotate_figures.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import argparse
-from itertools import starmap
-from pathlib import Path
-
-import numpy as np
-import pdf2image
-from PIL import Image
-from funcy import lmap
-
-from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline
-from cv_analysis.utils.draw import draw_rectangles
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pdf_path", "-p", required=True)
-    parser.add_argument("--output_folder", "-o", required=True)
-    return parser.parse_args()
-
-
-def annotate_figures(images):
-    pipeline = make_figure_detection_pipeline()
-    result = map(pipeline, images)
-    annotated_images = starmap(draw_rectangles, zip(images, result))
-    return annotated_images
-
-
-def save_as_pdf(images, output_folder, file_name):
-    Path(output_folder).mkdir(parents=True, exist_ok=True)
-    images = lmap(Image.fromarray, images)
-    images[0].save(f"{output_folder}/{file_name}_annotated_figures.pdf", save_all=True, append_images=images)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    pages = lmap(np.array, pdf2image.convert_from_path(args.pdf_path))
-    annotated_pages = annotate_figures(images=pages)
-    save_as_pdf(annotated_pages, args.output_folder, Path(args.pdf_path).stem)
diff --git a/scripts/annotate_pdf.py b/scripts/annotate_pdf.py
new file mode 100644
index 0000000..aead8f6
--- /dev/null
+++ b/scripts/annotate_pdf.py
@@ -0,0 +1,52 @@
+import argparse
+from itertools import starmap
+from pathlib import Path
+
+from PIL import Image
+from funcy import lmap
+
+from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline
+from cv_analysis.layout_parsing import parse_layout
+from cv_analysis.table_parsing import parse_tables
+from cv_analysis.utils.draw import draw_rectangles
+from cv_analysis.utils.pdf2array import pdf_to_array_and_metadata
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("pdf_path")
+    parser.add_argument("output_folder")
+    parser.add_argument("--type", "-t", choices=["table", "layout", "figure"], required=True)
+    return parser.parse_args()
+
+
+def analyse_and_annotate(images, analysis_fn):
+    result = map(analysis_fn, images)
+    annotated_images = starmap(draw_rectangles, zip(images, result))
+    return annotated_images
+
+
+def save_as_pdf(images, output_folder, file_name, operation):
+    Path(output_folder).mkdir(parents=True, exist_ok=True)
+    images = lmap(Image.fromarray, images)
+    images[0].save(f"{output_folder}/{file_name}_annotated_{operation}.pdf", save_all=True, append_images=images)
+
+
+def get_analysis_fn(analysis_type):
+    if analysis_type == "table":
+        return parse_tables
+    elif analysis_type == "layout":
+        return parse_layout
+    elif analysis_type == "figure":
+        return make_figure_detection_pipeline()
+    else:
+        raise
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    with open(args.pdf_path, "rb") as f:
+        pdf_bytes = f.read()
+        images, _ = zip(*pdf_to_array_and_metadata(pdf_bytes))
+    annotated_pages = analyse_and_annotate(images=images, analysis_fn=get_analysis_fn(args.type))
+    save_as_pdf(annotated_pages, args.output_folder, Path(args.pdf_path).stem, args.type)
diff --git a/test/unit_tests/pdf2array_test.py b/test/unit_tests/pdf2array_test.py
new file mode 100644
index 0000000..1820ed8
--- /dev/null
+++ b/test/unit_tests/pdf2array_test.py
@@ -0,0 +1,24 @@
+import fitz
+import numpy as np
+import pytest
+
+from cv_analysis.utils.pdf2array import pdf_to_array_and_metadata
+
+
+@pytest.fixture
+def pdf(n_pages):
+    doc = fitz.open()
+    for n in range(n_pages):
+        page = doc.new_page()
+        where = fitz.Point(50, 100)
+        page.insert_text(where, "De gustibus non est disputandum.", fontsize=30)
+    return doc.write()
+
+
+@pytest.mark.parametrize("n_pages", [1])
+def test_pdf_to_array_and_metadata(pdf):
+    for array, metadata in pdf_to_array_and_metadata(pdf):
+        assert isinstance(array, np.ndarray)
+        assert array.shape == (2339, 1653, 3)  # Height, Width, Color channels
+
+        assert isinstance(metadata, dict)

From e7b28f5bda92e1b495d20d398afeb3a678df503a Mon Sep 17 00:00:00 2001
From: Julius Unverfehrt <Julius.Unverfehrt@iqser.com>
Date: Thu, 21 Jul 2022 13:25:00 +0200
Subject: [PATCH 2/3] Pull request #18: Remove pil

Merge in RR/cv-analysis from remove_pil to master

Squashed commit of the following:

commit 83c8d88f3d48404251470176c70979ee75ae068b
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Thu Jul 21 10:51:51 2022 +0200

    remove deprecated server tests

commit cebc03b5399ac257a74036b41997201f882f5b74
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Thu Jul 21 10:51:08 2022 +0200

    remove deprecated server tests

commit ce2845b0c51f001b7b5b8b195d6bf7e034ec4e39
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Wed Jul 20 17:05:00 2022 +0200

    repair tests to work without pillow WIP

commit 023fdab8322f28359a24c63e32635a3d0deccbe4
Author: Isaac Riley <Isaac.Riley@iqser.com>
Date:   Wed Jul 20 16:40:36 2022 +0200

    fixed typo

commit 33850ca83a175f74789ae6b9bebd057ed84b7fb3
Author: Isaac Riley <Isaac.Riley@iqser.com>
Date:   Wed Jul 20 16:38:37 2022 +0200

    fixed import from refactored open_img.py

commit dbc6d345f074e538948e2c4f94ebed8a5ef520bc
Author: Isaac Riley <Isaac.Riley@iqser.com>
Date:   Wed Jul 20 16:32:42 2022 +0200

    removed PIL from production code, now inly in scripts
---
 .../figure_detection_pipeline.py              |  4 +-
 cv_analysis/layout_parsing.py                 | 26 +-----
 cv_analysis/redaction_detection.py            | 23 +----
 cv_analysis/server/stream.py                  |  4 +-
 cv_analysis/table_parsing.py                  | 14 +--
 cv_analysis/utils/deskew.py                   | 87 -------------------
 cv_analysis/utils/display.py                  | 36 +++++---
 cv_analysis/utils/logging.py                  |  4 +-
 cv_analysis/utils/open_pdf.py                 | 27 ++++++
 cv_analysis/utils/post_processing.py          | 18 +---
 cv_analysis/utils/preprocessing.py            | 46 ++++------
 cv_analysis/utils/test_metrics.py             |  6 +-
 cv_analysis/utils/visual_logging.py           |  8 +-
 incl/pyinfra                                  |  2 +-
 scripts/annotate.py                           |  7 +-
 scripts/deskew_demo.py                        | 50 -----------
 scripts/pyinfra_mock.py                       | 11 +--
 test/conftest.py                              |  1 -
 test/fixtures/figure_detection.py             | 28 +++---
 test/fixtures/server.py                       |  4 +-
 test/fixtures/table_parsing.py                | 14 +--
 .../figure_detection_pipeline_test.py         |  2 +-
 test/unit_tests/figure_detection/text_test.py | 28 ++----
 test/unit_tests/server/__init__.py            |  0
 .../server/formatted_stream_fn_test.py        | 15 ----
 25 files changed, 116 insertions(+), 349 deletions(-)
 delete mode 100644 cv_analysis/utils/deskew.py
 create mode 100644 cv_analysis/utils/open_pdf.py
 delete mode 100644 scripts/deskew_demo.py
 delete mode 100644 test/unit_tests/server/__init__.py
 delete mode 100644 test/unit_tests/server/formatted_stream_fn_test.py

diff --git a/cv_analysis/figure_detection/figure_detection_pipeline.py b/cv_analysis/figure_detection/figure_detection_pipeline.py
index f0a3b35..1a374f1 100644
--- a/cv_analysis/figure_detection/figure_detection_pipeline.py
+++ b/cv_analysis/figure_detection/figure_detection_pipeline.py
@@ -17,9 +17,7 @@ from cv_analysis.utils.structures import Rectangle
 def make_figure_detection_pipeline(min_area=5000, max_width_to_height_ratio=6):
     def pipeline(image: np.array):
         max_area = image.shape[0] * image.shape[1] * 0.99
-        filter_cnts = make_filter_likely_figures(
-            min_area, max_area, max_width_to_height_ratio
-        )
+        filter_cnts = make_filter_likely_figures(min_area, max_area, max_width_to_height_ratio)
 
         image = remove_primary_text_regions(image)
         cnts = detect_large_coherent_structures(image)
diff --git a/cv_analysis/layout_parsing.py b/cv_analysis/layout_parsing.py
index 3ffeecf..1e6171e 100644
--- a/cv_analysis/layout_parsing.py
+++ b/cv_analysis/layout_parsing.py
@@ -5,10 +5,6 @@ from operator import __and__
 import cv2
 import numpy as np
 
-# from pdf2image import pdf2image
-
-# from cv_analysis.utils.display import show_mpl
-# from cv_analysis.utils.draw import draw_rectangles
 from cv_analysis.utils.structures import Rectangle
 from cv_analysis.utils.post_processing import (
     remove_overlapping,
@@ -23,9 +19,7 @@ def is_likely_segment(rect, min_area=100):
 
 
 def find_segments(image):
-    contours, hierarchies = cv2.findContours(
-        image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
-    )
+    contours, hierarchies = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
 
     mask1 = map(is_likely_segment, contours)
     mask2 = map(has_no_parent, hierarchies[0])
@@ -81,21 +75,3 @@ def parse_layout(image: np.array):
     rects = remove_overlapping(rects)
 
     return list(map(Rectangle.from_xywh, rects))
-
-
-# def annotate_layout_in_pdf(page, return_rects=False, show=False):
-
-#     #page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
-#     #page = np.array(page)
-
-#     rects = parse_layout(page)
-
-#     if return_rects:
-#         return rects, page
-#     elif show:
-#         page = draw_rectangles(page, rects)
-#         vizlogger.debug(page, "layout10_output.png")
-#         show_mpl(page)
-#     else:
-#         page = draw_rectangles(page, rects)
-#         return page
diff --git a/cv_analysis/redaction_detection.py b/cv_analysis/redaction_detection.py
index 3c5bf5f..b9d40d8 100644
--- a/cv_analysis/redaction_detection.py
+++ b/cv_analysis/redaction_detection.py
@@ -5,16 +5,12 @@ import numpy as np
 import pdf2image
 from iteration_utilities import starfilter, first
 
-from cv_analysis.utils.display import show_mpl
-from cv_analysis.utils.draw import draw_contours
 from cv_analysis.utils.filters import is_large_enough, is_filled, is_boxy
 from cv_analysis.utils.visual_logging import vizlogger
 
 
 def is_likely_redaction(contour, hierarchy, min_area):
-    return (
-        is_filled(hierarchy) and is_boxy(contour) and is_large_enough(contour, min_area)
-    )
+    return is_filled(hierarchy) and is_boxy(contour) and is_large_enough(contour, min_area)
 
 
 def find_redactions(image: np.array, min_normalized_area=200000):
@@ -31,9 +27,7 @@ def find_redactions(image: np.array, min_normalized_area=200000):
     thresh = cv2.threshold(blurred, 252, 255, cv2.THRESH_BINARY)[1]
     vizlogger.debug(blurred, "redactions04_threshold.png")
 
-    contours, hierarchies = cv2.findContours(
-        thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE
-    )
+    contours, hierarchies = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
 
     try:
         contours = map(
@@ -46,16 +40,3 @@ def find_redactions(image: np.array, min_normalized_area=200000):
         return list(contours)
     except:
         return []
-
-
-# def annotate_redactions_in_pdf(page, show=False):
-
-#     #page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
-#     #page = np.array(page)
-
-#     redaction_contours = find_redactions(page)
-#     page = draw_contours(page, redaction_contours)
-#     vizlogger.debug(page, "redactions05_output.png")
-
-#     if show:
-#         show_mpl(page)
diff --git a/cv_analysis/server/stream.py b/cv_analysis/server/stream.py
index ae66475..a73ae05 100644
--- a/cv_analysis/server/stream.py
+++ b/cv_analysis/server/stream.py
@@ -7,7 +7,7 @@ from pyinfra.server.utils import make_streamable_and_wrap_in_packing_logic
 
 from cv_analysis.server.format import make_formatter
 from cv_analysis.utils.logging import get_logger
-from cv_analysis.utils.preprocessing import open_img_from_bytes
+from cv_analysis.utils.open_pdf import open_pdf
 
 logger = get_logger()
 
@@ -26,7 +26,7 @@ def make_streamable_analysis_fn(analysis_fn: Callable):
 
     def analyse(data: bytes, metadata: dict):
 
-        image = open_img_from_bytes(gzip.decompress(data))
+        image = open_pdf(gzip.decompress(data))[0]
 
         dpi = metadata["image_info"]["dpi"]
         width, height, rotation = itemgetter("width", "height", "rotation")(metadata["page_info"])
diff --git a/cv_analysis/table_parsing.py b/cv_analysis/table_parsing.py
index 52d5292..9375a0f 100644
--- a/cv_analysis/table_parsing.py
+++ b/cv_analysis/table_parsing.py
@@ -15,9 +15,7 @@ from cv_analysis.layout_parsing import parse_layout
 
 def add_external_contours(image, image_h_w_lines_only):
 
-    contours, _ = cv2.findContours(
-        image_h_w_lines_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE
-    )
+    contours, _ = cv2.findContours(image_h_w_lines_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
     for cnt in contours:
         x, y, w, h = cv2.boundingRect(cnt)
         cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
@@ -82,9 +80,7 @@ def isolate_vertical_and_horizontal_components(img_bin):
     img_bin_extended = img_bin_h | img_bin_v
 
     th1, img_bin_extended = cv2.threshold(img_bin_extended, 120, 255, cv2.THRESH_BINARY)
-    img_bin_final = cv2.dilate(
-        img_bin_extended, np.ones((1, 1), np.uint8), iterations=1
-    )
+    img_bin_final = cv2.dilate(img_bin_extended, np.ones((1, 1), np.uint8), iterations=1)
     # add contours before lines are extended by blurring
     img_bin_final = add_external_contours(img_bin_final, img_lines_raw)
 
@@ -137,9 +133,7 @@ def turn_connected_components_into_rects(image):
         x1, y1, w, h, area = stat
         return area > 2000 and w > 35 and h > 25
 
-    _, _, stats, _ = cv2.connectedComponentsWithStats(
-        ~image, connectivity=8, ltype=cv2.CV_32S
-    )
+    _, _, stats, _ = cv2.connectedComponentsWithStats(~image, connectivity=8, ltype=cv2.CV_32S)
 
     stats = np.vstack(list(filter(is_large_enough, stats)))
     return stats[:, :-1][2:]
@@ -149,7 +143,7 @@ def parse_tables(image: np.array, show=False):
     """Runs the full table parsing process.
 
     Args:
-        image (np.array): single PDF page, opened as PIL.Image object and converted to a numpy array
+        image (np.array): single PDF page, converted to a numpy array
 
     Returns:
         list: list of rectangles corresponding to table cells
diff --git a/cv_analysis/utils/deskew.py b/cv_analysis/utils/deskew.py
deleted file mode 100644
index 98f3de3..0000000
--- a/cv_analysis/utils/deskew.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import numpy as np
-from scipy.ndimage import rotate as rotate_
-import cv2
-
-from cv_analysis.config import CONFIG
-
-
-def rotate_straight(im: np.array, skew_angle: int) -> np.array:
-    h, w = im.shape[:2]
-    center = (w // 2, h // 2)
-    M = cv2.getRotationMatrix2D(center, skew_angle, 1.0)
-    rotated = cv2.warpAffine(
-        im, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE
-    )
-    return rotated
-
-
-def find_score(arr, angle):
-    data = rotate_(arr, angle, reshape=False, order=0, mode=CONFIG.deskew.mode)
-    hist = np.sum(data, axis=1)
-    score = np.sum((hist[1:] - hist[:-1]) ** 2)
-    return score
-
-
-def find_best_angle(page):
-    lim = CONFIG.deskew.max_abs_angle
-    delta = CONFIG.deskew.delta
-    angles = np.arange(-lim, lim + delta, delta)
-    scores = [find_score(page, angle) for angle in angles]
-    best_angle = angles[scores.index(max(scores))]
-    return best_angle
-
-
-def preprocess(arr: np.array):
-    if len(arr.shape) > 2:
-        arr = cv2.cvtColor(arr, cv2.COLOR_BGR2GRAY)
-    arr = cv2.fastNlMeansDenoising(arr, h=CONFIG.deskew.filter_strength_h)
-    return arr
-
-
-def rotate(page, angle):
-    rotated = rotate_(page, angle, reshape=False, order=0, mode="nearest")
-    return rotated
-
-
-def deskew_histbased(page: np.array):
-    page = preprocess(page)
-    best_angle = round(find_best_angle(page), 3)
-
-    if CONFIG.deskew.verbose:
-        print("Skew angle from pixel histogram: {}".format(best_angle))
-
-    rotated = rotate(page, best_angle)
-    return (rotated, best_angle)
-
-
-def needs_deskew(page: np.array) -> bool:
-    """
-    Makes use of 'row-wise mean difference' - the difference between neighboring  - on left and right halves
-    """
-
-    def split_rowmean_diff(page):
-        width = page.shape[1]
-        cutpoint = int(width / 2)
-        left = page[:, :cutpoint]
-        right = page[:, cutpoint:]
-        leftmeans = np.mean(left, axis=1)
-        rightmeans = np.mean(right, axis=1)
-        return rightmeans - leftmeans
-
-    unrotated_score = np.mean(np.abs(split_rowmean_diff(page)))
-    angles = [-CONFIG.deskew.test_delta, CONFIG.deskew.test_delta]
-    scores = [
-        np.mean(np.abs(split_rowmean_diff(rotate(page, angle)))) for angle in angles
-    ]
-    print(unrotated_score, scores)
-    return unrotated_score > min(scores)
-
-
-if CONFIG.deskew.function == "hist":
-    deskew = lambda page: deskew_histbased(page) if needs_deskew(page) else (page, 0)
-elif CONFIG.deskew.function == "identity":
-    deskew = lambda page: (page, None)
-else:
-    raise ValueError(
-        "'{CONFIG.deskew.function}' is not a valid parameter value for CONFIG.deskew.function"
-    )
diff --git a/cv_analysis/utils/display.py b/cv_analysis/utils/display.py
index 999c9a2..f5d9285 100644
--- a/cv_analysis/utils/display.py
+++ b/cv_analysis/utils/display.py
@@ -1,26 +1,34 @@
+from numpy import resize
 import cv2
 from matplotlib import pyplot as plt
 
 
-def show_mpl(image):
+def show_image_cv2(image, maxdim=700):
+    h, w, c = image.shape
+    maxhw = max(h, w)
+    if maxhw > maxdim:
+        ratio = maxdim / maxhw
+        h = int(h * ratio)
+        w = int(w * ratio)
+        img = cv2.resize(image, (h, w))
+    cv2.imshow("", img)
+    cv2.waitKey(0)
+    cv2.destroyAllWindows()
+
+
+def show_image_mpl(image):
     fig, ax = plt.subplots(1, 1)
     fig.set_size_inches(20, 20)
     ax.imshow(image, cmap="gray")
     plt.show()
 
 
-def save_mpl(image, path):
-    # fig, ax = plt.subplots(1, 1)
-    # figure = plt.gcf()
-    # figure.set_size_inches(16,12)
-    fig, ax = plt.subplots(1, 1)
-    fig.set_size_inches(20, 20)
-    ax.imshow(image, cmap="gray")
-    # plt.close()
-    plt.savefig(path)
-    plt.close()
+def show_image(image, backend="m"):
+    if backend.startswith("m"):
+        show_image_mpl(image)
+    else:
+        show_image_cv2(image)
 
 
-def show_cv2(image):
-    cv2.imshow("", image)
-    cv2.waitKey(0)
+def save_image(image, path):
+    cv2.imwrite(path, image)
diff --git a/cv_analysis/utils/logging.py b/cv_analysis/utils/logging.py
index 6fc280f..51be0fb 100644
--- a/cv_analysis/utils/logging.py
+++ b/cv_analysis/utils/logging.py
@@ -8,9 +8,7 @@ from cv_analysis.config import CONFIG
 def make_logger_getter():
     logger = logging.getLogger(__name__)
     logger.setLevel(logging.getLevelName(CONFIG.service.logging_level))
-    formatter = logging.Formatter(
-        fmt="%(asctime)s %(levelname)s: %(message)s", datefmt="%d.%m.%Y - %H:%M:%S"
-    )
+    formatter = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s", datefmt="%d.%m.%Y - %H:%M:%S")
 
     ch = logging.StreamHandler(sys.stdout)
     ch.setLevel(logging.getLevelName(CONFIG.service.logging_level))
diff --git a/cv_analysis/utils/open_pdf.py b/cv_analysis/utils/open_pdf.py
new file mode 100644
index 0000000..d704ba4
--- /dev/null
+++ b/cv_analysis/utils/open_pdf.py
@@ -0,0 +1,27 @@
+from numpy import array, ndarray
+import pdf2image
+from PIL import Image
+
+from cv_analysis.utils.preprocessing import preprocess_page_array
+
+
+def open_pdf(pdf, first_page=0, last_page=None):
+
+    first_page += 1
+    last_page = None if last_page is None else last_page + 1
+
+    if type(pdf) == str:
+        if pdf.lower().endswith((".png", ".jpg", ".jpeg")):
+            pages = [Image.open(pdf)]
+        elif pdf.lower().endswith(".pdf"):
+            pages = pdf2image.convert_from_path(pdf, first_page=first_page, last_page=last_page)
+        else:
+            raise IOError("Invalid file extension. Accepted filetypes:\n\t.png\n\t.jpg\n\t.jpeg\n\t.pdf")
+    elif type(pdf) == bytes:
+        pages = pdf2image.convert_from_bytes(pdf, first_page=first_page, last_page=last_page)
+    elif type(pdf) in {list, ndarray}:
+        return pdf
+
+    pages = [preprocess_page_array(array(p)) for p in pages]
+
+    return pages
diff --git a/cv_analysis/utils/post_processing.py b/cv_analysis/utils/post_processing.py
index 46da1dc..1749f2d 100644
--- a/cv_analysis/utils/post_processing.py
+++ b/cv_analysis/utils/post_processing.py
@@ -18,21 +18,11 @@ def remove_overlapping(rectangles):
 
 def remove_included(rectangles):
     def included(a, b):
-        return (
-            b.xmin >= a.xmin
-            and b.ymin >= a.ymin
-            and b.xmax <= a.xmax
-            and b.ymax <= a.ymax
-        )
+        return b.xmin >= a.xmin and b.ymin >= a.ymin and b.xmax <= a.xmax and b.ymax <= a.ymax
 
     def includes(a, b, tol=3):
         """does a include b?"""
-        return (
-            b.xmin + tol >= a.xmin
-            and b.ymin + tol >= a.ymin
-            and b.xmax - tol <= a.xmax
-            and b.ymax - tol <= a.ymax
-        )
+        return b.xmin + tol >= a.xmin and b.ymin + tol >= a.ymin and b.xmax - tol <= a.xmax and b.ymax - tol <= a.ymax
 
     def is_not_included(rect, rectangles):
         return not any(includes(r2, rect) for r2 in rectangles if not rect == r2)
@@ -110,9 +100,7 @@ def __remove_isolated_sorted(rectangles):
 
 
 def remove_isolated(rectangles, input_sorted=False):
-    return (__remove_isolated_sorted if input_sorted else __remove_isolated_unsorted)(
-        rectangles
-    )
+    return (__remove_isolated_sorted if input_sorted else __remove_isolated_unsorted)(rectangles)
 
 
 Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax")
diff --git a/cv_analysis/utils/preprocessing.py b/cv_analysis/utils/preprocessing.py
index d51139a..c3269d4 100644
--- a/cv_analysis/utils/preprocessing.py
+++ b/cv_analysis/utils/preprocessing.py
@@ -1,41 +1,29 @@
-from io import BytesIO
-from numpy import array, ndarray
-import pdf2image
-from PIL import Image
+from numpy import frombuffer, ndarray
 import cv2
 
 
-def preprocess_pdf_image(page):
+def preprocess_page_array(page):
     if len(page.shape) > 2:
         page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY)
     page = cv2.fastNlMeansDenoising(page, h=3)
     return page
 
 
-def open_pdf(pdf, first_page=0, last_page=None):
+def page2image(page):
 
-    first_page += 1
-    last_page = None if last_page is None else last_page + 1
-
-    if type(pdf) == str:
-        if pdf.lower().endswith((".png", ".jpg", ".jpeg")):
-            pages = [Image.open(pdf)]
-        else:  # assume pdf as default file type for a path argument
-            pages = pdf2image.convert_from_path(
-                pdf, first_page=first_page, last_page=last_page
+    if type(page) == bytes:
+        page = frombuffer(page)
+    elif type(page) == ndarray:
+        page = page
+    elif type(page) == str:
+        if page.lower().endswith((".png", ".jpg", ".jpeg")):
+            page = cv2.imread(page)
+        else:
+            raise IOError(
+                "PDFs are not a valid input type for cv-analysis."
+                " Use PNGs for tests and NumPy arrays for deployment."
             )
-    elif type(pdf) == bytes:
-        pages = pdf2image.convert_from_bytes(
-            pdf, first_page=first_page, last_page=last_page
-        )
-    elif type(pdf) in {list, ndarray}:
-        return pdf
+    else:
+        raise TypeError("Incompatible datatype. Expected bytes, numpy.ndarray, or path to an image file.")
 
-    pages = [preprocess_pdf_image(array(p)) for p in pages]
-
-    return pages
-
-
-def open_img_from_bytes(bytes_obj: bytes):
-    page = Image.open(BytesIO(bytes_obj))
-    return preprocess_pdf_image(array(page))
+    return preprocess_page_array(page)
diff --git a/cv_analysis/utils/test_metrics.py b/cv_analysis/utils/test_metrics.py
index fd0eca6..8df3d00 100644
--- a/cv_analysis/utils/test_metrics.py
+++ b/cv_analysis/utils/test_metrics.py
@@ -75,11 +75,7 @@ def compute_document_score(results_dict, annotation_dict):
 
     scores = []
     for i in range(len(annotation_dict["pages"])):
-        scores.append(
-            compute_page_iou(
-                results_dict["pages"][i]["cells"], annotation_dict["pages"][i]["cells"]
-            )
-        )
+        scores.append(compute_page_iou(results_dict["pages"][i]["cells"], annotation_dict["pages"][i]["cells"]))
     scores = np.array(scores)
 
     doc_score = np.average(scores, weights=page_weights)
diff --git a/cv_analysis/utils/visual_logging.py b/cv_analysis/utils/visual_logging.py
index 983b546..e088dbe 100644
--- a/cv_analysis/utils/visual_logging.py
+++ b/cv_analysis/utils/visual_logging.py
@@ -1,6 +1,6 @@
 import os
 from cv_analysis.config import CONFIG
-from cv_analysis.utils.display import save_mpl
+from cv_analysis.utils.display import save_image
 
 
 class VisualLogger:
@@ -12,7 +12,7 @@ class VisualLogger:
 
     def _save(self, img, name):
         output_path = os.path.join(self.output_folder, name)
-        save_mpl(img, output_path)
+        save_image(img, output_path)
 
     def info(self, img, name):
         if self._level_is_info():
@@ -36,6 +36,4 @@ class VisualLogger:
         return self.level == "ALL"
 
 
-vizlogger = VisualLogger(
-    CONFIG.visual_logging.level, CONFIG.visual_logging.output_folder
-)
+vizlogger = VisualLogger(CONFIG.visual_logging.level, CONFIG.visual_logging.output_folder)
diff --git a/incl/pyinfra b/incl/pyinfra
index 7e948a4..1e70d49 160000
--- a/incl/pyinfra
+++ b/incl/pyinfra
@@ -1 +1 @@
-Subproject commit 7e948a4cf05a3ef59fcc7e8719fcf910adc73864
+Subproject commit 1e70d49531e89613c70903be49290b94ee014f65
diff --git a/scripts/annotate.py b/scripts/annotate.py
index cac9b45..e899ec8 100644
--- a/scripts/annotate.py
+++ b/scripts/annotate.py
@@ -8,9 +8,9 @@ python scripts/annotate.py /home/iriley/Documents/pdf/scanned/10.pdf 5 --type fi
 
 import argparse
 
-from cv_analysis.utils.display import show_mpl
+from cv_analysis.utils.display import show_image
 from cv_analysis.utils.draw import draw_contours, draw_rectangles
-from cv_analysis.utils.preprocessing import open_pdf
+from cv_analysis.utils.open_pdf import open_pdf
 from cv_analysis.utils.visual_logging import vizlogger
 
 
@@ -28,7 +28,7 @@ def annotate_page(page_image, analysis_function, drawing_function, name="tmp.png
     result = analysis_function(page_image)
     page_image = drawing_function(page_image, result)
     vizlogger.debug(page_image, "redactions05_output.png")
-    show_mpl(page_image)
+    show_image(page_image)
 
 
 if __name__ == "__main__":
@@ -46,5 +46,6 @@ if __name__ == "__main__":
         from cv_analysis.layout_parsing import parse_layout as analyze
     elif args.type == "figure":
         from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline
+
         analyze = make_figure_detection_pipeline()
     annotate_page(page, analyze, draw, name=name, show=args.show)
diff --git a/scripts/deskew_demo.py b/scripts/deskew_demo.py
deleted file mode 100644
index b09a342..0000000
--- a/scripts/deskew_demo.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# sample usage: python3 scripts/deskew_demo.py /path/to/crooked.pdf 0
-import argparse
-import numpy as np
-import pdf2image
-from PIL import Image
-
-from cv_analysis.utils.deskew import deskew_histbased  # , deskew_linebased
-from cv_analysis.utils.display import show_mpl
-from cv_analysis.utils.draw import draw_stats
-from cv_analysis.table_parsing import parse_tables
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("pdf_path")
-    parser.add_argument("page_index", type=int)
-    parser.add_argument("--save_path")
-
-    args = parser.parse_args()
-
-    return args
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    page = pdf2image.convert_from_path(args.pdf_path, first_page=args.page_index + 1, last_page=args.page_index + 1)[0]
-    page = np.array(page)
-
-    show_mpl(page)
-    # page_ = deskew_linebased(page, verbose=True)
-    # show_mpl(page_)
-    page_corr, _ = deskew_histbased(page, verbose=True)
-    show_mpl(page_corr)
-    if args.save_path:
-        page_ = Image.fromarray(page).convert("RGB")
-        page_.save(args.save_path.replace(".pdf", "_uncorrected.pdf"))
-        page_corr_ = Image.fromarray(page_corr).convert("RGB")
-        page_corr_.save(args.save_path.replace(".pdf", "_corrected.pdf"))
-    # annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index)
-    stats = parse_tables(page)
-    page = draw_stats(page, stats)
-    show_mpl(page)
-    stats_corr = parse_tables(page_corr)
-    page_corr = draw_stats(page_corr, stats_corr)
-    show_mpl(page_corr)
-    if args.save_path:
-        page = Image.fromarray(page).convert("RGB")
-        page.save(args.save_path.replace(".pdf", "_uncorrected_annotated.pdf"))
-        page_corr = Image.fromarray(page_corr).convert("RGB")
-        page_corr.save(args.save_path.replace(".pdf", "_corrected_annotated.pdf"))
diff --git a/scripts/pyinfra_mock.py b/scripts/pyinfra_mock.py
index 1717521..6d45b4d 100644
--- a/scripts/pyinfra_mock.py
+++ b/scripts/pyinfra_mock.py
@@ -1,16 +1,11 @@
 import argparse
-import base64
 import gzip
-import io
-import json
 from operator import itemgetter
 from typing import List
 
 import fitz
 import pdf2image
-from PIL import Image
 from funcy import lmap, compose, pluck
-from funcy import lpluck
 
 from pyinfra.default_objects import get_component_factory
 
@@ -45,13 +40,13 @@ def draw_cells_on_page(cells: List[dict], page):
 
 
 def annotate_results_on_pdf(results, pdf_path, result_path):
-    open_pdf = fitz.open(pdf_path)
+    opened_pdf = fitz.open(pdf_path)
     metadata_per_page = pluck("metadata", results)
 
-    for page, metadata in zip(open_pdf, metadata_per_page):
+    for page, metadata in zip(opened_pdf, metadata_per_page):
         if metadata:
             draw_cells_on_page(metadata["cells"], page)
-    open_pdf.save(result_path)
+    opened_pdf.save(result_path)
 
 
 def main(args):
diff --git a/test/conftest.py b/test/conftest.py
index 8ccc497..6c15491 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -1,6 +1,5 @@
 pytest_plugins = [
     "test.fixtures.table_parsing",
-    "test.fixtures.server",
     "test.fixtures.figure_detection",
 ]
 
diff --git a/test/fixtures/figure_detection.py b/test/fixtures/figure_detection.py
index 386e4a7..f36ae92 100644
--- a/test/fixtures/figure_detection.py
+++ b/test/fixtures/figure_detection.py
@@ -3,21 +3,18 @@ import textwrap
 import cv2
 import numpy as np
 import pytest
-from PIL import Image
 from lorem_text import lorem
-from funcy import first
+
 from cv_analysis.figure_detection.figure_detection_pipeline import (
     make_figure_detection_pipeline,
 )
-from cv_analysis.utils.display import show_mpl
 
 
 @pytest.fixture
 def page_with_images(random_image, n_images, background):
-    page_image = Image.fromarray(background.astype("uint8")).convert("RGB")
-    page_image = paste_image(page_image, random_image, (200, 200))
-    if n_images == 2:
-        page_image = paste_image(page_image, random_image, (1000, 2600))
+    page_image = paste_image(background, random_image, (200, 200))
+    # if n_images == 2: # TODO: Adjust image paste position, might be out of bounds
+    #     page_image = paste_image(page_image, random_image, (1000, 2600))
     return np.array(page_image)
 
 
@@ -32,14 +29,10 @@ def page_with_text(background, font_scale, font_style, text_types):
         cursor = (image.shape[1] // 2, 70)
         image = paste_text(image, cursor, font_scale, font_style, y_stop=body_height)
         cursor = (50, body_height + 70)
-        image = paste_text(
-            image, cursor, font_scale, font_style, y_stop=body_height * 2
-        )
+        image = paste_text(image, cursor, font_scale, font_style, y_stop=body_height * 2)
     if "caption" in text_types:
         cursor = (image.shape[1] // 2, image.shape[0] - 100)
-        image = paste_text(
-            image, cursor, font_scale, font_style, y_stop=body_height * 3
-        )
+        image = paste_text(image, cursor, font_scale, font_style, y_stop=body_height * 3)
     return image
 
 
@@ -67,9 +60,7 @@ def paste_text(image: np.ndarray, cursor, font_scale, font_style, y_stop):
     def paste_text_at_cursor(x_start, y_start, y_stop):
         # TODO: adjust incorrect right margin
         text = lorem.paragraphs(1) * 200
-        (dx, dy), base = cv2.getTextSize(
-            text, fontFace=font_style, fontScale=font_scale, thickness=1
-        )
+        (dx, dy), base = cv2.getTextSize(text, fontFace=font_style, fontScale=font_scale, thickness=1)
         dy += base
         # char_width = dx // len(text)
         text = textwrap.fill(text=text, width=(dx // page_width))
@@ -95,6 +86,7 @@ def paste_text(image: np.ndarray, cursor, font_scale, font_style, y_stop):
 
 
 def paste_image(page_image, image, coords):
-    image = Image.fromarray(image.astype("uint8")).convert("RGBA")
-    page_image.paste(image, coords)
+    h, w = image.shape[:2]
+    x, y = coords
+    page_image[x : x + h, y : y + w] = image
     return page_image
diff --git a/test/fixtures/server.py b/test/fixtures/server.py
index 982b89a..0ecec7e 100644
--- a/test/fixtures/server.py
+++ b/test/fixtures/server.py
@@ -2,8 +2,8 @@ import gzip
 import io
 
 import numpy as np
+import cv2
 import pytest
-from PIL import Image
 from funcy import first
 
 from cv_analysis.utils.structures import Rectangle
@@ -12,7 +12,7 @@ from incl.pyinfra.pyinfra.server.packing import bytes_to_string
 
 @pytest.fixture
 def random_image_as_bytes_and_compressed(random_image):
-    image = Image.fromarray(random_image.astype("uint8")).convert("RGBA")
+    image = cv2.cvtColor(random_image.astype("uint8"), cv2.COLOR_RGB2RGBA)
     img_byte_arr = io.BytesIO()
     image.save(img_byte_arr, format="PNG")
     return gzip.compress(img_byte_arr.getvalue())
diff --git a/test/fixtures/table_parsing.py b/test/fixtures/table_parsing.py
index 94e8cb0..ccd5207 100644
--- a/test/fixtures/table_parsing.py
+++ b/test/fixtures/table_parsing.py
@@ -6,7 +6,7 @@ from funcy import first
 
 from cv_analysis.locations import TEST_DATA_DIR
 from cv_analysis.utils.draw import draw_rectangles
-from cv_analysis.utils.preprocessing import open_pdf
+from cv_analysis.utils.open_pdf import open_pdf
 from test.fixtures.figure_detection import paste_text
 
 
@@ -24,9 +24,7 @@ def expected_table_annotation(test_file_index):
 
 
 @pytest.fixture
-def page_with_table(
-    background, table_shape, table_style, n_tables, line_thickness, line_type
-):
+def page_with_table(background, table_shape, table_style, n_tables, line_thickness, line_type):
     page = draw_table(
         background,
         (100, 100),
@@ -36,9 +34,7 @@ def page_with_table(
         line_type=line_type,
     )
     if n_tables == 2:
-        page = draw_table(
-            page, (200, 2000), table_shape, table_style, line_thickness, line_type
-        )
+        page = draw_table(page, (200, 2000), table_shape, table_style, line_thickness, line_type)
     return page
 
 
@@ -205,9 +201,7 @@ def expected_gold_page_with_table(page_with_table, n_tables):
     return result
 
 
-def draw_table(
-    page, table_position, table_shape, table_style, line_thickness, line_type
-):
+def draw_table(page, table_position, table_shape, table_style, line_thickness, line_type):
     bbox_table = (*table_position, 1500, 1000)
     page = draw_grid_lines(
         page,
diff --git a/test/unit_tests/figure_detection/figure_detection_pipeline_test.py b/test/unit_tests/figure_detection/figure_detection_pipeline_test.py
index b51e7a4..95747ad 100644
--- a/test/unit_tests/figure_detection/figure_detection_pipeline_test.py
+++ b/test/unit_tests/figure_detection/figure_detection_pipeline_test.py
@@ -14,7 +14,7 @@ class TestFindPrimaryTextRegions:
         assert not list(results)
 
     @pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
-    @pytest.mark.parametrize("n_images", [1, 2])
+    @pytest.mark.parametrize("n_images", [1])
     def test_page_without_text_yields_figures(self, figure_detection_pipeline, page_with_images, image_size):
         results = figure_detection_pipeline(page_with_images)
         result_figures_size = map(lambda x: (x.w, x.h), results)
diff --git a/test/unit_tests/figure_detection/text_test.py b/test/unit_tests/figure_detection/text_test.py
index 6983d79..794763b 100644
--- a/test/unit_tests/figure_detection/text_test.py
+++ b/test/unit_tests/figure_detection/text_test.py
@@ -6,7 +6,7 @@ from cv_analysis.figure_detection.text import (
     remove_primary_text_regions,
     apply_threshold_to_image,
 )
-from cv_analysis.utils.display import show_mpl
+from cv_analysis.utils.display import show_image
 from test.utils.utils import powerset
 
 
@@ -25,33 +25,19 @@ class TestFindPrimaryTextRegions:
         np.testing.assert_equal(result_page, apply_threshold_to_image(page_with_images))
 
     @pytest.mark.parametrize("font_scale", [1, 1.5, 2])
-    @pytest.mark.parametrize(
-        "font_style", [cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_COMPLEX]
-    )
+    @pytest.mark.parametrize("font_style", [cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_COMPLEX])
     @pytest.mark.parametrize("text_types", powerset(["body", "header", "caption"]))
-    def test_page_with_only_text_gets_text_removed(
-        self, page_with_text, error_tolerance
-    ):
+    def test_page_with_only_text_gets_text_removed(self, page_with_text, error_tolerance):
         result_page = remove_primary_text_regions(page_with_text)
-        relative_error = (
-            np.sum(result_page != apply_threshold_to_image(page_with_text))
-            / result_page.size
-        )
+        relative_error = np.sum(result_page != apply_threshold_to_image(page_with_text)) / result_page.size
         assert relative_error <= error_tolerance
 
     @pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
     @pytest.mark.parametrize("n_images", [1, 2])
     @pytest.mark.parametrize("font_scale", [1, 1.5, 2])
-    @pytest.mark.parametrize(
-        "font_style", [cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_COMPLEX]
-    )
+    @pytest.mark.parametrize("font_style", [cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_COMPLEX])
     @pytest.mark.parametrize("text_types", powerset(["body", "header", "caption"]))
-    def test_page_with_images_and_text_keeps_images(
-        self, page_with_images_and_text, error_tolerance
-    ):
+    def test_page_with_images_and_text_keeps_images(self, page_with_images_and_text, error_tolerance):
         result_page = remove_primary_text_regions(page_with_images_and_text)
-        relative_error = (
-            np.sum(result_page != apply_threshold_to_image(page_with_images_and_text))
-            / result_page.size
-        )
+        relative_error = np.sum(result_page != apply_threshold_to_image(page_with_images_and_text)) / result_page.size
         assert relative_error <= error_tolerance
diff --git a/test/unit_tests/server/__init__.py b/test/unit_tests/server/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/test/unit_tests/server/formatted_stream_fn_test.py b/test/unit_tests/server/formatted_stream_fn_test.py
deleted file mode 100644
index 2cf1e3a..0000000
--- a/test/unit_tests/server/formatted_stream_fn_test.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import pytest
-from funcy import first
-
-from cv_analysis.server.stream import make_streamable_analysis_fn
-
-
-@pytest.mark.parametrize("operation", ["mock"])
-@pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
-def test_make_analysis_fn(
-    analysis_fn_mock, random_image_metadata_package, expected_analyse_metadata
-):
-    analyse = make_streamable_analysis_fn(analysis_fn_mock)
-    results = first(analyse(random_image_metadata_package))
-
-    assert results["metadata"] == expected_analyse_metadata

From a871fa3bd327e7f39305dd0626421eaa81871845 Mon Sep 17 00:00:00 2001
From: Julius Unverfehrt <Julius.Unverfehrt@iqser.com>
Date: Fri, 22 Jul 2022 15:11:40 +0200
Subject: [PATCH 3/3] Pull request #19: Refactor evaluate

Merge in RR/cv-analysis from refactor-evaluate to master

Squashed commit of the following:

commit cde03a492452610322f8b7d3eb804a51afb76d81
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Fri Jul 22 12:37:36 2022 +0200

    add optional show analysis metadata dict

commit fb8bb9e2afa7767f2560f865516295be65f97f20
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Fri Jul 22 12:13:18 2022 +0200

    add script to evaluate runtime per page for all cv-analysis operations for multiple PDFs

commit 721e823e2ec38aae3fea51d01e2135fc8f228d94
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Fri Jul 22 10:30:31 2022 +0200

    refactor

commit a453753cfa477e162e5902ce191ded61cb678337
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Fri Jul 22 10:19:24 2022 +0200

    add logic to transform result coordinates accordingly to page rotation, update annotation script to use this logic

commit 71c09758d0fb763a2c38c6871e1d9bf51f2e7c41
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Thu Jul 21 15:57:49 2022 +0200

    introduce pipeline for image conversion, analysis and result formatting

commit aef252a41b9658dd0c4f55aa2d9f84de933586e0
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Thu Jul 21 15:57:38 2022 +0200

    introduce pipeline for image conversion, analysis and result formatting
---
 cv_analysis/server/pipeline.py                | 64 +++++++++++++
 cv_analysis/server/{format.py => rotate.py}   | 36 ++++---
 cv_analysis/server/stream.py                  | 43 ---------
 .../utils/{pdf2array.py => pdf2image.py}      | 25 +++--
 scripts/annotate_pdf.py                       | 55 +++++------
 scripts/measure_runtimes.py                   | 96 +++++++++++++++++++
 .../{pdf2array_test.py => pdf2image_test.py}  | 10 +-
 7 files changed, 221 insertions(+), 108 deletions(-)
 create mode 100644 cv_analysis/server/pipeline.py
 rename cv_analysis/server/{format.py => rotate.py} (83%)
 delete mode 100644 cv_analysis/server/stream.py
 rename cv_analysis/utils/{pdf2array.py => pdf2image.py} (61%)
 create mode 100644 scripts/measure_runtimes.py
 rename test/unit_tests/{pdf2array_test.py => pdf2image_test.py} (51%)

diff --git a/cv_analysis/server/pipeline.py b/cv_analysis/server/pipeline.py
new file mode 100644
index 0000000..ef890f0
--- /dev/null
+++ b/cv_analysis/server/pipeline.py
@@ -0,0 +1,64 @@
+from functools import partial
+from typing import Callable
+
+from funcy import lmap
+
+from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline
+from cv_analysis.layout_parsing import parse_layout
+from cv_analysis.server.rotate import rotate_rectangle
+from cv_analysis.table_parsing import parse_tables
+from cv_analysis.utils.logging import get_logger
+from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs
+from cv_analysis.utils.structures import Rectangle
+
+logger = get_logger()
+
+
+def make_analysis_pipeline(analysis_fn: Callable, dpi=200):
+    """Make end-to-end pipeline to analyse a PDF with given analysis function.
+    The pipeline returns a Generator of dicts containing page information and the analysis results.
+
+    Steps:
+        Convert PDF to Arrays and page information
+        Analise pages, get list of bboxes per page (e.g. table cells)
+        Convert pixel values to inches
+        Rotate results if page is rotated
+        Format results to stream of dictionaries
+    """
+
+    def pipeline(pdf: bytes, index=None):
+        image_metadata_pairs = pdf_to_image_metadata_pairs(pdf, index=index, dpi=dpi)
+        results = map(image_metadata_pair_to_results, image_metadata_pairs)
+        results_filtered = filter(lambda x: x["bboxes"], results)
+        return results_filtered
+
+    def image_metadata_pair_to_results(image_metadata_pair):
+        rectangles = analysis_fn(image_metadata_pair.image)
+        rectangles = map(partial(pixel_rect_to_inches_rect, dpi=dpi), rectangles)
+        if image_metadata_pair.metadata["rotation"] != 0:
+            rotate_rectangle_fn = partial(rotate_rectangle, metadata=image_metadata_pair.metadata)
+            rectangles = map(rotate_rectangle_fn, rectangles)
+        bboxes = lmap(lambda x: x.json_xyxy(), rectangles)
+        return {**image_metadata_pair.metadata, "bboxes": bboxes}
+
+    return pipeline
+
+
+def get_analysis_fn(analysis_type):
+    if analysis_type == "table":
+        return parse_tables
+    elif analysis_type == "layout":
+        return parse_layout
+    elif analysis_type == "figure":
+        return make_figure_detection_pipeline()
+    else:
+        raise
+
+
+def pixel_rect_to_inches_rect(rect, dpi):
+    def convert_pixel_to_inch(pixel):
+        return pixel / dpi * 72
+
+    bbox = rect.x1, rect.y1, rect.x2, rect.y2
+    bbox_inches = tuple(map(convert_pixel_to_inch, bbox))
+    return Rectangle.from_xyxy(bbox_inches, discrete=False)
diff --git a/cv_analysis/server/format.py b/cv_analysis/server/rotate.py
similarity index 83%
rename from cv_analysis/server/format.py
rename to cv_analysis/server/rotate.py
index 6e00991..ec9a867 100644
--- a/cv_analysis/server/format.py
+++ b/cv_analysis/server/rotate.py
@@ -1,35 +1,25 @@
 from _operator import itemgetter
-from functools import partial
 
 import numpy as np
 
 from cv_analysis.utils.structures import Rectangle
 
 
-def make_formatter(dpi, page_size, rotation):
+def rotate_rectangle(rectangle, metadata):
+    width, height, rotation = itemgetter("width", "height", "rotation")(metadata)
     rotation = rotation // 90 if rotation not in [0, 1, 2, 3] else rotation
 
-    def format_(key2pixel):
-        convert = partial(convert_pixel_to_inch, dpi=dpi)
-        x, y, w, h = map(convert, itemgetter("x", "y", "width", "height")(key2pixel))
-        x1, y1 = x + w, y + h
-        matrix = np.vstack([[x, y], [x1, y1]]).T
-        new_matrix = rotate_and_shift(matrix, rotation, page_size)
-        x1, x2 = sorted(new_matrix[0, :])
-        y1, y2 = sorted(new_matrix[1, :])
-        return Rectangle.from_xyxy((x1, y1, x2, y2), discrete=False).json_xywh()
+    if rotation in [1, 3]:
+        width, height = height, width
 
-    return format_
+    x1, y1, x2, y2 = rectangle.xyxy()
+    matrix = np.vstack([[x1, y1], [x2, y2]]).T
+    new_matrix = rotate_and_shift(matrix, rotation, (width, height))
 
+    x1, x2 = sorted(new_matrix[0, :])
+    y1, y2 = sorted(new_matrix[1, :])
 
-def convert_pixel_to_inch(pixel, dpi):
-    return pixel / dpi * 72
-
-
-def rotate(input_matrix, radians):
-    rotation_matrix = np.vstack([[np.cos(radians), -np.sin(radians)], [np.sin(radians), np.cos(radians)]])
-
-    return np.dot(rotation_matrix, input_matrix)
+    return Rectangle.from_xyxy((x1, y1, x2, y2), discrete=False)
 
 
 def rotate_and_shift(matrix, rotation, size, debug=False):
@@ -109,3 +99,9 @@ def __show_matrices(size, radians, matrix, matrix_rotated, matrix_rotated_and_sh
     axes[1].quiver([0, 0], [0, 0], m3[0, :], m3[1, :], scale=5, scale_units="inches", color="blue")
 
     plt.show()
+
+
+def rotate(input_matrix, radians):
+    rotation_matrix = np.vstack([[np.cos(radians), -np.sin(radians)], [np.sin(radians), np.cos(radians)]])
+
+    return np.dot(rotation_matrix, input_matrix)
diff --git a/cv_analysis/server/stream.py b/cv_analysis/server/stream.py
deleted file mode 100644
index a73ae05..0000000
--- a/cv_analysis/server/stream.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import gzip
-from operator import itemgetter
-from typing import Callable
-
-from funcy import lmap
-from pyinfra.server.utils import make_streamable_and_wrap_in_packing_logic
-
-from cv_analysis.server.format import make_formatter
-from cv_analysis.utils.logging import get_logger
-from cv_analysis.utils.open_pdf import open_pdf
-
-logger = get_logger()
-
-
-def make_streamable_analysis_fn(analysis_fn: Callable):
-    """Makes an analysis function streamable for pyinfra server logic. The wrapped function then
-    works with data and metadata and returns a tuple or generator of tuples with data and metadata.
-    For more information about the server logic, see the PyInfra documentation.
-
-    Args:
-        analysis_fn: cv-analysis function
-
-    Returns:
-        wrapped function
-    """
-
-    def analyse(data: bytes, metadata: dict):
-
-        image = open_pdf(gzip.decompress(data))[0]
-
-        dpi = metadata["image_info"]["dpi"]
-        width, height, rotation = itemgetter("width", "height", "rotation")(metadata["page_info"])
-
-        formatter = make_formatter(dpi, (width, height), rotation)
-
-        results = map(lambda x: x.json_xywh(), analysis_fn(image))
-        results = {"cells": (lmap(formatter, results))}
-
-        logger.debug(f"Page {metadata['page_info'].get('index', '')}: Found {len(results['cells'])} cells.")
-
-        return b"", {**metadata, **results}
-
-    return make_streamable_and_wrap_in_packing_logic(analyse, batched=False)
diff --git a/cv_analysis/utils/pdf2array.py b/cv_analysis/utils/pdf2image.py
similarity index 61%
rename from cv_analysis/utils/pdf2array.py
rename to cv_analysis/utils/pdf2image.py
index 8ce1ea6..b5da78d 100644
--- a/cv_analysis/utils/pdf2array.py
+++ b/cv_analysis/utils/pdf2image.py
@@ -1,3 +1,4 @@
+from dataclasses import dataclass
 from functools import partial
 from typing import Iterator, Tuple
 
@@ -5,20 +6,25 @@ import fitz
 import numpy as np
 
 
-def pdf_to_array_and_metadata(pdf: bytes, index=None, dpi=200) -> Iterator[Tuple[np.ndarray, dict]]:
-    """Stream the pages of a PDF as Tuples of page as matrix representation and page metadata.
-    Note: If Index is not given or evaluates to None, the whole PDF will be processed.
-    """
-    convert_fn = partial(page_to_array_and_metadata, dpi=dpi)
+@dataclass
+class ImageMetadataPair:
+    image: np.ndarray
+    metadata: dict
+
+
+def pdf_to_image_metadata_pairs(pdf: bytes, index=None, dpi=200) -> Iterator[ImageMetadataPair]:
+    """Streams PDF as pairs of image (matrix) and metadata.
+    Note: If Index is not given or evaluates to None, the whole PDF will be processed."""
+    convert_fn = partial(page_to_image_metadata_pair, dpi=dpi)
     yield from map(convert_fn, stream_pages(pdf, index))
 
 
-def page_to_array_and_metadata(page: fitz.Page, dpi):
-    metadata = get_page_info(page, dpi)
+def page_to_image_metadata_pair(page: fitz.Page, dpi):
+    metadata = get_page_info(page)
     pixmap = page.get_pixmap(dpi=dpi)
     array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
 
-    return array, metadata
+    return ImageMetadataPair(array, metadata)
 
 
 def stream_pages(pdf: bytes, index=None) -> Iterator[fitz.Page]:
@@ -30,11 +36,10 @@ def stream_pages(pdf: bytes, index=None) -> Iterator[fitz.Page]:
                 yield pdf_handle[i]
 
 
-def get_page_info(page, dpi):
+def get_page_info(page):
     return {
         "index": page.number,
         "rotation": page.rotation,
         "width": page.rect.width,  # rotated page width in inches
         "height": page.rect.height,  # rotated page height in inches
-        "dpi": dpi,
     }
diff --git a/scripts/annotate_pdf.py b/scripts/annotate_pdf.py
index aead8f6..44c48e7 100644
--- a/scripts/annotate_pdf.py
+++ b/scripts/annotate_pdf.py
@@ -1,15 +1,11 @@
 import argparse
-from itertools import starmap
+import json
+from operator import itemgetter
 from pathlib import Path
 
-from PIL import Image
-from funcy import lmap
+import fitz
 
-from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline
-from cv_analysis.layout_parsing import parse_layout
-from cv_analysis.table_parsing import parse_tables
-from cv_analysis.utils.draw import draw_rectangles
-from cv_analysis.utils.pdf2array import pdf_to_array_and_metadata
+from cv_analysis.server.pipeline import get_analysis_fn, make_analysis_pipeline
 
 
 def parse_args():
@@ -17,36 +13,35 @@ def parse_args():
     parser.add_argument("pdf_path")
     parser.add_argument("output_folder")
     parser.add_argument("--type", "-t", choices=["table", "layout", "figure"], required=True)
+    parser.add_argument("--verbose", action="store_true")
+    parser.add_argument("--silent", dest="verbose", action="store_false")
+    parser.set_defaults(verbose=False)
     return parser.parse_args()
 
 
-def analyse_and_annotate(images, analysis_fn):
-    result = map(analysis_fn, images)
-    annotated_images = starmap(draw_rectangles, zip(images, result))
-    return annotated_images
+def analyse_annotate_save(pdf, analysis_type, output_path, verbose):
+    pipe = make_analysis_pipeline(get_analysis_fn(analysis_type))
+    results = list(pipe(pdf))
 
+    if verbose:
+        print(json.dumps(results, indent=2))
 
-def save_as_pdf(images, output_folder, file_name, operation):
-    Path(output_folder).mkdir(parents=True, exist_ok=True)
-    images = lmap(Image.fromarray, images)
-    images[0].save(f"{output_folder}/{file_name}_annotated_{operation}.pdf", save_all=True, append_images=images)
-
-
-def get_analysis_fn(analysis_type):
-    if analysis_type == "table":
-        return parse_tables
-    elif analysis_type == "layout":
-        return parse_layout
-    elif analysis_type == "figure":
-        return make_figure_detection_pipeline()
-    else:
-        raise
+    with fitz.open(stream=pdf) as pdf_handle:
+        for result in results:
+            page = pdf_handle[result["index"]]
+            for rect in result["bboxes"]:
+                x1, y1, x2, y2 = itemgetter("x1", "y1", "x2", "y2")(rect)
+                page.draw_rect((x1, y1, x2, y2), color=(0.5, 0.7, 0.2), width=2)
+        pdf_handle.save(output_path)
 
 
 if __name__ == "__main__":
     args = parse_args()
+
     with open(args.pdf_path, "rb") as f:
         pdf_bytes = f.read()
-        images, _ = zip(*pdf_to_array_and_metadata(pdf_bytes))
-    annotated_pages = analyse_and_annotate(images=images, analysis_fn=get_analysis_fn(args.type))
-    save_as_pdf(annotated_pages, args.output_folder, Path(args.pdf_path).stem, args.type)
+
+    Path(args.output_folder).mkdir(parents=True, exist_ok=True)
+    output_path = f"{args.output_folder}/{Path(args.pdf_path).stem}_annotated_{args.type}.pdf"
+
+    analyse_annotate_save(pdf_bytes, args.type, output_path, args.verbose)
diff --git a/scripts/measure_runtimes.py b/scripts/measure_runtimes.py
new file mode 100644
index 0000000..656dfcb
--- /dev/null
+++ b/scripts/measure_runtimes.py
@@ -0,0 +1,96 @@
+import argparse
+import time
+from functools import partial
+from pathlib import Path
+
+import fitz
+import numpy as np
+from funcy import lmap
+from matplotlib import pyplot as plt
+
+from cv_analysis.server.pipeline import make_analysis_pipeline, get_analysis_fn
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("pdf_folder", help="Path to folder with PDFs to evaluate")
+    parser.add_argument("output_folder", help="Path to folder where the Runtime plot should be stored")
+    parser.add_argument("n_runs", help="Number of runs per test")
+    return parser.parse_args()
+
+
+def measure(fn, n_runs):
+    def run(*args, **kwargs):
+        def _run():
+            start = time.time()
+            results = list(fn(*args, **kwargs))  # Evaluate generators
+            end = time.time()
+            return end - start
+
+        runtimes = [_run() for _ in range(n_runs)]
+        return np.mean(runtimes), np.std(runtimes)
+
+    return run
+
+
+def run_tests(pdf, test_cases, n_runs):
+    def measure_analysis_pipe(test_case):
+        timed_analysis_pipe = measure(make_analysis_pipeline(get_analysis_fn(test_case)), n_runs)
+        return timed_analysis_pipe(pdf)
+
+    return lmap(measure_analysis_pipe, test_cases)
+
+
+def to_ms_per_page(runtime, page_count):
+    ms_per_page = runtime / page_count * 1000
+    return round(ms_per_page, 0)
+
+
+def measure_pdf(pdf_path, n_runs):
+    with open(pdf_path, "rb") as f:
+        pdf = f.read()
+        page_count = fitz.open(stream=pdf).page_count
+        format_fn = partial(to_ms_per_page, page_count=page_count)
+
+    means, std = zip(*run_tests(pdf, test_cases, n_runs=n_runs))
+    means, std = lmap(format_fn, means), lmap(format_fn, std)
+    return means, std
+
+
+def plot_results_and_save(results, labels, n_runs, test_pdf_paths):
+    fig, ax = plt.subplots()
+    width = 0.2
+    x_labels = np.arange(len(labels))
+    plt.xticks(ticks=x_labels, labels=labels, rotation=90)
+    plt.grid(linestyle="dotted")
+
+    for idx, (result, test_pdf_path) in enumerate(zip(results, test_pdf_paths)):
+        x = x_labels + idx * width
+        means, std = result
+        bars = ax.bar(x, means, width, yerr=std, label=f"{test_pdf_path.stem}")
+        ax.bar_label(bars)
+    ax.set_ylabel("ms/page")
+    ax.set_xlabel("Cv-analysis operation")
+    ax.set_title(f"Cv-analysis runtime estimation {n_runs=}")
+    ax.legend(loc=0)
+
+    Path(args.output_folder).mkdir(parents=True, exist_ok=True)
+    output_path = f"{args.output_folder}/cv_analysis_runtime_{n_runs=}.png"
+    plt.savefig(output_path, dpi=200, bbox_inches="tight", pad_inches=0.5)
+    plt.close()
+
+
+def measure_and_save_plot(args, test_cases):
+    n_runs = int(args.n_runs)
+    measure_pdf_fn = partial(measure_pdf, n_runs=n_runs)
+    test_pdf_paths = list(Path(args.pdf_folder).glob("*.pdf"))
+    results = lmap(measure_pdf_fn, test_pdf_paths)
+    plot_results_and_save(results, test_cases, n_runs, test_pdf_paths)
+
+
+if __name__ == "__main__":
+
+    test_cases = ["table", "layout", "figure"]
+
+    args = parse_args()
+    measure_and_save_plot(args, test_cases)
diff --git a/test/unit_tests/pdf2array_test.py b/test/unit_tests/pdf2image_test.py
similarity index 51%
rename from test/unit_tests/pdf2array_test.py
rename to test/unit_tests/pdf2image_test.py
index 1820ed8..4a44a26 100644
--- a/test/unit_tests/pdf2array_test.py
+++ b/test/unit_tests/pdf2image_test.py
@@ -2,7 +2,7 @@ import fitz
 import numpy as np
 import pytest
 
-from cv_analysis.utils.pdf2array import pdf_to_array_and_metadata
+from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs
 
 
 @pytest.fixture
@@ -17,8 +17,8 @@ def pdf(n_pages):
 
 @pytest.mark.parametrize("n_pages", [1])
 def test_pdf_to_array_and_metadata(pdf):
-    for array, metadata in pdf_to_array_and_metadata(pdf):
-        assert isinstance(array, np.ndarray)
-        assert array.shape == (2339, 1653, 3)  # Height, Width, Color channels
+    for image_metadata_pair in pdf_to_image_metadata_pairs(pdf):
+        assert isinstance(image_metadata_pair.image, np.ndarray)
+        assert image_metadata_pair.image.shape == (2339, 1653, 3)  # Height, Width, Color channels
 
-        assert isinstance(metadata, dict)
+        assert isinstance(image_metadata_pair.metadata, dict)