From a2451b910328be44ab651fe97c23141b98174112 Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Wed, 20 Jul 2022 11:01:55 +0200 Subject: [PATCH 1/3] Pull request #17: Add pdf2array func Merge in RR/cv-analysis from add-pdf2array-func to master Squashed commit of the following: commit 6e6e9a509ede0abf28fb93a2042960efcc9453bd Author: Julius Unverfehrt Date: Wed Jul 20 09:12:01 2022 +0200 update script with layout parsing, refactor pdf2array commit 191bc71f58aa5c07b0cadbdb7067cd72c3d8858b Author: Julius Unverfehrt Date: Wed Jul 20 09:10:06 2022 +0200 update script with layout parsing, refactor pdf2array commit 25201bbb4151a23784193181272d379232877d2f Author: Julius Unverfehrt Date: Wed Jul 20 08:33:20 2022 +0200 add pdf2array functionality --- cv_analysis/utils/pdf2array.py | 40 ++++++++++++++++++++++++ requirements.txt | 5 ++- scripts/annotate_figures.py | 38 ---------------------- scripts/annotate_pdf.py | 52 +++++++++++++++++++++++++++++++ test/unit_tests/pdf2array_test.py | 24 ++++++++++++++ 5 files changed, 120 insertions(+), 39 deletions(-) create mode 100644 cv_analysis/utils/pdf2array.py delete mode 100644 scripts/annotate_figures.py create mode 100644 scripts/annotate_pdf.py create mode 100644 test/unit_tests/pdf2array_test.py diff --git a/cv_analysis/utils/pdf2array.py b/cv_analysis/utils/pdf2array.py new file mode 100644 index 0000000..8ce1ea6 --- /dev/null +++ b/cv_analysis/utils/pdf2array.py @@ -0,0 +1,40 @@ +from functools import partial +from typing import Iterator, Tuple + +import fitz +import numpy as np + + +def pdf_to_array_and_metadata(pdf: bytes, index=None, dpi=200) -> Iterator[Tuple[np.ndarray, dict]]: + """Stream the pages of a PDF as Tuples of page as matrix representation and page metadata. + Note: If Index is not given or evaluates to None, the whole PDF will be processed. + """ + convert_fn = partial(page_to_array_and_metadata, dpi=dpi) + yield from map(convert_fn, stream_pages(pdf, index)) + + +def page_to_array_and_metadata(page: fitz.Page, dpi): + metadata = get_page_info(page, dpi) + pixmap = page.get_pixmap(dpi=dpi) + array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) + + return array, metadata + + +def stream_pages(pdf: bytes, index=None) -> Iterator[fitz.Page]: + with fitz.open(stream=pdf) as pdf_handle: + if not index: + yield from pdf_handle + else: + for i in index: + yield pdf_handle[i] + + +def get_page_info(page, dpi): + return { + "index": page.number, + "rotation": page.rotation, + "width": page.rect.width, # rotated page width in inches + "height": page.rect.height, # rotated page height in inches + "dpi": dpi, + } diff --git a/requirements.txt b/requirements.txt index dc2ae9d..de53279 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,7 @@ coverage~=5.5 dependency-check~=0.6.0 prometheus-client~=0.13.1 prometheus_flask_exporter~=0.19.0 -lorem-text==2.1 \ No newline at end of file +lorem-text==2.1 + +# pdf2array +PyMuPDF==1.19.6 \ No newline at end of file diff --git a/scripts/annotate_figures.py b/scripts/annotate_figures.py deleted file mode 100644 index cdc72f4..0000000 --- a/scripts/annotate_figures.py +++ /dev/null @@ -1,38 +0,0 @@ -import argparse -from itertools import starmap -from pathlib import Path - -import numpy as np -import pdf2image -from PIL import Image -from funcy import lmap - -from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline -from cv_analysis.utils.draw import draw_rectangles - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--pdf_path", "-p", required=True) - parser.add_argument("--output_folder", "-o", required=True) - return parser.parse_args() - - -def annotate_figures(images): - pipeline = make_figure_detection_pipeline() - result = map(pipeline, images) - annotated_images = starmap(draw_rectangles, zip(images, result)) - return annotated_images - - -def save_as_pdf(images, output_folder, file_name): - Path(output_folder).mkdir(parents=True, exist_ok=True) - images = lmap(Image.fromarray, images) - images[0].save(f"{output_folder}/{file_name}_annotated_figures.pdf", save_all=True, append_images=images) - - -if __name__ == "__main__": - args = parse_args() - pages = lmap(np.array, pdf2image.convert_from_path(args.pdf_path)) - annotated_pages = annotate_figures(images=pages) - save_as_pdf(annotated_pages, args.output_folder, Path(args.pdf_path).stem) diff --git a/scripts/annotate_pdf.py b/scripts/annotate_pdf.py new file mode 100644 index 0000000..aead8f6 --- /dev/null +++ b/scripts/annotate_pdf.py @@ -0,0 +1,52 @@ +import argparse +from itertools import starmap +from pathlib import Path + +from PIL import Image +from funcy import lmap + +from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline +from cv_analysis.layout_parsing import parse_layout +from cv_analysis.table_parsing import parse_tables +from cv_analysis.utils.draw import draw_rectangles +from cv_analysis.utils.pdf2array import pdf_to_array_and_metadata + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("pdf_path") + parser.add_argument("output_folder") + parser.add_argument("--type", "-t", choices=["table", "layout", "figure"], required=True) + return parser.parse_args() + + +def analyse_and_annotate(images, analysis_fn): + result = map(analysis_fn, images) + annotated_images = starmap(draw_rectangles, zip(images, result)) + return annotated_images + + +def save_as_pdf(images, output_folder, file_name, operation): + Path(output_folder).mkdir(parents=True, exist_ok=True) + images = lmap(Image.fromarray, images) + images[0].save(f"{output_folder}/{file_name}_annotated_{operation}.pdf", save_all=True, append_images=images) + + +def get_analysis_fn(analysis_type): + if analysis_type == "table": + return parse_tables + elif analysis_type == "layout": + return parse_layout + elif analysis_type == "figure": + return make_figure_detection_pipeline() + else: + raise + + +if __name__ == "__main__": + args = parse_args() + with open(args.pdf_path, "rb") as f: + pdf_bytes = f.read() + images, _ = zip(*pdf_to_array_and_metadata(pdf_bytes)) + annotated_pages = analyse_and_annotate(images=images, analysis_fn=get_analysis_fn(args.type)) + save_as_pdf(annotated_pages, args.output_folder, Path(args.pdf_path).stem, args.type) diff --git a/test/unit_tests/pdf2array_test.py b/test/unit_tests/pdf2array_test.py new file mode 100644 index 0000000..1820ed8 --- /dev/null +++ b/test/unit_tests/pdf2array_test.py @@ -0,0 +1,24 @@ +import fitz +import numpy as np +import pytest + +from cv_analysis.utils.pdf2array import pdf_to_array_and_metadata + + +@pytest.fixture +def pdf(n_pages): + doc = fitz.open() + for n in range(n_pages): + page = doc.new_page() + where = fitz.Point(50, 100) + page.insert_text(where, "De gustibus non est disputandum.", fontsize=30) + return doc.write() + + +@pytest.mark.parametrize("n_pages", [1]) +def test_pdf_to_array_and_metadata(pdf): + for array, metadata in pdf_to_array_and_metadata(pdf): + assert isinstance(array, np.ndarray) + assert array.shape == (2339, 1653, 3) # Height, Width, Color channels + + assert isinstance(metadata, dict) From e7b28f5bda92e1b495d20d398afeb3a678df503a Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Thu, 21 Jul 2022 13:25:00 +0200 Subject: [PATCH 2/3] Pull request #18: Remove pil Merge in RR/cv-analysis from remove_pil to master Squashed commit of the following: commit 83c8d88f3d48404251470176c70979ee75ae068b Author: Julius Unverfehrt Date: Thu Jul 21 10:51:51 2022 +0200 remove deprecated server tests commit cebc03b5399ac257a74036b41997201f882f5b74 Author: Julius Unverfehrt Date: Thu Jul 21 10:51:08 2022 +0200 remove deprecated server tests commit ce2845b0c51f001b7b5b8b195d6bf7e034ec4e39 Author: Julius Unverfehrt Date: Wed Jul 20 17:05:00 2022 +0200 repair tests to work without pillow WIP commit 023fdab8322f28359a24c63e32635a3d0deccbe4 Author: Isaac Riley Date: Wed Jul 20 16:40:36 2022 +0200 fixed typo commit 33850ca83a175f74789ae6b9bebd057ed84b7fb3 Author: Isaac Riley Date: Wed Jul 20 16:38:37 2022 +0200 fixed import from refactored open_img.py commit dbc6d345f074e538948e2c4f94ebed8a5ef520bc Author: Isaac Riley Date: Wed Jul 20 16:32:42 2022 +0200 removed PIL from production code, now inly in scripts --- .../figure_detection_pipeline.py | 4 +- cv_analysis/layout_parsing.py | 26 +----- cv_analysis/redaction_detection.py | 23 +---- cv_analysis/server/stream.py | 4 +- cv_analysis/table_parsing.py | 14 +-- cv_analysis/utils/deskew.py | 87 ------------------- cv_analysis/utils/display.py | 36 +++++--- cv_analysis/utils/logging.py | 4 +- cv_analysis/utils/open_pdf.py | 27 ++++++ cv_analysis/utils/post_processing.py | 18 +--- cv_analysis/utils/preprocessing.py | 46 ++++------ cv_analysis/utils/test_metrics.py | 6 +- cv_analysis/utils/visual_logging.py | 8 +- incl/pyinfra | 2 +- scripts/annotate.py | 7 +- scripts/deskew_demo.py | 50 ----------- scripts/pyinfra_mock.py | 11 +-- test/conftest.py | 1 - test/fixtures/figure_detection.py | 28 +++--- test/fixtures/server.py | 4 +- test/fixtures/table_parsing.py | 14 +-- .../figure_detection_pipeline_test.py | 2 +- test/unit_tests/figure_detection/text_test.py | 28 ++---- test/unit_tests/server/__init__.py | 0 .../server/formatted_stream_fn_test.py | 15 ---- 25 files changed, 116 insertions(+), 349 deletions(-) delete mode 100644 cv_analysis/utils/deskew.py create mode 100644 cv_analysis/utils/open_pdf.py delete mode 100644 scripts/deskew_demo.py delete mode 100644 test/unit_tests/server/__init__.py delete mode 100644 test/unit_tests/server/formatted_stream_fn_test.py diff --git a/cv_analysis/figure_detection/figure_detection_pipeline.py b/cv_analysis/figure_detection/figure_detection_pipeline.py index f0a3b35..1a374f1 100644 --- a/cv_analysis/figure_detection/figure_detection_pipeline.py +++ b/cv_analysis/figure_detection/figure_detection_pipeline.py @@ -17,9 +17,7 @@ from cv_analysis.utils.structures import Rectangle def make_figure_detection_pipeline(min_area=5000, max_width_to_height_ratio=6): def pipeline(image: np.array): max_area = image.shape[0] * image.shape[1] * 0.99 - filter_cnts = make_filter_likely_figures( - min_area, max_area, max_width_to_height_ratio - ) + filter_cnts = make_filter_likely_figures(min_area, max_area, max_width_to_height_ratio) image = remove_primary_text_regions(image) cnts = detect_large_coherent_structures(image) diff --git a/cv_analysis/layout_parsing.py b/cv_analysis/layout_parsing.py index 3ffeecf..1e6171e 100644 --- a/cv_analysis/layout_parsing.py +++ b/cv_analysis/layout_parsing.py @@ -5,10 +5,6 @@ from operator import __and__ import cv2 import numpy as np -# from pdf2image import pdf2image - -# from cv_analysis.utils.display import show_mpl -# from cv_analysis.utils.draw import draw_rectangles from cv_analysis.utils.structures import Rectangle from cv_analysis.utils.post_processing import ( remove_overlapping, @@ -23,9 +19,7 @@ def is_likely_segment(rect, min_area=100): def find_segments(image): - contours, hierarchies = cv2.findContours( - image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE - ) + contours, hierarchies = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) mask1 = map(is_likely_segment, contours) mask2 = map(has_no_parent, hierarchies[0]) @@ -81,21 +75,3 @@ def parse_layout(image: np.array): rects = remove_overlapping(rects) return list(map(Rectangle.from_xywh, rects)) - - -# def annotate_layout_in_pdf(page, return_rects=False, show=False): - -# #page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] -# #page = np.array(page) - -# rects = parse_layout(page) - -# if return_rects: -# return rects, page -# elif show: -# page = draw_rectangles(page, rects) -# vizlogger.debug(page, "layout10_output.png") -# show_mpl(page) -# else: -# page = draw_rectangles(page, rects) -# return page diff --git a/cv_analysis/redaction_detection.py b/cv_analysis/redaction_detection.py index 3c5bf5f..b9d40d8 100644 --- a/cv_analysis/redaction_detection.py +++ b/cv_analysis/redaction_detection.py @@ -5,16 +5,12 @@ import numpy as np import pdf2image from iteration_utilities import starfilter, first -from cv_analysis.utils.display import show_mpl -from cv_analysis.utils.draw import draw_contours from cv_analysis.utils.filters import is_large_enough, is_filled, is_boxy from cv_analysis.utils.visual_logging import vizlogger def is_likely_redaction(contour, hierarchy, min_area): - return ( - is_filled(hierarchy) and is_boxy(contour) and is_large_enough(contour, min_area) - ) + return is_filled(hierarchy) and is_boxy(contour) and is_large_enough(contour, min_area) def find_redactions(image: np.array, min_normalized_area=200000): @@ -31,9 +27,7 @@ def find_redactions(image: np.array, min_normalized_area=200000): thresh = cv2.threshold(blurred, 252, 255, cv2.THRESH_BINARY)[1] vizlogger.debug(blurred, "redactions04_threshold.png") - contours, hierarchies = cv2.findContours( - thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE - ) + contours, hierarchies = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE) try: contours = map( @@ -46,16 +40,3 @@ def find_redactions(image: np.array, min_normalized_area=200000): return list(contours) except: return [] - - -# def annotate_redactions_in_pdf(page, show=False): - -# #page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] -# #page = np.array(page) - -# redaction_contours = find_redactions(page) -# page = draw_contours(page, redaction_contours) -# vizlogger.debug(page, "redactions05_output.png") - -# if show: -# show_mpl(page) diff --git a/cv_analysis/server/stream.py b/cv_analysis/server/stream.py index ae66475..a73ae05 100644 --- a/cv_analysis/server/stream.py +++ b/cv_analysis/server/stream.py @@ -7,7 +7,7 @@ from pyinfra.server.utils import make_streamable_and_wrap_in_packing_logic from cv_analysis.server.format import make_formatter from cv_analysis.utils.logging import get_logger -from cv_analysis.utils.preprocessing import open_img_from_bytes +from cv_analysis.utils.open_pdf import open_pdf logger = get_logger() @@ -26,7 +26,7 @@ def make_streamable_analysis_fn(analysis_fn: Callable): def analyse(data: bytes, metadata: dict): - image = open_img_from_bytes(gzip.decompress(data)) + image = open_pdf(gzip.decompress(data))[0] dpi = metadata["image_info"]["dpi"] width, height, rotation = itemgetter("width", "height", "rotation")(metadata["page_info"]) diff --git a/cv_analysis/table_parsing.py b/cv_analysis/table_parsing.py index 52d5292..9375a0f 100644 --- a/cv_analysis/table_parsing.py +++ b/cv_analysis/table_parsing.py @@ -15,9 +15,7 @@ from cv_analysis.layout_parsing import parse_layout def add_external_contours(image, image_h_w_lines_only): - contours, _ = cv2.findContours( - image_h_w_lines_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE - ) + contours, _ = cv2.findContours(image_h_w_lines_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) for cnt in contours: x, y, w, h = cv2.boundingRect(cnt) cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1) @@ -82,9 +80,7 @@ def isolate_vertical_and_horizontal_components(img_bin): img_bin_extended = img_bin_h | img_bin_v th1, img_bin_extended = cv2.threshold(img_bin_extended, 120, 255, cv2.THRESH_BINARY) - img_bin_final = cv2.dilate( - img_bin_extended, np.ones((1, 1), np.uint8), iterations=1 - ) + img_bin_final = cv2.dilate(img_bin_extended, np.ones((1, 1), np.uint8), iterations=1) # add contours before lines are extended by blurring img_bin_final = add_external_contours(img_bin_final, img_lines_raw) @@ -137,9 +133,7 @@ def turn_connected_components_into_rects(image): x1, y1, w, h, area = stat return area > 2000 and w > 35 and h > 25 - _, _, stats, _ = cv2.connectedComponentsWithStats( - ~image, connectivity=8, ltype=cv2.CV_32S - ) + _, _, stats, _ = cv2.connectedComponentsWithStats(~image, connectivity=8, ltype=cv2.CV_32S) stats = np.vstack(list(filter(is_large_enough, stats))) return stats[:, :-1][2:] @@ -149,7 +143,7 @@ def parse_tables(image: np.array, show=False): """Runs the full table parsing process. Args: - image (np.array): single PDF page, opened as PIL.Image object and converted to a numpy array + image (np.array): single PDF page, converted to a numpy array Returns: list: list of rectangles corresponding to table cells diff --git a/cv_analysis/utils/deskew.py b/cv_analysis/utils/deskew.py deleted file mode 100644 index 98f3de3..0000000 --- a/cv_analysis/utils/deskew.py +++ /dev/null @@ -1,87 +0,0 @@ -import numpy as np -from scipy.ndimage import rotate as rotate_ -import cv2 - -from cv_analysis.config import CONFIG - - -def rotate_straight(im: np.array, skew_angle: int) -> np.array: - h, w = im.shape[:2] - center = (w // 2, h // 2) - M = cv2.getRotationMatrix2D(center, skew_angle, 1.0) - rotated = cv2.warpAffine( - im, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE - ) - return rotated - - -def find_score(arr, angle): - data = rotate_(arr, angle, reshape=False, order=0, mode=CONFIG.deskew.mode) - hist = np.sum(data, axis=1) - score = np.sum((hist[1:] - hist[:-1]) ** 2) - return score - - -def find_best_angle(page): - lim = CONFIG.deskew.max_abs_angle - delta = CONFIG.deskew.delta - angles = np.arange(-lim, lim + delta, delta) - scores = [find_score(page, angle) for angle in angles] - best_angle = angles[scores.index(max(scores))] - return best_angle - - -def preprocess(arr: np.array): - if len(arr.shape) > 2: - arr = cv2.cvtColor(arr, cv2.COLOR_BGR2GRAY) - arr = cv2.fastNlMeansDenoising(arr, h=CONFIG.deskew.filter_strength_h) - return arr - - -def rotate(page, angle): - rotated = rotate_(page, angle, reshape=False, order=0, mode="nearest") - return rotated - - -def deskew_histbased(page: np.array): - page = preprocess(page) - best_angle = round(find_best_angle(page), 3) - - if CONFIG.deskew.verbose: - print("Skew angle from pixel histogram: {}".format(best_angle)) - - rotated = rotate(page, best_angle) - return (rotated, best_angle) - - -def needs_deskew(page: np.array) -> bool: - """ - Makes use of 'row-wise mean difference' - the difference between neighboring - on left and right halves - """ - - def split_rowmean_diff(page): - width = page.shape[1] - cutpoint = int(width / 2) - left = page[:, :cutpoint] - right = page[:, cutpoint:] - leftmeans = np.mean(left, axis=1) - rightmeans = np.mean(right, axis=1) - return rightmeans - leftmeans - - unrotated_score = np.mean(np.abs(split_rowmean_diff(page))) - angles = [-CONFIG.deskew.test_delta, CONFIG.deskew.test_delta] - scores = [ - np.mean(np.abs(split_rowmean_diff(rotate(page, angle)))) for angle in angles - ] - print(unrotated_score, scores) - return unrotated_score > min(scores) - - -if CONFIG.deskew.function == "hist": - deskew = lambda page: deskew_histbased(page) if needs_deskew(page) else (page, 0) -elif CONFIG.deskew.function == "identity": - deskew = lambda page: (page, None) -else: - raise ValueError( - "'{CONFIG.deskew.function}' is not a valid parameter value for CONFIG.deskew.function" - ) diff --git a/cv_analysis/utils/display.py b/cv_analysis/utils/display.py index 999c9a2..f5d9285 100644 --- a/cv_analysis/utils/display.py +++ b/cv_analysis/utils/display.py @@ -1,26 +1,34 @@ +from numpy import resize import cv2 from matplotlib import pyplot as plt -def show_mpl(image): +def show_image_cv2(image, maxdim=700): + h, w, c = image.shape + maxhw = max(h, w) + if maxhw > maxdim: + ratio = maxdim / maxhw + h = int(h * ratio) + w = int(w * ratio) + img = cv2.resize(image, (h, w)) + cv2.imshow("", img) + cv2.waitKey(0) + cv2.destroyAllWindows() + + +def show_image_mpl(image): fig, ax = plt.subplots(1, 1) fig.set_size_inches(20, 20) ax.imshow(image, cmap="gray") plt.show() -def save_mpl(image, path): - # fig, ax = plt.subplots(1, 1) - # figure = plt.gcf() - # figure.set_size_inches(16,12) - fig, ax = plt.subplots(1, 1) - fig.set_size_inches(20, 20) - ax.imshow(image, cmap="gray") - # plt.close() - plt.savefig(path) - plt.close() +def show_image(image, backend="m"): + if backend.startswith("m"): + show_image_mpl(image) + else: + show_image_cv2(image) -def show_cv2(image): - cv2.imshow("", image) - cv2.waitKey(0) +def save_image(image, path): + cv2.imwrite(path, image) diff --git a/cv_analysis/utils/logging.py b/cv_analysis/utils/logging.py index 6fc280f..51be0fb 100644 --- a/cv_analysis/utils/logging.py +++ b/cv_analysis/utils/logging.py @@ -8,9 +8,7 @@ from cv_analysis.config import CONFIG def make_logger_getter(): logger = logging.getLogger(__name__) logger.setLevel(logging.getLevelName(CONFIG.service.logging_level)) - formatter = logging.Formatter( - fmt="%(asctime)s %(levelname)s: %(message)s", datefmt="%d.%m.%Y - %H:%M:%S" - ) + formatter = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s", datefmt="%d.%m.%Y - %H:%M:%S") ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.getLevelName(CONFIG.service.logging_level)) diff --git a/cv_analysis/utils/open_pdf.py b/cv_analysis/utils/open_pdf.py new file mode 100644 index 0000000..d704ba4 --- /dev/null +++ b/cv_analysis/utils/open_pdf.py @@ -0,0 +1,27 @@ +from numpy import array, ndarray +import pdf2image +from PIL import Image + +from cv_analysis.utils.preprocessing import preprocess_page_array + + +def open_pdf(pdf, first_page=0, last_page=None): + + first_page += 1 + last_page = None if last_page is None else last_page + 1 + + if type(pdf) == str: + if pdf.lower().endswith((".png", ".jpg", ".jpeg")): + pages = [Image.open(pdf)] + elif pdf.lower().endswith(".pdf"): + pages = pdf2image.convert_from_path(pdf, first_page=first_page, last_page=last_page) + else: + raise IOError("Invalid file extension. Accepted filetypes:\n\t.png\n\t.jpg\n\t.jpeg\n\t.pdf") + elif type(pdf) == bytes: + pages = pdf2image.convert_from_bytes(pdf, first_page=first_page, last_page=last_page) + elif type(pdf) in {list, ndarray}: + return pdf + + pages = [preprocess_page_array(array(p)) for p in pages] + + return pages diff --git a/cv_analysis/utils/post_processing.py b/cv_analysis/utils/post_processing.py index 46da1dc..1749f2d 100644 --- a/cv_analysis/utils/post_processing.py +++ b/cv_analysis/utils/post_processing.py @@ -18,21 +18,11 @@ def remove_overlapping(rectangles): def remove_included(rectangles): def included(a, b): - return ( - b.xmin >= a.xmin - and b.ymin >= a.ymin - and b.xmax <= a.xmax - and b.ymax <= a.ymax - ) + return b.xmin >= a.xmin and b.ymin >= a.ymin and b.xmax <= a.xmax and b.ymax <= a.ymax def includes(a, b, tol=3): """does a include b?""" - return ( - b.xmin + tol >= a.xmin - and b.ymin + tol >= a.ymin - and b.xmax - tol <= a.xmax - and b.ymax - tol <= a.ymax - ) + return b.xmin + tol >= a.xmin and b.ymin + tol >= a.ymin and b.xmax - tol <= a.xmax and b.ymax - tol <= a.ymax def is_not_included(rect, rectangles): return not any(includes(r2, rect) for r2 in rectangles if not rect == r2) @@ -110,9 +100,7 @@ def __remove_isolated_sorted(rectangles): def remove_isolated(rectangles, input_sorted=False): - return (__remove_isolated_sorted if input_sorted else __remove_isolated_unsorted)( - rectangles - ) + return (__remove_isolated_sorted if input_sorted else __remove_isolated_unsorted)(rectangles) Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax") diff --git a/cv_analysis/utils/preprocessing.py b/cv_analysis/utils/preprocessing.py index d51139a..c3269d4 100644 --- a/cv_analysis/utils/preprocessing.py +++ b/cv_analysis/utils/preprocessing.py @@ -1,41 +1,29 @@ -from io import BytesIO -from numpy import array, ndarray -import pdf2image -from PIL import Image +from numpy import frombuffer, ndarray import cv2 -def preprocess_pdf_image(page): +def preprocess_page_array(page): if len(page.shape) > 2: page = cv2.cvtColor(page, cv2.COLOR_BGR2GRAY) page = cv2.fastNlMeansDenoising(page, h=3) return page -def open_pdf(pdf, first_page=0, last_page=None): +def page2image(page): - first_page += 1 - last_page = None if last_page is None else last_page + 1 - - if type(pdf) == str: - if pdf.lower().endswith((".png", ".jpg", ".jpeg")): - pages = [Image.open(pdf)] - else: # assume pdf as default file type for a path argument - pages = pdf2image.convert_from_path( - pdf, first_page=first_page, last_page=last_page + if type(page) == bytes: + page = frombuffer(page) + elif type(page) == ndarray: + page = page + elif type(page) == str: + if page.lower().endswith((".png", ".jpg", ".jpeg")): + page = cv2.imread(page) + else: + raise IOError( + "PDFs are not a valid input type for cv-analysis." + " Use PNGs for tests and NumPy arrays for deployment." ) - elif type(pdf) == bytes: - pages = pdf2image.convert_from_bytes( - pdf, first_page=first_page, last_page=last_page - ) - elif type(pdf) in {list, ndarray}: - return pdf + else: + raise TypeError("Incompatible datatype. Expected bytes, numpy.ndarray, or path to an image file.") - pages = [preprocess_pdf_image(array(p)) for p in pages] - - return pages - - -def open_img_from_bytes(bytes_obj: bytes): - page = Image.open(BytesIO(bytes_obj)) - return preprocess_pdf_image(array(page)) + return preprocess_page_array(page) diff --git a/cv_analysis/utils/test_metrics.py b/cv_analysis/utils/test_metrics.py index fd0eca6..8df3d00 100644 --- a/cv_analysis/utils/test_metrics.py +++ b/cv_analysis/utils/test_metrics.py @@ -75,11 +75,7 @@ def compute_document_score(results_dict, annotation_dict): scores = [] for i in range(len(annotation_dict["pages"])): - scores.append( - compute_page_iou( - results_dict["pages"][i]["cells"], annotation_dict["pages"][i]["cells"] - ) - ) + scores.append(compute_page_iou(results_dict["pages"][i]["cells"], annotation_dict["pages"][i]["cells"])) scores = np.array(scores) doc_score = np.average(scores, weights=page_weights) diff --git a/cv_analysis/utils/visual_logging.py b/cv_analysis/utils/visual_logging.py index 983b546..e088dbe 100644 --- a/cv_analysis/utils/visual_logging.py +++ b/cv_analysis/utils/visual_logging.py @@ -1,6 +1,6 @@ import os from cv_analysis.config import CONFIG -from cv_analysis.utils.display import save_mpl +from cv_analysis.utils.display import save_image class VisualLogger: @@ -12,7 +12,7 @@ class VisualLogger: def _save(self, img, name): output_path = os.path.join(self.output_folder, name) - save_mpl(img, output_path) + save_image(img, output_path) def info(self, img, name): if self._level_is_info(): @@ -36,6 +36,4 @@ class VisualLogger: return self.level == "ALL" -vizlogger = VisualLogger( - CONFIG.visual_logging.level, CONFIG.visual_logging.output_folder -) +vizlogger = VisualLogger(CONFIG.visual_logging.level, CONFIG.visual_logging.output_folder) diff --git a/incl/pyinfra b/incl/pyinfra index 7e948a4..1e70d49 160000 --- a/incl/pyinfra +++ b/incl/pyinfra @@ -1 +1 @@ -Subproject commit 7e948a4cf05a3ef59fcc7e8719fcf910adc73864 +Subproject commit 1e70d49531e89613c70903be49290b94ee014f65 diff --git a/scripts/annotate.py b/scripts/annotate.py index cac9b45..e899ec8 100644 --- a/scripts/annotate.py +++ b/scripts/annotate.py @@ -8,9 +8,9 @@ python scripts/annotate.py /home/iriley/Documents/pdf/scanned/10.pdf 5 --type fi import argparse -from cv_analysis.utils.display import show_mpl +from cv_analysis.utils.display import show_image from cv_analysis.utils.draw import draw_contours, draw_rectangles -from cv_analysis.utils.preprocessing import open_pdf +from cv_analysis.utils.open_pdf import open_pdf from cv_analysis.utils.visual_logging import vizlogger @@ -28,7 +28,7 @@ def annotate_page(page_image, analysis_function, drawing_function, name="tmp.png result = analysis_function(page_image) page_image = drawing_function(page_image, result) vizlogger.debug(page_image, "redactions05_output.png") - show_mpl(page_image) + show_image(page_image) if __name__ == "__main__": @@ -46,5 +46,6 @@ if __name__ == "__main__": from cv_analysis.layout_parsing import parse_layout as analyze elif args.type == "figure": from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline + analyze = make_figure_detection_pipeline() annotate_page(page, analyze, draw, name=name, show=args.show) diff --git a/scripts/deskew_demo.py b/scripts/deskew_demo.py deleted file mode 100644 index b09a342..0000000 --- a/scripts/deskew_demo.py +++ /dev/null @@ -1,50 +0,0 @@ -# sample usage: python3 scripts/deskew_demo.py /path/to/crooked.pdf 0 -import argparse -import numpy as np -import pdf2image -from PIL import Image - -from cv_analysis.utils.deskew import deskew_histbased # , deskew_linebased -from cv_analysis.utils.display import show_mpl -from cv_analysis.utils.draw import draw_stats -from cv_analysis.table_parsing import parse_tables - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("pdf_path") - parser.add_argument("page_index", type=int) - parser.add_argument("--save_path") - - args = parser.parse_args() - - return args - - -if __name__ == "__main__": - args = parse_args() - page = pdf2image.convert_from_path(args.pdf_path, first_page=args.page_index + 1, last_page=args.page_index + 1)[0] - page = np.array(page) - - show_mpl(page) - # page_ = deskew_linebased(page, verbose=True) - # show_mpl(page_) - page_corr, _ = deskew_histbased(page, verbose=True) - show_mpl(page_corr) - if args.save_path: - page_ = Image.fromarray(page).convert("RGB") - page_.save(args.save_path.replace(".pdf", "_uncorrected.pdf")) - page_corr_ = Image.fromarray(page_corr).convert("RGB") - page_corr_.save(args.save_path.replace(".pdf", "_corrected.pdf")) - # annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index) - stats = parse_tables(page) - page = draw_stats(page, stats) - show_mpl(page) - stats_corr = parse_tables(page_corr) - page_corr = draw_stats(page_corr, stats_corr) - show_mpl(page_corr) - if args.save_path: - page = Image.fromarray(page).convert("RGB") - page.save(args.save_path.replace(".pdf", "_uncorrected_annotated.pdf")) - page_corr = Image.fromarray(page_corr).convert("RGB") - page_corr.save(args.save_path.replace(".pdf", "_corrected_annotated.pdf")) diff --git a/scripts/pyinfra_mock.py b/scripts/pyinfra_mock.py index 1717521..6d45b4d 100644 --- a/scripts/pyinfra_mock.py +++ b/scripts/pyinfra_mock.py @@ -1,16 +1,11 @@ import argparse -import base64 import gzip -import io -import json from operator import itemgetter from typing import List import fitz import pdf2image -from PIL import Image from funcy import lmap, compose, pluck -from funcy import lpluck from pyinfra.default_objects import get_component_factory @@ -45,13 +40,13 @@ def draw_cells_on_page(cells: List[dict], page): def annotate_results_on_pdf(results, pdf_path, result_path): - open_pdf = fitz.open(pdf_path) + opened_pdf = fitz.open(pdf_path) metadata_per_page = pluck("metadata", results) - for page, metadata in zip(open_pdf, metadata_per_page): + for page, metadata in zip(opened_pdf, metadata_per_page): if metadata: draw_cells_on_page(metadata["cells"], page) - open_pdf.save(result_path) + opened_pdf.save(result_path) def main(args): diff --git a/test/conftest.py b/test/conftest.py index 8ccc497..6c15491 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,6 +1,5 @@ pytest_plugins = [ "test.fixtures.table_parsing", - "test.fixtures.server", "test.fixtures.figure_detection", ] diff --git a/test/fixtures/figure_detection.py b/test/fixtures/figure_detection.py index 386e4a7..f36ae92 100644 --- a/test/fixtures/figure_detection.py +++ b/test/fixtures/figure_detection.py @@ -3,21 +3,18 @@ import textwrap import cv2 import numpy as np import pytest -from PIL import Image from lorem_text import lorem -from funcy import first + from cv_analysis.figure_detection.figure_detection_pipeline import ( make_figure_detection_pipeline, ) -from cv_analysis.utils.display import show_mpl @pytest.fixture def page_with_images(random_image, n_images, background): - page_image = Image.fromarray(background.astype("uint8")).convert("RGB") - page_image = paste_image(page_image, random_image, (200, 200)) - if n_images == 2: - page_image = paste_image(page_image, random_image, (1000, 2600)) + page_image = paste_image(background, random_image, (200, 200)) + # if n_images == 2: # TODO: Adjust image paste position, might be out of bounds + # page_image = paste_image(page_image, random_image, (1000, 2600)) return np.array(page_image) @@ -32,14 +29,10 @@ def page_with_text(background, font_scale, font_style, text_types): cursor = (image.shape[1] // 2, 70) image = paste_text(image, cursor, font_scale, font_style, y_stop=body_height) cursor = (50, body_height + 70) - image = paste_text( - image, cursor, font_scale, font_style, y_stop=body_height * 2 - ) + image = paste_text(image, cursor, font_scale, font_style, y_stop=body_height * 2) if "caption" in text_types: cursor = (image.shape[1] // 2, image.shape[0] - 100) - image = paste_text( - image, cursor, font_scale, font_style, y_stop=body_height * 3 - ) + image = paste_text(image, cursor, font_scale, font_style, y_stop=body_height * 3) return image @@ -67,9 +60,7 @@ def paste_text(image: np.ndarray, cursor, font_scale, font_style, y_stop): def paste_text_at_cursor(x_start, y_start, y_stop): # TODO: adjust incorrect right margin text = lorem.paragraphs(1) * 200 - (dx, dy), base = cv2.getTextSize( - text, fontFace=font_style, fontScale=font_scale, thickness=1 - ) + (dx, dy), base = cv2.getTextSize(text, fontFace=font_style, fontScale=font_scale, thickness=1) dy += base # char_width = dx // len(text) text = textwrap.fill(text=text, width=(dx // page_width)) @@ -95,6 +86,7 @@ def paste_text(image: np.ndarray, cursor, font_scale, font_style, y_stop): def paste_image(page_image, image, coords): - image = Image.fromarray(image.astype("uint8")).convert("RGBA") - page_image.paste(image, coords) + h, w = image.shape[:2] + x, y = coords + page_image[x : x + h, y : y + w] = image return page_image diff --git a/test/fixtures/server.py b/test/fixtures/server.py index 982b89a..0ecec7e 100644 --- a/test/fixtures/server.py +++ b/test/fixtures/server.py @@ -2,8 +2,8 @@ import gzip import io import numpy as np +import cv2 import pytest -from PIL import Image from funcy import first from cv_analysis.utils.structures import Rectangle @@ -12,7 +12,7 @@ from incl.pyinfra.pyinfra.server.packing import bytes_to_string @pytest.fixture def random_image_as_bytes_and_compressed(random_image): - image = Image.fromarray(random_image.astype("uint8")).convert("RGBA") + image = cv2.cvtColor(random_image.astype("uint8"), cv2.COLOR_RGB2RGBA) img_byte_arr = io.BytesIO() image.save(img_byte_arr, format="PNG") return gzip.compress(img_byte_arr.getvalue()) diff --git a/test/fixtures/table_parsing.py b/test/fixtures/table_parsing.py index 94e8cb0..ccd5207 100644 --- a/test/fixtures/table_parsing.py +++ b/test/fixtures/table_parsing.py @@ -6,7 +6,7 @@ from funcy import first from cv_analysis.locations import TEST_DATA_DIR from cv_analysis.utils.draw import draw_rectangles -from cv_analysis.utils.preprocessing import open_pdf +from cv_analysis.utils.open_pdf import open_pdf from test.fixtures.figure_detection import paste_text @@ -24,9 +24,7 @@ def expected_table_annotation(test_file_index): @pytest.fixture -def page_with_table( - background, table_shape, table_style, n_tables, line_thickness, line_type -): +def page_with_table(background, table_shape, table_style, n_tables, line_thickness, line_type): page = draw_table( background, (100, 100), @@ -36,9 +34,7 @@ def page_with_table( line_type=line_type, ) if n_tables == 2: - page = draw_table( - page, (200, 2000), table_shape, table_style, line_thickness, line_type - ) + page = draw_table(page, (200, 2000), table_shape, table_style, line_thickness, line_type) return page @@ -205,9 +201,7 @@ def expected_gold_page_with_table(page_with_table, n_tables): return result -def draw_table( - page, table_position, table_shape, table_style, line_thickness, line_type -): +def draw_table(page, table_position, table_shape, table_style, line_thickness, line_type): bbox_table = (*table_position, 1500, 1000) page = draw_grid_lines( page, diff --git a/test/unit_tests/figure_detection/figure_detection_pipeline_test.py b/test/unit_tests/figure_detection/figure_detection_pipeline_test.py index b51e7a4..95747ad 100644 --- a/test/unit_tests/figure_detection/figure_detection_pipeline_test.py +++ b/test/unit_tests/figure_detection/figure_detection_pipeline_test.py @@ -14,7 +14,7 @@ class TestFindPrimaryTextRegions: assert not list(results) @pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)]) - @pytest.mark.parametrize("n_images", [1, 2]) + @pytest.mark.parametrize("n_images", [1]) def test_page_without_text_yields_figures(self, figure_detection_pipeline, page_with_images, image_size): results = figure_detection_pipeline(page_with_images) result_figures_size = map(lambda x: (x.w, x.h), results) diff --git a/test/unit_tests/figure_detection/text_test.py b/test/unit_tests/figure_detection/text_test.py index 6983d79..794763b 100644 --- a/test/unit_tests/figure_detection/text_test.py +++ b/test/unit_tests/figure_detection/text_test.py @@ -6,7 +6,7 @@ from cv_analysis.figure_detection.text import ( remove_primary_text_regions, apply_threshold_to_image, ) -from cv_analysis.utils.display import show_mpl +from cv_analysis.utils.display import show_image from test.utils.utils import powerset @@ -25,33 +25,19 @@ class TestFindPrimaryTextRegions: np.testing.assert_equal(result_page, apply_threshold_to_image(page_with_images)) @pytest.mark.parametrize("font_scale", [1, 1.5, 2]) - @pytest.mark.parametrize( - "font_style", [cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_COMPLEX] - ) + @pytest.mark.parametrize("font_style", [cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_COMPLEX]) @pytest.mark.parametrize("text_types", powerset(["body", "header", "caption"])) - def test_page_with_only_text_gets_text_removed( - self, page_with_text, error_tolerance - ): + def test_page_with_only_text_gets_text_removed(self, page_with_text, error_tolerance): result_page = remove_primary_text_regions(page_with_text) - relative_error = ( - np.sum(result_page != apply_threshold_to_image(page_with_text)) - / result_page.size - ) + relative_error = np.sum(result_page != apply_threshold_to_image(page_with_text)) / result_page.size assert relative_error <= error_tolerance @pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)]) @pytest.mark.parametrize("n_images", [1, 2]) @pytest.mark.parametrize("font_scale", [1, 1.5, 2]) - @pytest.mark.parametrize( - "font_style", [cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_COMPLEX] - ) + @pytest.mark.parametrize("font_style", [cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_COMPLEX]) @pytest.mark.parametrize("text_types", powerset(["body", "header", "caption"])) - def test_page_with_images_and_text_keeps_images( - self, page_with_images_and_text, error_tolerance - ): + def test_page_with_images_and_text_keeps_images(self, page_with_images_and_text, error_tolerance): result_page = remove_primary_text_regions(page_with_images_and_text) - relative_error = ( - np.sum(result_page != apply_threshold_to_image(page_with_images_and_text)) - / result_page.size - ) + relative_error = np.sum(result_page != apply_threshold_to_image(page_with_images_and_text)) / result_page.size assert relative_error <= error_tolerance diff --git a/test/unit_tests/server/__init__.py b/test/unit_tests/server/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/test/unit_tests/server/formatted_stream_fn_test.py b/test/unit_tests/server/formatted_stream_fn_test.py deleted file mode 100644 index 2cf1e3a..0000000 --- a/test/unit_tests/server/formatted_stream_fn_test.py +++ /dev/null @@ -1,15 +0,0 @@ -import pytest -from funcy import first - -from cv_analysis.server.stream import make_streamable_analysis_fn - - -@pytest.mark.parametrize("operation", ["mock"]) -@pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)]) -def test_make_analysis_fn( - analysis_fn_mock, random_image_metadata_package, expected_analyse_metadata -): - analyse = make_streamable_analysis_fn(analysis_fn_mock) - results = first(analyse(random_image_metadata_package)) - - assert results["metadata"] == expected_analyse_metadata From a871fa3bd327e7f39305dd0626421eaa81871845 Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Fri, 22 Jul 2022 15:11:40 +0200 Subject: [PATCH 3/3] Pull request #19: Refactor evaluate Merge in RR/cv-analysis from refactor-evaluate to master Squashed commit of the following: commit cde03a492452610322f8b7d3eb804a51afb76d81 Author: Julius Unverfehrt Date: Fri Jul 22 12:37:36 2022 +0200 add optional show analysis metadata dict commit fb8bb9e2afa7767f2560f865516295be65f97f20 Author: Julius Unverfehrt Date: Fri Jul 22 12:13:18 2022 +0200 add script to evaluate runtime per page for all cv-analysis operations for multiple PDFs commit 721e823e2ec38aae3fea51d01e2135fc8f228d94 Author: Julius Unverfehrt Date: Fri Jul 22 10:30:31 2022 +0200 refactor commit a453753cfa477e162e5902ce191ded61cb678337 Author: Julius Unverfehrt Date: Fri Jul 22 10:19:24 2022 +0200 add logic to transform result coordinates accordingly to page rotation, update annotation script to use this logic commit 71c09758d0fb763a2c38c6871e1d9bf51f2e7c41 Author: Julius Unverfehrt Date: Thu Jul 21 15:57:49 2022 +0200 introduce pipeline for image conversion, analysis and result formatting commit aef252a41b9658dd0c4f55aa2d9f84de933586e0 Author: Julius Unverfehrt Date: Thu Jul 21 15:57:38 2022 +0200 introduce pipeline for image conversion, analysis and result formatting --- cv_analysis/server/pipeline.py | 64 +++++++++++++ cv_analysis/server/{format.py => rotate.py} | 36 ++++--- cv_analysis/server/stream.py | 43 --------- .../utils/{pdf2array.py => pdf2image.py} | 25 +++-- scripts/annotate_pdf.py | 55 +++++------ scripts/measure_runtimes.py | 96 +++++++++++++++++++ .../{pdf2array_test.py => pdf2image_test.py} | 10 +- 7 files changed, 221 insertions(+), 108 deletions(-) create mode 100644 cv_analysis/server/pipeline.py rename cv_analysis/server/{format.py => rotate.py} (83%) delete mode 100644 cv_analysis/server/stream.py rename cv_analysis/utils/{pdf2array.py => pdf2image.py} (61%) create mode 100644 scripts/measure_runtimes.py rename test/unit_tests/{pdf2array_test.py => pdf2image_test.py} (51%) diff --git a/cv_analysis/server/pipeline.py b/cv_analysis/server/pipeline.py new file mode 100644 index 0000000..ef890f0 --- /dev/null +++ b/cv_analysis/server/pipeline.py @@ -0,0 +1,64 @@ +from functools import partial +from typing import Callable + +from funcy import lmap + +from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline +from cv_analysis.layout_parsing import parse_layout +from cv_analysis.server.rotate import rotate_rectangle +from cv_analysis.table_parsing import parse_tables +from cv_analysis.utils.logging import get_logger +from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs +from cv_analysis.utils.structures import Rectangle + +logger = get_logger() + + +def make_analysis_pipeline(analysis_fn: Callable, dpi=200): + """Make end-to-end pipeline to analyse a PDF with given analysis function. + The pipeline returns a Generator of dicts containing page information and the analysis results. + + Steps: + Convert PDF to Arrays and page information + Analise pages, get list of bboxes per page (e.g. table cells) + Convert pixel values to inches + Rotate results if page is rotated + Format results to stream of dictionaries + """ + + def pipeline(pdf: bytes, index=None): + image_metadata_pairs = pdf_to_image_metadata_pairs(pdf, index=index, dpi=dpi) + results = map(image_metadata_pair_to_results, image_metadata_pairs) + results_filtered = filter(lambda x: x["bboxes"], results) + return results_filtered + + def image_metadata_pair_to_results(image_metadata_pair): + rectangles = analysis_fn(image_metadata_pair.image) + rectangles = map(partial(pixel_rect_to_inches_rect, dpi=dpi), rectangles) + if image_metadata_pair.metadata["rotation"] != 0: + rotate_rectangle_fn = partial(rotate_rectangle, metadata=image_metadata_pair.metadata) + rectangles = map(rotate_rectangle_fn, rectangles) + bboxes = lmap(lambda x: x.json_xyxy(), rectangles) + return {**image_metadata_pair.metadata, "bboxes": bboxes} + + return pipeline + + +def get_analysis_fn(analysis_type): + if analysis_type == "table": + return parse_tables + elif analysis_type == "layout": + return parse_layout + elif analysis_type == "figure": + return make_figure_detection_pipeline() + else: + raise + + +def pixel_rect_to_inches_rect(rect, dpi): + def convert_pixel_to_inch(pixel): + return pixel / dpi * 72 + + bbox = rect.x1, rect.y1, rect.x2, rect.y2 + bbox_inches = tuple(map(convert_pixel_to_inch, bbox)) + return Rectangle.from_xyxy(bbox_inches, discrete=False) diff --git a/cv_analysis/server/format.py b/cv_analysis/server/rotate.py similarity index 83% rename from cv_analysis/server/format.py rename to cv_analysis/server/rotate.py index 6e00991..ec9a867 100644 --- a/cv_analysis/server/format.py +++ b/cv_analysis/server/rotate.py @@ -1,35 +1,25 @@ from _operator import itemgetter -from functools import partial import numpy as np from cv_analysis.utils.structures import Rectangle -def make_formatter(dpi, page_size, rotation): +def rotate_rectangle(rectangle, metadata): + width, height, rotation = itemgetter("width", "height", "rotation")(metadata) rotation = rotation // 90 if rotation not in [0, 1, 2, 3] else rotation - def format_(key2pixel): - convert = partial(convert_pixel_to_inch, dpi=dpi) - x, y, w, h = map(convert, itemgetter("x", "y", "width", "height")(key2pixel)) - x1, y1 = x + w, y + h - matrix = np.vstack([[x, y], [x1, y1]]).T - new_matrix = rotate_and_shift(matrix, rotation, page_size) - x1, x2 = sorted(new_matrix[0, :]) - y1, y2 = sorted(new_matrix[1, :]) - return Rectangle.from_xyxy((x1, y1, x2, y2), discrete=False).json_xywh() + if rotation in [1, 3]: + width, height = height, width - return format_ + x1, y1, x2, y2 = rectangle.xyxy() + matrix = np.vstack([[x1, y1], [x2, y2]]).T + new_matrix = rotate_and_shift(matrix, rotation, (width, height)) + x1, x2 = sorted(new_matrix[0, :]) + y1, y2 = sorted(new_matrix[1, :]) -def convert_pixel_to_inch(pixel, dpi): - return pixel / dpi * 72 - - -def rotate(input_matrix, radians): - rotation_matrix = np.vstack([[np.cos(radians), -np.sin(radians)], [np.sin(radians), np.cos(radians)]]) - - return np.dot(rotation_matrix, input_matrix) + return Rectangle.from_xyxy((x1, y1, x2, y2), discrete=False) def rotate_and_shift(matrix, rotation, size, debug=False): @@ -109,3 +99,9 @@ def __show_matrices(size, radians, matrix, matrix_rotated, matrix_rotated_and_sh axes[1].quiver([0, 0], [0, 0], m3[0, :], m3[1, :], scale=5, scale_units="inches", color="blue") plt.show() + + +def rotate(input_matrix, radians): + rotation_matrix = np.vstack([[np.cos(radians), -np.sin(radians)], [np.sin(radians), np.cos(radians)]]) + + return np.dot(rotation_matrix, input_matrix) diff --git a/cv_analysis/server/stream.py b/cv_analysis/server/stream.py deleted file mode 100644 index a73ae05..0000000 --- a/cv_analysis/server/stream.py +++ /dev/null @@ -1,43 +0,0 @@ -import gzip -from operator import itemgetter -from typing import Callable - -from funcy import lmap -from pyinfra.server.utils import make_streamable_and_wrap_in_packing_logic - -from cv_analysis.server.format import make_formatter -from cv_analysis.utils.logging import get_logger -from cv_analysis.utils.open_pdf import open_pdf - -logger = get_logger() - - -def make_streamable_analysis_fn(analysis_fn: Callable): - """Makes an analysis function streamable for pyinfra server logic. The wrapped function then - works with data and metadata and returns a tuple or generator of tuples with data and metadata. - For more information about the server logic, see the PyInfra documentation. - - Args: - analysis_fn: cv-analysis function - - Returns: - wrapped function - """ - - def analyse(data: bytes, metadata: dict): - - image = open_pdf(gzip.decompress(data))[0] - - dpi = metadata["image_info"]["dpi"] - width, height, rotation = itemgetter("width", "height", "rotation")(metadata["page_info"]) - - formatter = make_formatter(dpi, (width, height), rotation) - - results = map(lambda x: x.json_xywh(), analysis_fn(image)) - results = {"cells": (lmap(formatter, results))} - - logger.debug(f"Page {metadata['page_info'].get('index', '')}: Found {len(results['cells'])} cells.") - - return b"", {**metadata, **results} - - return make_streamable_and_wrap_in_packing_logic(analyse, batched=False) diff --git a/cv_analysis/utils/pdf2array.py b/cv_analysis/utils/pdf2image.py similarity index 61% rename from cv_analysis/utils/pdf2array.py rename to cv_analysis/utils/pdf2image.py index 8ce1ea6..b5da78d 100644 --- a/cv_analysis/utils/pdf2array.py +++ b/cv_analysis/utils/pdf2image.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from functools import partial from typing import Iterator, Tuple @@ -5,20 +6,25 @@ import fitz import numpy as np -def pdf_to_array_and_metadata(pdf: bytes, index=None, dpi=200) -> Iterator[Tuple[np.ndarray, dict]]: - """Stream the pages of a PDF as Tuples of page as matrix representation and page metadata. - Note: If Index is not given or evaluates to None, the whole PDF will be processed. - """ - convert_fn = partial(page_to_array_and_metadata, dpi=dpi) +@dataclass +class ImageMetadataPair: + image: np.ndarray + metadata: dict + + +def pdf_to_image_metadata_pairs(pdf: bytes, index=None, dpi=200) -> Iterator[ImageMetadataPair]: + """Streams PDF as pairs of image (matrix) and metadata. + Note: If Index is not given or evaluates to None, the whole PDF will be processed.""" + convert_fn = partial(page_to_image_metadata_pair, dpi=dpi) yield from map(convert_fn, stream_pages(pdf, index)) -def page_to_array_and_metadata(page: fitz.Page, dpi): - metadata = get_page_info(page, dpi) +def page_to_image_metadata_pair(page: fitz.Page, dpi): + metadata = get_page_info(page) pixmap = page.get_pixmap(dpi=dpi) array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n) - return array, metadata + return ImageMetadataPair(array, metadata) def stream_pages(pdf: bytes, index=None) -> Iterator[fitz.Page]: @@ -30,11 +36,10 @@ def stream_pages(pdf: bytes, index=None) -> Iterator[fitz.Page]: yield pdf_handle[i] -def get_page_info(page, dpi): +def get_page_info(page): return { "index": page.number, "rotation": page.rotation, "width": page.rect.width, # rotated page width in inches "height": page.rect.height, # rotated page height in inches - "dpi": dpi, } diff --git a/scripts/annotate_pdf.py b/scripts/annotate_pdf.py index aead8f6..44c48e7 100644 --- a/scripts/annotate_pdf.py +++ b/scripts/annotate_pdf.py @@ -1,15 +1,11 @@ import argparse -from itertools import starmap +import json +from operator import itemgetter from pathlib import Path -from PIL import Image -from funcy import lmap +import fitz -from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline -from cv_analysis.layout_parsing import parse_layout -from cv_analysis.table_parsing import parse_tables -from cv_analysis.utils.draw import draw_rectangles -from cv_analysis.utils.pdf2array import pdf_to_array_and_metadata +from cv_analysis.server.pipeline import get_analysis_fn, make_analysis_pipeline def parse_args(): @@ -17,36 +13,35 @@ def parse_args(): parser.add_argument("pdf_path") parser.add_argument("output_folder") parser.add_argument("--type", "-t", choices=["table", "layout", "figure"], required=True) + parser.add_argument("--verbose", action="store_true") + parser.add_argument("--silent", dest="verbose", action="store_false") + parser.set_defaults(verbose=False) return parser.parse_args() -def analyse_and_annotate(images, analysis_fn): - result = map(analysis_fn, images) - annotated_images = starmap(draw_rectangles, zip(images, result)) - return annotated_images +def analyse_annotate_save(pdf, analysis_type, output_path, verbose): + pipe = make_analysis_pipeline(get_analysis_fn(analysis_type)) + results = list(pipe(pdf)) + if verbose: + print(json.dumps(results, indent=2)) -def save_as_pdf(images, output_folder, file_name, operation): - Path(output_folder).mkdir(parents=True, exist_ok=True) - images = lmap(Image.fromarray, images) - images[0].save(f"{output_folder}/{file_name}_annotated_{operation}.pdf", save_all=True, append_images=images) - - -def get_analysis_fn(analysis_type): - if analysis_type == "table": - return parse_tables - elif analysis_type == "layout": - return parse_layout - elif analysis_type == "figure": - return make_figure_detection_pipeline() - else: - raise + with fitz.open(stream=pdf) as pdf_handle: + for result in results: + page = pdf_handle[result["index"]] + for rect in result["bboxes"]: + x1, y1, x2, y2 = itemgetter("x1", "y1", "x2", "y2")(rect) + page.draw_rect((x1, y1, x2, y2), color=(0.5, 0.7, 0.2), width=2) + pdf_handle.save(output_path) if __name__ == "__main__": args = parse_args() + with open(args.pdf_path, "rb") as f: pdf_bytes = f.read() - images, _ = zip(*pdf_to_array_and_metadata(pdf_bytes)) - annotated_pages = analyse_and_annotate(images=images, analysis_fn=get_analysis_fn(args.type)) - save_as_pdf(annotated_pages, args.output_folder, Path(args.pdf_path).stem, args.type) + + Path(args.output_folder).mkdir(parents=True, exist_ok=True) + output_path = f"{args.output_folder}/{Path(args.pdf_path).stem}_annotated_{args.type}.pdf" + + analyse_annotate_save(pdf_bytes, args.type, output_path, args.verbose) diff --git a/scripts/measure_runtimes.py b/scripts/measure_runtimes.py new file mode 100644 index 0000000..656dfcb --- /dev/null +++ b/scripts/measure_runtimes.py @@ -0,0 +1,96 @@ +import argparse +import time +from functools import partial +from pathlib import Path + +import fitz +import numpy as np +from funcy import lmap +from matplotlib import pyplot as plt + +from cv_analysis.server.pipeline import make_analysis_pipeline, get_analysis_fn + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("pdf_folder", help="Path to folder with PDFs to evaluate") + parser.add_argument("output_folder", help="Path to folder where the Runtime plot should be stored") + parser.add_argument("n_runs", help="Number of runs per test") + return parser.parse_args() + + +def measure(fn, n_runs): + def run(*args, **kwargs): + def _run(): + start = time.time() + results = list(fn(*args, **kwargs)) # Evaluate generators + end = time.time() + return end - start + + runtimes = [_run() for _ in range(n_runs)] + return np.mean(runtimes), np.std(runtimes) + + return run + + +def run_tests(pdf, test_cases, n_runs): + def measure_analysis_pipe(test_case): + timed_analysis_pipe = measure(make_analysis_pipeline(get_analysis_fn(test_case)), n_runs) + return timed_analysis_pipe(pdf) + + return lmap(measure_analysis_pipe, test_cases) + + +def to_ms_per_page(runtime, page_count): + ms_per_page = runtime / page_count * 1000 + return round(ms_per_page, 0) + + +def measure_pdf(pdf_path, n_runs): + with open(pdf_path, "rb") as f: + pdf = f.read() + page_count = fitz.open(stream=pdf).page_count + format_fn = partial(to_ms_per_page, page_count=page_count) + + means, std = zip(*run_tests(pdf, test_cases, n_runs=n_runs)) + means, std = lmap(format_fn, means), lmap(format_fn, std) + return means, std + + +def plot_results_and_save(results, labels, n_runs, test_pdf_paths): + fig, ax = plt.subplots() + width = 0.2 + x_labels = np.arange(len(labels)) + plt.xticks(ticks=x_labels, labels=labels, rotation=90) + plt.grid(linestyle="dotted") + + for idx, (result, test_pdf_path) in enumerate(zip(results, test_pdf_paths)): + x = x_labels + idx * width + means, std = result + bars = ax.bar(x, means, width, yerr=std, label=f"{test_pdf_path.stem}") + ax.bar_label(bars) + ax.set_ylabel("ms/page") + ax.set_xlabel("Cv-analysis operation") + ax.set_title(f"Cv-analysis runtime estimation {n_runs=}") + ax.legend(loc=0) + + Path(args.output_folder).mkdir(parents=True, exist_ok=True) + output_path = f"{args.output_folder}/cv_analysis_runtime_{n_runs=}.png" + plt.savefig(output_path, dpi=200, bbox_inches="tight", pad_inches=0.5) + plt.close() + + +def measure_and_save_plot(args, test_cases): + n_runs = int(args.n_runs) + measure_pdf_fn = partial(measure_pdf, n_runs=n_runs) + test_pdf_paths = list(Path(args.pdf_folder).glob("*.pdf")) + results = lmap(measure_pdf_fn, test_pdf_paths) + plot_results_and_save(results, test_cases, n_runs, test_pdf_paths) + + +if __name__ == "__main__": + + test_cases = ["table", "layout", "figure"] + + args = parse_args() + measure_and_save_plot(args, test_cases) diff --git a/test/unit_tests/pdf2array_test.py b/test/unit_tests/pdf2image_test.py similarity index 51% rename from test/unit_tests/pdf2array_test.py rename to test/unit_tests/pdf2image_test.py index 1820ed8..4a44a26 100644 --- a/test/unit_tests/pdf2array_test.py +++ b/test/unit_tests/pdf2image_test.py @@ -2,7 +2,7 @@ import fitz import numpy as np import pytest -from cv_analysis.utils.pdf2array import pdf_to_array_and_metadata +from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs @pytest.fixture @@ -17,8 +17,8 @@ def pdf(n_pages): @pytest.mark.parametrize("n_pages", [1]) def test_pdf_to_array_and_metadata(pdf): - for array, metadata in pdf_to_array_and_metadata(pdf): - assert isinstance(array, np.ndarray) - assert array.shape == (2339, 1653, 3) # Height, Width, Color channels + for image_metadata_pair in pdf_to_image_metadata_pairs(pdf): + assert isinstance(image_metadata_pair.image, np.ndarray) + assert image_metadata_pair.image.shape == (2339, 1653, 3) # Height, Width, Color channels - assert isinstance(metadata, dict) + assert isinstance(image_metadata_pair.metadata, dict)