From c9b2f6bf2924403c084af5a1b7d830cbcbee3395 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 14:55:38 +0100 Subject: [PATCH] Pull request #9: Refactoring Merge in RR/vidocp from refactoring to master Squashed commit of the following: commit 36a62a13e51148d2420cb12930e84d78629db6b0 Author: Matthias Bisping Date: Sun Feb 6 14:54:53 2022 +0100 refactoring commit e652da1fa88a048f9a5211b4e8c0b96074fb5849 Author: Matthias Bisping Date: Sun Feb 6 14:53:17 2022 +0100 refactoring commit d9567da428c81f9cd7971a657281df0a90166810 Author: Matthias Bisping Date: Sun Feb 6 14:47:18 2022 +0100 refactoring commit 9d30009dceec0357db6499bfaffae8ce97718ee0 Author: Matthias Bisping Date: Sun Feb 6 14:45:53 2022 +0100 refactoring commit e8863d67aaaff138fb088c4e496a91b6354cc059 Author: Matthias Bisping Date: Sun Feb 6 14:42:45 2022 +0100 refactoring commit 89a99d3586db4fbafa743a45bdd02eaf0c1f341f Author: Matthias Bisping Date: Sun Feb 6 14:39:49 2022 +0100 refactoring commit aa66b6865b00b0490b9e7695a6bae386e6f96723 Author: Matthias Bisping Date: Sun Feb 6 14:31:21 2022 +0100 refactoring commit 98d77cb522a08821c3a13ae2cffbe7239c654762 Author: Matthias Bisping Date: Sun Feb 6 14:27:55 2022 +0100 refactoring commit fed3a7e4f1b8b7ca4e14f9e495459c26490fb50b Author: Matthias Bisping Date: Sun Feb 6 14:26:16 2022 +0100 refactoring commit 504cafbd5d4bba183d9943b36c60548aae34e402 Author: Matthias Bisping Date: Sun Feb 6 14:25:44 2022 +0100 renaming commit c9780a57e5a048529d36958ba678eddb11759cef Author: Matthias Bisping Date: Sun Feb 6 14:24:41 2022 +0100 removed obsolete import commit d555e86475e82024f8e1a5fc5b0ac70faa091ee1 Author: Matthias Bisping Date: Sun Feb 6 14:24:04 2022 +0100 refactored figure detection once --- scripts/annotate.py | 8 +- vidocp/figure_detection.py | 54 +++---------- vidocp/layout_parsing.py | 4 +- vidocp/redaction_detection.py | 21 +---- vidocp/table_parsing.py | 5 +- vidocp/utils/__init__.py | 1 + vidocp/utils/detection.py | 23 ++++++ vidocp/utils/display.py | 16 ++++ vidocp/utils/draw.py | 56 +++++++++++++ vidocp/utils/filters.py | 25 ++++++ vidocp/{utils.py => utils/post_processing.py} | 81 ------------------- vidocp/utils/text.py | 57 +++++++++++++ vidocp/utils/utils.py | 12 +++ 13 files changed, 214 insertions(+), 149 deletions(-) create mode 100644 vidocp/utils/__init__.py create mode 100644 vidocp/utils/detection.py create mode 100644 vidocp/utils/display.py create mode 100644 vidocp/utils/draw.py create mode 100644 vidocp/utils/filters.py rename vidocp/{utils.py => utils/post_processing.py} (51%) create mode 100644 vidocp/utils/text.py create mode 100644 vidocp/utils/utils.py diff --git a/scripts/annotate.py b/scripts/annotate.py index 10d40cc..9ef1bce 100644 --- a/scripts/annotate.py +++ b/scripts/annotate.py @@ -1,9 +1,9 @@ import argparse from vidocp.table_parsing import annotate_tables_in_pdf -from vidocp.redaction_detection import annotate_boxes_in_pdf +from vidocp.redaction_detection import annotate_redactions_in_pdf from vidocp.layout_parsing import annotate_layout_in_pdf -from vidocp.figure_detection import remove_text_in_pdf +from vidocp.figure_detection import detect_figures_in_pdf def parse_args(): @@ -22,8 +22,8 @@ if __name__ == "__main__": if args.type == "table": annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index) elif args.type == "redaction": - annotate_boxes_in_pdf(args.pdf_path, page_index=args.page_index) + annotate_redactions_in_pdf(args.pdf_path, page_index=args.page_index) elif args.type == "layout": annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index) elif args.type == "figure": - remove_text_in_pdf(args.pdf_path, page_index=args.page_index) + detect_figures_in_pdf(args.pdf_path, page_index=args.page_index) diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py index e852646..27a8eb2 100644 --- a/vidocp/figure_detection.py +++ b/vidocp/figure_detection.py @@ -2,16 +2,12 @@ import cv2 import numpy as np from pdf2image import pdf2image -from vidocp.utils import draw_contours, show_mpl, draw_rectangles, remove_included, remove_overlapping, show_cv2 - - -def is_large_enough(cont, min_area=10000): - return cv2.contourArea(cont, False) > min_area - - -def has_acceptable_format(cont, max_width_to_hight_ratio=6): - _, _, w, h = cv2.boundingRect(cont) - return max_width_to_hight_ratio >= w / h >= (1 / max_width_to_hight_ratio) +from vidocp.utils.detection import detect_large_coherent_structures +from vidocp.utils.display import show_mpl +from vidocp.utils.draw import draw_rectangles +from vidocp.utils.post_processing import remove_included +from vidocp.utils.filters import is_large_enough, has_acceptable_format +from vidocp.utils.text import remove_primary_text_regions def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6): @@ -22,47 +18,17 @@ def detect_figures(image: np.array): image = image.copy() - gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] - - close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3)) - close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, close_kernel, iterations=1) - - dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3)) - dilate = cv2.dilate(close, dilate_kernel, iterations=1) - - cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) - - def filter_rects(): - for c in cnts: - area = cv2.contourArea(c) - if area > 800 and area < 15000: - yield cv2.boundingRect(c) - - for rect in filter_rects(): - x, y, w, h = rect - cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1) - - gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - - thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY)[1] - - dilate_kernel = cv2.getStructuringElement(cv2.MORPH_OPEN, (5, 5)) - dilate = cv2.dilate(~thresh, dilate_kernel, iterations=4) - - close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20)) - close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1) - - cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + image = remove_primary_text_regions(image) + cnts = detect_large_coherent_structures(image) cnts = filter(is_likely_figure, cnts) - rects = [cv2.boundingRect(c) for c in cnts] + rects = map(cv2.boundingRect, cnts) rects = remove_included(rects) return rects -def remove_text_in_pdf(pdf_path, page_index=1): +def detect_figures_in_pdf(pdf_path, page_index=1): page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) diff --git a/vidocp/layout_parsing.py b/vidocp/layout_parsing.py index 67cd89e..b5f1c51 100644 --- a/vidocp/layout_parsing.py +++ b/vidocp/layout_parsing.py @@ -6,7 +6,9 @@ import cv2 import numpy as np from pdf2image import pdf2image -from vidocp.utils import draw_rectangles, show_mpl, remove_overlapping, remove_included, has_no_parent +from vidocp.utils.display import show_mpl +from vidocp.utils.draw import draw_rectangles +from vidocp.utils.post_processing import remove_overlapping, remove_included, has_no_parent def is_likely_segment(rect, min_area=100): diff --git a/vidocp/redaction_detection.py b/vidocp/redaction_detection.py index f1b319a..3362dc6 100644 --- a/vidocp/redaction_detection.py +++ b/vidocp/redaction_detection.py @@ -5,22 +5,9 @@ import numpy as np import pdf2image from iteration_utilities import starfilter, first -from vidocp.utils import show_mpl, draw_contours - - -def is_filled(hierarchy): - # See https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv - return hierarchy[3] <= 0 and hierarchy[2] == -1 - - -def is_boxy(contour): - epsilon = 0.01 * cv2.arcLength(contour, True) - approx = cv2.approxPolyDP(contour, epsilon, True) - return len(approx) <= 10 - - -def is_large_enough(contour, min_area): - return cv2.contourArea(contour, False) > min_area +from vidocp.utils.display import show_mpl +from vidocp.utils.draw import draw_contours +from vidocp.utils.filters import is_large_enough, is_filled, is_boxy def is_likely_redaction(contour, hierarchy, min_area): @@ -43,7 +30,7 @@ def find_redactions(image: np.array, min_normalized_area=200000): return contours -def annotate_boxes_in_pdf(pdf_path, page_index=1): +def annotate_redactions_in_pdf(pdf_path, page_index=1): page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index 765fb1c..c991d43 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -2,7 +2,8 @@ import cv2 import numpy as np from pdf2image import pdf2image -from vidocp.utils import show_cv2, draw_stats +from vidocp.utils.display import show_mpl +from vidocp.utils.draw import draw_stats def add_external_contours(image, img): @@ -52,4 +53,4 @@ def annotate_tables_in_pdf(pdf_path, page_index=1): stats = parse_table(page) page = draw_stats(page, stats) - show_cv2(page) + show_mpl(page) diff --git a/vidocp/utils/__init__.py b/vidocp/utils/__init__.py new file mode 100644 index 0000000..16281fe --- /dev/null +++ b/vidocp/utils/__init__.py @@ -0,0 +1 @@ +from .utils import * diff --git a/vidocp/utils/detection.py b/vidocp/utils/detection.py new file mode 100644 index 0000000..e5d8266 --- /dev/null +++ b/vidocp/utils/detection.py @@ -0,0 +1,23 @@ +import cv2 +import numpy as np + + +def detect_large_coherent_structures(image: np.array): + """Detects large coherent structures on an image. + + References: + https://stackoverflow.com/questions/60259169/how-to-group-nearby-contours-in-opencv-python-zebra-crossing-detection + """ + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + + thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY)[1] + + dilate_kernel = cv2.getStructuringElement(cv2.MORPH_OPEN, (5, 5)) + dilate = cv2.dilate(~thresh, dilate_kernel, iterations=4) + + close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20)) + close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1) + + cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + return cnts diff --git a/vidocp/utils/display.py b/vidocp/utils/display.py new file mode 100644 index 0000000..e0cb8ab --- /dev/null +++ b/vidocp/utils/display.py @@ -0,0 +1,16 @@ +import cv2 +from matplotlib import pyplot as plt + + +def show_mpl(image): + + fig, ax = plt.subplots(1, 1) + fig.set_size_inches(20, 20) + ax.imshow(image) + plt.show() + + +def show_cv2(image): + + cv2.imshow("", image) + cv2.waitKey(0) diff --git a/vidocp/utils/draw.py b/vidocp/utils/draw.py new file mode 100644 index 0000000..32c66f6 --- /dev/null +++ b/vidocp/utils/draw.py @@ -0,0 +1,56 @@ +import cv2 + +from vidocp.utils import copy_and_normalize_channels + + +def draw_contours(image, contours): + + image = copy_and_normalize_channels(image) + + for cont in contours: + cv2.drawContours(image, cont, -1, (0, 255, 0), 4) + + return image + + +def draw_rectangles(image, rectangles, color=None): + + image = copy_and_normalize_channels(image) + + if not color: + color = (0, 255, 0) + + for rect in rectangles: + x, y, w, h = rect + cv2.rectangle(image, (x, y), (x + w, y + h), color, 2) + + return image + + +def draw_stats(image, stats, annotate=False): + + image = copy_and_normalize_channels(image) + + keys = ["x", "y", "w", "h"] + + def annotate_stat(x, y, w, h): + + for i, (s, v) in enumerate(zip(keys, [x, y, w, h])): + anno = f"{s} = {v}" + xann = int(x + 5) + yann = int(y + h - (20 * (i + 1))) + cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2) + + def draw_stat(stat): + + x, y, w, h, area = stat + + cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2) + + if annotate: + annotate_stat(x, y, w, h) + + for stat in stats[2:]: + draw_stat(stat) + + return image diff --git a/vidocp/utils/filters.py b/vidocp/utils/filters.py new file mode 100644 index 0000000..274925c --- /dev/null +++ b/vidocp/utils/filters.py @@ -0,0 +1,25 @@ +import cv2 + + +def is_large_enough(cont, min_area): + return cv2.contourArea(cont, False) > min_area + + +def has_acceptable_format(cont, max_width_to_height_ratio): + _, _, w, h = cv2.boundingRect(cont) + return max_width_to_height_ratio >= w / h >= (1 / max_width_to_height_ratio) + + +def is_filled(hierarchy): + """Checks whether a hierarchy is filled. + + References: + https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv + """ + return hierarchy[3] <= 0 and hierarchy[2] == -1 + + +def is_boxy(contour): + epsilon = 0.01 * cv2.arcLength(contour, True) + approx = cv2.approxPolyDP(contour, epsilon, True) + return len(approx) <= 10 diff --git a/vidocp/utils.py b/vidocp/utils/post_processing.py similarity index 51% rename from vidocp/utils.py rename to vidocp/utils/post_processing.py index ee528b4..0cd7f62 100644 --- a/vidocp/utils.py +++ b/vidocp/utils/post_processing.py @@ -1,87 +1,6 @@ from collections import namedtuple from functools import partial -import cv2 -from matplotlib import pyplot as plt - - -def show_mpl(image): - - fig, ax = plt.subplots(1, 1) - fig.set_size_inches(20, 20) - ax.imshow(image) - plt.show() - - -def show_cv2(image): - - cv2.imshow("", image) - cv2.waitKey(0) - - -def copy_and_normalize_channels(image): - - image = image.copy() - try: - image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR) - except cv2.error: - pass - - return image - - -def draw_contours(image, contours): - - image = copy_and_normalize_channels(image) - - for cont in contours: - cv2.drawContours(image, cont, -1, (0, 255, 0), 4) - - return image - - -def draw_rectangles(image, rectangles, color=None): - - image = copy_and_normalize_channels(image) - - if not color: - color = (0, 255, 0) - - for rect in rectangles: - x, y, w, h = rect - cv2.rectangle(image, (x, y), (x + w, y + h), color, 2) - - return image - - -def draw_stats(image, stats, annotate=False): - - image = copy_and_normalize_channels(image) - - keys = ["x", "y", "w", "h"] - - def annotate_stat(x, y, w, h): - - for i, (s, v) in enumerate(zip(keys, [x, y, w, h])): - anno = f"{s} = {v}" - xann = int(x + 5) - yann = int(y + h - (20 * (i + 1))) - cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2) - - def draw_stat(stat): - - x, y, w, h, area = stat - - cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2) - - if annotate: - annotate_stat(x, y, w, h) - - for stat in stats[2:]: - draw_stat(stat) - - return image - def remove_overlapping(rectangles): def overlap(a, b): diff --git a/vidocp/utils/text.py b/vidocp/utils/text.py new file mode 100644 index 0000000..4189005 --- /dev/null +++ b/vidocp/utils/text.py @@ -0,0 +1,57 @@ +import cv2 + + +def remove_primary_text_regions(image): + """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs. + + Args: + image: Image to remove primary text from. + + Returns: + Image with primary text removed. + """ + + image = image.copy() + + cnts = find_primary_text_regions(image) + + for cnt in cnts: + x, y, w, h = cv2.boundingRect(cnt) + cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1) + + return image + + +def find_primary_text_regions(image): + """Finds regions of primary text, meaning no figure descriptions for example, but main text body paragraphs. + + Args: + image: Image to remove primary text from. + + Returns: + Image with primary text removed. + + References: + https://stackoverflow.com/questions/58349726/opencv-how-to-remove-text-from-background + """ + + def is_likely_primary_text_segments(cnt): + return 800 < cv2.contourArea(cnt) < 15000 + + image = image.copy() + + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + + thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] + + close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3)) + close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, close_kernel, iterations=1) + + dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3)) + dilate = cv2.dilate(close, dilate_kernel, iterations=1) + + cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) + + cnts = filter(is_likely_primary_text_segments, cnts) + + return cnts diff --git a/vidocp/utils/utils.py b/vidocp/utils/utils.py new file mode 100644 index 0000000..18c8eb2 --- /dev/null +++ b/vidocp/utils/utils.py @@ -0,0 +1,12 @@ +import cv2 + + +def copy_and_normalize_channels(image): + + image = image.copy() + try: + image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR) + except cv2.error: + pass + + return image