From e8863d67aaaff138fb088c4e496a91b6354cc059 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Sun, 6 Feb 2022 14:42:45 +0100 Subject: [PATCH] refactoring --- vidocp/figure_detection.py | 11 +++++++++-- vidocp/utils.py | 37 ++++++++++++++++++++++++++++++------- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py index 5cb44ca..2bf77db 100644 --- a/vidocp/figure_detection.py +++ b/vidocp/figure_detection.py @@ -2,8 +2,15 @@ import cv2 import numpy as np from pdf2image import pdf2image -from vidocp.utils import show_mpl, draw_rectangles, remove_included, remove_primary_text_regions, \ - __detect_large_coherent_structures, is_large_enough, has_acceptable_format +from vidocp.utils import ( + show_mpl, + draw_rectangles, + remove_included, + remove_primary_text_regions, + __detect_large_coherent_structures, + is_large_enough, + has_acceptable_format, +) def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6): diff --git a/vidocp/utils.py b/vidocp/utils.py index a7a50a4..ff61135 100644 --- a/vidocp/utils.py +++ b/vidocp/utils.py @@ -144,8 +144,8 @@ def vec_rect_to_xywh(rect): return x, y, w, h -def remove_primary_text_regions(image): - """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs. +def find_primary_text_regions(image): + """Finds regions of primary text, meaning no figure descriptions for example, but main text body paragraphs. Args: image: Image to remove primary text from. @@ -174,7 +174,26 @@ def remove_primary_text_regions(image): cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) - for cnt in filter(is_likely_primary_text_segments, cnts): + cnts = filter(is_likely_primary_text_segments, cnts) + + return cnts + + +def remove_primary_text_regions(image): + """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs. + + Args: + image: Image to remove primary text from. + + Returns: + Image with primary text removed. + """ + + image = image.copy() + + cnts = find_primary_text_regions(image) + + for cnt in cnts: x, y, w, h = cv2.boundingRect(cnt) cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1) @@ -206,17 +225,21 @@ def is_large_enough(cont, min_area): return cv2.contourArea(cont, False) > min_area -def has_acceptable_format(cont, max_width_to_hight_ratio): +def has_acceptable_format(cont, max_width_to_height_ratio): _, _, w, h = cv2.boundingRect(cont) - return max_width_to_hight_ratio >= w / h >= (1 / max_width_to_hight_ratio) + return max_width_to_height_ratio >= w / h >= (1 / max_width_to_height_ratio) def is_filled(hierarchy): - # See https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv + """Checks whether a hierarchy is filled. + + References: + https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv + """ return hierarchy[3] <= 0 and hierarchy[2] == -1 def is_boxy(contour): epsilon = 0.01 * cv2.arcLength(contour, True) approx = cv2.approxPolyDP(contour, epsilon, True) - return len(approx) <= 10 \ No newline at end of file + return len(approx) <= 10