From b00b914caf48e9c471f580b033278ba4f6c76150 Mon Sep 17 00:00:00 2001 From: Matthias Bisping Date: Fri, 4 Feb 2022 12:49:51 +0100 Subject: [PATCH] improved box detection --- box_detection/box_detection.py | 54 ++++++++++++++++++++++++---------- requirements.txt | 1 + scripts/annotate.py | 1 - 3 files changed, 39 insertions(+), 17 deletions(-) diff --git a/box_detection/box_detection.py b/box_detection/box_detection.py index 9bbe9dd..28adb2f 100644 --- a/box_detection/box_detection.py +++ b/box_detection/box_detection.py @@ -1,29 +1,51 @@ -from itertools import count +from functools import partial import cv2 -import imutils import numpy as np import pdf2image +from iteration_utilities import starfilter, first from matplotlib import pyplot as plt -def parse(image: np.array): +def is_filled(hierarchy): + # See https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv + return hierarchy[3] <= 0 and hierarchy[2] == -1 + + +def is_boxy(contour): + epsilon = 0.01 * cv2.arcLength(contour, True) + approx = cv2.approxPolyDP(contour, epsilon, True) + return len(approx) <= 10 + + +def is_large_enough(contour, min_area): + return cv2.contourArea(contour, False) > min_area + + +def is_likely_redaction(contour, hierarchy, min_area): + return is_filled(hierarchy) and is_boxy(contour) and is_large_enough(contour, min_area) + + +def find_redactions(image: np.array, min_nomralized_area=200000): + + min_nomralized_area /= 200 # Assumes 200 DPI PDF -> image conversion resolution + gray = ~cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) blurred = cv2.GaussianBlur(gray, (5, 5), 1) - thresh = cv2.threshold(blurred, 253, 255, cv2.THRESH_BINARY)[1] - cnts = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - cnts = imutils.grab_contours(cnts) + plt.imshow(blurred) + thresh = cv2.threshold(blurred, 252, 255, cv2.THRESH_BINARY)[1] - for c in cnts: - peri = cv2.arcLength(c, True) - approx = cv2.approxPolyDP(c, 0.04 * peri, True) - yield cv2.boundingRect(approx) + contours, hierarchies = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE) + + contours = map( + first, starfilter(partial(is_likely_redaction, min_area=min_nomralized_area), zip(contours, hierarchies[0])) + ) + return contours -def annotate_boxes(image, rects): - for rect in rects: - (x, y, w, h) = rect - cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2) +def annotate_poly(image, conts): + for cont in conts: + cv2.drawContours(image, cont, -1, (0, 255, 0), 2) return image @@ -33,8 +55,8 @@ def annotate_boxes_in_pdf(pdf_path, page_index=1): page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) - asd = parse(page) - page = annotate_boxes(page, asd) + asd = find_redactions(page) + page = annotate_poly(page, asd) fig, ax = plt.subplots(1, 1) fig.set_size_inches(20, 20) diff --git a/requirements.txt b/requirements.txt index 2ca75a6..a3f596f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ numpy~=1.22.1 pdf2image~=1.16.0 matplotlib~=3.5.1 imutils==0.5.4 +iteration-utilities==0.11.0 diff --git a/scripts/annotate.py b/scripts/annotate.py index c369db0..74fa570 100644 --- a/scripts/annotate.py +++ b/scripts/annotate.py @@ -21,4 +21,3 @@ if __name__ == "__main__": annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index) elif args.object == "box": annotate_boxes_in_pdf(args.pdf_path, page_index=args.page_index) -