diff --git a/README.md b/README.md index 549a674..f2f2fe2 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,11 @@ dvc pull ### As an API The module provided functions for the individual tasks that all return some kid of collection of points, depending on -the specific task. Example for finding the outlines of previous redactions. +the specific task. + +#### Redaction Detection + +The below snippet shows hot to find the outlines of previous redactions. ```python @@ -31,10 +35,10 @@ from vidocp.redaction_detection import find_redactions import pdf2image import numpy as np + pdf_path = ... page_index = ... - page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0] page = np.array(page) @@ -52,13 +56,17 @@ Core API functionalities can be used through a CLI. The tables parsing utility detects and segments tables into individual cells. ```bash -python scripts/annotate.py data/test_pdf.pdf 2 --type redaction +python scripts/annotate.py data/test_pdf.pdf 7 --type table ``` +The below image shows a parsed table, where each table cell has been detected individually. + +![](data/table_parsing.png) + #### Redaction Detection -The redaction detection utility detects previous redactions in PDFs (black filled rectangles). +The redaction detection utility detects previous redactions in PDFs (filled black rectangles). ```bash python scripts/annotate.py 0 --type redaction ``` diff --git a/data/table_parsing.png b/data/table_parsing.png new file mode 100644 index 0000000..c3d4f9a Binary files /dev/null and b/data/table_parsing.png differ diff --git a/scripts/annotate.py b/scripts/annotate.py index 4c6d7b8..4ada4ad 100644 --- a/scripts/annotate.py +++ b/scripts/annotate.py @@ -1,6 +1,6 @@ import argparse -from vidocp.table_parsig import annotate_tables_in_pdf +from vidocp.table_parsing import annotate_tables_in_pdf from vidocp.redaction_detection import annotate_boxes_in_pdf from vidocp.layout_detection import annotate_layout_in_pdf diff --git a/vidocp/redaction_detection.py b/vidocp/redaction_detection.py index b071c93..e5908e3 100644 --- a/vidocp/redaction_detection.py +++ b/vidocp/redaction_detection.py @@ -4,7 +4,8 @@ import cv2 import numpy as np import pdf2image from iteration_utilities import starfilter, first -from matplotlib import pyplot as plt + +from vidocp.utils import show_mpl, draw_contours def is_filled(hierarchy): @@ -42,22 +43,12 @@ def find_redactions(image: np.array, min_normalized_area=200000): return contours -def annotate_poly(image, contours): - for cont in contours: - cv2.drawContours(image, cont, -1, (0, 255, 0), 4) - - return image - - def annotate_boxes_in_pdf(pdf_path, page_index=1): page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) redaction_contours = find_redactions(page) - page = annotate_poly(page, redaction_contours) + page = draw_contours(page, redaction_contours) - fig, ax = plt.subplots(1, 1) - fig.set_size_inches(20, 20) - ax.imshow(page) - plt.show() + show_mpl(page) diff --git a/vidocp/table_parsig.py b/vidocp/table_parsig.py deleted file mode 100644 index c6d0306..0000000 --- a/vidocp/table_parsig.py +++ /dev/null @@ -1,58 +0,0 @@ -from itertools import count - -import cv2 -import numpy as np -import pdf2image -from matplotlib import pyplot as plt - - -def parse(image: np.array): - - gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - th1, img_bin = cv2.threshold(gray_scale, 150, 225, cv2.THRESH_BINARY) - img_bin = ~img_bin - - line_min_width = 4 - kernel_h = np.ones((1, line_min_width), np.uint8) - kernel_v = np.ones((line_min_width, 1), np.uint8) - - img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) - img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) - - img_bin_final = img_bin_h | img_bin_v - - _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) - - return labels, stats - - -def parse_tables_in_pdf(pages): - return zip(map(parse, pages), count()) - - -def annotate_image(image, stats): - for x, y, w, h, area in stats[2:]: - if w > 10 and h > 10: - cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) - - for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): - anno = f"{s} = {v}" - xann = int(x + 5) - yann = int(y + h - (20 * (i + 1))) - cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) - - return image - - -def annotate_tables_in_pdf(pdf_path, page_index=1): - - page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] - page = np.array(page) - - _, stats = parse(page) - page = annotate_image(page, stats) - - fig, ax = plt.subplots(1, 1) - fig.set_size_inches(20, 20) - ax.imshow(page) - plt.show() diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py new file mode 100644 index 0000000..765fb1c --- /dev/null +++ b/vidocp/table_parsing.py @@ -0,0 +1,55 @@ +import cv2 +import numpy as np +from pdf2image import pdf2image + +from vidocp.utils import show_cv2, draw_stats + + +def add_external_contours(image, img): + + contours, hierarchy = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) + + for cnt in contours: + x, y, w, h = cv2.boundingRect(cnt) + cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1) + + return image + + +def isolate_vertical_and_horizontal_components(img_bin): + + line_min_width = 30 + kernel_h = np.ones((1, line_min_width), np.uint8) + kernel_v = np.ones((line_min_width, 1), np.uint8) + + img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) + img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) + + img_bin_final = img_bin_h | img_bin_v + + return img_bin_final + + +def parse_table(image: np.array): + + gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY) + img_bin = ~img_bin + + img_bin = isolate_vertical_and_horizontal_components(img_bin) + img_bin_final = add_external_contours(img_bin, img_bin) + + _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) + + return stats + + +def annotate_tables_in_pdf(pdf_path, page_index=1): + + page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] + page = np.array(page) + + stats = parse_table(page) + page = draw_stats(page, stats) + + show_cv2(page) diff --git a/vidocp/utils.py b/vidocp/utils.py new file mode 100644 index 0000000..b540356 --- /dev/null +++ b/vidocp/utils.py @@ -0,0 +1,54 @@ +import cv2 +from matplotlib import pyplot as plt + + +def show_mpl(image): + + fig, ax = plt.subplots(1, 1) + fig.set_size_inches(20, 20) + ax.imshow(image) + plt.show() + + +def show_cv2(image): + + cv2.imshow("", image) + cv2.waitKey(0) + + +def draw_contours(image, contours): + + image = image.copy() + + for cont in contours: + cv2.drawContours(image, cont, -1, (0, 255, 0), 4) + + return image + + +def draw_stats(image, stats, annotate=False): + + image = image.copy() + keys = ["x", "y", "w", "h"] + + def annotate_stat(x, y, w, h): + + for i, (s, v) in enumerate(zip(keys, [x, y, w, h])): + anno = f"{s} = {v}" + xann = int(x + 5) + yann = int(y + h - (20 * (i + 1))) + cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2) + + def draw_stat(stat): + + x, y, w, h, area = stat + + cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2) + + if annotate: + annotate_stat(x, y, w, h) + + for stat in stats[2:]: + draw_stat(stat) + + return image