diff --git a/README.md b/README.md index 549a674..dca1bc4 100644 --- a/README.md +++ b/README.md @@ -23,18 +23,21 @@ dvc pull ### As an API The module provided functions for the individual tasks that all return some kid of collection of points, depending on -the specific task. Example for finding the outlines of previous redactions. +the specific task. + +#### Redaction Detection + +The below snippet shows hot to find the outlines of previous redactions. ```python - from vidocp.redaction_detection import find_redactions import pdf2image import numpy as np + pdf_path = ... page_index = ... - page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0] page = np.array(page) @@ -52,17 +55,45 @@ Core API functionalities can be used through a CLI. The tables parsing utility detects and segments tables into individual cells. ```bash -python scripts/annotate.py data/test_pdf.pdf 2 --type redaction +python scripts/annotate.py data/test_pdf.pdf 7 --type table ``` +The below image shows a parsed table, where each table cell has been detected individually. + +![](data/table_parsing.png) + #### Redaction Detection -The redaction detection utility detects previous redactions in PDFs (black filled rectangles). +The redaction detection utility detects previous redactions in PDFs (filled black rectangles). ```bash -python scripts/annotate.py 0 --type redaction +python scripts/annotate.py data/test_pdf.pdf 2 --type redaction ``` The below image shows the detected redactions with green outlines. ![](data/redaction_detection.png) + + +#### Layout Parsing + +The layout parsing utility detects elements such as paragraphs, tables and figures. +```bash +python scripts/annotate.py data/test_pdf.pdf 7 --type layout +``` + +The below image shows the detected layout elements on a page. + +![](data/layout_parsing.png) + + +#### Figure Detection + +The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility. +```bash +python scripts/annotate.py data/test_pdf.pdf 3 --type figure +``` + +The below image shows the detected figure on a page. + +![](data/figure_detection.png) diff --git a/data/figure_detection.png b/data/figure_detection.png new file mode 100644 index 0000000..7716ade Binary files /dev/null and b/data/figure_detection.png differ diff --git a/data/layout_parsing.png b/data/layout_parsing.png new file mode 100644 index 0000000..6b2a12a Binary files /dev/null and b/data/layout_parsing.png differ diff --git a/data/table_parsing.png b/data/table_parsing.png new file mode 100644 index 0000000..c3d4f9a Binary files /dev/null and b/data/table_parsing.png differ diff --git a/scripts/annotate.py b/scripts/annotate.py index 95de313..9ef1bce 100644 --- a/scripts/annotate.py +++ b/scripts/annotate.py @@ -1,15 +1,16 @@ import argparse -from vidocp.table_parsing_2 import annotate_tables_in_pdf -from vidocp.redaction_detection import annotate_boxes_in_pdf -from vidocp.layout_detection import annotate_layout_in_pdf +from vidocp.table_parsing import annotate_tables_in_pdf +from vidocp.redaction_detection import annotate_redactions_in_pdf +from vidocp.layout_parsing import annotate_layout_in_pdf +from vidocp.figure_detection import detect_figures_in_pdf def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("pdf_path") parser.add_argument("page_index", type=int) - parser.add_argument("--type", choices=["table", "redaction", "layout"], default="table") + parser.add_argument("--type", choices=["table", "redaction", "layout", "figure"]) args = parser.parse_args() @@ -21,6 +22,8 @@ if __name__ == "__main__": if args.type == "table": annotate_tables_in_pdf(args.pdf_path, page_index=args.page_index) elif args.type == "redaction": - annotate_boxes_in_pdf(args.pdf_path, page_index=args.page_index) + annotate_redactions_in_pdf(args.pdf_path, page_index=args.page_index) elif args.type == "layout": annotate_layout_in_pdf(args.pdf_path, page_index=args.page_index) + elif args.type == "figure": + detect_figures_in_pdf(args.pdf_path, page_index=args.page_index) diff --git a/vidocp/figure_detection.py b/vidocp/figure_detection.py new file mode 100644 index 0000000..27a8eb2 --- /dev/null +++ b/vidocp/figure_detection.py @@ -0,0 +1,39 @@ +import cv2 +import numpy as np +from pdf2image import pdf2image + +from vidocp.utils.detection import detect_large_coherent_structures +from vidocp.utils.display import show_mpl +from vidocp.utils.draw import draw_rectangles +from vidocp.utils.post_processing import remove_included +from vidocp.utils.filters import is_large_enough, has_acceptable_format +from vidocp.utils.text import remove_primary_text_regions + + +def is_likely_figure(cont, min_area=5000, max_width_to_hight_ratio=6): + return is_large_enough(cont, min_area) and has_acceptable_format(cont, max_width_to_hight_ratio) + + +def detect_figures(image: np.array): + + image = image.copy() + + image = remove_primary_text_regions(image) + cnts = detect_large_coherent_structures(image) + + cnts = filter(is_likely_figure, cnts) + rects = map(cv2.boundingRect, cnts) + rects = remove_included(rects) + + return rects + + +def detect_figures_in_pdf(pdf_path, page_index=1): + + page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] + page = np.array(page) + + redaction_contours = detect_figures(page) + page = draw_rectangles(page, redaction_contours) + + show_mpl(page) diff --git a/vidocp/layout_detection.py b/vidocp/layout_detection.py index 40e9e58..d559df0 100644 --- a/vidocp/layout_detection.py +++ b/vidocp/layout_detection.py @@ -1,10 +1,8 @@ -from itertools import count - import cv2 +import imutils import numpy as np import pdf2image from matplotlib import pyplot as plt -import imutils def find_layout_boxes(image: np.array): diff --git a/vidocp/layout_parsing.py b/vidocp/layout_parsing.py new file mode 100644 index 0000000..b5f1c51 --- /dev/null +++ b/vidocp/layout_parsing.py @@ -0,0 +1,71 @@ +from itertools import compress +from itertools import starmap +from operator import __and__ + +import cv2 +import numpy as np +from pdf2image import pdf2image + +from vidocp.utils.display import show_mpl +from vidocp.utils.draw import draw_rectangles +from vidocp.utils.post_processing import remove_overlapping, remove_included, has_no_parent + + +def is_likely_segment(rect, min_area=100): + return cv2.contourArea(rect, False) > min_area + + +def find_segments(image): + contours, hierarchies = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + mask1 = map(is_likely_segment, contours) + mask2 = map(has_no_parent, hierarchies[0]) + mask = starmap(__and__, zip(mask1, mask2)) + contours = compress(contours, mask) + + rectangles = (cv2.boundingRect(c) for c in contours) + + return rectangles + + +def parse_layout(image: np.array): + + image = image.copy() + + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + blur = cv2.GaussianBlur(gray, (7, 7), 0) + thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] + + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) + dilate = cv2.dilate(thresh, kernel, iterations=4) + + rects = list(find_segments(dilate)) + + # -> Run meta detection on the previous detections TODO: refactor + for rect in rects: + x, y, w, h = rect + cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 0), -1) + cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), 7) + + _, image = cv2.threshold(image, 254, 255, cv2.THRESH_BINARY) + image = ~image + + image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + rects = find_segments(image) + # <- End of meta detection + + rects = remove_included(rects) + rects = remove_overlapping(rects) + + return rects + + +def annotate_layout_in_pdf(pdf_path, page_index=1): + + page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] + page = np.array(page) + + rects = parse_layout(page) + page = draw_rectangles(page, rects) + + show_mpl(page) diff --git a/vidocp/redaction_detection.py b/vidocp/redaction_detection.py index b071c93..3362dc6 100644 --- a/vidocp/redaction_detection.py +++ b/vidocp/redaction_detection.py @@ -4,22 +4,10 @@ import cv2 import numpy as np import pdf2image from iteration_utilities import starfilter, first -from matplotlib import pyplot as plt - -def is_filled(hierarchy): - # See https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv - return hierarchy[3] <= 0 and hierarchy[2] == -1 - - -def is_boxy(contour): - epsilon = 0.01 * cv2.arcLength(contour, True) - approx = cv2.approxPolyDP(contour, epsilon, True) - return len(approx) <= 10 - - -def is_large_enough(contour, min_area): - return cv2.contourArea(contour, False) > min_area +from vidocp.utils.display import show_mpl +from vidocp.utils.draw import draw_contours +from vidocp.utils.filters import is_large_enough, is_filled, is_boxy def is_likely_redaction(contour, hierarchy, min_area): @@ -34,7 +22,7 @@ def find_redactions(image: np.array, min_normalized_area=200000): blurred = cv2.GaussianBlur(gray, (5, 5), 1) thresh = cv2.threshold(blurred, 252, 255, cv2.THRESH_BINARY)[1] - contours, hierarchies = cv2.findContours(thresh.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE) + contours, hierarchies = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE) contours = map( first, starfilter(partial(is_likely_redaction, min_area=min_normalized_area), zip(contours, hierarchies[0])) @@ -42,22 +30,12 @@ def find_redactions(image: np.array, min_normalized_area=200000): return contours -def annotate_poly(image, contours): - for cont in contours: - cv2.drawContours(image, cont, -1, (0, 255, 0), 4) - - return image - - -def annotate_boxes_in_pdf(pdf_path, page_index=1): +def annotate_redactions_in_pdf(pdf_path, page_index=1): page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) redaction_contours = find_redactions(page) - page = annotate_poly(page, redaction_contours) + page = draw_contours(page, redaction_contours) - fig, ax = plt.subplots(1, 1) - fig.set_size_inches(20, 20) - ax.imshow(page) - plt.show() + show_mpl(page) diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py new file mode 100644 index 0000000..c991d43 --- /dev/null +++ b/vidocp/table_parsing.py @@ -0,0 +1,56 @@ +import cv2 +import numpy as np +from pdf2image import pdf2image + +from vidocp.utils.display import show_mpl +from vidocp.utils.draw import draw_stats + + +def add_external_contours(image, img): + + contours, hierarchy = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) + + for cnt in contours: + x, y, w, h = cv2.boundingRect(cnt) + cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1) + + return image + + +def isolate_vertical_and_horizontal_components(img_bin): + + line_min_width = 30 + kernel_h = np.ones((1, line_min_width), np.uint8) + kernel_v = np.ones((line_min_width, 1), np.uint8) + + img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) + img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) + + img_bin_final = img_bin_h | img_bin_v + + return img_bin_final + + +def parse_table(image: np.array): + + gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY) + img_bin = ~img_bin + + img_bin = isolate_vertical_and_horizontal_components(img_bin) + img_bin_final = add_external_contours(img_bin, img_bin) + + _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) + + return stats + + +def annotate_tables_in_pdf(pdf_path, page_index=1): + + page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] + page = np.array(page) + + stats = parse_table(page) + page = draw_stats(page, stats) + + show_mpl(page) diff --git a/vidocp/utils/__init__.py b/vidocp/utils/__init__.py new file mode 100644 index 0000000..16281fe --- /dev/null +++ b/vidocp/utils/__init__.py @@ -0,0 +1 @@ +from .utils import * diff --git a/vidocp/utils/detection.py b/vidocp/utils/detection.py new file mode 100644 index 0000000..e5d8266 --- /dev/null +++ b/vidocp/utils/detection.py @@ -0,0 +1,23 @@ +import cv2 +import numpy as np + + +def detect_large_coherent_structures(image: np.array): + """Detects large coherent structures on an image. + + References: + https://stackoverflow.com/questions/60259169/how-to-group-nearby-contours-in-opencv-python-zebra-crossing-detection + """ + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + + thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY)[1] + + dilate_kernel = cv2.getStructuringElement(cv2.MORPH_OPEN, (5, 5)) + dilate = cv2.dilate(~thresh, dilate_kernel, iterations=4) + + close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20)) + close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1) + + cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + return cnts diff --git a/vidocp/utils/display.py b/vidocp/utils/display.py new file mode 100644 index 0000000..e0cb8ab --- /dev/null +++ b/vidocp/utils/display.py @@ -0,0 +1,16 @@ +import cv2 +from matplotlib import pyplot as plt + + +def show_mpl(image): + + fig, ax = plt.subplots(1, 1) + fig.set_size_inches(20, 20) + ax.imshow(image) + plt.show() + + +def show_cv2(image): + + cv2.imshow("", image) + cv2.waitKey(0) diff --git a/vidocp/utils/draw.py b/vidocp/utils/draw.py new file mode 100644 index 0000000..32c66f6 --- /dev/null +++ b/vidocp/utils/draw.py @@ -0,0 +1,56 @@ +import cv2 + +from vidocp.utils import copy_and_normalize_channels + + +def draw_contours(image, contours): + + image = copy_and_normalize_channels(image) + + for cont in contours: + cv2.drawContours(image, cont, -1, (0, 255, 0), 4) + + return image + + +def draw_rectangles(image, rectangles, color=None): + + image = copy_and_normalize_channels(image) + + if not color: + color = (0, 255, 0) + + for rect in rectangles: + x, y, w, h = rect + cv2.rectangle(image, (x, y), (x + w, y + h), color, 2) + + return image + + +def draw_stats(image, stats, annotate=False): + + image = copy_and_normalize_channels(image) + + keys = ["x", "y", "w", "h"] + + def annotate_stat(x, y, w, h): + + for i, (s, v) in enumerate(zip(keys, [x, y, w, h])): + anno = f"{s} = {v}" + xann = int(x + 5) + yann = int(y + h - (20 * (i + 1))) + cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2) + + def draw_stat(stat): + + x, y, w, h, area = stat + + cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2) + + if annotate: + annotate_stat(x, y, w, h) + + for stat in stats[2:]: + draw_stat(stat) + + return image diff --git a/vidocp/utils/filters.py b/vidocp/utils/filters.py new file mode 100644 index 0000000..274925c --- /dev/null +++ b/vidocp/utils/filters.py @@ -0,0 +1,25 @@ +import cv2 + + +def is_large_enough(cont, min_area): + return cv2.contourArea(cont, False) > min_area + + +def has_acceptable_format(cont, max_width_to_height_ratio): + _, _, w, h = cv2.boundingRect(cont) + return max_width_to_height_ratio >= w / h >= (1 / max_width_to_height_ratio) + + +def is_filled(hierarchy): + """Checks whether a hierarchy is filled. + + References: + https://stackoverflow.com/questions/60095520/how-to-distinguish-filled-circle-contour-and-unfilled-circle-contour-in-opencv + """ + return hierarchy[3] <= 0 and hierarchy[2] == -1 + + +def is_boxy(contour): + epsilon = 0.01 * cv2.arcLength(contour, True) + approx = cv2.approxPolyDP(contour, epsilon, True) + return len(approx) <= 10 diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py new file mode 100644 index 0000000..0cd7f62 --- /dev/null +++ b/vidocp/utils/post_processing.py @@ -0,0 +1,62 @@ +from collections import namedtuple +from functools import partial + + +def remove_overlapping(rectangles): + def overlap(a, b): + return compute_intersection(a, b) > 0 + + def does_not_overlap(rect, rectangles): + return not any(overlap(rect, r2) for r2 in rectangles if not rect == r2) + + rectangles = list(map(xywh_to_vec_rect, rectangles)) + rectangles = filter(partial(does_not_overlap, rectangles=rectangles), rectangles) + rectangles = map(vec_rect_to_xywh, rectangles) + return rectangles + + +def remove_included(rectangles): + def included(a, b): + return b.xmin >= a.xmin and b.ymin >= a.ymin and b.xmax <= a.xmax and b.ymax <= a.ymax + + def is_not_included(rect, rectangles): + return not any(included(r2, rect) for r2 in rectangles if not rect == r2) + + rectangles = list(map(xywh_to_vec_rect, rectangles)) + rectangles = filter(partial(is_not_included, rectangles=rectangles), rectangles) + rectangles = map(vec_rect_to_xywh, rectangles) + return rectangles + + +Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax") + + +def make_box(x1, y1, x2, y2): + keys = "x1", "y1", "x2", "y2" + return dict(zip(keys, [x1, y1, x2, y2])) + + +def compute_intersection(a, b): + + dx = min(a.xmax, b.xmax) - max(a.xmin, b.xmin) + dy = min(a.ymax, b.ymax) - max(a.ymin, b.ymin) + + return dx * dy if (dx >= 0) and (dy >= 0) else 0 + + +def has_no_parent(hierarchy): + return hierarchy[-1] <= 0 + + +def xywh_to_vec_rect(rect): + x1, y1, w, h = rect + x2 = x1 + w + y2 = y1 + h + return Rectangle(x1, y1, x2, y2) + + +def vec_rect_to_xywh(rect): + x, y, x2, y2 = rect + w = x2 - x + h = y2 - y + return x, y, w, h diff --git a/vidocp/utils/text.py b/vidocp/utils/text.py new file mode 100644 index 0000000..4189005 --- /dev/null +++ b/vidocp/utils/text.py @@ -0,0 +1,57 @@ +import cv2 + + +def remove_primary_text_regions(image): + """Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs. + + Args: + image: Image to remove primary text from. + + Returns: + Image with primary text removed. + """ + + image = image.copy() + + cnts = find_primary_text_regions(image) + + for cnt in cnts: + x, y, w, h = cv2.boundingRect(cnt) + cv2.rectangle(image, (x, y), (x + w, y + h), (255, 255, 255), -1) + + return image + + +def find_primary_text_regions(image): + """Finds regions of primary text, meaning no figure descriptions for example, but main text body paragraphs. + + Args: + image: Image to remove primary text from. + + Returns: + Image with primary text removed. + + References: + https://stackoverflow.com/questions/58349726/opencv-how-to-remove-text-from-background + """ + + def is_likely_primary_text_segments(cnt): + return 800 < cv2.contourArea(cnt) < 15000 + + image = image.copy() + + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + + thresh = cv2.threshold(gray, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] + + close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 3)) + close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, close_kernel, iterations=1) + + dilate_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 3)) + dilate = cv2.dilate(close, dilate_kernel, iterations=1) + + cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) + + cnts = filter(is_likely_primary_text_segments, cnts) + + return cnts diff --git a/vidocp/utils/utils.py b/vidocp/utils/utils.py new file mode 100644 index 0000000..18c8eb2 --- /dev/null +++ b/vidocp/utils/utils.py @@ -0,0 +1,12 @@ +import cv2 + + +def copy_and_normalize_channels(image): + + image = image.copy() + try: + image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR) + except cv2.error: + pass + + return image