From 6274c204a9c12ac38e39772e4e2a47681590d224 Mon Sep 17 00:00:00 2001 From: llocarnini Date: Fri, 4 Feb 2022 10:11:14 +0100 Subject: [PATCH] added imutils to requirements few changes in table_parsig.py bc of a pull --- requirements.txt | 1 + table_parsing/table_parsig.py | 89 +++++++++++++++++++++++++++++++++-- 2 files changed, 86 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index c607322..2be77e5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ opencv-python~=4.5.5.62 numpy~=1.22.1 pdf2image~=1.16.0 matplotlib~=3.5.1 +imutils~=0.5.4 \ No newline at end of file diff --git a/table_parsing/table_parsig.py b/table_parsing/table_parsig.py index ebae9c8..16db709 100644 --- a/table_parsing/table_parsig.py +++ b/table_parsing/table_parsig.py @@ -1,14 +1,13 @@ from itertools import count import cv2 +import imutils import numpy as np import pdf2image from matplotlib import pyplot as plt -import imutils def parse(image: np.array): - gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) blurred = cv2.GaussianBlur(gray_scale, (5, 5), 1) thresh = cv2.threshold(blurred, 253, 255, cv2.THRESH_BINARY)[1] @@ -30,10 +29,93 @@ def parse(image: np.array): approx = cv2.approxPolyDP(c, 0.04 * peri, True) yield cv2.boundingRect(approx) +def parse_tables(image: np.array, rectangle): + gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + th1, img_bin = cv2.threshold(gray_scale, 200, 255, cv2.THRESH_BINARY) + img_bin = ~img_bin + + line_min_width = 5 + kernel_h = np.ones((1, line_min_width), np.uint8) + kernel_v = np.ones((line_min_width, 1), np.uint8) + + img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) + img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) + # find_and_close_internal_gaps(img_bin_v) + img_bin_final = img_bin_h | img_bin_v + plt.imshow(img_bin_final) + # find_and_close_internal_gaps(img_bin_final) + # find_and_close_edges(img_bin_final) + + _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) + return labels, stats + + +def filter_unconnected_cells(stats): + filtered_cells = [] + # print(stats) + for left, middle, right in zip(stats[0:], stats[1:], + list(stats[2:]) + [np.array([None, None, None, None, None])]): + x, y, w, h, area = middle + if w > 35 and h > 13 and area > 500: + if right[1] is None: + if y == left[1] or x == left[0]: + filtered_cells.append(middle) + else: + if y == left[1] or y == right[1] or x == left[0] or x == right[0]: + filtered_cells.append(middle) + return filtered_cells + +def find_and_close_edges(img_bin_final): + contours, hierarchy = cv2.findContours(img_bin_final, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + for cnt in contours: + missing_external_edges = True + left = tuple(cnt[cnt[:, :, 0].argmin()][0]) + right = tuple(cnt[cnt[:, :, 0].argmax()][0]) + top = tuple(cnt[cnt[:, :, 1].argmin()][0]) + bottom = tuple(cnt[cnt[:, :, 1].argmax()][0]) + topleft = [left[0], top[1]] + bottomright = [right[0], bottom[1]] + for arr in cnt: + if np.array_equal(arr, np.array([bottomright])) or np.array_equal(arr, np.array([topleft])): + missing_external_edges = False + break + + if missing_external_edges and (bottomright[0] - topleft[0]) * (bottomright[1] - topleft[1]) >= 50000: + cv2.rectangle(img_bin_final, tuple(topleft), tuple(bottomright), (255, 255, 255), 2) + # print("missing cell detectet rectangle drawn") + + return img_bin_final + +def annotate_image(image, stats): + stats = filter_unconnected_cells(stats) + for stat in stats: + x, y, w, h, area = stat + cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) + for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): + anno = f"{s} = {v}" + xann = int(x + 5) + yann = int(y + h - (20 * (i + 1))) + cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) + + return image def parse_tables_in_pdf(pages): return zip(map(parse, pages), count()) +# def annotate_tables_in_pdf(pdf_path, page_index=1): +# # timeit() +# page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] +# page = np.array(page) +# +# _, stats = parse(page) +# page = annotate_image(page, stats) +# # print(timeit()) +# fig, ax = plt.subplots(1, 1) +# fig.set_size_inches(20, 20) +# ax.imshow(page) +# plt.show() + def annotate_boxes(image, rects): for rect in rects: @@ -44,7 +126,6 @@ def annotate_boxes(image, rects): def annotate_tables_in_pdf(pdf_path, page_index=1): - page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) @@ -54,4 +135,4 @@ def annotate_tables_in_pdf(pdf_path, page_index=1): fig, ax = plt.subplots(1, 1) fig.set_size_inches(20, 20) ax.imshow(page) - plt.show() \ No newline at end of file + plt.show()