diff --git a/table_parsing/table_parsig.py b/table_parsing/table_parsig.py index dce5f9f..ebae9c8 100644 --- a/table_parsing/table_parsig.py +++ b/table_parsing/table_parsig.py @@ -4,172 +4,54 @@ import cv2 import numpy as np import pdf2image from matplotlib import pyplot as plt -from timeit import timeit +import imutils def parse(image: np.array): - gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - th1, img_bin = cv2.threshold(gray_scale, 200, 255, cv2.THRESH_BINARY) - img_bin = ~img_bin - line_min_width = 5 - kernel_h = np.ones((1, line_min_width), np.uint8) - kernel_v = np.ones((line_min_width, 1), np.uint8) + gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + blurred = cv2.GaussianBlur(gray_scale, (5, 5), 1) + thresh = cv2.threshold(blurred, 253, 255, cv2.THRESH_BINARY)[1] + img_bin = ~thresh + + line_min_width = 10 + kernel_h = np.ones((10, line_min_width), np.uint8) + kernel_v = np.ones((line_min_width, 10), np.uint8) img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) - # find_and_close_internal_gaps(img_bin_v) + img_bin_final = img_bin_h | img_bin_v - plt.imshow(img_bin_final) - #find_and_close_internal_gaps(img_bin_final) - #find_and_close_edges(img_bin_final) - _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) - return labels, stats - -# def parse(image: np.array): -# gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) -# th1, img_bin = cv2.threshold(gray_scale, 250, 255, cv2.THRESH_BINARY) -# img_bin = ~img_bin -# -# line_min_width = 10 -# kernel_h = np.ones((20, line_min_width), np.uint8) -# #kernel_v = np.ones((line_min_width, 20), np.uint8) -# -# img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) -# #img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) -# #img_bin_final = img_bin_h | img_bin_v -# contours, hierarchy = cv2.findContours(img_bin_h, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) -# cv2.drawContours(img_bin_h, contours, 1, (255,0,0) , 6) -# plt.imshow(img_bin_h) -# print([cnt for cnt in contours if len(cnt)==4]) -# #plt.imshow(img_bin_h) -# #find_and_close_internal_gaps(img_bin_final) -# #find_and_close_edges(img_bin_final) -# -# #_, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) -# #return labels, stats -# return contours,hierarchy - -# def filter_unconnected_cells(stats): -# filtered_cells = [] -# for left, middle, right in zip(stats[0:], stats[1:], list(stats[2:])+[None]): -# x, y, w, h, area = middle -# if w > 35 and h > 13 and area > 500: -# if y == left[1] or y == right[1]: -# filtered_cells.append(middle) -# return filtered_cells - -def filter_unconnected_cells(stats): - filtered_cells = [] - # print(stats) - for left, middle, right in zip(stats[0:], stats[1:], list(stats[2:]) + [np.array([None, None, None, None, None])]): - x, y, w, h, area = middle - if w > 35 and h > 13 and area > 500: - if right[1] is None: - if y == left[1] or x == left[0]: - filtered_cells.append(middle) - else: - if y == left[1] or y == right[1] or x == left[0] or x == right[0]: - filtered_cells.append(middle) - return filtered_cells - - - -def annotate_image(image, stats): - stats = filter_unconnected_cells(stats) - for stat in stats: - x, y, w, h, area = stat - cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) - for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): - anno = f"{s} = {v}" - xann = int(x + 5) - yann = int(y + h - (20 * (i + 1))) - cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) - - return image - - -# def find_and_close_edges(img_bin_final): -# contours, hierarchy = cv2.findContours(img_bin_final, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) -# -# for cnt in contours: -# missing_external_edges = True -# left = tuple(cnt[cnt[:, :, 0].argmin()][0]) -# right = tuple(cnt[cnt[:, :, 0].argmax()][0]) -# top = tuple(cnt[cnt[:, :, 1].argmin()][0]) -# bottom = tuple(cnt[cnt[:, :, 1].argmax()][0]) -# topleft = [left[0] + 1, top[1]] -# # print(cnt, left, top, topleft) -# bottomright = [right[0] - 1, bottom[1]] -# for arr in cnt: -# if np.array_equal(arr, np.array([bottomright])) or np.array_equal(arr, np.array([topleft])): -# missing_external_edges = False -# break -# -# if missing_external_edges and (bottomright[0]-topleft[0])*(bottomright[1]-topleft[1]) >= 50000: -# topleft[0] -= 1 -# bottomright[0] += 1 -# cv2.rectangle(img_bin_final, tuple(topleft), tuple(bottomright), (255,255,255) , 2) -# #print("missing cell detectet rectangle drawn") -# -# return img_bin_final - -def find_and_close_edges(img_bin_final): - contours, hierarchy = cv2.findContours(img_bin_final, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - - for cnt in contours: - missing_external_edges = True - left = tuple(cnt[cnt[:, :, 0].argmin()][0]) - right = tuple(cnt[cnt[:, :, 0].argmax()][0]) - top = tuple(cnt[cnt[:, :, 1].argmin()][0]) - bottom = tuple(cnt[cnt[:, :, 1].argmax()][0]) - topleft = [left[0], top[1]] - bottomright = [right[0], bottom[1]] - for arr in cnt: - if np.array_equal(arr, np.array([bottomright])) or np.array_equal(arr, np.array([topleft])): - missing_external_edges = False - break - - if missing_external_edges and (bottomright[0] - topleft[0]) * (bottomright[1] - topleft[1]) >= 50000: - cv2.rectangle(img_bin_final, tuple(topleft), tuple(bottomright), (255, 255, 255), 2) - # print("missing cell detectet rectangle drawn") - - return img_bin_final - - -def find_and_close_internal_gaps(img_bin): - contours, hierarchy = cv2.findContours(img_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - cv2.drawContours(img_bin, contours, -1,(255,255,255),2) - plt.imshow(img_bin) - #print([cnt for cnt in contours if len(cnt) == 2]) - # - # print(contours) - # contours_list = sorted([cnt.tolist() for cnt in contours if len(cnt)>2]) - # lines_with_gaps = [] - # for left, right in zip(contours_list[0:], contours_list[1:] + [[[[None]]]]): - # print(left, left[0], left[0][0]) - # if left[1][0][1]-left[0][0][1] > 13: - # if left[0][0][0] == right[0][0][0]: - # lines_with_gaps.append(left + right) - # for lines in lines_with_gaps: - # cv2.line(img_bin, tuple(min(lines)[0]), tuple(max(lines)[0]), (255,255,255), 2) - # #plt.imshow(img_bin) + contours = cv2.findContours(img_bin_final, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + contours = imutils.grab_contours(contours) + for c in contours: + peri = cv2.arcLength(c, True) + approx = cv2.approxPolyDP(c, 0.04 * peri, True) + yield cv2.boundingRect(approx) def parse_tables_in_pdf(pages): return zip(map(parse, pages), count()) +def annotate_boxes(image, rects): + for rect in rects: + (x, y, w, h) = rect + cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2) + + return image + + def annotate_tables_in_pdf(pdf_path, page_index=1): - # timeit() + page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) - _, stats = parse(page) - page = annotate_image(page, stats) - # print(timeit()) + asd = parse(page) + page = annotate_boxes(page, asd) + fig, ax = plt.subplots(1, 1) fig.set_size_inches(20, 20) ax.imshow(page) - plt.show() + plt.show() \ No newline at end of file