From aed7f27626d7ae0a42b0542f9dcfe25bf1f2b9f8 Mon Sep 17 00:00:00 2001 From: llocarnini Date: Thu, 3 Feb 2022 17:01:57 +0100 Subject: [PATCH] new changes for table completion --- table_parsing/table_parsig.py | 76 +++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 25 deletions(-) diff --git a/table_parsing/table_parsig.py b/table_parsing/table_parsig.py index 1efe198..dce5f9f 100644 --- a/table_parsing/table_parsig.py +++ b/table_parsing/table_parsig.py @@ -18,15 +18,38 @@ def parse(image: np.array): img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) - #print(np.nonzero(img_bin_v)) - + # find_and_close_internal_gaps(img_bin_v) img_bin_final = img_bin_h | img_bin_v - - find_and_close_edges(img_bin_final) + plt.imshow(img_bin_final) + #find_and_close_internal_gaps(img_bin_final) + #find_and_close_edges(img_bin_final) _, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) return labels, stats +# def parse(image: np.array): +# gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) +# th1, img_bin = cv2.threshold(gray_scale, 250, 255, cv2.THRESH_BINARY) +# img_bin = ~img_bin +# +# line_min_width = 10 +# kernel_h = np.ones((20, line_min_width), np.uint8) +# #kernel_v = np.ones((line_min_width, 20), np.uint8) +# +# img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h) +# #img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v) +# #img_bin_final = img_bin_h | img_bin_v +# contours, hierarchy = cv2.findContours(img_bin_h, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) +# cv2.drawContours(img_bin_h, contours, 1, (255,0,0) , 6) +# plt.imshow(img_bin_h) +# print([cnt for cnt in contours if len(cnt)==4]) +# #plt.imshow(img_bin_h) +# #find_and_close_internal_gaps(img_bin_final) +# #find_and_close_edges(img_bin_final) +# +# #_, labels, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) +# #return labels, stats +# return contours,hierarchy # def filter_unconnected_cells(stats): # filtered_cells = [] @@ -52,19 +75,7 @@ def filter_unconnected_cells(stats): return filtered_cells -# def annotate_image(image, stats): -# stats = filter_unconnected_cells(stats) -# for i,val in enumerate(stats): -# x, y, w, h, area = stats[i][0], stats[i][1], stats[i][2], stats[i][3], stats[i][4] -# cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) -# -# for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): -# anno = f"{s} = {v}" -# xann = int(x + 5) -# yann = int(y + h - (20 * (i + 1))) -# cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) -# -# return image + def annotate_image(image, stats): stats = filter_unconnected_cells(stats) for stat in stats: @@ -114,21 +125,36 @@ def find_and_close_edges(img_bin_final): top = tuple(cnt[cnt[:, :, 1].argmin()][0]) bottom = tuple(cnt[cnt[:, :, 1].argmax()][0]) topleft = [left[0], top[1]] - # print(cnt, left, top, topleft) bottomright = [right[0], bottom[1]] for arr in cnt: if np.array_equal(arr, np.array([bottomright])) or np.array_equal(arr, np.array([topleft])): missing_external_edges = False break - if missing_external_edges and (bottomright[0]-topleft[0])*(bottomright[1]-topleft[1]) >= 50000: - cv2.rectangle(img_bin_final, tuple(topleft), tuple(bottomright), (255,255,255) , 2) - #print("missing cell detectet rectangle drawn") + if missing_external_edges and (bottomright[0] - topleft[0]) * (bottomright[1] - topleft[1]) >= 50000: + cv2.rectangle(img_bin_final, tuple(topleft), tuple(bottomright), (255, 255, 255), 2) + # print("missing cell detectet rectangle drawn") return img_bin_final -def find_and_close_internal_gaps(img_bin_final): - contours, hierarchy = cv2.findContours(img_bin_final, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + +def find_and_close_internal_gaps(img_bin): + contours, hierarchy = cv2.findContours(img_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) + cv2.drawContours(img_bin, contours, -1,(255,255,255),2) + plt.imshow(img_bin) + #print([cnt for cnt in contours if len(cnt) == 2]) + # + # print(contours) + # contours_list = sorted([cnt.tolist() for cnt in contours if len(cnt)>2]) + # lines_with_gaps = [] + # for left, right in zip(contours_list[0:], contours_list[1:] + [[[[None]]]]): + # print(left, left[0], left[0][0]) + # if left[1][0][1]-left[0][0][1] > 13: + # if left[0][0][0] == right[0][0][0]: + # lines_with_gaps.append(left + right) + # for lines in lines_with_gaps: + # cv2.line(img_bin, tuple(min(lines)[0]), tuple(max(lines)[0]), (255,255,255), 2) + # #plt.imshow(img_bin) def parse_tables_in_pdf(pages): @@ -136,13 +162,13 @@ def parse_tables_in_pdf(pages): def annotate_tables_in_pdf(pdf_path, page_index=1): - #timeit() + # timeit() page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) _, stats = parse(page) page = annotate_image(page, stats) - #print(timeit()) + # print(timeit()) fig, ax = plt.subplots(1, 1) fig.set_size_inches(20, 20) ax.imshow(page)