diff --git a/table_parsing/table_parsig.py b/table_parsing/table_parsig.py index d00eafd..7b486cf 100644 --- a/table_parsing/table_parsig.py +++ b/table_parsing/table_parsig.py @@ -4,11 +4,11 @@ import cv2 import numpy as np import pdf2image from matplotlib import pyplot as plt - +from timeit import timeit def parse(image: np.array): gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - th1, img_bin = cv2.threshold(gray_scale, 150, 225, cv2.THRESH_BINARY) + th1, img_bin = cv2.threshold(gray_scale, 200, 255, cv2.THRESH_BINARY) img_bin = ~img_bin line_min_width = 5 @@ -25,12 +25,39 @@ def parse(image: np.array): return labels, stats +# def filter_unconnected_cells(stats): +# filtered_cells = [] +# for i, val in enumerate(stats[2:]): +# x, y, w, h, area = stats[i][0], stats[i][1], stats[i][2], stats[i][3], stats[i][4] +# if w > 35 and h > 13 and area > 500: +# # print(stats[i]) +# if y == stats[i - 1][1] or y == stats[i + 1][1]: +# filtered_cells.append(stats[i]) +# return filtered_cells +# +# +# +# def annotate_image(image, stats): +# stats = filter_unconnected_cells(stats) +# for i,val in enumerate(stats): +# x, y, w, h, area = stats[i][0], stats[i][1], stats[i][2], stats[i][3], stats[i][4] +# cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) +# +# for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): +# anno = f"{s} = {v}" +# xann = int(x + 5) +# yann = int(y + h - (20 * (i + 1))) +# cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2) +# +# return image + def annotate_image(image, stats): + print(stats.shape) for i in range(2, len(stats)): - x,y,w,h,area = stats[i][0],stats[i][1],stats[i][2],stats[i][3],stats[i][4] + x, y, w, h, area = stats[i][0], stats[i][1], stats[i][2], stats[i][3], stats[i][4] if w > 35 and h > 13 and area > 500: - #print(stats[i]) - if y == stats[i-1][1] or y == stats[i+1][1]: + # print(stats[i]) + if y == stats[i - 1][1] or y == stats[i + 1][1]: cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2) for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])): @@ -48,14 +75,17 @@ def parse_tables_in_pdf(pages): def annotate_tables_in_pdf(pdf_path, page_index=1): + timeit() page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0] page = np.array(page) _, stats = parse(page) page = annotate_image(page, stats) + print(timeit()) fig, ax = plt.subplots(1, 1) fig.set_size_inches(20, 20) ax.imshow(page) plt.show() +