seperate function which is filtering for isolated boxes

2022-01-27 00:19:39 +01:00 · 2022-01-27 00:19:39 +01:00 · edf3bfe446
commit edf3bfe446
parent cf5851b652
1 changed files with 35 additions and 5 deletions
--- a/table_parsing/table_parsig.py
+++ b/table_parsing/table_parsig.py
@ -4,11 +4,11 @@ import cv2
 import numpy as np
 import pdf2image
 from matplotlib import pyplot as plt
-
+from timeit import timeit

 def parse(image: np.array):
    gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    th1, img_bin = cv2.threshold(gray_scale, 150, 225, cv2.THRESH_BINARY)
+    th1, img_bin = cv2.threshold(gray_scale, 200, 255, cv2.THRESH_BINARY)
    img_bin = ~img_bin

    line_min_width = 5
@ -25,12 +25,39 @@ def parse(image: np.array):
    return labels, stats


+# def filter_unconnected_cells(stats):
+#     filtered_cells = []
+#     for i, val in enumerate(stats[2:]):
+#         x, y, w, h, area = stats[i][0], stats[i][1], stats[i][2], stats[i][3], stats[i][4]
+#         if w > 35 and h > 13 and area > 500:
+#             # print(stats[i])
+#             if y == stats[i - 1][1] or y == stats[i + 1][1]:
+#                 filtered_cells.append(stats[i])
+#     return filtered_cells
+#
+#
+#
+# def annotate_image(image, stats):
+#     stats = filter_unconnected_cells(stats)
+#     for i,val in enumerate(stats):
+#         x, y, w, h, area = stats[i][0], stats[i][1], stats[i][2], stats[i][3], stats[i][4]
+#         cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)
+#
+#         for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])):
+#             anno = f"{s} = {v}"
+#             xann = int(x + 5)
+#             yann = int(y + h - (20 * (i + 1)))
+#             cv2.putText(image, anno, (xann, yann), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 255), 2)
+#
+#     return image
+
 def annotate_image(image, stats):
+    print(stats.shape)
    for i in range(2, len(stats)):
-        x,y,w,h,area = stats[i][0],stats[i][1],stats[i][2],stats[i][3],stats[i][4]
+        x, y, w, h, area = stats[i][0], stats[i][1], stats[i][2], stats[i][3], stats[i][4]
        if w > 35 and h > 13 and area > 500:
-            #print(stats[i])
-            if y == stats[i-1][1] or y == stats[i+1][1]:
+            # print(stats[i])
+            if y == stats[i - 1][1] or y == stats[i + 1][1]:
                cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)

                for i, (s, v) in enumerate(zip(["x", "y", "w", "h"], [x, y, w, h])):
@ -48,14 +75,17 @@ def parse_tables_in_pdf(pages):


 def annotate_tables_in_pdf(pdf_path, page_index=1):
+    timeit()
    page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
    page = np.array(page)

    _, stats = parse(page)
    page = annotate_image(page, stats)
+    print(timeit())
    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(20, 20)
    ax.imshow(page)
    plt.show()


+