added changes to parse scanned pdfs

2022-02-11 15:59:54 +01:00 · 2022-02-11 15:59:54 +01:00 · 885fc22f9d
commit 885fc22f9d
parent 07907d45dd
2 changed files with 34 additions and 6 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+/pdfs/
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@ -23,6 +23,19 @@ def add_external_contours(image, img):

    return image

+def process_lines(img_bin_h, img_bin_v):
+    def draw_lines(lines, img_bin):
+        for line in lines:
+            for x1, y1, x2, y2 in line:
+                cv2.line(img_bin, (x1, y1), (x2, y2), (255, 255, 255), 3)
+                return img_bin
+    lines_h = cv2.HoughLinesP(img_bin_h, 1, np.pi/180, 500, 700, 0)
+    draw_lines(lines_h, img_bin_h)
+
+    lines_v = cv2.HoughLinesP(img_bin_v, 0.7, np.pi / 180, 500, 700, 0)
+    draw_lines(lines_v,img_bin_v)
+
+    return img_bin_h, img_bin_v

 def isolate_vertical_and_horizontal_components(img_bin):

@ -33,6 +46,18 @@ def isolate_vertical_and_horizontal_components(img_bin):
    img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
    img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)

+    #img_bin_h, img_bin_v = process_lines(img_bin_h,img_bin_v)
+
+    # lines_h = cv2.HoughLinesP(img_bin_h, 1, np.pi/180, 500)
+    # for line in lines_h:
+    #     for x1, y1, x2, y2 in line:
+    #         cv2.line(img_bin_h, (x1, y1), (x2, y2), (255, 255, 255), 3)
+    # lines_v = cv2.HoughLinesP(img_bin_v, 0.7, np.pi / 180, 500, 600, 0)
+    # for line in lines_v:
+    #     for x1, y1, x2, y2 in line:
+    #         cv2.line(img_bin_v, (x1, y1), (x2, y2), (255, 255, 255), 3)
+
+
    img_bin_final = img_bin_h | img_bin_v

    return img_bin_final
@ -73,15 +98,21 @@ def has_table_shape(rects):
    )


+
+
+
 def parse_table(image: np.array):
    def is_large_enough(stat):
        x1, y1, w, h, area = stat
-        #  was set too higg (3000): Boxes in a Table can be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters
+        #  was set too high (3000): Boxes in a Table can be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters
        #  with extra condition for the length of height and width, weirdly narrow rectangles can be filtered
        return area > 500 and w > 35 and h > 15

    gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    #changed threshold value from 150 to 200 because of a shaded edgecase table
+    # blur_gray_scale = cv2.GaussianBlur(gray_scale, (5, 5), 1, borderType=cv2.BORDER_REPLICATE)
+    # th1, img_bin = cv2.threshold(blur_gray_scale, 195, 255, cv2.THRESH_BINARY)
+
+    # changed threshold value from 150 to 195 because of a shaded edgecase table
    th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY)
    img_bin = ~img_bin

@ -96,10 +127,6 @@ def parse_table(image: np.array):
    # FIXME: produces false negatives for `data0/043d551b4c4c768b899eaece4466c836.pdf 1 --type table`
    rects = list(remove_isolated(rects, input_sorted=True))

-    # print(f"{has_table_shape(rects) = }")
-    # if not has_table_shape(rects):
-    #     print(111111111111111111111)
-    #     return []

    return rects