From 885fc22f9de9b4e44d7657b117b260f0a774f091 Mon Sep 17 00:00:00 2001
From: llocarnini <lillian.locarnini@iqser.com>
Date: Fri, 11 Feb 2022 15:59:54 +0100
Subject: [PATCH] added changes to parse scanned pdfs

---
 .gitignore              |  1 +
 vidocp/table_parsing.py | 39 +++++++++++++++++++++++++++++++++------
 2 files changed, 34 insertions(+), 6 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bac3af5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+/pdfs/
diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py
index adaa210..455e9f3 100644
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@@ -23,6 +23,19 @@ def add_external_contours(image, img):
 
     return image
 
+def process_lines(img_bin_h, img_bin_v):
+    def draw_lines(lines, img_bin):
+        for line in lines:
+            for x1, y1, x2, y2 in line:
+                cv2.line(img_bin, (x1, y1), (x2, y2), (255, 255, 255), 3)
+                return img_bin
+    lines_h = cv2.HoughLinesP(img_bin_h, 1, np.pi/180, 500, 700, 0)
+    draw_lines(lines_h, img_bin_h)
+
+    lines_v = cv2.HoughLinesP(img_bin_v, 0.7, np.pi / 180, 500, 700, 0)
+    draw_lines(lines_v,img_bin_v)
+
+    return img_bin_h, img_bin_v
 
 def isolate_vertical_and_horizontal_components(img_bin):
 
@@ -33,6 +46,18 @@ def isolate_vertical_and_horizontal_components(img_bin):
     img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
     img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
 
+    #img_bin_h, img_bin_v = process_lines(img_bin_h,img_bin_v)
+
+    # lines_h = cv2.HoughLinesP(img_bin_h, 1, np.pi/180, 500)
+    # for line in lines_h:
+    #     for x1, y1, x2, y2 in line:
+    #         cv2.line(img_bin_h, (x1, y1), (x2, y2), (255, 255, 255), 3)
+    # lines_v = cv2.HoughLinesP(img_bin_v, 0.7, np.pi / 180, 500, 600, 0)
+    # for line in lines_v:
+    #     for x1, y1, x2, y2 in line:
+    #         cv2.line(img_bin_v, (x1, y1), (x2, y2), (255, 255, 255), 3)
+
+
     img_bin_final = img_bin_h | img_bin_v
 
     return img_bin_final
@@ -73,15 +98,21 @@ def has_table_shape(rects):
     )
 
 
+
+
+
 def parse_table(image: np.array):
     def is_large_enough(stat):
         x1, y1, w, h, area = stat
-        #  was set too higg (3000): Boxes in a Table can be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters
+        #  was set too high (3000): Boxes in a Table can be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters
         #  with extra condition for the length of height and width, weirdly narrow rectangles can be filtered
         return area > 500 and w > 35 and h > 15
 
     gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    #changed threshold value from 150 to 200 because of a shaded edgecase table
+    # blur_gray_scale = cv2.GaussianBlur(gray_scale, (5, 5), 1, borderType=cv2.BORDER_REPLICATE)
+    # th1, img_bin = cv2.threshold(blur_gray_scale, 195, 255, cv2.THRESH_BINARY)
+
+    # changed threshold value from 150 to 195 because of a shaded edgecase table
     th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY)
     img_bin = ~img_bin
 
@@ -96,10 +127,6 @@ def parse_table(image: np.array):
     # FIXME: produces false negatives for `data0/043d551b4c4c768b899eaece4466c836.pdf 1 --type table`
     rects = list(remove_isolated(rects, input_sorted=True))
 
-    # print(f"{has_table_shape(rects) = }")
-    # if not has_table_shape(rects):
-    #     print(111111111111111111111)
-    #     return []
 
     return rects