From 723c6606e1a3e2d192e9bfbb1ebc1e56c2cf8fe6 Mon Sep 17 00:00:00 2001
From: llocarnini <lillian.locarnini@iqser.com>
Date: Fri, 18 Feb 2022 16:35:50 +0100
Subject: [PATCH] kernel size for morphology ex set bit higher, so less
 non-table structure are detected. Reduced the kernel size of the directional
 motion blurr and increased the treshold a little bit so narrow cells wont be
 split up.

Problem with the cell filtering for certain scanned pdfs detected.
---
 vidocp/table_parsing.py         | 14 ++++++++------
 vidocp/utils/post_processing.py | 14 ++------------
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py
index dd65cd2..c43a457 100644
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@@ -25,7 +25,7 @@ def add_external_contours(image, img):
 
 
 def isolate_vertical_and_horizontal_components(img_bin, bounding_rects):
-    line_min_width = 47
+    line_min_width = 48
     kernel_h = np.ones((1, line_min_width), np.uint8)
     kernel_v = np.ones((line_min_width, 1), np.uint8)
 
@@ -39,15 +39,17 @@ def isolate_vertical_and_horizontal_components(img_bin, bounding_rects):
     img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2)
     show_mpl(img_bin_h | img_bin_v)
 
-    img_bin_h = apply_motion_blur(img_bin_h, 100, 0)
-    img_bin_v = apply_motion_blur(img_bin_v, 100, 90)
+    #reduced filtersize from 100 to 80 to minimize splitting narrow cells
+    img_bin_h = apply_motion_blur(img_bin_h, 80, 0)
+    img_bin_v = apply_motion_blur(img_bin_v, 80, 90)
 
     img_bin_final = img_bin_h | img_bin_v
     show_mpl(img_bin_final)
-
-    th1, img_bin_final = cv2.threshold(img_bin_final, 110, 255, cv2.THRESH_BINARY)
+    #changed threshold from 110 to 120 to minimize cell splitting
+    th1, img_bin_final = cv2.threshold(img_bin_final, 120, 255, cv2.THRESH_BINARY)
     img_bin_final = cv2.dilate(img_bin_final, np.ones((1, 1), np.uint8), iterations=1)
     show_mpl(img_bin_final)
+    # problem if layout parser detects too big of a layout box as in VV-748542.pdf p.22
     img_bin_final = disconnect_non_existing_cells(img_bin_final, bounding_rects)
     show_mpl(img_bin_final)
 
@@ -108,7 +110,7 @@ def find_table_layout_boxes(image: np.array):
     table_boxes = []
     for box in layout_boxes:
         (x, y, w, h) = box
-        if w * h >= 300000:
+        if w * h >= 100000:
             table_boxes.append(box)
     return table_boxes
 
diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py
index 02018c3..79626d2 100644
--- a/vidocp/utils/post_processing.py
+++ b/vidocp/utils/post_processing.py
@@ -50,17 +50,10 @@ def adjacent(a, b):
 
     v1 = a.xmin, a.ymin
     v2 = a.xmax, a.ymax
-    #print("topleft and bottom right rec1", v1,v2)
+
     w1 = b.xmin, b.ymin
     w2 = b.xmax, b.ymax
-    #print("topleft and bottom right rec2", w1, w2)
-    # some rectangles are compared twice
-    # print((
-    #         adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]),
-    #         adjacent2d(v1[0], w2[0], w1[1], w2[1], v1[1], v2[1]),
-    #         adjacent2d(v2[1], w1[1], w1[0], w2[0], v1[0], v2[0]),
-    #         adjacent2d(v1[1], w2[1], w1[0], w2[0], v1[0], v2[0]),
-    #     ))
+
     return any(
         (
             adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]),
@@ -86,9 +79,7 @@ def __remove_isolated_unsorted(rectangles):
 def __remove_isolated_sorted(rectangles):
     def is_connected(left, center, right):
         return any(starmap(adjacent, [(left, center), (center, right)]))
-
     rectangles = list(map(xywh_to_vec_rect, rectangles))
-    # print("rectangles after coordinates to vetor rectangles", len(rectangles), "\n", rectangles)
 
     lefts = [None, *rectangles[:-1]]
     rights = [*rectangles[1:], None]
@@ -96,7 +87,6 @@ def __remove_isolated_sorted(rectangles):
     mask = starmap(is_connected, zip(lefts, rectangles, rights))
     rectangles = compress(rectangles, mask)
     rectangles = map(vec_rect_to_xywh, rectangles)
-
     return rectangles