kernel size for morphology ex set bit higher, so less non-table structure are detected. Reduced the kernel size of the directional motion blurr and increased the treshold a little bit so narrow cells wont be split up.

Problem with the cell filtering for certain scanned pdfs detected.
2022-02-18 16:35:50 +01:00 · 2022-02-18 16:35:50 +01:00 · 723c6606e1
commit 723c6606e1
parent d70781f4aa
2 changed files with 10 additions and 18 deletions
--- a/vidocp/table_parsing.py
+++ b/vidocp/table_parsing.py
@ -25,7 +25,7 @@ def add_external_contours(image, img):


 def isolate_vertical_and_horizontal_components(img_bin, bounding_rects):
-    line_min_width = 47
+    line_min_width = 48
    kernel_h = np.ones((1, line_min_width), np.uint8)
    kernel_v = np.ones((line_min_width, 1), np.uint8)

@ -39,15 +39,17 @@ def isolate_vertical_and_horizontal_components(img_bin, bounding_rects):
    img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2)
    show_mpl(img_bin_h | img_bin_v)

-    img_bin_h = apply_motion_blur(img_bin_h, 100, 0)
-    img_bin_v = apply_motion_blur(img_bin_v, 100, 90)
+    #reduced filtersize from 100 to 80 to minimize splitting narrow cells
+    img_bin_h = apply_motion_blur(img_bin_h, 80, 0)
+    img_bin_v = apply_motion_blur(img_bin_v, 80, 90)

    img_bin_final = img_bin_h | img_bin_v
    show_mpl(img_bin_final)
-
-    th1, img_bin_final = cv2.threshold(img_bin_final, 110, 255, cv2.THRESH_BINARY)
+    #changed threshold from 110 to 120 to minimize cell splitting
+    th1, img_bin_final = cv2.threshold(img_bin_final, 120, 255, cv2.THRESH_BINARY)
    img_bin_final = cv2.dilate(img_bin_final, np.ones((1, 1), np.uint8), iterations=1)
    show_mpl(img_bin_final)
+    # problem if layout parser detects too big of a layout box as in VV-748542.pdf p.22
    img_bin_final = disconnect_non_existing_cells(img_bin_final, bounding_rects)
    show_mpl(img_bin_final)

@ -108,7 +110,7 @@ def find_table_layout_boxes(image: np.array):
    table_boxes = []
    for box in layout_boxes:
        (x, y, w, h) = box
-        if w * h >= 300000:
+        if w * h >= 100000:
            table_boxes.append(box)
    return table_boxes

--- a/vidocp/utils/post_processing.py
+++ b/vidocp/utils/post_processing.py
@ -50,17 +50,10 @@ def adjacent(a, b):

    v1 = a.xmin, a.ymin
    v2 = a.xmax, a.ymax
-    #print("topleft and bottom right rec1", v1,v2)
+
    w1 = b.xmin, b.ymin
    w2 = b.xmax, b.ymax
-    #print("topleft and bottom right rec2", w1, w2)
-    # some rectangles are compared twice
-    # print((
-    #         adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]),
-    #         adjacent2d(v1[0], w2[0], w1[1], w2[1], v1[1], v2[1]),
-    #         adjacent2d(v2[1], w1[1], w1[0], w2[0], v1[0], v2[0]),
-    #         adjacent2d(v1[1], w2[1], w1[0], w2[0], v1[0], v2[0]),
-    #     ))
+
    return any(
        (
            adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]),
@ -86,9 +79,7 @@ def __remove_isolated_unsorted(rectangles):
 def __remove_isolated_sorted(rectangles):
    def is_connected(left, center, right):
        return any(starmap(adjacent, [(left, center), (center, right)]))
-
    rectangles = list(map(xywh_to_vec_rect, rectangles))
-    # print("rectangles after coordinates to vetor rectangles", len(rectangles), "\n", rectangles)

    lefts = [None, *rectangles[:-1]]
    rights = [*rectangles[1:], None]
@ -96,7 +87,6 @@ def __remove_isolated_sorted(rectangles):
    mask = starmap(is_connected, zip(lefts, rectangles, rights))
    rectangles = compress(rectangles, mask)
    rectangles = map(vec_rect_to_xywh, rectangles)
-
    return rectangles