kernel size for morphology ex set bit higher, so less non-table structure are detected. Reduced the kernel size of the directional motion blurr and increased the treshold a little bit so narrow cells wont be split up.

Problem with the cell filtering for certain scanned pdfs detected.
This commit is contained in:
llocarnini 2022-02-18 16:35:50 +01:00
parent d70781f4aa
commit 723c6606e1
2 changed files with 10 additions and 18 deletions

View File

@ -25,7 +25,7 @@ def add_external_contours(image, img):
def isolate_vertical_and_horizontal_components(img_bin, bounding_rects):
line_min_width = 47
line_min_width = 48
kernel_h = np.ones((1, line_min_width), np.uint8)
kernel_v = np.ones((line_min_width, 1), np.uint8)
@ -39,15 +39,17 @@ def isolate_vertical_and_horizontal_components(img_bin, bounding_rects):
img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2)
show_mpl(img_bin_h | img_bin_v)
img_bin_h = apply_motion_blur(img_bin_h, 100, 0)
img_bin_v = apply_motion_blur(img_bin_v, 100, 90)
#reduced filtersize from 100 to 80 to minimize splitting narrow cells
img_bin_h = apply_motion_blur(img_bin_h, 80, 0)
img_bin_v = apply_motion_blur(img_bin_v, 80, 90)
img_bin_final = img_bin_h | img_bin_v
show_mpl(img_bin_final)
th1, img_bin_final = cv2.threshold(img_bin_final, 110, 255, cv2.THRESH_BINARY)
#changed threshold from 110 to 120 to minimize cell splitting
th1, img_bin_final = cv2.threshold(img_bin_final, 120, 255, cv2.THRESH_BINARY)
img_bin_final = cv2.dilate(img_bin_final, np.ones((1, 1), np.uint8), iterations=1)
show_mpl(img_bin_final)
# problem if layout parser detects too big of a layout box as in VV-748542.pdf p.22
img_bin_final = disconnect_non_existing_cells(img_bin_final, bounding_rects)
show_mpl(img_bin_final)
@ -108,7 +110,7 @@ def find_table_layout_boxes(image: np.array):
table_boxes = []
for box in layout_boxes:
(x, y, w, h) = box
if w * h >= 300000:
if w * h >= 100000:
table_boxes.append(box)
return table_boxes

View File

@ -50,17 +50,10 @@ def adjacent(a, b):
v1 = a.xmin, a.ymin
v2 = a.xmax, a.ymax
#print("topleft and bottom right rec1", v1,v2)
w1 = b.xmin, b.ymin
w2 = b.xmax, b.ymax
#print("topleft and bottom right rec2", w1, w2)
# some rectangles are compared twice
# print((
# adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]),
# adjacent2d(v1[0], w2[0], w1[1], w2[1], v1[1], v2[1]),
# adjacent2d(v2[1], w1[1], w1[0], w2[0], v1[0], v2[0]),
# adjacent2d(v1[1], w2[1], w1[0], w2[0], v1[0], v2[0]),
# ))
return any(
(
adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]),
@ -86,9 +79,7 @@ def __remove_isolated_unsorted(rectangles):
def __remove_isolated_sorted(rectangles):
def is_connected(left, center, right):
return any(starmap(adjacent, [(left, center), (center, right)]))
rectangles = list(map(xywh_to_vec_rect, rectangles))
# print("rectangles after coordinates to vetor rectangles", len(rectangles), "\n", rectangles)
lefts = [None, *rectangles[:-1]]
rights = [*rectangles[1:], None]
@ -96,7 +87,6 @@ def __remove_isolated_sorted(rectangles):
mask = starmap(is_connected, zip(lefts, rectangles, rights))
rectangles = compress(rectangles, mask)
rectangles = map(vec_rect_to_xywh, rectangles)
return rectangles