diff --git a/vidocp/table_parsing.py b/vidocp/table_parsing.py index 2ead96c..0131c3c 100644 --- a/vidocp/table_parsing.py +++ b/vidocp/table_parsing.py @@ -76,10 +76,13 @@ def has_table_shape(rects): def parse_table(image: np.array): def is_large_enough(stat): x1, y1, w, h, area = stat - return area > 3000 + # was set too higg (3000): Boxes in a Table can definetly be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters + # with extra condition for the length of height and width weirdly narrow rectangles can be filtered + return area > 500 and w > 35 and h > 15 gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY) + #changed threshold value from 150 to 200 b + th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY) img_bin = ~img_bin img_bin = isolate_vertical_and_horizontal_components(img_bin) @@ -88,9 +91,12 @@ def parse_table(image: np.array): _, _, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S) stats = np.vstack(list(filter(is_large_enough, stats))) + print(stats) rects = stats[:, :-1][2:] + # FIXME: produces false negatives for `data0/043d551b4c4c768b899eaece4466c836.pdf 1 --type table` rects = list(remove_isolated(rects, input_sorted=True)) + print(rects) # print(f"{has_table_shape(rects) = }") # if not has_table_shape(rects): diff --git a/vidocp/utils/post_processing.py b/vidocp/utils/post_processing.py index 06dc1d9..77f8cab 100644 --- a/vidocp/utils/post_processing.py +++ b/vidocp/utils/post_processing.py @@ -29,8 +29,8 @@ def remove_included(rectangles): rectangles = map(vec_rect_to_xywh, rectangles) return rectangles - -def adjacent1d(n, m, tolerance=1): +#tolerance was set too low (1) most lines are 2px wide +def adjacent1d(n, m, tolerance=2): return abs(n - m) <= tolerance @@ -43,6 +43,7 @@ def adjacent(a, b): """ def adjacent2d(g, h, i, j, k, l): + #print(abs(g-h), [k <= p <= l for p in [i, j]]) return adjacent1d(g, h) and any(k <= p <= l for p in [i, j]) if any(x is None for x in (a, b)): @@ -50,10 +51,17 @@ def adjacent(a, b): v1 = a.xmin, a.ymin v2 = a.xmax, a.ymax - + print("topleft and bottom right rec1", v1,v2) w1 = b.xmin, b.ymin w2 = b.xmax, b.ymax - + print("topleft and bottom right rec2", w1, w2) + # some rectangles are compared twice + print(( + adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]), + adjacent2d(v1[0], w2[0], w1[1], w2[1], v1[1], v2[1]), + adjacent2d(v2[1], w1[1], w1[0], w2[0], v1[0], v2[0]), + adjacent2d(v1[1], w2[1], w1[0], w2[0], v1[0], v2[0]), + )) return any( ( adjacent2d(v2[0], w1[0], w1[1], w2[1], v1[1], v2[1]), @@ -81,6 +89,7 @@ def __remove_isolated_sorted(rectangles): return any(starmap(adjacent, [(left, center), (center, right)])) rectangles = list(map(xywh_to_vec_rect, rectangles)) + # print("rectangles after coordinates to vetor rectangles", len(rectangles), "\n", rectangles) lefts = [None, *rectangles[:-1]] rights = [*rectangles[1:], None] @@ -117,7 +126,6 @@ def xywh_to_vec_rect(rect): def vecs_to_vec_rect(rect): - print(rect) v1, v2 = rect return Rectangle(*v1, *v2)