filtering non-tables by bounding rect check WIP

This commit is contained in:
Matthias Bisping 2022-02-06 21:03:40 +01:00
parent 36284f9a78
commit 90b8613bf8
3 changed files with 68 additions and 13 deletions

View File

@ -1,4 +1,6 @@
from functools import partial
from itertools import chain, starmap
from operator import attrgetter
import cv2
import numpy as np
@ -7,7 +9,7 @@ from pdf2image import pdf2image
from vidocp.utils.display import show_mpl
from vidocp.utils.draw import draw_stats, draw_rectangles
from vidocp.utils.filters import is_large_enough
from vidocp.utils.post_processing import remove_isolated
from vidocp.utils.post_processing import remove_isolated, xywh_to_vecs, xywh_to_vec_rect, vecs_to_vec_rect, adjacent1d
def add_external_contours(image, img):
@ -37,8 +39,46 @@ def isolate_vertical_and_horizontal_components(img_bin):
return img_bin_final
def has_table_shape(rects):
assert isinstance(rects, list)
points = list(chain(*map(xywh_to_vecs, rects)))
brect = xywh_to_vec_rect(cv2.boundingRect(np.vstack(points)))
rects = list(map(xywh_to_vec_rect, rects))
# print(rects)
# print(brect)
def matches_bounding_rect_corner(rect, x, y):
corresp_coords = list(zip(*map(attrgetter(x, y), [brect, rect])))
ret = all(starmap(partial(adjacent1d, tolerance=30), corresp_coords))
# print()
# print(x, y)
# print(brect)
# print(rect)
# print(corresp_coords)
# print(ret)
return ret
return all(
(
any(matches_bounding_rect_corner(r, "xmin", "ymin") for r in rects),
any(matches_bounding_rect_corner(r, "xmin", "ymax") for r in rects),
any(matches_bounding_rect_corner(r, "xmax", "ymax") for r in rects),
any(matches_bounding_rect_corner(r, "xmax", "ymin") for r in rects),
)
)
def parse_table(image: np.array):
def is_large_enough(stat):
x1, y1, w, h, area = stat
return area > 3000
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY)
img_bin = ~img_bin
@ -48,16 +88,14 @@ def parse_table(image: np.array):
_, _, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
def is_large_enough(stat):
x1, y1, w, h, area = stat
return area > 3000
stats = np.vstack(list(filter(is_large_enough, stats)))
rects = stats[:, :-1][2:]
rects = list(remove_isolated(rects, input_sorted=True))
# FIXME: For some reason some isolated rects remain.
rects = remove_isolated(rects, input_sorted=True)
# print(f"{has_table_shape(rects) = }")
# if not has_table_shape(rects):
# print(111111111111111111111)
# return []
return rects
@ -68,6 +106,6 @@ def annotate_tables_in_pdf(pdf_path, page_index=1):
page = np.array(page)
stats = parse_table(page)
page = draw_rectangles(page, stats)
page = draw_rectangles(page, stats, annotate=True)
show_mpl(page)

View File

@ -13,7 +13,10 @@ def draw_contours(image, contours):
return image
def draw_rectangles(image, rectangles, color=None):
def draw_rectangles(image, rectangles, color=None, annotate=False):
def annotate_rect(x, y, w, h):
cv2.putText(image, "+", (x + (w // 2) - 12, y + (h // 2) + 9), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
image = copy_and_normalize_channels(image)
@ -24,13 +27,14 @@ def draw_rectangles(image, rectangles, color=None):
x, y, w, h = rect
cv2.rectangle(image, (x, y), (x + w, y + h), color, 2)
if annotate:
annotate_rect(x, y, w, h)
return image
def draw_stats(image, stats, annotate=False):
image = copy_and_normalize_channels(image)
keys = ["x", "y", "w", "h"]
def annotate_stat(x, y, w, h):
@ -50,6 +54,8 @@ def draw_stats(image, stats, annotate=False):
if annotate:
annotate_stat(x, y, w, h)
image = copy_and_normalize_channels(image)
for stat in stats[2:]:
draw_stat(stat)

View File

@ -111,10 +111,21 @@ def has_no_parent(hierarchy):
def xywh_to_vec_rect(rect):
v1, v2 = xywh_to_vecs(rect)
return Rectangle(*v1, *v2)
def vecs_to_vec_rect(rect):
print(rect)
v1, v2 = rect
return Rectangle(*v1, *v2)
def xywh_to_vecs(rect):
x1, y1, w, h = rect
x2 = x1 + w
y2 = y1 + h
return Rectangle(x1, y1, x2, y2)
return (x1, y1), (x2, y2)
def vec_rect_to_xywh(rect):