filtering non-tables by bounding rect check WIP
This commit is contained in:
parent
36284f9a78
commit
90b8613bf8
@ -1,4 +1,6 @@
|
||||
from functools import partial
|
||||
from itertools import chain, starmap
|
||||
from operator import attrgetter
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
@ -7,7 +9,7 @@ from pdf2image import pdf2image
|
||||
from vidocp.utils.display import show_mpl
|
||||
from vidocp.utils.draw import draw_stats, draw_rectangles
|
||||
from vidocp.utils.filters import is_large_enough
|
||||
from vidocp.utils.post_processing import remove_isolated
|
||||
from vidocp.utils.post_processing import remove_isolated, xywh_to_vecs, xywh_to_vec_rect, vecs_to_vec_rect, adjacent1d
|
||||
|
||||
|
||||
def add_external_contours(image, img):
|
||||
@ -37,8 +39,46 @@ def isolate_vertical_and_horizontal_components(img_bin):
|
||||
return img_bin_final
|
||||
|
||||
|
||||
def has_table_shape(rects):
|
||||
|
||||
assert isinstance(rects, list)
|
||||
|
||||
points = list(chain(*map(xywh_to_vecs, rects)))
|
||||
brect = xywh_to_vec_rect(cv2.boundingRect(np.vstack(points)))
|
||||
|
||||
rects = list(map(xywh_to_vec_rect, rects))
|
||||
|
||||
# print(rects)
|
||||
# print(brect)
|
||||
|
||||
def matches_bounding_rect_corner(rect, x, y):
|
||||
corresp_coords = list(zip(*map(attrgetter(x, y), [brect, rect])))
|
||||
ret = all(starmap(partial(adjacent1d, tolerance=30), corresp_coords))
|
||||
# print()
|
||||
# print(x, y)
|
||||
# print(brect)
|
||||
# print(rect)
|
||||
# print(corresp_coords)
|
||||
# print(ret)
|
||||
|
||||
return ret
|
||||
|
||||
return all(
|
||||
(
|
||||
any(matches_bounding_rect_corner(r, "xmin", "ymin") for r in rects),
|
||||
any(matches_bounding_rect_corner(r, "xmin", "ymax") for r in rects),
|
||||
any(matches_bounding_rect_corner(r, "xmax", "ymax") for r in rects),
|
||||
any(matches_bounding_rect_corner(r, "xmax", "ymin") for r in rects),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def parse_table(image: np.array):
|
||||
|
||||
def is_large_enough(stat):
|
||||
x1, y1, w, h, area = stat
|
||||
return area > 3000
|
||||
|
||||
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||
th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY)
|
||||
img_bin = ~img_bin
|
||||
@ -48,16 +88,14 @@ def parse_table(image: np.array):
|
||||
|
||||
_, _, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
|
||||
|
||||
def is_large_enough(stat):
|
||||
x1, y1, w, h, area = stat
|
||||
return area > 3000
|
||||
|
||||
stats = np.vstack(list(filter(is_large_enough, stats)))
|
||||
|
||||
rects = stats[:, :-1][2:]
|
||||
rects = list(remove_isolated(rects, input_sorted=True))
|
||||
|
||||
# FIXME: For some reason some isolated rects remain.
|
||||
rects = remove_isolated(rects, input_sorted=True)
|
||||
# print(f"{has_table_shape(rects) = }")
|
||||
# if not has_table_shape(rects):
|
||||
# print(111111111111111111111)
|
||||
# return []
|
||||
|
||||
return rects
|
||||
|
||||
@ -68,6 +106,6 @@ def annotate_tables_in_pdf(pdf_path, page_index=1):
|
||||
page = np.array(page)
|
||||
|
||||
stats = parse_table(page)
|
||||
page = draw_rectangles(page, stats)
|
||||
page = draw_rectangles(page, stats, annotate=True)
|
||||
|
||||
show_mpl(page)
|
||||
|
||||
@ -13,7 +13,10 @@ def draw_contours(image, contours):
|
||||
return image
|
||||
|
||||
|
||||
def draw_rectangles(image, rectangles, color=None):
|
||||
def draw_rectangles(image, rectangles, color=None, annotate=False):
|
||||
|
||||
def annotate_rect(x, y, w, h):
|
||||
cv2.putText(image, "+", (x + (w // 2) - 12, y + (h // 2) + 9), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
|
||||
|
||||
image = copy_and_normalize_channels(image)
|
||||
|
||||
@ -24,13 +27,14 @@ def draw_rectangles(image, rectangles, color=None):
|
||||
x, y, w, h = rect
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), color, 2)
|
||||
|
||||
if annotate:
|
||||
annotate_rect(x, y, w, h)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def draw_stats(image, stats, annotate=False):
|
||||
|
||||
image = copy_and_normalize_channels(image)
|
||||
|
||||
keys = ["x", "y", "w", "h"]
|
||||
|
||||
def annotate_stat(x, y, w, h):
|
||||
@ -50,6 +54,8 @@ def draw_stats(image, stats, annotate=False):
|
||||
if annotate:
|
||||
annotate_stat(x, y, w, h)
|
||||
|
||||
image = copy_and_normalize_channels(image)
|
||||
|
||||
for stat in stats[2:]:
|
||||
draw_stat(stat)
|
||||
|
||||
|
||||
@ -111,10 +111,21 @@ def has_no_parent(hierarchy):
|
||||
|
||||
|
||||
def xywh_to_vec_rect(rect):
|
||||
v1, v2 = xywh_to_vecs(rect)
|
||||
return Rectangle(*v1, *v2)
|
||||
|
||||
|
||||
def vecs_to_vec_rect(rect):
|
||||
print(rect)
|
||||
v1, v2 = rect
|
||||
return Rectangle(*v1, *v2)
|
||||
|
||||
|
||||
def xywh_to_vecs(rect):
|
||||
x1, y1, w, h = rect
|
||||
x2 = x1 + w
|
||||
y2 = y1 + h
|
||||
return Rectangle(x1, y1, x2, y2)
|
||||
return (x1, y1), (x2, y2)
|
||||
|
||||
|
||||
def vec_rect_to_xywh(rect):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user