cv-analysis-service/vidocp/table_parsing.py

116 lines
3.6 KiB
Python

from functools import partial
from itertools import chain, starmap
from operator import attrgetter
import cv2
import numpy as np
from pdf2image import pdf2image
from vidocp.utils.display import show_mpl
from vidocp.utils.draw import draw_rectangles
from vidocp.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d, remove_isolated
def add_external_contours(image, img):
contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
# contours = filter(partial(is_large_enough, min_area=5000000), contours)
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
return image
def isolate_vertical_and_horizontal_components(img_bin):
line_min_width = 30
kernel_h = np.ones((1, line_min_width), np.uint8)
kernel_v = np.ones((line_min_width, 1), np.uint8)
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
img_bin_final = img_bin_h | img_bin_v
return img_bin_final
# FIXME: does not work yet
def has_table_shape(rects):
assert isinstance(rects, list)
points = list(chain(*map(xywh_to_vecs, rects)))
brect = xywh_to_vec_rect(cv2.boundingRect(np.vstack(points)))
rects = list(map(xywh_to_vec_rect, rects))
# print(rects)
# print(brect)
def matches_bounding_rect_corner(rect, x, y):
corresp_coords = list(zip(*map(attrgetter(x, y), [brect, rect])))
ret = all(starmap(partial(adjacent1d, tolerance=30), corresp_coords))
# print()
# print(x, y)
# print(brect)
# print(rect)
# print(corresp_coords)
# print(ret)
return ret
return all(
(
any(matches_bounding_rect_corner(r, "xmin", "ymin") for r in rects),
any(matches_bounding_rect_corner(r, "xmin", "ymax") for r in rects),
any(matches_bounding_rect_corner(r, "xmax", "ymax") for r in rects),
any(matches_bounding_rect_corner(r, "xmax", "ymin") for r in rects),
)
)
def parse_table(image: np.array):
def is_large_enough(stat):
x1, y1, w, h, area = stat
# was set too higg (3000): Boxes in a Table can be smaller. example: a column titled "No." This cell has approximatly an area of 500 px based on 11pt letters
# with extra condition for the length of height and width, weirdly narrow rectangles can be filtered
return area > 500 and w > 35 and h > 15
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
#changed threshold value from 150 to 200 because of a shaded edgecase table
th1, img_bin = cv2.threshold(gray_scale, 195, 255, cv2.THRESH_BINARY)
img_bin = ~img_bin
img_bin = isolate_vertical_and_horizontal_components(img_bin)
img_bin_final = add_external_contours(img_bin, img_bin)
_, _, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
stats = np.vstack(list(filter(is_large_enough, stats)))
rects = stats[:, :-1][2:]
# FIXME: produces false negatives for `data0/043d551b4c4c768b899eaece4466c836.pdf 1 --type table`
rects = list(remove_isolated(rects, input_sorted=True))
# print(f"{has_table_shape(rects) = }")
# if not has_table_shape(rects):
# print(111111111111111111111)
# return []
return rects
def annotate_tables_in_pdf(pdf_path, page_index=1):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
stats = parse_table(page)
page = draw_rectangles(page, stats, annotate=True)
show_mpl(page)