cv-analysis-service/vidocp/table_parsing.py
Matthias Bisping f7d3e39692 nix dolles
2022-02-08 15:05:12 +01:00

112 lines
3.3 KiB
Python

from functools import partial
from itertools import chain, starmap
from operator import attrgetter
import cv2
import numpy as np
from pdf2image import pdf2image
from vidocp.utils.display import show_mpl
from vidocp.utils.draw import draw_rectangles
from vidocp.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d, remove_isolated
def add_external_contours(image, img):
contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
# contours = filter(partial(is_large_enough, min_area=5000000), contours)
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
return image
def isolate_vertical_and_horizontal_components(img_bin):
line_min_width = 30
kernel_h = np.ones((1, line_min_width), np.uint8)
kernel_v = np.ones((line_min_width, 1), np.uint8)
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
img_bin_final = img_bin_h | img_bin_v
return img_bin_final
# FIXME: does not work yet
def has_table_shape(rects):
assert isinstance(rects, list)
points = list(chain(*map(xywh_to_vecs, rects)))
brect = xywh_to_vec_rect(cv2.boundingRect(np.vstack(points)))
rects = list(map(xywh_to_vec_rect, rects))
# print(rects)
# print(brect)
def matches_bounding_rect_corner(rect, x, y):
corresp_coords = list(zip(*map(attrgetter(x, y), [brect, rect])))
ret = all(starmap(partial(adjacent1d, tolerance=30), corresp_coords))
# print()
# print(x, y)
# print(brect)
# print(rect)
# print(corresp_coords)
# print(ret)
return ret
return all(
(
any(matches_bounding_rect_corner(r, "xmin", "ymin") for r in rects),
any(matches_bounding_rect_corner(r, "xmin", "ymax") for r in rects),
any(matches_bounding_rect_corner(r, "xmax", "ymax") for r in rects),
any(matches_bounding_rect_corner(r, "xmax", "ymin") for r in rects),
)
)
def parse_table(image: np.array):
def is_large_enough(stat):
x1, y1, w, h, area = stat
return area > 3000
gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
th1, img_bin = cv2.threshold(gray_scale, 150, 255, cv2.THRESH_BINARY)
img_bin = ~img_bin
img_bin = isolate_vertical_and_horizontal_components(img_bin)
img_bin_final = add_external_contours(img_bin, img_bin)
_, _, stats, _ = cv2.connectedComponentsWithStats(~img_bin_final, connectivity=8, ltype=cv2.CV_32S)
stats = np.vstack(list(filter(is_large_enough, stats)))
rects = stats[:, :-1][2:]
# FIXME: produces false negatives for `data0/043d551b4c4c768b899eaece4466c836.pdf 1 --type table`
rects = list(remove_isolated(rects, input_sorted=True))
# print(f"{has_table_shape(rects) = }")
# if not has_table_shape(rects):
# print(111111111111111111111)
# return []
return rects
def annotate_tables_in_pdf(pdf_path, page_index=1):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
stats = parse_table(page)
page = draw_rectangles(page, stats, annotate=True)
show_mpl(page)