cv-analysis-service/vidocp/table_parsing.py
2022-03-14 21:26:49 +01:00

157 lines
5.2 KiB
Python

from functools import partial
from itertools import chain, starmap
from operator import attrgetter
import cv2
import numpy as np
from pdf2image import pdf2image
from vidocp.utils.display import show_mpl
from vidocp.utils.draw import draw_rectangles
from vidocp.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d, remove_isolated
from vidocp.utils.deskew import deskew_histbased
from vidocp.layout_parsing import parse_layout
def add_external_contours(image, img):
contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
# contours = filter(partial(is_large_enough, min_area=5000000), contours)
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
return image
def isolate_vertical_and_horizontal_components(img_bin, bounding_rects, show=False):
line_min_width = 48
kernel_h = np.ones((1, line_min_width), np.uint8)
kernel_v = np.ones((line_min_width, 1), np.uint8)
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
if show:
show_mpl(img_bin_h | img_bin_v)
kernel_h = np.ones((1, 30), np.uint8)
kernel_v = np.ones((30, 1), np.uint8)
img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=2)
img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2)
# show_mpl(img_bin_h | img_bin_v)
# reduced filtersize from 100 to 80 to minimize splitting narrow cells
img_bin_h = apply_motion_blur(img_bin_h, 80, 0)
img_bin_v = apply_motion_blur(img_bin_v, 80, 90)
img_bin_final = img_bin_h | img_bin_v
if show:
show_mpl(img_bin_final)
# changed threshold from 110 to 120 to minimize cell splitting
th1, img_bin_final = cv2.threshold(img_bin_final, 120, 255, cv2.THRESH_BINARY)
img_bin_final = cv2.dilate(img_bin_final, np.ones((1, 1), np.uint8), iterations=1)
# show_mpl(img_bin_final)
# problem if layout parser detects too big of a layout box as in VV-748542.pdf p.22
img_bin_final = disconnect_non_existing_cells(img_bin_final, bounding_rects)
# show_mpl(img_bin_final)
return img_bin_final
def disconnect_non_existing_cells(img_bin, bounding_rects):
for rect in bounding_rects:
x, y, w, h = rect
img_bin = cv2.rectangle(img_bin, (x, y), (x + w, y + h), (0, 0, 0), 5)
return img_bin
# FIXME: does not work yet
def has_table_shape(rects):
assert isinstance(rects, list)
points = list(chain(*map(xywh_to_vecs, rects)))
brect = xywh_to_vec_rect(cv2.boundingRect(np.vstack(points)))
rects = list(map(xywh_to_vec_rect, rects))
def matches_bounding_rect_corner(rect, x, y):
corresp_coords = list(zip(*map(attrgetter(x, y), [brect, rect])))
ret = all(starmap(partial(adjacent1d, tolerance=30), corresp_coords))
return ret
return all(
(
any(matches_bounding_rect_corner(r, "xmin", "ymin") for r in rects),
any(matches_bounding_rect_corner(r, "xmin", "ymax") for r in rects),
any(matches_bounding_rect_corner(r, "xmax", "ymax") for r in rects),
any(matches_bounding_rect_corner(r, "xmax", "ymin") for r in rects),
)
)
def apply_motion_blur(image, size, angle):
k = np.zeros((size, size), dtype=np.float32)
k[(size - 1) // 2, :] = np.ones(size, dtype=np.float32)
k = cv2.warpAffine(k, cv2.getRotationMatrix2D((size / 2 - 0.5, size / 2 - 0.5), angle, 1.0), (size, size))
k = k * (1.0 / np.sum(k))
return cv2.filter2D(image, -1, k)
def find_table_layout_boxes(image: np.array):
layout_boxes = parse_layout(image)
table_boxes = []
for box in layout_boxes:
(x, y, w, h) = box
if w * h >= 100000:
table_boxes.append(box)
return table_boxes
def preprocess(image: np.array):
if len(image.shape) > 2:
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
th1, image = cv2.threshold(image, 195, 255, cv2.THRESH_BINARY)
image = ~image
return image
def parse_table(image: np.array, show=False):
def is_large_enough(stat):
x1, y1, w, h, area = stat
return area > 2000 and w > 35 and h > 25
image = preprocess(image)
if show:
show_mpl(image)
table_layout_boxes = find_table_layout_boxes(image)
image = isolate_vertical_and_horizontal_components(image, table_layout_boxes)
image = add_external_contours(image, image)
_, _, stats, _ = cv2.connectedComponentsWithStats(~image, connectivity=8, ltype=cv2.CV_32S)
stats = np.vstack(list(filter(is_large_enough, stats)))
rects = stats[:, :-1][2:]
# FIXME: produces false negatives for `data0/043d551b4c4c768b899eaece4466c836.pdf 1 --type table`
rects = remove_isolated(rects, input_sorted=True)
return list(rects)
def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=True):
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
if deskew:
page, _ = deskew_histbased(page)
stats = parse_table(page)
page = draw_rectangles(page, stats, annotate=True)
if show:
show_mpl(page)
else:
return page