cv-analysis-service/cv_analysis/table_parsing.py
2022-04-27 11:27:38 +02:00

203 lines
6.7 KiB
Python

from functools import partial
from itertools import chain, starmap
from operator import attrgetter
from os.path import join
import cv2
import numpy as np
from pdf2image import pdf2image
from cv_analysis.utils.display import show_mpl
from cv_analysis.utils.draw import draw_rectangles
from cv_analysis.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d
from cv_analysis.utils.deskew import deskew_histbased
from cv_analysis.utils.filters import is_large_enough
from cv_analysis.utils.visual_logging import vizlogger
from cv_analysis.layout_parsing import parse_layout
def add_external_contours(image, contour_source_image):
contours, _ = cv2.findContours(contour_source_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
contours = filter(partial(is_large_enough, min_area=5000), contours)
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
vizlogger.debug(image, "external_contours.png")
return image
def extend_lines():
#TODO
pass
def make_table_block_mask():
#TODO
pass
def apply_motion_blur(image: np.array, angle, size=80):
"""Solidifies and slightly extends detected lines.
Args:
image (np.array): page image as array
angle: direction in which to apply blur, 0 or 90
size (int): kernel size; 80 found empirically to work well
Returns:
np.array
"""
k = np.zeros((size, size), dtype=np.float32)
vizlogger.debug(k, "tables08_blur_kernel1.png")
k[(size - 1) // 2, :] = np.ones(size, dtype=np.float32)
vizlogger.debug(k, "tables09_blur_kernel2.png")
k = cv2.warpAffine(k, cv2.getRotationMatrix2D((size / 2 - 0.5, size / 2 - 0.5), angle, 1.0), (size, size))
vizlogger.debug(k, "tables10_blur_kernel3.png")
k = k * (1.0 / np.sum(k))
vizlogger.debug(k, "tables11_blur_kernel4.png")
blurred = cv2.filter2D(image, -1, k)
return blurred
def isolate_vertical_and_horizontal_components(img_bin):
"""Identifies and reinforces horizontal and vertical lines in a binary image.
Args:
img_bin (np.array): array corresponding to single binarized page image
bounding_rects (list): list of layout boxes of the form (x, y, w, h), potentially containing tables
Returns:
np.array
"""
line_min_width = 48
kernel_h = np.ones((1, line_min_width), np.uint8)
kernel_v = np.ones((line_min_width, 1), np.uint8)
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
vizlogger.debug(img_bin_h, "tables01_isolate01_img_bin_h.png")
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
img_lines_raw = img_bin_v | img_bin_h
vizlogger.debug(img_lines_raw, "tables02_isolate02_img_bin_v.png")
kernel_h = np.ones((1, 30), np.uint8)
kernel_v = np.ones((30, 1), np.uint8)
img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=2)
vizlogger.debug(img_bin_h, "tables03_isolate03_dilate_h.png")
img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2)
vizlogger.debug(img_bin_v | img_bin_h, "tables04_isolate04_dilate_v.png")
img_bin_h = apply_motion_blur(img_bin_h, 0)
vizlogger.debug(img_bin_h, "tables09_isolate05_blur_h.png")
img_bin_v = apply_motion_blur(img_bin_v, 90)
vizlogger.debug(img_bin_v | img_bin_h, "tables10_isolate06_blur_v.png")
img_bin_final = img_bin_h | img_bin_v
vizlogger.debug(img_bin_final, "tables11_isolate07_final.png")
th1, img_bin_final = cv2.threshold(img_bin_final, 120, 255, cv2.THRESH_BINARY)
vizlogger.debug(img_bin_final, "tables10_isolate12_threshold.png")
img_bin_final = cv2.dilate(img_bin_final, np.ones((1, 1), np.uint8), iterations=1)
vizlogger.debug(img_bin_final, "tables11_isolate13_dilate.png")
# add contours before lines are extended by blurring
img_bin_final = add_external_contours(img_bin_final, img_lines_raw)
vizlogger.debug(img_bin_final, "tables11_isolate14_contours_added.png")
return img_bin_final
def has_table_shape(rects):
assert isinstance(rects, list)
points = list(chain(*map(xywh_to_vecs, rects)))
brect = xywh_to_vec_rect(cv2.boundingRect(np.vstack(points)))
rects = list(map(xywh_to_vec_rect, rects))
def matches_bounding_rect_corner(rect, x, y):
corresp_coords = list(zip(*map(attrgetter(x, y), [brect, rect])))
ret = all(starmap(partial(adjacent1d, tolerance=30), corresp_coords))
return ret
return all(
(
any(matches_bounding_rect_corner(r, "xmin", "ymin") for r in rects),
any(matches_bounding_rect_corner(r, "xmin", "ymax") for r in rects),
any(matches_bounding_rect_corner(r, "xmax", "ymax") for r in rects),
any(matches_bounding_rect_corner(r, "xmax", "ymin") for r in rects),
)
)
def find_table_layout_boxes(image: np.array):
layout_boxes = parse_layout(image)
table_boxes = []
for box in layout_boxes:
(x, y, w, h) = box
if w * h >= 100000:
table_boxes.append(box)
return table_boxes
def preprocess(image: np.array):
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
_, image = cv2.threshold(image, 195, 255, cv2.THRESH_BINARY)
return ~image
def parse_table(image: np.array, show=False):
"""Runs the full table parsing process.
Args:
image (np.array): single PDF page, opened as PIL.Image object and converted to a numpy array
Returns:
list: list of rectangles corresponding to table cells
"""
def is_large_enough(stat):
x1, y1, w, h, area = stat
return area > 2000 and w > 35 and h > 25
image = preprocess(image)
table_layout_boxes = find_table_layout_boxes(image)
image = isolate_vertical_and_horizontal_components(image)
#image = add_external_contours(image, image)
#vizlogger.debug(image, "external_contours_added.png")
_, _, stats, _ = cv2.connectedComponentsWithStats(~image, connectivity=8, ltype=cv2.CV_32S)
stats = np.vstack(list(filter(is_large_enough, stats)))
rects = stats[:, :-1][2:]
return list(map(list, rects))
def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=False):
""" """
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
if show:
show_mpl(page)
if deskew:
page, _ = deskew_histbased(page)
stats = parse_table(page)
page = draw_rectangles(page, stats, annotate=True)
vizlogger.debug(page, "tables15_final_output.png")
if show:
show_mpl(page)
def tables_in_image(cropped_image):
table_rects = parse_table(cropped_image)
if len(table_rects) > 0:
return True, table_rects
else:
return False, None