cv-analysis-service/cv_analysis/table_parsing.py
llocarnini 19fe6965fb added line in display so the visual logger doesn't open too many plots
changes to fig_detection_with_layout.py so tables are getting parsed as well

reusage of adding external contour in table_parsing.py
2022-04-26 11:19:27 +02:00

195 lines
6.6 KiB
Python

from functools import partial
from itertools import chain, starmap
from operator import attrgetter
from os.path import join
import cv2
import numpy as np
from pdf2image import pdf2image
from cv_analysis.utils.display import show_mpl
from cv_analysis.utils.draw import draw_rectangles
from cv_analysis.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d
from cv_analysis.utils.deskew import deskew_histbased
from cv_analysis.utils.filters import is_large_enough
from cv_analysis.utils.visual_logging import vizlogger
from cv_analysis.layout_parsing import parse_layout
def add_external_contours(image, img):
contours, _ = cv2.findContours(img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
contours = filter(partial(is_large_enough, min_area=5000), contours)
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
vizlogger.debug(image, "external_contours.png")
return image
def apply_motion_blur(image: np.array, angle, size=80):
"""Solidifies and slightly extends detected lines.
Args:
image (np.array): page image as array
angle: direction in which to apply blur, 0 or 90
size (int): kernel size; 80 found empirically to work well
Returns:
np.array
"""
k = np.zeros((size, size), dtype=np.float32)
vizlogger.debug(k, "tables08_blur_kernel1.png")
k[(size - 1) // 2, :] = np.ones(size, dtype=np.float32)
vizlogger.debug(k, "tables09_blur_kernel2.png")
k = cv2.warpAffine(k, cv2.getRotationMatrix2D((size / 2 - 0.5, size / 2 - 0.5), angle, 1.0), (size, size))
vizlogger.debug(k, "tables10_blur_kernel3.png")
k = k * (1.0 / np.sum(k))
vizlogger.debug(k, "tables11_blur_kernel4.png")
blurred = cv2.filter2D(image, -1, k)
return blurred
def isolate_vertical_and_horizontal_components(img_bin):
"""Identifies and reinforces horizontal and vertical lines in a binary image.
Args:
img_bin (np.array): array corresponding to single binarized page image
bounding_rects (list): list of layout boxes of the form (x, y, w, h), potentially containing tables
Returns:
np.array
"""
line_min_width = 48
kernel_h = np.ones((1, line_min_width), np.uint8)
kernel_v = np.ones((line_min_width, 1), np.uint8)
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
vizlogger.debug(img_bin_h, "tables01_isolate01_img_bin_h.png")
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
vizlogger.debug(img_bin_v | img_bin_h, "tables02_isolate02_img_bin_v.png")
kernel_h = np.ones((1, 30), np.uint8)
kernel_v = np.ones((30, 1), np.uint8)
img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=2)
vizlogger.debug(img_bin_h, "tables03_isolate03_dilate_h.png")
img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2)
vizlogger.debug(img_bin_v | img_bin_h, "tables04_isolate04_dilate_v.png")
img_bin_h = apply_motion_blur(img_bin_h, 0)
vizlogger.debug(img_bin_h, "tables09_isolate05_blur_h.png")
img_bin_v = apply_motion_blur(img_bin_v, 90)
vizlogger.debug(img_bin_v | img_bin_h, "tables10_isolate06_blur_v.png")
img_bin_final = img_bin_h | img_bin_v
vizlogger.debug(img_bin_final, "tables11_isolate07_final.png")
th1, img_bin_final = cv2.threshold(img_bin_final, 120, 255, cv2.THRESH_BINARY)
vizlogger.debug(img_bin_final, "tables10_isolate12_threshold.png")
img_bin_final = cv2.dilate(img_bin_final, np.ones((1, 1), np.uint8), iterations=1)
vizlogger.debug(img_bin_final, "tables11_isolate13_dilate.png")
return img_bin_final
def disconnect_non_existing_cells(img_bin, bounding_rects):
for rect in bounding_rects:
x, y, w, h = rect
img_bin = cv2.rectangle(img_bin, (x, y), (x + w, y + h), (0, 0, 0), 5)
return img_bin
def has_table_shape(rects):
assert isinstance(rects, list)
points = list(chain(*map(xywh_to_vecs, rects)))
brect = xywh_to_vec_rect(cv2.boundingRect(np.vstack(points)))
rects = list(map(xywh_to_vec_rect, rects))
def matches_bounding_rect_corner(rect, x, y):
corresp_coords = list(zip(*map(attrgetter(x, y), [brect, rect])))
ret = all(starmap(partial(adjacent1d, tolerance=30), corresp_coords))
return ret
return all(
(
any(matches_bounding_rect_corner(r, "xmin", "ymin") for r in rects),
any(matches_bounding_rect_corner(r, "xmin", "ymax") for r in rects),
any(matches_bounding_rect_corner(r, "xmax", "ymax") for r in rects),
any(matches_bounding_rect_corner(r, "xmax", "ymin") for r in rects),
)
)
def find_table_layout_boxes(image: np.array):
layout_boxes = parse_layout(image)
table_boxes = []
for box in layout_boxes:
(x, y, w, h) = box
if w * h >= 100000:
table_boxes.append(box)
return table_boxes
def preprocess(image: np.array):
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
_, image = cv2.threshold(image, 195, 255, cv2.THRESH_BINARY)
return ~image
def parse_table(image: np.array, show=False):
"""Runs the full table parsing process.
Args:
image (np.array): single PDF page, opened as PIL.Image object and converted to a numpy array
Returns:
list: list of rectangles corresponding to table cells
"""
def is_large_enough(stat):
x1, y1, w, h, area = stat
return area > 2000 and w > 35 and h > 25
image = preprocess(image)
table_layout_boxes = find_table_layout_boxes(image)
image = isolate_vertical_and_horizontal_components(image)
image = disconnect_non_existing_cells(image, table_layout_boxes)
vizlogger.debug(image, "tables12_isolate14_disconnect.png")
image = add_external_contours(image, image)
vizlogger.debug(image, "external_contours_added.png")
_, _, stats, _ = cv2.connectedComponentsWithStats(~image, connectivity=8, ltype=cv2.CV_32S)
stats = np.vstack(list(filter(is_large_enough, stats)))
rects = stats[:, :-1][2:]
return list(rects)
def annotate_tables_in_pdf(pdf_path, page_index=0, deskew=False, show=False):
""" """
page = pdf2image.convert_from_path(pdf_path, first_page=page_index + 1, last_page=page_index + 1)[0]
page = np.array(page)
if show:
show_mpl(page)
if deskew:
page, _ = deskew_histbased(page)
stats = parse_table(page)
page = draw_rectangles(page, stats, annotate=True)
vizlogger.debug(page, "tables15_final_output.png")
def tables_in_image(cropped_image):
table_rects = parse_table(cropped_image)
if len(table_rects) > 0:
return True, table_rects
else:
return False, None