Merge in RR/cv-analysis from add_table_parsing_fixtures to master
Squashed commit of the following:
commit cfc89b421b61082c8e92e1971c9d0bf4490fa07e
Merge: a7ecb05 73c66a8
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Mon Jul 11 12:19:01 2022 +0200
Merge branch 'master' of ssh://git.iqser.com:2222/rr/cv-analysis into add_table_parsing_fixtures
commit a7ecb05b7d8327f0c7429180f63a380b61b06bc3
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Mon Jul 11 12:02:07 2022 +0200
refactor
commit 466f217e5a9ee5c54fd38c6acd28d54fc38ff9bb
Author: llocarnini <lillian.locarnini@iqser.com>
Date: Mon Jul 11 10:24:14 2022 +0200
deleted unused imports and unused lines of code
commit c58955c8658d0631cdd1c24c8556d399e3fd9990
Author: llocarnini <lillian.locarnini@iqser.com>
Date: Mon Jul 11 10:16:01 2022 +0200
black reformatted files
commit f8bcb10a00ff7f0da49b80c1609b17997411985a
Author: llocarnini <lillian.locarnini@iqser.com>
Date: Tue Jul 5 15:15:00 2022 +0200
reformat files
commit 432e8a569fd70bd0745ce0549c2bfd2f2e907763
Author: llocarnini <lillian.locarnini@iqser.com>
Date: Tue Jul 5 15:08:22 2022 +0200
added better test for generic pages with table WIP as thicker lines create inconsistent results.
added test for patchy tables which does not work yet
commit 2aac9ebf5c76bd963f8c136fe5dd4c2d7681b469
Author: llocarnini <lillian.locarnini@iqser.com>
Date: Mon Jul 4 16:56:29 2022 +0200
added new fixtures for table_parsing_test.py
commit 37606cac0301b13e99be2c16d95867477f29e7c4
Author: llocarnini <lillian.locarnini@iqser.com>
Date: Fri Jul 1 16:02:44 2022 +0200
added separate file for table parsing fixtures, where fixtures for generic tables were added. WIP tests for generic table fixtures
165 lines
5.1 KiB
Python
165 lines
5.1 KiB
Python
from functools import partial
|
|
from itertools import chain, starmap
|
|
from operator import attrgetter
|
|
import cv2
|
|
import numpy as np
|
|
|
|
from funcy import lmap
|
|
|
|
from cv_analysis.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d
|
|
|
|
from cv_analysis.utils.structures import Rectangle
|
|
from cv_analysis.utils.visual_logging import vizlogger
|
|
from cv_analysis.layout_parsing import parse_layout
|
|
|
|
|
|
def add_external_contours(image, image_h_w_lines_only):
|
|
|
|
contours, _ = cv2.findContours(
|
|
image_h_w_lines_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE
|
|
)
|
|
for cnt in contours:
|
|
x, y, w, h = cv2.boundingRect(cnt)
|
|
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
|
|
|
|
return image
|
|
|
|
|
|
def apply_motion_blur(image: np.array, angle, size=80):
|
|
"""Solidifies and slightly extends detected lines.
|
|
|
|
Args:
|
|
image (np.array): page image as array
|
|
angle: direction in which to apply blur, 0 or 90
|
|
size (int): kernel size; 80 found empirically to work well
|
|
|
|
Returns:
|
|
np.array
|
|
|
|
"""
|
|
k = np.zeros((size, size), dtype=np.float32)
|
|
vizlogger.debug(k, "tables08_blur_kernel1.png")
|
|
k[(size - 1) // 2, :] = np.ones(size, dtype=np.float32)
|
|
vizlogger.debug(k, "tables09_blur_kernel2.png")
|
|
k = cv2.warpAffine(
|
|
k,
|
|
cv2.getRotationMatrix2D((size / 2 - 0.5, size / 2 - 0.5), angle, 1.0),
|
|
(size, size),
|
|
)
|
|
vizlogger.debug(k, "tables10_blur_kernel3.png")
|
|
k = k * (1.0 / np.sum(k))
|
|
vizlogger.debug(k, "tables11_blur_kernel4.png")
|
|
blurred = cv2.filter2D(image, -1, k)
|
|
return blurred
|
|
|
|
|
|
def isolate_vertical_and_horizontal_components(img_bin):
|
|
"""Identifies and reinforces horizontal and vertical lines in a binary image.
|
|
|
|
Args:
|
|
img_bin (np.array): array corresponding to single binarized page image
|
|
bounding_rects (list): list of layout boxes of the form (x, y, w, h), potentially containing tables
|
|
|
|
Returns:
|
|
np.array
|
|
"""
|
|
line_min_width = 48
|
|
kernel_h = np.ones((1, line_min_width), np.uint8)
|
|
kernel_v = np.ones((line_min_width, 1), np.uint8)
|
|
|
|
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
|
|
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
|
|
img_lines_raw = img_bin_v | img_bin_h
|
|
|
|
kernel_h = np.ones((1, 30), np.uint8)
|
|
kernel_v = np.ones((30, 1), np.uint8)
|
|
img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=2)
|
|
img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2)
|
|
|
|
img_bin_h = apply_motion_blur(img_bin_h, 0)
|
|
img_bin_v = apply_motion_blur(img_bin_v, 90)
|
|
|
|
img_bin_extended = img_bin_h | img_bin_v
|
|
|
|
th1, img_bin_extended = cv2.threshold(img_bin_extended, 120, 255, cv2.THRESH_BINARY)
|
|
img_bin_final = cv2.dilate(
|
|
img_bin_extended, np.ones((1, 1), np.uint8), iterations=1
|
|
)
|
|
# add contours before lines are extended by blurring
|
|
img_bin_final = add_external_contours(img_bin_final, img_lines_raw)
|
|
|
|
return img_bin_final
|
|
|
|
|
|
def has_table_shape(rects):
|
|
assert isinstance(rects, list)
|
|
|
|
points = list(chain(*map(xywh_to_vecs, rects)))
|
|
brect = xywh_to_vec_rect(cv2.boundingRect(np.vstack(points)))
|
|
|
|
rects = list(map(xywh_to_vec_rect, rects))
|
|
|
|
def matches_bounding_rect_corner(rect, x, y):
|
|
corresp_coords = list(zip(*map(attrgetter(x, y), [brect, rect])))
|
|
ret = all(starmap(partial(adjacent1d, tolerance=30), corresp_coords))
|
|
return ret
|
|
|
|
return all(
|
|
(
|
|
any(matches_bounding_rect_corner(r, "xmin", "ymin") for r in rects),
|
|
any(matches_bounding_rect_corner(r, "xmin", "ymax") for r in rects),
|
|
any(matches_bounding_rect_corner(r, "xmax", "ymax") for r in rects),
|
|
any(matches_bounding_rect_corner(r, "xmax", "ymin") for r in rects),
|
|
)
|
|
)
|
|
|
|
|
|
def find_table_layout_boxes(image: np.array):
|
|
def is_large_enough(box):
|
|
(x, y, w, h) = box
|
|
if w * h >= 100000:
|
|
return Rectangle.from_xywh(box)
|
|
|
|
layout_boxes = parse_layout(image)
|
|
a = lmap(is_large_enough, layout_boxes)
|
|
print(a)
|
|
return lmap(is_large_enough, layout_boxes)
|
|
|
|
|
|
def preprocess(image: np.array):
|
|
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
|
|
_, image = cv2.threshold(image, 195, 255, cv2.THRESH_BINARY)
|
|
return ~image
|
|
|
|
|
|
def turn_connected_components_into_rects(image):
|
|
def is_large_enough(stat):
|
|
x1, y1, w, h, area = stat
|
|
return area > 2000 and w > 35 and h > 25
|
|
|
|
_, _, stats, _ = cv2.connectedComponentsWithStats(
|
|
~image, connectivity=8, ltype=cv2.CV_32S
|
|
)
|
|
|
|
stats = np.vstack(list(filter(is_large_enough, stats)))
|
|
return stats[:, :-1][2:]
|
|
|
|
|
|
def parse_tables(image: np.array, show=False):
|
|
"""Runs the full table parsing process.
|
|
|
|
Args:
|
|
image (np.array): single PDF page, opened as PIL.Image object and converted to a numpy array
|
|
|
|
Returns:
|
|
list: list of rectangles corresponding to table cells
|
|
"""
|
|
|
|
image = preprocess(image)
|
|
|
|
image = isolate_vertical_and_horizontal_components(image)
|
|
|
|
rects = turn_connected_components_into_rects(image)
|
|
|
|
return list(map(Rectangle.from_xywh, rects))
|