cv-analysis-service/test/fixtures/table_parsing.py
Matthias Bisping 619f67f1fd Refactoring
Various
2023-01-09 16:51:58 +01:00

261 lines
7.7 KiB
Python

import json
from os.path import join
import cv2
import pytest
from dvc.repo import Repo
from funcy import first
from loguru import logger
from cv_analysis.config import get_config
from cv_analysis.locations import REPO_ROOT_PATH, TEST_DATA_DVC
from cv_analysis.utils.drawing import draw_rectangles
from cv_analysis.utils.input import open_analysis_input_file
from test.fixtures.figure_detection import paste_text
CV_CONFIG = get_config()
@pytest.fixture
def client_page_with_table(test_file_index, dvc_test_data):
img_path = join(CV_CONFIG.test_data_dir, f"test{test_file_index}.png")
return first(open_analysis_input_file(img_path))
@pytest.fixture(scope="session")
def dvc_test_data():
# noinspection PyCallingNonCallable
logger.info("Pulling data with DVC...")
Repo(REPO_ROOT_PATH).pull(targets=[str(TEST_DATA_DVC)])
logger.info("Finished pulling data.")
@pytest.fixture
def expected_table_annotation(test_file_index):
json_path = join(CV_CONFIG.test_data_dir, f"test{test_file_index}.json")
with open(json_path) as f:
return json.load(f)
@pytest.fixture
def page_with_table(background, table_shape, table_style, n_tables, line_thickness, line_type):
page = draw_table(
background,
(100, 100),
table_shape,
table_style,
line_thickness,
line_type=line_type,
)
if n_tables == 2:
page = draw_table(page, (200, 2000), table_shape, table_style, line_thickness, line_type)
return page
@pytest.fixture
def page_with_patchy_table(page_with_table, background_color):
page = page_with_table
page_width = 2480
page_height = 3508
x_start = 0
y_start = 0
for x in range(0, page_width, 325):
page = cv2.line(
page,
(x, y_start),
(x, page_height),
tuple(3 * [background_color]),
2,
cv2.LINE_AA,
)
for y in range(0, page_height, 515):
page = cv2.line(
page,
(x_start, y),
(page_width, y),
tuple(3 * [background_color]),
1,
cv2.LINE_AA,
)
return page
@pytest.fixture
def page_with_table_and_text(page_with_table):
return paste_text(page_with_table, (50, 1500), 1, cv2.FONT_HERSHEY_COMPLEX, 1700)
@pytest.fixture
def expected_gold_page_with_table(page_with_table, n_tables):
result = [
(103, 103, 185, 198),
(291, 103, 185, 198),
(479, 103, 185, 198),
(667, 103, 185, 198),
(855, 103, 185, 198),
(1043, 103, 185, 198),
(1231, 103, 185, 198),
(1419, 103, 181, 198),
(103, 304, 185, 198),
(291, 304, 185, 198),
(479, 304, 185, 198),
(667, 304, 185, 198),
(855, 304, 185, 198),
(1043, 304, 185, 198),
(1231, 304, 185, 198),
(1419, 304, 181, 198),
(103, 505, 185, 198),
(291, 505, 185, 198),
(479, 505, 185, 198),
(667, 505, 185, 198),
(855, 505, 185, 198),
(1043, 505, 185, 198),
(1231, 505, 185, 198),
(1419, 505, 181, 198),
(103, 706, 185, 198),
(291, 706, 185, 198),
(479, 706, 185, 198),
(667, 706, 185, 198),
(855, 706, 185, 198),
(1043, 706, 185, 198),
(1231, 706, 185, 198),
(1419, 706, 181, 198),
(103, 907, 185, 193),
(291, 907, 185, 193),
(479, 907, 185, 193),
(667, 907, 185, 193),
(855, 907, 185, 193),
(1043, 907, 185, 193),
(1231, 907, 185, 193),
(1419, 907, 181, 193),
]
if n_tables == 2:
result = [
(103, 103, 185, 198),
(291, 103, 185, 198),
(479, 103, 185, 198),
(667, 103, 185, 198),
(855, 103, 185, 198),
(1043, 103, 185, 198),
(1231, 103, 185, 198),
(1419, 103, 181, 198),
(103, 304, 185, 198),
(291, 304, 185, 198),
(479, 304, 185, 198),
(667, 304, 185, 198),
(855, 304, 185, 198),
(1043, 304, 185, 198),
(1231, 304, 185, 198),
(1419, 304, 181, 198),
(103, 505, 185, 198),
(291, 505, 185, 198),
(479, 505, 185, 198),
(667, 505, 185, 198),
(855, 505, 185, 198),
(1043, 505, 185, 198),
(1231, 505, 185, 198),
(1419, 505, 181, 198),
(103, 706, 185, 198),
(291, 706, 185, 198),
(479, 706, 185, 198),
(667, 706, 185, 198),
(855, 706, 185, 198),
(1043, 706, 185, 198),
(1231, 706, 185, 198),
(1419, 706, 181, 198),
(103, 907, 185, 193),
(291, 907, 185, 193),
(479, 907, 185, 193),
(667, 907, 185, 193),
(855, 907, 185, 193),
(1043, 907, 185, 193),
(1231, 907, 185, 193),
(1419, 907, 181, 193),
(203, 2003, 186, 199),
(390, 2003, 187, 199),
(578, 2003, 187, 199),
(766, 2003, 187, 199),
(954, 2003, 187, 199),
(1142, 2003, 187, 199),
(1330, 2003, 187, 199),
(1518, 2003, 182, 199),
(203, 2203, 186, 200),
(390, 2203, 187, 200),
(578, 2203, 187, 200),
(766, 2203, 187, 200),
(954, 2203, 187, 200),
(1142, 2203, 187, 200),
(1330, 2203, 187, 200),
(1518, 2203, 182, 200),
(203, 2404, 186, 200),
(390, 2404, 187, 200),
(578, 2404, 187, 200),
(766, 2404, 187, 200),
(954, 2404, 187, 200),
(1142, 2404, 187, 200),
(1330, 2404, 187, 200),
(1518, 2404, 182, 200),
(203, 2605, 186, 200),
(390, 2605, 187, 200),
(578, 2605, 187, 200),
(766, 2605, 187, 200),
(954, 2605, 187, 200),
(1142, 2605, 187, 200),
(1330, 2605, 187, 200),
(1518, 2605, 182, 200),
(203, 2806, 186, 194),
(390, 2806, 187, 194),
(578, 2806, 187, 194),
(766, 2806, 187, 194),
(954, 2806, 187, 194),
(1142, 2806, 187, 194),
(1330, 2806, 187, 194),
(1518, 2806, 182, 194),
]
return result
def draw_table(page, table_position, table_shape, table_style, line_thickness, line_type):
bbox_table = (*table_position, 1500, 1000)
page = draw_grid_lines(
page,
table_shape,
bbox_table,
table_style,
thickness=line_thickness,
line_type=line_type,
)
if "closed" in table_style:
page = draw_rectangles(page, [bbox_table], (0, 0, 0))
return page
def draw_grid_lines(image, table_shape, bbox, visible_lines, thickness, line_type):
x, y, w, h = bbox
n_rows, n_columns = table_shape
cell_width = bbox[2] // n_columns + 1
cell_height = bbox[3] // n_rows + 1
x_line, y_line = x + cell_width, y + cell_height
if "horizontal" in visible_lines:
for y_line in range(y_line, y + h, cell_height):
image = cv2.line(
image,
(x, y_line),
(x + w, y_line),
color=(0, 0, 0),
thickness=thickness,
lineType=line_type,
)
if "vertical" in visible_lines:
for x_line in range(x_line, x + w, cell_width):
image = cv2.line(
image,
(x_line, y),
(x_line, y + h),
color=(0, 0, 0),
thickness=thickness,
lineType=line_type,
)
return image