cv-analysis-service/test/fixtures/table_parsing.py
Julius Unverfehrt 0a11471191 feat(opentel,dynaconf): adapt new pyinfra
This commit also disables a broken test that connot be fixed. There are
also many scripts that didn't work anyways (and are not needed in my
eyes) that were not updatet. The scripts that are needed to run the
service processing locally still work.
2024-02-08 11:19:33 +01:00

261 lines
7.7 KiB
Python

import json
from os.path import join
import cv2
import pytest
from dvc.repo import Repo
from funcy import first
from kn_utils.logging import logger
from cv_analysis.config import get_config
from cv_analysis.locations import REPO_ROOT_PATH, TEST_DATA_DVC
from cv_analysis.utils.draw import draw_rectangles
from cv_analysis.utils.open_pdf import open_pdf
from test.fixtures.figure_detection import paste_text
settings = get_config()
@pytest.fixture
def client_page_with_table(test_file_index, dvc_test_data):
img_path = join(settings.paths.test_data_dir, f"test{test_file_index}.png")
return first(open_pdf(img_path))
@pytest.fixture(scope="session")
def dvc_test_data():
# noinspection PyCallingNonCallable
logger.info("Pulling data with DVC...")
Repo(REPO_ROOT_PATH).pull(targets=[str(TEST_DATA_DVC)])
logger.info("Finished pulling data.")
@pytest.fixture
def expected_table_annotation(test_file_index):
json_path = join(settings.paths.test_data_dir, f"test{test_file_index}.json")
with open(json_path) as f:
return json.load(f)
@pytest.fixture
def page_with_table(background, table_shape, table_style, n_tables, line_thickness, line_type):
page = draw_table(
background,
(100, 100),
table_shape,
table_style,
line_thickness,
line_type=line_type,
)
if n_tables == 2:
page = draw_table(page, (200, 2000), table_shape, table_style, line_thickness, line_type)
return page
@pytest.fixture
def page_with_patchy_table(page_with_table, background_color):
page = page_with_table
page_width = 2480
page_height = 3508
x_start = 0
y_start = 0
for x in range(0, page_width, 325):
page = cv2.line(
page,
(x, y_start),
(x, page_height),
tuple(3 * [background_color]),
2,
cv2.LINE_AA,
)
for y in range(0, page_height, 515):
page = cv2.line(
page,
(x_start, y),
(page_width, y),
tuple(3 * [background_color]),
1,
cv2.LINE_AA,
)
return page
@pytest.fixture
def page_with_table_and_text(page_with_table):
return paste_text(page_with_table, (50, 1500), 1, cv2.FONT_HERSHEY_COMPLEX, 1700)
@pytest.fixture
def expected_gold_page_with_table(page_with_table, n_tables):
result = [
(103, 103, 185, 198),
(291, 103, 185, 198),
(479, 103, 185, 198),
(667, 103, 185, 198),
(855, 103, 185, 198),
(1043, 103, 185, 198),
(1231, 103, 185, 198),
(1419, 103, 181, 198),
(103, 304, 185, 198),
(291, 304, 185, 198),
(479, 304, 185, 198),
(667, 304, 185, 198),
(855, 304, 185, 198),
(1043, 304, 185, 198),
(1231, 304, 185, 198),
(1419, 304, 181, 198),
(103, 505, 185, 198),
(291, 505, 185, 198),
(479, 505, 185, 198),
(667, 505, 185, 198),
(855, 505, 185, 198),
(1043, 505, 185, 198),
(1231, 505, 185, 198),
(1419, 505, 181, 198),
(103, 706, 185, 198),
(291, 706, 185, 198),
(479, 706, 185, 198),
(667, 706, 185, 198),
(855, 706, 185, 198),
(1043, 706, 185, 198),
(1231, 706, 185, 198),
(1419, 706, 181, 198),
(103, 907, 185, 193),
(291, 907, 185, 193),
(479, 907, 185, 193),
(667, 907, 185, 193),
(855, 907, 185, 193),
(1043, 907, 185, 193),
(1231, 907, 185, 193),
(1419, 907, 181, 193),
]
if n_tables == 2:
result = [
(103, 103, 185, 198),
(291, 103, 185, 198),
(479, 103, 185, 198),
(667, 103, 185, 198),
(855, 103, 185, 198),
(1043, 103, 185, 198),
(1231, 103, 185, 198),
(1419, 103, 181, 198),
(103, 304, 185, 198),
(291, 304, 185, 198),
(479, 304, 185, 198),
(667, 304, 185, 198),
(855, 304, 185, 198),
(1043, 304, 185, 198),
(1231, 304, 185, 198),
(1419, 304, 181, 198),
(103, 505, 185, 198),
(291, 505, 185, 198),
(479, 505, 185, 198),
(667, 505, 185, 198),
(855, 505, 185, 198),
(1043, 505, 185, 198),
(1231, 505, 185, 198),
(1419, 505, 181, 198),
(103, 706, 185, 198),
(291, 706, 185, 198),
(479, 706, 185, 198),
(667, 706, 185, 198),
(855, 706, 185, 198),
(1043, 706, 185, 198),
(1231, 706, 185, 198),
(1419, 706, 181, 198),
(103, 907, 185, 193),
(291, 907, 185, 193),
(479, 907, 185, 193),
(667, 907, 185, 193),
(855, 907, 185, 193),
(1043, 907, 185, 193),
(1231, 907, 185, 193),
(1419, 907, 181, 193),
(203, 2003, 186, 199),
(390, 2003, 187, 199),
(578, 2003, 187, 199),
(766, 2003, 187, 199),
(954, 2003, 187, 199),
(1142, 2003, 187, 199),
(1330, 2003, 187, 199),
(1518, 2003, 182, 199),
(203, 2203, 186, 200),
(390, 2203, 187, 200),
(578, 2203, 187, 200),
(766, 2203, 187, 200),
(954, 2203, 187, 200),
(1142, 2203, 187, 200),
(1330, 2203, 187, 200),
(1518, 2203, 182, 200),
(203, 2404, 186, 200),
(390, 2404, 187, 200),
(578, 2404, 187, 200),
(766, 2404, 187, 200),
(954, 2404, 187, 200),
(1142, 2404, 187, 200),
(1330, 2404, 187, 200),
(1518, 2404, 182, 200),
(203, 2605, 186, 200),
(390, 2605, 187, 200),
(578, 2605, 187, 200),
(766, 2605, 187, 200),
(954, 2605, 187, 200),
(1142, 2605, 187, 200),
(1330, 2605, 187, 200),
(1518, 2605, 182, 200),
(203, 2806, 186, 194),
(390, 2806, 187, 194),
(578, 2806, 187, 194),
(766, 2806, 187, 194),
(954, 2806, 187, 194),
(1142, 2806, 187, 194),
(1330, 2806, 187, 194),
(1518, 2806, 182, 194),
]
return result
def draw_table(page, table_position, table_shape, table_style, line_thickness, line_type):
bbox_table = (*table_position, 1500, 1000)
page = draw_grid_lines(
page,
table_shape,
bbox_table,
table_style,
thickness=line_thickness,
line_type=line_type,
)
if "closed" in table_style:
page = draw_rectangles(page, [bbox_table], (0, 0, 0))
return page
def draw_grid_lines(image, table_shape, bbox, visible_lines, thickness, line_type):
x, y, w, h = bbox
n_rows, n_columns = table_shape
cell_width = bbox[2] // n_columns + 1
cell_height = bbox[3] // n_rows + 1
x_line, y_line = x + cell_width, y + cell_height
if "horizontal" in visible_lines:
for y_line in range(y_line, y + h, cell_height):
image = cv2.line(
image,
(x, y_line),
(x + w, y_line),
color=(0, 0, 0),
thickness=thickness,
lineType=line_type,
)
if "vertical" in visible_lines:
for x_line in range(x_line, x + w, cell_width):
image = cv2.line(
image,
(x_line, y),
(x_line, y + h),
color=(0, 0, 0),
thickness=thickness,
lineType=line_type,
)
return image