From 482673f92740638c3b2053ccea629ec8a70ca800 Mon Sep 17 00:00:00 2001 From: Isaac Riley Date: Fri, 8 Mar 2024 10:42:54 +0100 Subject: [PATCH] Table lines --- .gitignore | 1 + src/cv_analysis/server/pipeline.py | 54 ++++++--- src/cv_analysis/table_parsing.py | 139 +++++++++++++++++++++--- test/unit_tests/server_pipeline_test.py | 39 +++++-- 4 files changed, 195 insertions(+), 38 deletions(-) diff --git a/.gitignore b/.gitignore index 1e1e7e2..9c552ec 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ venv/ .DS_Store # Project folders +scratch/ *.vscode/ .idea *_app diff --git a/src/cv_analysis/server/pipeline.py b/src/cv_analysis/server/pipeline.py index 5d36b33..6d57911 100644 --- a/src/cv_analysis/server/pipeline.py +++ b/src/cv_analysis/server/pipeline.py @@ -2,31 +2,42 @@ import sys from dataclasses import asdict from operator import truth -from funcy import lmap, flatten +from funcy import flatten, lmap +from pdf2img.conversion import convert_pages_to_images +from pdf2img.default_objects.image import ImageInfo, ImagePlus +from pdf2img.default_objects.rectangle import RectanglePlus from cv_analysis.figure_detection.figure_detection import detect_figures -from cv_analysis.table_parsing import parse_tables +from cv_analysis.table_parsing import parse_lines, parse_tables from cv_analysis.utils.structures import Rectangle -from pdf2img.conversion import convert_pages_to_images -from pdf2img.default_objects.image import ImagePlus, ImageInfo -from pdf2img.default_objects.rectangle import RectanglePlus def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images=True): if operation == "table": return make_analysis_pipeline( - parse_tables, + parse_lines, table_parsing_formatter, dpi=200, skip_pages_without_images=table_parsing_skip_pages_without_images, ) - elif operation == "figure": - return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200) + if operation == "table_cells": + return make_analysis_pipeline( + parse_tables, + table_parsing_cells_formatter, + dpi=200, + skip_pages_without_images=table_parsing_skip_pages_without_images, + ) + if operation == "figure": + return make_analysis_pipeline( + detect_figures, figure_detection_formatter, dpi=200 + ) else: raise -def make_analysis_pipeline(analysis_fn, formatter, dpi, skip_pages_without_images=False): +def make_analysis_pipeline( + analysis_fn, formatter, dpi, skip_pages_without_images=False +): def analyse_pipeline(pdf: bytes, index=None): def parse_page(page: ImagePlus): image = page.asarray() @@ -36,7 +47,12 @@ def make_analysis_pipeline(analysis_fn, formatter, dpi, skip_pages_without_image infos = formatter(rects, page, dpi) return infos - pages = convert_pages_to_images(pdf, index=index, dpi=dpi, skip_pages_without_images=skip_pages_without_images) + pages = convert_pages_to_images( + pdf, + index=index, + dpi=dpi, + skip_pages_without_images=skip_pages_without_images, + ) results = map(parse_page, pages) yield from flatten(filter(truth, results)) @@ -44,9 +60,15 @@ def make_analysis_pipeline(analysis_fn, formatter, dpi, skip_pages_without_image return analyse_pipeline -def table_parsing_formatter(rects, page: ImagePlus, dpi): +def table_parsing_formatter(lines: list[dict[str, float]], page: ImagePlus, dpi): + return {"pageInfo": page.asdict(natural_index=True), "tableLines": lines} + + +def table_parsing_cells_formatter(rects, page: ImagePlus, dpi): def format_rect(rect: Rectangle): - rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi) + rect_plus = RectanglePlus.from_pixels( + *rect.xyxy(), page.info, alpha=False, dpi=dpi + ) return rect_plus.asdict(derotate=True) bboxes = lmap(format_rect, rects) @@ -56,7 +78,11 @@ def table_parsing_formatter(rects, page: ImagePlus, dpi): def figure_detection_formatter(rects, page, dpi): def format_rect(rect: Rectangle): - rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi) - return asdict(ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha)) + rect_plus = RectanglePlus.from_pixels( + *rect.xyxy(), page.info, alpha=False, dpi=dpi + ) + return asdict( + ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha) + ) return lmap(format_rect, rects) diff --git a/src/cv_analysis/table_parsing.py b/src/cv_analysis/table_parsing.py index 37c50e6..9e35f99 100644 --- a/src/cv_analysis/table_parsing.py +++ b/src/cv_analysis/table_parsing.py @@ -1,20 +1,19 @@ -from functools import partial -from itertools import chain, starmap -from operator import attrgetter - import cv2 import numpy as np -from funcy import lmap, lfilter +from funcy import lfilter, lmap from cv_analysis.layout_parsing import parse_layout -from cv_analysis.utils.postprocessing import remove_isolated # xywh_to_vecs, xywh_to_vec_rect, adjacent1d +from cv_analysis.utils.postprocessing import ( + remove_isolated, +) # xywh_to_vecs, xywh_to_vec_rect, adjacent1d from cv_analysis.utils.structures import Rectangle from cv_analysis.utils.visual_logging import vizlogger def add_external_contours(image, image_h_w_lines_only): - - contours, _ = cv2.findContours(image_h_w_lines_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) + contours, _ = cv2.findContours( + image_h_w_lines_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE + ) for cnt in contours: x, y, w, h = cv2.boundingRect(cnt) cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1) @@ -78,8 +77,10 @@ def isolate_vertical_and_horizontal_components(img_bin): img_bin_extended = img_bin_h | img_bin_v - th1, img_bin_extended = cv2.threshold(img_bin_extended, 120, 255, cv2.THRESH_BINARY) - img_bin_final = cv2.dilate(img_bin_extended, np.ones((1, 1), np.uint8), iterations=1) + _, img_bin_extended = cv2.threshold(img_bin_extended, 120, 255, cv2.THRESH_BINARY) + img_bin_final = cv2.dilate( + img_bin_extended, np.ones((1, 1), np.uint8), iterations=1 + ) # add contours before lines are extended by blurring img_bin_final = add_external_contours(img_bin_final, img_lines_raw) @@ -88,7 +89,7 @@ def isolate_vertical_and_horizontal_components(img_bin): def find_table_layout_boxes(image: np.array): def is_large_enough(box): - (x, y, w, h) = box + (_, _, w, h) = box if w * h >= 100000: return Rectangle.from_xywh(box) @@ -108,7 +109,9 @@ def turn_connected_components_into_rects(image: np.array): x1, y1, w, h, area = stat return area > 2000 and w > 35 and h > 25 - _, _, stats, _ = cv2.connectedComponentsWithStats(~image, connectivity=8, ltype=cv2.CV_32S) + _, _, stats, _ = cv2.connectedComponentsWithStats( + ~image, connectivity=8, ltype=cv2.CV_32S + ) stats = lfilter(is_large_enough, stats) if stats: @@ -130,10 +133,114 @@ def parse_tables(image: np.array, show=False): image = preprocess(image) image = isolate_vertical_and_horizontal_components(image) rects = turn_connected_components_into_rects(image) - #print(rects, "\n\n") rects = list(map(Rectangle.from_xywh, rects)) - #print(rects, "\n\n") rects = remove_isolated(rects) - #print(rects, "\n\n") - + return rects + + +# def make_lines(image: np.array, horizontal=True, kernel_length=40) + + +def detect_horizontal_lines(image_bin: np.array, kernel_length=40): + line_min_width = 48 + kernel_h = np.ones((1, line_min_width), np.uint8) + img_bin_h = cv2.morphologyEx(image_bin, cv2.MORPH_OPEN, kernel_h) + kernel_h = np.ones((1, 30), np.uint8) + img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=2) + img_bin_h = apply_motion_blur(img_bin_h, 0) + _, img_bin_h = cv2.threshold(img_bin_h, 120, 255, cv2.THRESH_BINARY) + # img_bin_h = cv2.dilate(img_bin_h, np.ones((1, 1), np.uint8), iterations=1) + return img_bin_h + + +def detect_vertical_lines(image_bin: np.array, kernel_length=40): + line_min_width = 48 + kernel_v = np.ones((line_min_width, 1), np.uint8) + img_bin_v = cv2.morphologyEx(image_bin, cv2.MORPH_OPEN, kernel_v) + kernel_v = np.ones((30, 1), np.uint8) + img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2) + img_bin_v = apply_motion_blur(img_bin_v, 90) + _, img_bin_v = cv2.threshold(img_bin_v, 120, 255, cv2.THRESH_BINARY) + # img_bin_v = cv2.dilate(img_bin_v, np.ones((1, 1), np.uint8), iterations=1) + return img_bin_v + + +def detect_endpoints( + image: np.array, is_horizontal: bool +) -> list[tuple[int, int, int, int]]: + def are_collinear( + quad1: tuple[int, int, int, int], quad2: tuple[int, int, int, int], index: int + ) -> bool: + dist_a = abs(quad1[index] - quad2[index]) + dist_b = abs(quad1[index + 2] - quad2[index + 2]) + overlap = True if index else (quad1[1] >= quad2[3] or quad1[3] >= quad2[1]) + + return (dist_a < 15) and (dist_b < 15) and overlap + + points = cv2.HoughLinesP( + image, # Input edge image + 1, # Distance resolution in pixels + np.pi / 180, # Angle resolution in radians + threshold=100, # Min number of votes for valid line + minLineLength=200, # Min allowed length of line + maxLineGap=10, # Max allowed gap between line for joining them + ) + points = points if points is not None else [] + + lines = list(map(lambda x: tuple(x[0]), points)) + if not lines: + return lines + index = int(is_horizontal) + lines.sort(key=lambda q: q[index]) + corrected = [lines[0]] + for quad in lines[1:]: + if are_collinear(corrected[-1], quad, bool(is_horizontal)): + prev = corrected.pop(-1) + corrected.append( + ( + min(prev[0], quad[0]), + min(prev[1], quad[1]), + max(prev[2], quad[2]), + min(prev[3], quad[3]), + ) + if is_horizontal + else ( + min(prev[0], quad[0]), + max(prev[1], quad[1]), + min(prev[2], quad[2]), + min(prev[3], quad[3]), + ) + ) + else: + corrected.append(quad) + return corrected + + +def parse_lines(image: np.array, show=False) -> list[dict[str, list[int]]]: + image = preprocess(image) + # kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2, 2)) + # image = cv2.dilate(image, kernel, iterations=4) + horizontal_line_img = detect_horizontal_lines(image) + vertical_line_img = detect_vertical_lines(image) + + horizontal_endpoints = detect_endpoints(horizontal_line_img, is_horizontal=True) + vertical_endpoints = detect_endpoints(vertical_line_img, is_horizontal=False) + + def format_quad( + quad: tuple[int, int, int, int], max_x: int, max_y: int + ) -> tuple[int, int, int, int]: + x1, y1, x2, y2 = quad + if x1 > (x2 + 5): + x1, y1, x2, y2 = x2, y2, x1, y1 + elif y1 > (y2 + 5): + x1, y1, x2, y2 = x2, y2, x1, y1 + return {"x1": x1 / max_x, "y1": y1 / max_y, "x2": x2 / max_x, "y2": y2 / max_y} + + ymax, xmax = image.shape + return list( + map( + lambda quad: format_quad(quad, xmax, ymax), + horizontal_endpoints + vertical_endpoints, + ) + ) diff --git a/test/unit_tests/server_pipeline_test.py b/test/unit_tests/server_pipeline_test.py index b0f9a28..39e685a 100644 --- a/test/unit_tests/server_pipeline_test.py +++ b/test/unit_tests/server_pipeline_test.py @@ -2,7 +2,11 @@ import fitz import numpy as np import pytest -from cv_analysis.server.pipeline import table_parsing_formatter, figure_detection_formatter, make_analysis_pipeline +from cv_analysis.server.pipeline import ( + figure_detection_formatter, + make_analysis_pipeline, + table_parsing_formatter, +) from cv_analysis.utils.structures import Rectangle @@ -21,10 +25,15 @@ def empty_pdf(): @pytest.fixture def expected_formatted_analysis_result(operation): - if operation == "table": + if operation == "table_cells": return [ { - "pageInfo": {"number": 1, "rotation": 0, "width": 595.0, "height": 842.0}, + "pageInfo": { + "number": 1, + "rotation": 0, + "width": 595.0, + "height": 842.0, + }, "tableCells": [ { "x0": 0.0, @@ -40,8 +49,20 @@ def expected_formatted_analysis_result(operation): if operation == "figure": return [ { - "pageInfo": {"number": 0, "rotation": 0, "width": 595.0, "height": 842.0}, - "boundingBox": {"x0": 0.0, "y0": 0.0, "x1": 15.12, "y1": 15.12, "width": 15.12, "height": 15.12}, + "pageInfo": { + "number": 0, + "rotation": 0, + "width": 595.0, + "height": 842.0, + }, + "boundingBox": { + "x0": 0.0, + "y0": 0.0, + "x1": 15.12, + "y1": 15.12, + "width": 15.12, + "height": 15.12, + }, "alpha": False, } ] @@ -49,7 +70,7 @@ def expected_formatted_analysis_result(operation): @pytest.fixture def formatter(operation): - if operation == "table": + if operation == "table_cells": return table_parsing_formatter elif operation == "figure": return figure_detection_formatter @@ -57,8 +78,10 @@ def formatter(operation): raise -@pytest.mark.parametrize("operation", ["table", "figure"]) +@pytest.mark.parametrize("operation", ["table_cells", "figure"]) def test_analysis_pipeline(empty_pdf, formatter, expected_formatted_analysis_result): - analysis_pipeline = make_analysis_pipeline(analysis_fn_mock, formatter, dpi=200, skip_pages_without_images=False) + analysis_pipeline = make_analysis_pipeline( + analysis_fn_mock, formatter, dpi=200, skip_pages_without_images=False + ) results = list(analysis_pipeline(empty_pdf)) assert list(results) == expected_formatted_analysis_result