from typing import Iterable import cv2 import numpy as np from cv_analysis.figure_detection.text import remove_primary_text_regions, apply_threshold_to_image from cv_analysis.table_parsing import preprocess, isolate_vertical_and_horizontal_components, \ turn_connected_components_into_rects from cv_analysis.utils.display import show_image_mpl def area_is_bigger_than(rect: tuple, maxarea=100000): x, y, w, h = rect return w * h >= maxarea def define_rect(rect_img, original_position): show_image_mpl(rect_img) xo,yo,wo,ho = original_position if is_header(yo+ho): print(original_position, " is header") return "header" elif is_footer(yo): print(original_position, " is footer") return "footer" elif is_table(rect_img): print(original_position, " is table") return "table" elif is_text(rect_img): print(original_position, " is text") return "text" def is_table(rect_img): rect_img_inv = preprocess(rect_img) grid_inv = isolate_vertical_and_horizontal_components(rect_img_inv) cnts, _ = cv2.findContours(image=grid_inv, mode=cv2.RETR_EXTERNAL, method=cv2.CHAIN_APPROX_SIMPLE) if cnts: rects = turn_connected_components_into_rects(grid_inv) rects = map(lambda r: r.xywh(), rects) bbox = list((cv2.boundingRect(c) for c in cnts)) if len(list(rects)) > 1 and len(bbox) == 1: x, y, w, h = bbox[0] w_img, h_img = rect_img.shape if w * h / (w_img * h_img) >= 0.75: print("is table") return True else: print(" table detected but to small for layout rect, so cant be table, maybe figure?") return False else: print("single cell or no connected components, maybe figure?") return False else: print("not a table, but text?") return False def is_header(y): return y < 200 def is_footer(y): return y > 2150 def is_text(img): show_image_mpl(img) cleaned = remove_primary_text_regions(img) show_image_mpl(cleaned) return pixel_density(cleaned) < 0.05 def pixel_density(img): pixels = np.count_nonzero(img) density = pixels / img.size return density def annotate_rect(rect, rect_img): pass def label_rects(rects: Iterable[tuple], image: np.array): labeled_rects = {} for rect in rects: x, y, w, h = rect labeled_rects[rect] = define_rect(image[y:y + h, x:x + w], rect)