90 lines
2.5 KiB
Python

from typing import Iterable
import cv2
import numpy as np
from cv_analysis.figure_detection.text import remove_primary_text_regions, apply_threshold_to_image
from cv_analysis.table_parsing import preprocess, isolate_vertical_and_horizontal_components, \
turn_connected_components_into_rects
from cv_analysis.utils.display import show_image_mpl
def area_is_bigger_than(rect: tuple, maxarea=100000):
x, y, w, h = rect
return w * h >= maxarea
def define_rect(rect_img, original_position):
show_image_mpl(rect_img)
xo,yo,wo,ho = original_position
if is_header(yo+ho):
print(original_position, " is header")
return "header"
elif is_footer(yo):
print(original_position, " is footer")
return "footer"
elif is_table(rect_img):
print(original_position, " is table")
return "table"
elif is_text(rect_img):
print(original_position, " is text")
return "text"
def is_table(rect_img):
rect_img_inv = preprocess(rect_img)
grid_inv = isolate_vertical_and_horizontal_components(rect_img_inv)
cnts, _ = cv2.findContours(image=grid_inv, mode=cv2.RETR_EXTERNAL, method=cv2.CHAIN_APPROX_SIMPLE)
if cnts:
rects = turn_connected_components_into_rects(grid_inv)
rects = map(lambda r: r.xywh(), rects)
bbox = list((cv2.boundingRect(c) for c in cnts))
if len(list(rects)) > 1 and len(bbox) == 1:
x, y, w, h = bbox[0]
w_img, h_img = rect_img.shape
if w * h / (w_img * h_img) >= 0.75:
print("is table")
return True
else:
print(" table detected but to small for layout rect, so cant be table, maybe figure?")
return False
else:
print("single cell or no connected components, maybe figure?")
return False
else:
print("not a table, but text?")
return False
def is_header(y):
return y < 200
def is_footer(y):
return y > 2150
def is_text(img):
show_image_mpl(img)
cleaned = remove_primary_text_regions(img)
show_image_mpl(cleaned)
return pixel_density(cleaned) < 0.05
def pixel_density(img):
pixels = np.count_nonzero(img)
density = pixels / img.size
return density
def annotate_rect(rect, rect_img):
pass
def label_rects(rects: Iterable[tuple], image: np.array):
labeled_rects = {}
for rect in rects:
x, y, w, h = rect
labeled_rects[rect] = define_rect(image[y:y + h, x:x + w], rect)