from functools import partial from typing import Iterable, List import cv2 import numpy as np from funcy import compose, rcompose, lkeep from cv_analysis.utils import lstarkeep from cv_analysis.utils.common import ( find_contours_and_hierarchies, dilate_page_components, normalize_to_gray_scale, threshold_image, invert_image, fill_rectangles, ) from cv_analysis.utils.conversion import contour_to_rectangle from cv_analysis.utils.merging import merge_related_rectangles from cv_analysis.utils.postprocessing import remove_included, has_no_parent from cv_analysis.utils.rectangle import Rectangle def parse_layout(image: np.array) -> List[Rectangle]: """Parse the layout of a page. Args: image: Image of the page. Returns: List of rectangles representing the layout of the page as identified page elements. """ rectangles = rcompose( find_segments, remove_included, merge_related_rectangles, remove_included, )(image) return rectangles def find_segments(image: np.ndarray) -> List[Rectangle]: """Find segments in a page. Segments are structural elements of a page, such as text blocks, tables, etc.""" rectangles = rcompose( prepare_for_initial_detection, __find_segments, partial(prepare_for_meta_detection, image.copy()), __find_segments, )(image) return rectangles def prepare_for_initial_detection(image: np.ndarray) -> np.ndarray: return compose(dilate_page_components, normalize_to_gray_scale)(image) def __find_segments(image: np.ndarray) -> List[Rectangle]: def to_rectangle_if_valid(contour, hierarchy): return contour_to_rectangle(contour) if is_likely_segment(contour) and has_no_parent(hierarchy) else None rectangles = lstarkeep(to_rectangle_if_valid, zip(*find_contours_and_hierarchies(image))) return rectangles def prepare_for_meta_detection(image: np.ndarray, rectangles: Iterable[Rectangle]) -> np.ndarray: image = rcompose( fill_rectangles, threshold_image, invert_image, normalize_to_gray_scale, )(image, rectangles) return image def is_likely_segment(rectangle: Rectangle, min_area: float = 100) -> bool: # FIXME: Parameterize via factory return cv2.contourArea(rectangle, False) > min_area