from functools import partial from itertools import starmap from operator import truth from typing import Callable, Iterator from funcy import lmap from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline from cv_analysis.layout_parsing import parse_layout from cv_analysis.table_parsing import parse_tables from cv_analysis.utils.structures import Rectangle from pdf2img.conversion import convert_pdf_to_image_and_metadata_stream def make_analysis_pipeline(analysis_fn: Callable, dpi=200): """Make end-to-end pipeline to analyse a PDF with given analysis function. The pipeline streams dicts containing page information and the analysis results. Note: If there are no results on a page, the page is skipped in result stream Steps: Convert PDF to a stream of page as image and metadata (page information) tuples Analyse pages: Get list of bounding boxes per page (e.g. table cells) Convert pixel values to inches Format results """ def analysis_pipeline(pdf: bytes, index=None) -> Iterator[dict]: image_metadata_stream = convert_pdf_to_image_and_metadata_stream(pdf, index=index, dpi=dpi) results = starmap(analyse_image_metadata_pair, image_metadata_stream) yield from filter(truth, results) def analyse_image_metadata_pair(image, metadata): rectangles = analysis_fn(image) rectangles = map(partial(convert_pixel_rect_to_inches_rect, dpi=dpi), rectangles) bboxes = lmap(lambda x: x.json_full(), rectangles) return {**metadata, "bboxes": bboxes} if bboxes else {} return analysis_pipeline def get_analysis_fn(analysis_type): if analysis_type == "table": return parse_tables elif analysis_type == "layout": return parse_layout elif analysis_type == "figure": return make_figure_detection_pipeline() else: raise def convert_pixel_rect_to_inches_rect(rect, dpi): def pixel_to_inch(pixel): return pixel / dpi * 72 bbox_inches = tuple(map(pixel_to_inch, rect.xyxy())) return Rectangle.from_xyxy(bbox_inches, discrete=False)