2022-08-10 14:17:57 +02:00

52 lines
1.7 KiB
Python

from operator import truth
from typing import Callable, Iterator
from funcy import flatten
from cv_analysis.figure_detection.figure_detection import detect_figures
from cv_analysis.layout_parsing import parse_layout
from cv_analysis.table_parsing import parse_tables
from cv_analysis.utils.structures import Rectangle
from pdf2img.conversion import convert_pages_to_images
from pdf2img.default_objects.image import ImagePlus
from pdf2img.default_objects.rectangle import RectanglePlus
def get_analysis_pipeline(operation):
if operation == "figure":
return make_analysis_pipeline(detect_figures, reduced=False)
return make_analysis_pipeline(get_analysis_fn(operation))
def get_analysis_fn(operation):
if operation == "table":
return parse_tables
elif operation == "layout":
return parse_layout
elif operation == "figure":
return detect_figures
else:
raise
def make_analysis_pipeline(analysis_fn: Callable, dpi=200, reduced=True):
def analysis_pipeline(pdf: bytes, index=None) -> Iterator[dict]:
pages = convert_pages_to_images(pdf, index=index, dpi=dpi)
yield from flatten(filter(truth, map(analyse, pages)))
def analyse(page: ImagePlus):
def format_rect(rect: Rectangle):
rect_plus = RectanglePlus.from_pixel(rect.xyxy(), page_info, alpha=False, dpi=dpi)
image_info = rect_plus.to_image_info()
return image_info.asdict(reduced)
image, page_info = page.asarray(), page.info
rects = analysis_fn(image)
if not rects:
return
formatted_results = map(format_rect, rects)
yield from formatted_results
return analysis_pipeline