52 lines
1.7 KiB
Python
52 lines
1.7 KiB
Python
from operator import truth
|
|
from typing import Callable, Iterator
|
|
|
|
from funcy import flatten
|
|
|
|
from cv_analysis.figure_detection.figure_detection import detect_figures
|
|
from cv_analysis.layout_parsing import parse_layout
|
|
from cv_analysis.table_parsing import parse_tables
|
|
from cv_analysis.utils.structures import Rectangle
|
|
from pdf2img.conversion import convert_pages_to_images
|
|
from pdf2img.default_objects.image import ImagePlus
|
|
from pdf2img.default_objects.rectangle import RectanglePlus
|
|
|
|
|
|
def get_analysis_pipeline(operation):
|
|
if operation == "figure":
|
|
return make_analysis_pipeline(detect_figures, reduced=False)
|
|
return make_analysis_pipeline(get_analysis_fn(operation))
|
|
|
|
|
|
def get_analysis_fn(operation):
|
|
if operation == "table":
|
|
return parse_tables
|
|
elif operation == "layout":
|
|
return parse_layout
|
|
elif operation == "figure":
|
|
return detect_figures
|
|
else:
|
|
raise
|
|
|
|
|
|
def make_analysis_pipeline(analysis_fn: Callable, dpi=200, reduced=True):
|
|
def analysis_pipeline(pdf: bytes, index=None) -> Iterator[dict]:
|
|
pages = convert_pages_to_images(pdf, index=index, dpi=dpi)
|
|
yield from flatten(filter(truth, map(analyse, pages)))
|
|
|
|
def analyse(page: ImagePlus):
|
|
def format_rect(rect: Rectangle):
|
|
rect_plus = RectanglePlus.from_pixel(rect.xyxy(), page_info, alpha=False, dpi=dpi)
|
|
image_info = rect_plus.to_image_info()
|
|
return image_info.asdict(reduced)
|
|
|
|
image, page_info = page.asarray(), page.info
|
|
rects = analysis_fn(image)
|
|
if not rects:
|
|
return
|
|
formatted_results = map(format_rect, rects)
|
|
|
|
yield from formatted_results
|
|
|
|
return analysis_pipeline
|