Julius Unverfehrt 954c2f17a5 Pull request #26: adjust response for table parsing to be per page based
Merge in RR/cv-analysis from adjust-signature-to-per-page to master

Squashed commit of the following:

commit 1142a350d537453a7ac35d97bfa00bd2a64c4871
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Fri Aug 12 11:09:04 2022 +0200

    adjust response for table parsing to be per page based
2022-08-12 11:46:24 +02:00

59 lines
2.0 KiB
Python

from operator import truth
from funcy import lmap
from cv_analysis.figure_detection.figure_detection import detect_figures
from cv_analysis.table_parsing import parse_tables
from cv_analysis.utils.structures import Rectangle
from pdf2img.conversion import convert_pages_to_images
from pdf2img.default_objects.image import ImagePlus, ImageInfo
from pdf2img.default_objects.rectangle import RectanglePlus
def get_analysis_pipeline(operation):
if operation == "table":
return make_analysis_pipeline(parse_tables, table_parsing_formatter, dpi=200)
elif operation == "figure":
return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200)
else:
raise
def make_analysis_pipeline(analysis_fn, formatter, dpi):
def analyse_pipeline(pdf: bytes, index=None):
def parse_page(page: ImagePlus):
image = page.asarray()
rects = analysis_fn(image)
if not rects:
return
infos = formatter(rects, page, dpi)
return infos
pages = convert_pages_to_images(pdf, index=index, dpi=dpi)
results = map(parse_page, pages)
yield from filter(truth, results)
return analyse_pipeline
def table_parsing_formatter(rects, page, dpi):
def format_rect(rect: Rectangle):
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
rect_plus.derotate() # TODO: see if derotate is necessary
rect_plus.transform()
return rect_plus.asdict(reduced=True)
bboxes = lmap(format_rect, rects)
return {**page.asdict(reduced=True), "tableCells": bboxes}
def figure_detection_formatter(rects, page, dpi):
def format_rect(rect: Rectangle):
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
rect_plus.derotate() # TODO: see if derotate is necessary
return ImageInfo(page.info, rect_plus.asbbox(), rect_plus.alpha).asdict(reduced=False)
return lmap(format_rect, rects)