Merge in RR/cv-analysis from adjust-signature-to-per-page to master
Squashed commit of the following:
commit 1142a350d537453a7ac35d97bfa00bd2a64c4871
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Fri Aug 12 11:09:04 2022 +0200
adjust response for table parsing to be per page based
59 lines
2.0 KiB
Python
59 lines
2.0 KiB
Python
from operator import truth
|
|
|
|
from funcy import lmap
|
|
|
|
from cv_analysis.figure_detection.figure_detection import detect_figures
|
|
from cv_analysis.table_parsing import parse_tables
|
|
from cv_analysis.utils.structures import Rectangle
|
|
from pdf2img.conversion import convert_pages_to_images
|
|
from pdf2img.default_objects.image import ImagePlus, ImageInfo
|
|
from pdf2img.default_objects.rectangle import RectanglePlus
|
|
|
|
|
|
def get_analysis_pipeline(operation):
|
|
if operation == "table":
|
|
return make_analysis_pipeline(parse_tables, table_parsing_formatter, dpi=200)
|
|
elif operation == "figure":
|
|
return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200)
|
|
else:
|
|
raise
|
|
|
|
|
|
def make_analysis_pipeline(analysis_fn, formatter, dpi):
|
|
def analyse_pipeline(pdf: bytes, index=None):
|
|
def parse_page(page: ImagePlus):
|
|
image = page.asarray()
|
|
rects = analysis_fn(image)
|
|
if not rects:
|
|
return
|
|
infos = formatter(rects, page, dpi)
|
|
return infos
|
|
|
|
pages = convert_pages_to_images(pdf, index=index, dpi=dpi)
|
|
results = map(parse_page, pages)
|
|
|
|
yield from filter(truth, results)
|
|
|
|
return analyse_pipeline
|
|
|
|
|
|
def table_parsing_formatter(rects, page, dpi):
|
|
def format_rect(rect: Rectangle):
|
|
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
|
|
rect_plus.derotate() # TODO: see if derotate is necessary
|
|
rect_plus.transform()
|
|
return rect_plus.asdict(reduced=True)
|
|
|
|
bboxes = lmap(format_rect, rects)
|
|
|
|
return {**page.asdict(reduced=True), "tableCells": bboxes}
|
|
|
|
|
|
def figure_detection_formatter(rects, page, dpi):
|
|
def format_rect(rect: Rectangle):
|
|
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
|
|
rect_plus.derotate() # TODO: see if derotate is necessary
|
|
return ImageInfo(page.info, rect_plus.asbbox(), rect_plus.alpha).asdict(reduced=False)
|
|
|
|
return lmap(format_rect, rects)
|