Julius Unverfehrt 0a11471191 feat(opentel,dynaconf): adapt new pyinfra
This commit also disables a broken test that connot be fixed. There are
also many scripts that didn't work anyways (and are not needed in my
eyes) that were not updatet. The scripts that are needed to run the
service processing locally still work.
2024-02-08 11:19:33 +01:00

63 lines
2.2 KiB
Python

import sys
from dataclasses import asdict
from operator import truth
from funcy import lmap, flatten
from cv_analysis.figure_detection.figure_detection import detect_figures
from cv_analysis.table_parsing import parse_tables
from cv_analysis.utils.structures import Rectangle
from pdf2img.conversion import convert_pages_to_images
from pdf2img.default_objects.image import ImagePlus, ImageInfo
from pdf2img.default_objects.rectangle import RectanglePlus
def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images=True):
if operation == "table":
return make_analysis_pipeline(
parse_tables,
table_parsing_formatter,
dpi=200,
skip_pages_without_images=table_parsing_skip_pages_without_images,
)
elif operation == "figure":
return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200)
else:
raise
def make_analysis_pipeline(analysis_fn, formatter, dpi, skip_pages_without_images=False):
def analyse_pipeline(pdf: bytes, index=None):
def parse_page(page: ImagePlus):
image = page.asarray()
rects = analysis_fn(image)
if not rects:
return
infos = formatter(rects, page, dpi)
return infos
pages = convert_pages_to_images(pdf, index=index, dpi=dpi, skip_pages_without_images=skip_pages_without_images)
results = map(parse_page, pages)
yield from flatten(filter(truth, results))
return analyse_pipeline
def table_parsing_formatter(rects, page: ImagePlus, dpi):
def format_rect(rect: Rectangle):
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
return rect_plus.asdict(derotate=True)
bboxes = lmap(format_rect, rects)
return {"pageInfo": page.asdict(natural_index=True), "tableCells": bboxes}
def figure_detection_formatter(rects, page, dpi):
def format_rect(rect: Rectangle):
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
return asdict(ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha))
return lmap(format_rect, rects)