cv-analysis-service/scripts/pyinfra_mock.py
Julius Unverfehrt e7b28f5bda Pull request #18: Remove pil
Merge in RR/cv-analysis from remove_pil to master

Squashed commit of the following:

commit 83c8d88f3d48404251470176c70979ee75ae068b
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Thu Jul 21 10:51:51 2022 +0200

    remove deprecated server tests

commit cebc03b5399ac257a74036b41997201f882f5b74
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Thu Jul 21 10:51:08 2022 +0200

    remove deprecated server tests

commit ce2845b0c51f001b7b5b8b195d6bf7e034ec4e39
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Wed Jul 20 17:05:00 2022 +0200

    repair tests to work without pillow WIP

commit 023fdab8322f28359a24c63e32635a3d0deccbe4
Author: Isaac Riley <Isaac.Riley@iqser.com>
Date:   Wed Jul 20 16:40:36 2022 +0200

    fixed typo

commit 33850ca83a175f74789ae6b9bebd057ed84b7fb3
Author: Isaac Riley <Isaac.Riley@iqser.com>
Date:   Wed Jul 20 16:38:37 2022 +0200

    fixed import from refactored open_img.py

commit dbc6d345f074e538948e2c4f94ebed8a5ef520bc
Author: Isaac Riley <Isaac.Riley@iqser.com>
Date:   Wed Jul 20 16:32:42 2022 +0200

    removed PIL from production code, now inly in scripts
2022-07-21 13:25:00 +02:00

65 lines
1.9 KiB
Python

import argparse
import gzip
from operator import itemgetter
from typing import List
import fitz
import pdf2image
from funcy import lmap, compose, pluck
from pyinfra.default_objects import get_component_factory
from cv_analysis.config import CONFIG
from incl.pyinfra.test.utils.image import image_to_bytes
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--pdf_path", "-p", required=True)
parser.add_argument("--operation", "-o", choices=["figure_detection", "table_parsing"], required=True)
parser.add_argument("--result_path", "-r", required=True)
args = parser.parse_args()
return args
def request_metadatas(dpi, n_metadata):
return [{"dpi": dpi} for _ in range(1, n_metadata)]
def draw_cells_on_page(cells: List[dict], page):
def format_xywh_to_x0y0x1y1(rect):
x, y, w, h = rect
return x, y, x + w, y + h
rects = map(itemgetter("x", "y", "width", "height"), cells)
rects = map(format_xywh_to_x0y0x1y1, rects)
for rect in rects:
page.draw_rect(rect, color=(0.3, 0.7, 0.1), width=2, overlay=True)
def annotate_results_on_pdf(results, pdf_path, result_path):
opened_pdf = fitz.open(pdf_path)
metadata_per_page = pluck("metadata", results)
for page, metadata in zip(opened_pdf, metadata_per_page):
if metadata:
draw_cells_on_page(metadata["cells"], page)
opened_pdf.save(result_path)
def main(args):
dpi = 200
images = lmap(compose(gzip.compress, image_to_bytes), pdf2image.convert_from_path(args.pdf_path, dpi=dpi))
submit_endpoint = f"http://{CONFIG.webserver.host}:{CONFIG.webserver.port}/{args.operation}"
pipeline = get_component_factory(CONFIG).get_pipeline(submit_endpoint)
results = list(pipeline(data=images, metadata=request_metadatas(dpi, len(images))))
annotate_results_on_pdf(results, args.pdf_path, args.result_path)
if __name__ == "__main__":
main(parse_args())