Merge in RR/cv-analysis from add-pdf-coord-conversion to master
Squashed commit of the following:
commit f56b7b45feb78142b032ef0faae2ca8dd020e6c5
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Thu Jul 7 11:26:46 2022 +0200
update pyinfra
commit 9086ef0a2059688fb8dd5559cda831bbbd36362b
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Thu Jul 7 11:21:53 2022 +0200
update inpout metadata keys
commit 55f147a5848e22ea62242ea883a0ce53ef1c04a5
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Thu Jul 7 09:16:16 2022 +0200
update to new input metadata signature
commit df4652fb027f734f2613e4adb7bc5b17edee62e9
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Jul 6 16:55:36 2022 +0200
refactor
commit e52c674085a9c7411c55a2e0993aa34622284317
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Jul 6 16:15:21 2022 +0200
update build script, refactor
commit 1f874aea591f25544aaa3f39a4e38fa50a24615e
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Jul 5 17:01:15 2022 +0200
add rotation formatter
commit b78a69741287a4cd38a90ace98f67e8f1b803737
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Jul 5 09:26:27 2022 +0200
refactor
commit b3155b8e072530f99114f3ee9135e73afc8f85cb
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Fri Jul 1 15:06:45 2022 +0200
made assertion robust to floating point precision
commit 4169102a6b5053500a3db2d789d265c2c77d56a4
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Fri Jul 1 15:06:01 2022 +0200
improve banner
commit dea74593d925c802489e5400297b48a9729038f0
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Fri Jul 1 14:28:08 2022 +0200
introduce derotation logic for rectangles from rotated pdfs, introduce continious option for coordinates in Rectangle class
commit d07e1dc2731ea7ae9887cc02bb98155bf1565a0d
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Fri Jul 1 10:39:38 2022 +0200
introduce table parsing formatter to convert pixel values to inches
commit 67ff6730dd7073a0fc9e9698904325dea9537c5b
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Fri Jul 1 08:06:42 2022 +0200
fixed duplicate logging
commit 6c025409415329028f697bb99986cd0912c7ed54
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Thu Jun 30 17:10:32 2022 +0200
add pyinfra mock script
70 lines
2.0 KiB
Python
70 lines
2.0 KiB
Python
import argparse
|
|
import base64
|
|
import gzip
|
|
import io
|
|
import json
|
|
from operator import itemgetter
|
|
from typing import List
|
|
|
|
import fitz
|
|
import pdf2image
|
|
from PIL import Image
|
|
from funcy import lmap, compose, pluck
|
|
from funcy import lpluck
|
|
|
|
from pyinfra.default_objects import get_component_factory
|
|
|
|
from cv_analysis.config import CONFIG
|
|
from incl.pyinfra.test.utils.image import image_to_bytes
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--pdf_path", "-p", required=True)
|
|
parser.add_argument("--operation", "-o", choices=["figure_detection", "table_parsing"], required=True)
|
|
parser.add_argument("--result_path", "-r", required=True)
|
|
args = parser.parse_args()
|
|
|
|
return args
|
|
|
|
|
|
def request_metadatas(dpi, n_metadata):
|
|
return [{"dpi": dpi} for _ in range(1, n_metadata)]
|
|
|
|
|
|
def draw_cells_on_page(cells: List[dict], page):
|
|
def format_xywh_to_x0y0x1y1(rect):
|
|
x, y, w, h = rect
|
|
return x, y, x + w, y + h
|
|
|
|
rects = map(itemgetter("x", "y", "width", "height"), cells)
|
|
rects = map(format_xywh_to_x0y0x1y1, rects)
|
|
|
|
for rect in rects:
|
|
page.draw_rect(rect, color=(0.3, 0.7, 0.1), width=2, overlay=True)
|
|
|
|
|
|
def annotate_results_on_pdf(results, pdf_path, result_path):
|
|
open_pdf = fitz.open(pdf_path)
|
|
metadata_per_page = pluck("metadata", results)
|
|
|
|
for page, metadata in zip(open_pdf, metadata_per_page):
|
|
if metadata:
|
|
draw_cells_on_page(metadata["cells"], page)
|
|
open_pdf.save(result_path)
|
|
|
|
|
|
def main(args):
|
|
dpi = 200
|
|
images = lmap(compose(gzip.compress, image_to_bytes), pdf2image.convert_from_path(args.pdf_path, dpi=dpi))
|
|
|
|
submit_endpoint = f"http://{CONFIG.webserver.host}:{CONFIG.webserver.port}/{args.operation}"
|
|
pipeline = get_component_factory(CONFIG).get_pipeline(submit_endpoint)
|
|
results = list(pipeline(data=images, metadata=request_metadatas(dpi, len(images))))
|
|
|
|
annotate_results_on_pdf(results, args.pdf_path, args.result_path)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main(parse_args())
|