Pull request #27: Image service compat

Merge in RR/cv-analysis from image-service-compat to master

Squashed commit of the following:

commit 397d12a96a6b78de762f7b3a80a72427f5f51e97
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Tue Aug 16 16:14:40 2022 +0200

    update pdf2image, adjust response format for table-parsing & figure-detection

commit f2061bda8d25d64de974e97f36148dea29af50d9
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Mon Aug 15 08:56:39 2022 +0200

    add script to save figure detection data that can be used for image-service pipeline script
This commit is contained in:
Julius Unverfehrt 2022-08-16 17:04:05 +02:00
parent 20267f2715
commit 309ae0d57b
6 changed files with 36 additions and 98 deletions

View File

@ -1,3 +1,4 @@
from dataclasses import asdict
from operator import truth
from funcy import lmap, flatten
@ -40,19 +41,16 @@ def make_analysis_pipeline(analysis_fn, formatter, dpi):
def table_parsing_formatter(rects, page, dpi):
def format_rect(rect: Rectangle):
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
rect_plus.derotate() # TODO: see if derotate is necessary
rect_plus.transform()
return rect_plus.asdict(reduced=True)
return rect_plus.asdict(derotate=True)
bboxes = lmap(format_rect, rects)
return {**page.asdict(reduced=True), "tableCells": bboxes}
return {"pageInfo": page.asdict(), "tableCells": bboxes}
def figure_detection_formatter(rects, page, dpi):
def format_rect(rect: Rectangle):
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
rect_plus.derotate() # TODO: see if derotate is necessary
return ImageInfo(page.info, rect_plus.asbbox(), rect_plus.alpha).asdict(reduced=False)
return asdict(ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha))
return lmap(format_rect, rects)

@ -1 +1 @@
Subproject commit fee87964cb7da0ea0c19410ca418849744474302
Subproject commit 4753c005ee926b09238977d524542f12ec4c4847

View File

@ -1,52 +0,0 @@
import argparse
import json
from operator import itemgetter
from pathlib import Path
import fitz
from funcy import lmap
from cv_analysis.server.pipeline import get_analysis_fn, make_analysis_pipeline, get_analysis_pipeline
from cv_analysis.utils.display import show_image_mpl
from pdf2img.extraction import extract_images
from pdf2img.get_rectangles import parse_to_image_rectangles
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("pdf_path")
parser.add_argument("output_folder")
parser.add_argument("--type", "-t", choices=["table", "layout", "figure"], required=True)
parser.add_argument("--verbose", action="store_true")
parser.add_argument("--silent", dest="verbose", action="store_false")
parser.set_defaults(verbose=False)
return parser.parse_args()
def analyse_annotate_save(pdf, analysis_type, output_path, verbose):
pipe = get_analysis_pipeline(analysis_type)
results = list(pipe(pdf))
if verbose:
print(json.dumps(results, indent=2))
with fitz.open(stream=pdf) as pdf_handle:
for result in results:
page = pdf_handle[result["pageInfo"]["number"]]
bbox = result["boundingBoxScreen"]
x1, y1, x2, y2 = itemgetter("x0", "y0", "x1", "y1")(bbox)
rect = fitz.Rect(x1, y1, x2, y2)
page.draw_rect(rect, color=(0.5, 0.7, 0.2), width=2)
pdf_handle.save(output_path)
if __name__ == "__main__":
args = parse_args()
with open(args.pdf_path, "rb") as f:
pdf_bytes = f.read()
Path(args.output_folder).mkdir(parents=True, exist_ok=True)
output_path = f"{args.output_folder}/{Path(args.pdf_path).stem}_annotated_{args.type}.pdf"
analyse_annotate_save(pdf_bytes, args.type, output_path, args.verbose)

View File

@ -0,0 +1,28 @@
import argparse
import json
from pathlib import Path
from cv_analysis.server.pipeline import get_analysis_pipeline
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("pdf")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
detect_figures = get_analysis_pipeline("figure")
with open(args.pdf, "rb") as f:
pdf_bytes = f.read()
results = list(detect_figures(pdf_bytes))
folder = Path(args.pdf).parent
file_stem = Path(args.pdf).stem
with open(f"{folder}/{file_stem}_figures.json", "w+") as f:
json.dump(results, f, indent=2)

View File

@ -1,24 +0,0 @@
import argparse
import gzip
import json
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("compressed_json_path", help="Path to compressed JSON file")
return parser.parse_args()
def main(fp):
with open(fp, "rb") as f:
compressed_json_path = f.read()
json_str = gzip.decompress(compressed_json_path)
parsed = json.loads(json_str)
print(json.dumps(parsed, indent=2))
if __name__ == "__main__":
args = parse_args()
main(args.compressed_json_path)

View File

@ -24,26 +24,14 @@ def expected_formatted_analysis_result(operation):
if operation == "table":
return [
{
"pageNumber": 0,
"pageRotation": 0,
"pageWidth": 595.0,
"pageHeight": 842.0,
"tableCells": [
{"x0": 0.0, "y0": 826.8800048828125, "width": 15.119999885559082, "height": 15.1199951171875}
],
"pageInfo": {"number": 0, "rotation": 0, "width": 595.0, "height": 842.0},
"tableCells": [{"x0": 0.0, "y0": 0.0, "x1": 15.12, "y1": 15.12, "width": 15.12, "height": 15.12}],
}
]
if operation == "figure":
return [
{
"pageInfo": {
"pageNumber": 0,
"pageRotation": 0,
"pageWidth": 595.0,
"pageHeight": 842.0,
"deRotationMatrix": (1.0, -0.0, -0.0, 1.0, 0.0, 0.0),
"transformationMatrix": (1.0, 0.0, 0.0, -1.0, -0.0, 842.0),
},
"pageInfo": {"number": 0, "rotation": 0, "width": 595.0, "height": 842.0},
"boundingBox": {"x0": 0.0, "y0": 0.0, "x1": 15.12, "y1": 15.12, "width": 15.12, "height": 15.12},
"alpha": False,
}