Pull request #27: Image service compat

Merge in RR/cv-analysis from image-service-compat to master Squashed commit of the following: commit 397d12a96a6b78de762f7b3a80a72427f5f51e97 Author: Julius Unverfehrt <julius.unverfehrt@iqser.com> Date: Tue Aug 16 16:14:40 2022 +0200 update pdf2image, adjust response format for table-parsing & figure-detection commit f2061bda8d25d64de974e97f36148dea29af50d9 Author: Julius Unverfehrt <julius.unverfehrt@iqser.com> Date: Mon Aug 15 08:56:39 2022 +0200 add script to save figure detection data that can be used for image-service pipeline script
2022-08-16 17:04:05 +02:00 · 2022-08-16 17:04:05 +02:00 · 309ae0d57b
commit 309ae0d57b
parent 20267f2715
6 changed files with 36 additions and 98 deletions
--- a/cv_analysis/server/pipeline.py
+++ b/cv_analysis/server/pipeline.py
@ -1,3 +1,4 @@
+from dataclasses import asdict
 from operator import truth

 from funcy import lmap, flatten
@ -40,19 +41,16 @@ def make_analysis_pipeline(analysis_fn, formatter, dpi):
 def table_parsing_formatter(rects, page, dpi):
    def format_rect(rect: Rectangle):
        rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
-        rect_plus.derotate()  # TODO: see if derotate is necessary
-        rect_plus.transform()
-        return rect_plus.asdict(reduced=True)
+        return rect_plus.asdict(derotate=True)

    bboxes = lmap(format_rect, rects)

-    return {**page.asdict(reduced=True), "tableCells": bboxes}
+    return {"pageInfo": page.asdict(), "tableCells": bboxes}


 def figure_detection_formatter(rects, page, dpi):
    def format_rect(rect: Rectangle):
        rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
-        rect_plus.derotate()  # TODO: see if derotate is necessary
-        return ImageInfo(page.info, rect_plus.asbbox(), rect_plus.alpha).asdict(reduced=False)
+        return asdict(ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha))

    return lmap(format_rect, rects)
--- a/incl/pdf2image
+++ b/incl/pdf2image
@ -1 +1 @@
-Subproject commit fee87964cb7da0ea0c19410ca418849744474302
+Subproject commit 4753c005ee926b09238977d524542f12ec4c4847
--- a/scripts/annotate_pdf.py
+++ b/scripts/annotate_pdf.py
@ -1,52 +0,0 @@
-import argparse
-import json
-from operator import itemgetter
-from pathlib import Path
-
-import fitz
-from funcy import lmap
-
-from cv_analysis.server.pipeline import get_analysis_fn, make_analysis_pipeline, get_analysis_pipeline
-from cv_analysis.utils.display import show_image_mpl
-from pdf2img.extraction import extract_images
-from pdf2img.get_rectangles import parse_to_image_rectangles
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("pdf_path")
-    parser.add_argument("output_folder")
-    parser.add_argument("--type", "-t", choices=["table", "layout", "figure"], required=True)
-    parser.add_argument("--verbose", action="store_true")
-    parser.add_argument("--silent", dest="verbose", action="store_false")
-    parser.set_defaults(verbose=False)
-    return parser.parse_args()
-
-
-def analyse_annotate_save(pdf, analysis_type, output_path, verbose):
-    pipe = get_analysis_pipeline(analysis_type)
-    results = list(pipe(pdf))
-
-    if verbose:
-        print(json.dumps(results, indent=2))
-
-    with fitz.open(stream=pdf) as pdf_handle:
-        for result in results:
-            page = pdf_handle[result["pageInfo"]["number"]]
-            bbox = result["boundingBoxScreen"]
-            x1, y1, x2, y2 = itemgetter("x0", "y0", "x1", "y1")(bbox)
-            rect = fitz.Rect(x1, y1, x2, y2)
-            page.draw_rect(rect, color=(0.5, 0.7, 0.2), width=2)
-        pdf_handle.save(output_path)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-
-    with open(args.pdf_path, "rb") as f:
-        pdf_bytes = f.read()
-
-    Path(args.output_folder).mkdir(parents=True, exist_ok=True)
-    output_path = f"{args.output_folder}/{Path(args.pdf_path).stem}_annotated_{args.type}.pdf"
-
-    analyse_annotate_save(pdf_bytes, args.type, output_path, args.verbose)
--- a/scripts/save_figure_detection_data.py
+++ b/scripts/save_figure_detection_data.py
@ -0,0 +1,28 @@
+import argparse
+import json
+from pathlib import Path
+
+from cv_analysis.server.pipeline import get_analysis_pipeline
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("pdf")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    detect_figures = get_analysis_pipeline("figure")
+
+    with open(args.pdf, "rb") as f:
+        pdf_bytes = f.read()
+
+    results = list(detect_figures(pdf_bytes))
+
+    folder = Path(args.pdf).parent
+    file_stem = Path(args.pdf).stem
+
+    with open(f"{folder}/{file_stem}_figures.json", "w+") as f:
+        json.dump(results, f, indent=2)
--- a/scripts/show_compressed_json.py
+++ b/scripts/show_compressed_json.py
@ -1,24 +0,0 @@
-import argparse
-import gzip
-import json
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("compressed_json_path", help="Path to compressed JSON file")
-    return parser.parse_args()
-
-
-def main(fp):
-    with open(fp, "rb") as f:
-        compressed_json_path = f.read()
-
-    json_str = gzip.decompress(compressed_json_path)
-    parsed = json.loads(json_str)
-
-    print(json.dumps(parsed, indent=2))
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args.compressed_json_path)
--- a/test/unit_tests/server_pipeline_test.py
+++ b/test/unit_tests/server_pipeline_test.py
@ -24,26 +24,14 @@ def expected_formatted_analysis_result(operation):
    if operation == "table":
        return [
            {
-                "pageNumber": 0,
-                "pageRotation": 0,
-                "pageWidth": 595.0,
-                "pageHeight": 842.0,
-                "tableCells": [
-                    {"x0": 0.0, "y0": 826.8800048828125, "width": 15.119999885559082, "height": 15.1199951171875}
-                ],
+                "pageInfo": {"number": 0, "rotation": 0, "width": 595.0, "height": 842.0},
+                "tableCells": [{"x0": 0.0, "y0": 0.0, "x1": 15.12, "y1": 15.12, "width": 15.12, "height": 15.12}],
            }
        ]
    if operation == "figure":
        return [
            {
-                "pageInfo": {
-                    "pageNumber": 0,
-                    "pageRotation": 0,
-                    "pageWidth": 595.0,
-                    "pageHeight": 842.0,
-                    "deRotationMatrix": (1.0, -0.0, -0.0, 1.0, 0.0, 0.0),
-                    "transformationMatrix": (1.0, 0.0, 0.0, -1.0, -0.0, 842.0),
-                },
+                "pageInfo": {"number": 0, "rotation": 0, "width": 595.0, "height": 842.0},
                "boundingBox": {"x0": 0.0, "y0": 0.0, "x1": 15.12, "y1": 15.12, "width": 15.12, "height": 15.12},
                "alpha": False,
            }