Pull request #26: adjust response for table parsing to be per page based

Merge in RR/cv-analysis from adjust-signature-to-per-page to master Squashed commit of the following: commit 1142a350d537453a7ac35d97bfa00bd2a64c4871 Author: Julius Unverfehrt <julius.unverfehrt@iqser.com> Date: Fri Aug 12 11:09:04 2022 +0200 adjust response for table parsing to be per page based
2022-08-12 11:46:24 +02:00 · 2022-08-12 11:46:24 +02:00 · 954c2f17a5
commit 954c2f17a5
parent ea25b57dd9
3 changed files with 80 additions and 79 deletions
--- a/cv_analysis/server/pipeline.py
+++ b/cv_analysis/server/pipeline.py
@ -1,51 +1,58 @@
 from operator import truth
-from typing import Callable, Iterator

-from funcy import flatten
+from funcy import lmap

 from cv_analysis.figure_detection.figure_detection import detect_figures
-from cv_analysis.layout_parsing import parse_layout
 from cv_analysis.table_parsing import parse_tables
 from cv_analysis.utils.structures import Rectangle
 from pdf2img.conversion import convert_pages_to_images
-from pdf2img.default_objects.image import ImagePlus
+from pdf2img.default_objects.image import ImagePlus, ImageInfo
 from pdf2img.default_objects.rectangle import RectanglePlus


 def get_analysis_pipeline(operation):
-    if operation == "figure":
-        return make_analysis_pipeline(detect_figures, reduced=False)
-    return make_analysis_pipeline(get_analysis_fn(operation))
-
-
-def get_analysis_fn(operation):
    if operation == "table":
-        return parse_tables
-    elif operation == "layout":
-        return parse_layout
+        return make_analysis_pipeline(parse_tables, table_parsing_formatter, dpi=200)
    elif operation == "figure":
-        return detect_figures
+        return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200)
    else:
        raise


-def make_analysis_pipeline(analysis_fn: Callable, dpi=200, reduced=True):
-    def analysis_pipeline(pdf: bytes, index=None) -> Iterator[dict]:
+def make_analysis_pipeline(analysis_fn, formatter, dpi):
+    def analyse_pipeline(pdf: bytes, index=None):
+        def parse_page(page: ImagePlus):
+            image = page.asarray()
+            rects = analysis_fn(image)
+            if not rects:
+                return
+            infos = formatter(rects, page, dpi)
+            return infos
+
        pages = convert_pages_to_images(pdf, index=index, dpi=dpi)
-        yield from flatten(filter(truth, map(analyse, pages)))
+        results = map(parse_page, pages)

-    def analyse(page: ImagePlus):
-        def format_rect(rect: Rectangle):
-            rect_plus = RectanglePlus.from_pixel(rect.xyxy(), page_info, alpha=False, dpi=dpi)
-            image_info = rect_plus.to_image_info()
-            return image_info.asdict(reduced)
+        yield from filter(truth, results)

-        image, page_info = page.asarray(), page.info
-        rects = analysis_fn(image)
-        if not rects:
-            return
-        formatted_results = map(format_rect, rects)
+    return analyse_pipeline

-        yield from formatted_results

-    return analysis_pipeline
+def table_parsing_formatter(rects, page, dpi):
+    def format_rect(rect: Rectangle):
+        rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
+        rect_plus.derotate()  # TODO: see if derotate is necessary
+        rect_plus.transform()
+        return rect_plus.asdict(reduced=True)
+
+    bboxes = lmap(format_rect, rects)
+
+    return {**page.asdict(reduced=True), "tableCells": bboxes}
+
+
+def figure_detection_formatter(rects, page, dpi):
+    def format_rect(rect: Rectangle):
+        rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
+        rect_plus.derotate()  # TODO: see if derotate is necessary
+        return ImageInfo(page.info, rect_plus.asbbox(), rect_plus.alpha).asdict(reduced=False)
+
+    return lmap(format_rect, rects)
--- a/incl/pdf2image
+++ b/incl/pdf2image
@ -1 +1 @@
-Subproject commit 20fe8ba2ba1b73d2049bdc00117b0e37f150f15a
+Subproject commit fee87964cb7da0ea0c19410ca418849744474302
--- a/test/unit_tests/server_pipeline_test.py
+++ b/test/unit_tests/server_pipeline_test.py
@ -2,7 +2,7 @@ import fitz
 import numpy as np
 import pytest

-from cv_analysis.server.pipeline import make_analysis_pipeline
+from cv_analysis.server.pipeline import table_parsing_formatter, figure_detection_formatter, make_analysis_pipeline
 from cv_analysis.utils.structures import Rectangle


@ -12,64 +12,58 @@ def analysis_fn_mock(image: np.ndarray):


@pytest.fixture
-def empty_pdf(n_pages):
+def empty_pdf():
    doc = fitz.open()
-    for n in range(n_pages):
+    for n in range(1):
        doc.new_page()
    return doc.write()


@pytest.fixture
-def expected_formatted_analysis_result(n_pages, reduced):
-    if reduced:
+def expected_formatted_analysis_result(operation):
+    if operation == "table":
        return [
            {
-                "pageInfo": {
-                    "number": page_number,
-                    "rotation": 0,
-                    "width": 595.0,
-                    "height": 842.0,
-                },
-                "boundingBox": {
-                    "x0": 0.0,
-                    "y0": 826.8800048828125,
-                    "x1": 15.119999885559082,
-                    "y1": 842.0,
-                    "width": 15.119999885559082,
-                    "height": 15.1199951171875,
-                },
-                "alpha": False,
+                "pageNumber": 0,
+                "pageRotation": 0,
+                "pageWidth": 595.0,
+                "pageHeight": 842.0,
+                "tableCells": [
+                    {"x0": 0.0, "y0": 826.8800048828125, "width": 15.119999885559082, "height": 15.1199951171875}
+                ],
            }
-            for page_number in range(n_pages)
        ]
-    return [
-        {
-            "pageInfo": {
-                "number": page_number,
-                "rotation": 0,
-                "width": 595.0,
-                "height": 842.0,
-                "deRotationMatrix": (1.0, -0.0, -0.0, 1.0, 0.0, 0.0),
-                "transformationMatrix": (1.0, 0.0, 0.0, -1.0, -0.0, 842.0),
-            },
-            "boundingBox": {
-                "x0": 0.0,
-                "y0": 826.8800048828125,
-                "x1": 15.119999885559082,
-                "y1": 842.0,
-                "width": 15.119999885559082,
-                "height": 15.1199951171875,
-            },
-            "boundingBoxScreen": {"x0": 0.0, "y0": 0.0, "x1": 15.12, "y1": 15.12, "width": 15.12, "height": 15.12},
-            "alpha": False,
-        }
-        for page_number in range(n_pages)
-    ]
+    if operation == "figure":
+        return [
+            [
+                {
+                    "pageInfo": {
+                        "pageNumber": 0,
+                        "pageRotation": 0,
+                        "pageWidth": 595.0,
+                        "pageHeight": 842.0,
+                        "deRotationMatrix": (1.0, -0.0, -0.0, 1.0, 0.0, 0.0),
+                        "transformationMatrix": (1.0, 0.0, 0.0, -1.0, -0.0, 842.0),
+                    },
+                    "boundingBox": {"x0": 0.0, "y0": 0.0, "x1": 15.12, "y1": 15.12, "width": 15.12, "height": 15.12},
+                    "alpha": False,
+                }
+            ]
+        ]


-@pytest.mark.parametrize("n_pages", [1, 2])
-@pytest.mark.parametrize("reduced", [True, False])
-def test_analysis_pipeline(empty_pdf, expected_formatted_analysis_result, reduced):
-    analysis_pipeline = make_analysis_pipeline(analysis_fn_mock, reduced=reduced)
-    results = analysis_pipeline(empty_pdf)
+@pytest.fixture
+def formatter(operation):
+    if operation == "table":
+        return table_parsing_formatter
+    elif operation == "figure":
+        return figure_detection_formatter
+    else:
+        raise
+
+
+@pytest.mark.parametrize("operation", ["table", "figure"])
+def test_analysis_pipeline(empty_pdf, formatter, expected_formatted_analysis_result):
+    analysis_pipeline = make_analysis_pipeline(analysis_fn_mock, formatter, dpi=200)
+    results = list(analysis_pipeline(empty_pdf))
    assert list(results) == expected_formatted_analysis_result