From 954c2f17a51cfc6f30e58270cd747da75bee0c9d Mon Sep 17 00:00:00 2001 From: Julius Unverfehrt Date: Fri, 12 Aug 2022 11:46:24 +0200 Subject: [PATCH] Pull request #26: adjust response for table parsing to be per page based Merge in RR/cv-analysis from adjust-signature-to-per-page to master Squashed commit of the following: commit 1142a350d537453a7ac35d97bfa00bd2a64c4871 Author: Julius Unverfehrt Date: Fri Aug 12 11:09:04 2022 +0200 adjust response for table parsing to be per page based --- cv_analysis/server/pipeline.py | 65 +++++++++-------- incl/pdf2image | 2 +- test/unit_tests/server_pipeline_test.py | 92 ++++++++++++------------- 3 files changed, 80 insertions(+), 79 deletions(-) diff --git a/cv_analysis/server/pipeline.py b/cv_analysis/server/pipeline.py index f029195..8409a35 100644 --- a/cv_analysis/server/pipeline.py +++ b/cv_analysis/server/pipeline.py @@ -1,51 +1,58 @@ from operator import truth -from typing import Callable, Iterator -from funcy import flatten +from funcy import lmap from cv_analysis.figure_detection.figure_detection import detect_figures -from cv_analysis.layout_parsing import parse_layout from cv_analysis.table_parsing import parse_tables from cv_analysis.utils.structures import Rectangle from pdf2img.conversion import convert_pages_to_images -from pdf2img.default_objects.image import ImagePlus +from pdf2img.default_objects.image import ImagePlus, ImageInfo from pdf2img.default_objects.rectangle import RectanglePlus def get_analysis_pipeline(operation): - if operation == "figure": - return make_analysis_pipeline(detect_figures, reduced=False) - return make_analysis_pipeline(get_analysis_fn(operation)) - - -def get_analysis_fn(operation): if operation == "table": - return parse_tables - elif operation == "layout": - return parse_layout + return make_analysis_pipeline(parse_tables, table_parsing_formatter, dpi=200) elif operation == "figure": - return detect_figures + return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200) else: raise -def make_analysis_pipeline(analysis_fn: Callable, dpi=200, reduced=True): - def analysis_pipeline(pdf: bytes, index=None) -> Iterator[dict]: +def make_analysis_pipeline(analysis_fn, formatter, dpi): + def analyse_pipeline(pdf: bytes, index=None): + def parse_page(page: ImagePlus): + image = page.asarray() + rects = analysis_fn(image) + if not rects: + return + infos = formatter(rects, page, dpi) + return infos + pages = convert_pages_to_images(pdf, index=index, dpi=dpi) - yield from flatten(filter(truth, map(analyse, pages))) + results = map(parse_page, pages) - def analyse(page: ImagePlus): - def format_rect(rect: Rectangle): - rect_plus = RectanglePlus.from_pixel(rect.xyxy(), page_info, alpha=False, dpi=dpi) - image_info = rect_plus.to_image_info() - return image_info.asdict(reduced) + yield from filter(truth, results) - image, page_info = page.asarray(), page.info - rects = analysis_fn(image) - if not rects: - return - formatted_results = map(format_rect, rects) + return analyse_pipeline - yield from formatted_results - return analysis_pipeline +def table_parsing_formatter(rects, page, dpi): + def format_rect(rect: Rectangle): + rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi) + rect_plus.derotate() # TODO: see if derotate is necessary + rect_plus.transform() + return rect_plus.asdict(reduced=True) + + bboxes = lmap(format_rect, rects) + + return {**page.asdict(reduced=True), "tableCells": bboxes} + + +def figure_detection_formatter(rects, page, dpi): + def format_rect(rect: Rectangle): + rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi) + rect_plus.derotate() # TODO: see if derotate is necessary + return ImageInfo(page.info, rect_plus.asbbox(), rect_plus.alpha).asdict(reduced=False) + + return lmap(format_rect, rects) diff --git a/incl/pdf2image b/incl/pdf2image index 20fe8ba..fee8796 160000 --- a/incl/pdf2image +++ b/incl/pdf2image @@ -1 +1 @@ -Subproject commit 20fe8ba2ba1b73d2049bdc00117b0e37f150f15a +Subproject commit fee87964cb7da0ea0c19410ca418849744474302 diff --git a/test/unit_tests/server_pipeline_test.py b/test/unit_tests/server_pipeline_test.py index 3da67dc..c51b761 100644 --- a/test/unit_tests/server_pipeline_test.py +++ b/test/unit_tests/server_pipeline_test.py @@ -2,7 +2,7 @@ import fitz import numpy as np import pytest -from cv_analysis.server.pipeline import make_analysis_pipeline +from cv_analysis.server.pipeline import table_parsing_formatter, figure_detection_formatter, make_analysis_pipeline from cv_analysis.utils.structures import Rectangle @@ -12,64 +12,58 @@ def analysis_fn_mock(image: np.ndarray): @pytest.fixture -def empty_pdf(n_pages): +def empty_pdf(): doc = fitz.open() - for n in range(n_pages): + for n in range(1): doc.new_page() return doc.write() @pytest.fixture -def expected_formatted_analysis_result(n_pages, reduced): - if reduced: +def expected_formatted_analysis_result(operation): + if operation == "table": return [ { - "pageInfo": { - "number": page_number, - "rotation": 0, - "width": 595.0, - "height": 842.0, - }, - "boundingBox": { - "x0": 0.0, - "y0": 826.8800048828125, - "x1": 15.119999885559082, - "y1": 842.0, - "width": 15.119999885559082, - "height": 15.1199951171875, - }, - "alpha": False, + "pageNumber": 0, + "pageRotation": 0, + "pageWidth": 595.0, + "pageHeight": 842.0, + "tableCells": [ + {"x0": 0.0, "y0": 826.8800048828125, "width": 15.119999885559082, "height": 15.1199951171875} + ], } - for page_number in range(n_pages) ] - return [ - { - "pageInfo": { - "number": page_number, - "rotation": 0, - "width": 595.0, - "height": 842.0, - "deRotationMatrix": (1.0, -0.0, -0.0, 1.0, 0.0, 0.0), - "transformationMatrix": (1.0, 0.0, 0.0, -1.0, -0.0, 842.0), - }, - "boundingBox": { - "x0": 0.0, - "y0": 826.8800048828125, - "x1": 15.119999885559082, - "y1": 842.0, - "width": 15.119999885559082, - "height": 15.1199951171875, - }, - "boundingBoxScreen": {"x0": 0.0, "y0": 0.0, "x1": 15.12, "y1": 15.12, "width": 15.12, "height": 15.12}, - "alpha": False, - } - for page_number in range(n_pages) - ] + if operation == "figure": + return [ + [ + { + "pageInfo": { + "pageNumber": 0, + "pageRotation": 0, + "pageWidth": 595.0, + "pageHeight": 842.0, + "deRotationMatrix": (1.0, -0.0, -0.0, 1.0, 0.0, 0.0), + "transformationMatrix": (1.0, 0.0, 0.0, -1.0, -0.0, 842.0), + }, + "boundingBox": {"x0": 0.0, "y0": 0.0, "x1": 15.12, "y1": 15.12, "width": 15.12, "height": 15.12}, + "alpha": False, + } + ] + ] -@pytest.mark.parametrize("n_pages", [1, 2]) -@pytest.mark.parametrize("reduced", [True, False]) -def test_analysis_pipeline(empty_pdf, expected_formatted_analysis_result, reduced): - analysis_pipeline = make_analysis_pipeline(analysis_fn_mock, reduced=reduced) - results = analysis_pipeline(empty_pdf) +@pytest.fixture +def formatter(operation): + if operation == "table": + return table_parsing_formatter + elif operation == "figure": + return figure_detection_formatter + else: + raise + + +@pytest.mark.parametrize("operation", ["table", "figure"]) +def test_analysis_pipeline(empty_pdf, formatter, expected_formatted_analysis_result): + analysis_pipeline = make_analysis_pipeline(analysis_fn_mock, formatter, dpi=200) + results = list(analysis_pipeline(empty_pdf)) assert list(results) == expected_formatted_analysis_result