Pull request #26: adjust response for table parsing to be per page based

Merge in RR/cv-analysis from adjust-signature-to-per-page to master

Squashed commit of the following:

commit 1142a350d537453a7ac35d97bfa00bd2a64c4871
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Fri Aug 12 11:09:04 2022 +0200

    adjust response for table parsing to be per page based
This commit is contained in:
Julius Unverfehrt 2022-08-12 11:46:24 +02:00
parent ea25b57dd9
commit 954c2f17a5
3 changed files with 80 additions and 79 deletions

View File

@ -1,51 +1,58 @@
from operator import truth
from typing import Callable, Iterator
from funcy import flatten
from funcy import lmap
from cv_analysis.figure_detection.figure_detection import detect_figures
from cv_analysis.layout_parsing import parse_layout
from cv_analysis.table_parsing import parse_tables
from cv_analysis.utils.structures import Rectangle
from pdf2img.conversion import convert_pages_to_images
from pdf2img.default_objects.image import ImagePlus
from pdf2img.default_objects.image import ImagePlus, ImageInfo
from pdf2img.default_objects.rectangle import RectanglePlus
def get_analysis_pipeline(operation):
if operation == "figure":
return make_analysis_pipeline(detect_figures, reduced=False)
return make_analysis_pipeline(get_analysis_fn(operation))
def get_analysis_fn(operation):
if operation == "table":
return parse_tables
elif operation == "layout":
return parse_layout
return make_analysis_pipeline(parse_tables, table_parsing_formatter, dpi=200)
elif operation == "figure":
return detect_figures
return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200)
else:
raise
def make_analysis_pipeline(analysis_fn: Callable, dpi=200, reduced=True):
def analysis_pipeline(pdf: bytes, index=None) -> Iterator[dict]:
def make_analysis_pipeline(analysis_fn, formatter, dpi):
def analyse_pipeline(pdf: bytes, index=None):
def parse_page(page: ImagePlus):
image = page.asarray()
rects = analysis_fn(image)
if not rects:
return
infos = formatter(rects, page, dpi)
return infos
pages = convert_pages_to_images(pdf, index=index, dpi=dpi)
yield from flatten(filter(truth, map(analyse, pages)))
results = map(parse_page, pages)
def analyse(page: ImagePlus):
def format_rect(rect: Rectangle):
rect_plus = RectanglePlus.from_pixel(rect.xyxy(), page_info, alpha=False, dpi=dpi)
image_info = rect_plus.to_image_info()
return image_info.asdict(reduced)
yield from filter(truth, results)
image, page_info = page.asarray(), page.info
rects = analysis_fn(image)
if not rects:
return
formatted_results = map(format_rect, rects)
return analyse_pipeline
yield from formatted_results
return analysis_pipeline
def table_parsing_formatter(rects, page, dpi):
def format_rect(rect: Rectangle):
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
rect_plus.derotate() # TODO: see if derotate is necessary
rect_plus.transform()
return rect_plus.asdict(reduced=True)
bboxes = lmap(format_rect, rects)
return {**page.asdict(reduced=True), "tableCells": bboxes}
def figure_detection_formatter(rects, page, dpi):
def format_rect(rect: Rectangle):
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
rect_plus.derotate() # TODO: see if derotate is necessary
return ImageInfo(page.info, rect_plus.asbbox(), rect_plus.alpha).asdict(reduced=False)
return lmap(format_rect, rects)

@ -1 +1 @@
Subproject commit 20fe8ba2ba1b73d2049bdc00117b0e37f150f15a
Subproject commit fee87964cb7da0ea0c19410ca418849744474302

View File

@ -2,7 +2,7 @@ import fitz
import numpy as np
import pytest
from cv_analysis.server.pipeline import make_analysis_pipeline
from cv_analysis.server.pipeline import table_parsing_formatter, figure_detection_formatter, make_analysis_pipeline
from cv_analysis.utils.structures import Rectangle
@ -12,64 +12,58 @@ def analysis_fn_mock(image: np.ndarray):
@pytest.fixture
def empty_pdf(n_pages):
def empty_pdf():
doc = fitz.open()
for n in range(n_pages):
for n in range(1):
doc.new_page()
return doc.write()
@pytest.fixture
def expected_formatted_analysis_result(n_pages, reduced):
if reduced:
def expected_formatted_analysis_result(operation):
if operation == "table":
return [
{
"pageInfo": {
"number": page_number,
"rotation": 0,
"width": 595.0,
"height": 842.0,
},
"boundingBox": {
"x0": 0.0,
"y0": 826.8800048828125,
"x1": 15.119999885559082,
"y1": 842.0,
"width": 15.119999885559082,
"height": 15.1199951171875,
},
"alpha": False,
"pageNumber": 0,
"pageRotation": 0,
"pageWidth": 595.0,
"pageHeight": 842.0,
"tableCells": [
{"x0": 0.0, "y0": 826.8800048828125, "width": 15.119999885559082, "height": 15.1199951171875}
],
}
for page_number in range(n_pages)
]
return [
{
"pageInfo": {
"number": page_number,
"rotation": 0,
"width": 595.0,
"height": 842.0,
"deRotationMatrix": (1.0, -0.0, -0.0, 1.0, 0.0, 0.0),
"transformationMatrix": (1.0, 0.0, 0.0, -1.0, -0.0, 842.0),
},
"boundingBox": {
"x0": 0.0,
"y0": 826.8800048828125,
"x1": 15.119999885559082,
"y1": 842.0,
"width": 15.119999885559082,
"height": 15.1199951171875,
},
"boundingBoxScreen": {"x0": 0.0, "y0": 0.0, "x1": 15.12, "y1": 15.12, "width": 15.12, "height": 15.12},
"alpha": False,
}
for page_number in range(n_pages)
]
if operation == "figure":
return [
[
{
"pageInfo": {
"pageNumber": 0,
"pageRotation": 0,
"pageWidth": 595.0,
"pageHeight": 842.0,
"deRotationMatrix": (1.0, -0.0, -0.0, 1.0, 0.0, 0.0),
"transformationMatrix": (1.0, 0.0, 0.0, -1.0, -0.0, 842.0),
},
"boundingBox": {"x0": 0.0, "y0": 0.0, "x1": 15.12, "y1": 15.12, "width": 15.12, "height": 15.12},
"alpha": False,
}
]
]
@pytest.mark.parametrize("n_pages", [1, 2])
@pytest.mark.parametrize("reduced", [True, False])
def test_analysis_pipeline(empty_pdf, expected_formatted_analysis_result, reduced):
analysis_pipeline = make_analysis_pipeline(analysis_fn_mock, reduced=reduced)
results = analysis_pipeline(empty_pdf)
@pytest.fixture
def formatter(operation):
if operation == "table":
return table_parsing_formatter
elif operation == "figure":
return figure_detection_formatter
else:
raise
@pytest.mark.parametrize("operation", ["table", "figure"])
def test_analysis_pipeline(empty_pdf, formatter, expected_formatted_analysis_result):
analysis_pipeline = make_analysis_pipeline(analysis_fn_mock, formatter, dpi=200)
results = list(analysis_pipeline(empty_pdf))
assert list(results) == expected_formatted_analysis_result