diff --git a/src/cv_analysis/server/pipeline.py b/src/cv_analysis/server/pipeline.py index 1642e8b..08a29f9 100644 --- a/src/cv_analysis/server/pipeline.py +++ b/src/cv_analysis/server/pipeline.py @@ -10,7 +10,7 @@ from pdf2img.default_objects.rectangle import RectanglePlus from cv_analysis.figure_detection.figure_detection import detect_figures from cv_analysis.table_inference import infer_lines from cv_analysis.table_parsing import parse_lines, parse_tables -from cv_analysis.utils.image_extraction import extract_images_from_pdf +from cv_analysis.utils.image_extraction import extract_images_from_pdf, transform_table_lines_by_page_info from cv_analysis.utils.structures import Rectangle @@ -45,8 +45,9 @@ def make_image_analysis_pipeline( def analyse_pipeline(data: dict) -> Generator[dict, bytes, None]: pdf_bytes = data["pdf"] vlp_output = data["vlp_output"] - images, info = extract_images_from_pdf(pdf_bytes, vlp_output) - img_results = list(map(analysis_fn, images)) + images, info, page_info = extract_images_from_pdf(pdf_bytes, vlp_output) + img_results = lmap(analysis_fn, images) + img_results = lmap(transform_table_lines_by_page_info, img_results, page_info) results = map(lambda i: info[i] | img_results[i], range(len(info))) yield from results diff --git a/src/cv_analysis/utils/image_extraction.py b/src/cv_analysis/utils/image_extraction.py index d5fefc4..4ff32e6 100644 --- a/src/cv_analysis/utils/image_extraction.py +++ b/src/cv_analysis/utils/image_extraction.py @@ -1,26 +1,60 @@ +from dataclasses import dataclass +from functools import partial from operator import itemgetter from typing import Iterable from typing import Tuple import fitz import numpy as np +from funcy import compose from numpy import ndarray as Array +@dataclass +class PageInfo: + page_num: int + rotation_matrix: fitz.Matrix + transformation_matrix: fitz.Matrix + dpi: int + + def transform_image_coordinates_to_pdf_coordinates( bbox: Iterable[int | float], rotation_matrix: fitz.Matrix, transformation_matrix: fitz.Matrix, dpi: int = None ) -> Tuple: x1, y1, x2, y2 = map(lambda x: (x / dpi) * 72, bbox) if dpi else bbox # Convert to points, can be done before rect = fitz.Rect(x1, y1, x2, y2) - rect = rect.transform(rotation_matrix).transform(transformation_matrix) + rect = rect * rotation_matrix * transformation_matrix return rect.x0, rect.y0, rect.x1, rect.y1 -def extract_images_from_pdf(pdf_bytes: bytes, vlp_output: dict, dpi: int = 200) -> tuple[list[Array], dict]: +def transform_table_lines_by_page_info(bboxes: dict, page_info: PageInfo) -> dict: + # FIXME: Also convert image info? Is image info necessary? + # Also, the resulting lines are not in the table bbox, is this okay? + transform = partial( + transform_image_coordinates_to_pdf_coordinates, + rotation_matrix=page_info.rotation_matrix, + transformation_matrix=page_info.transformation_matrix, + dpi=page_info.dpi, + ) + unpack = itemgetter("x1", "y1", "x2", "y2") + pack = lambda x: {"x1": x[0], "y1": x[1], "x2": x[2], "y2": x[3]} + convert = compose(pack, transform, unpack) + + table_lines = bboxes.get("tableLines", []) + transformed_lines = list(map(convert, table_lines)) + bboxes["tableLines"] = transformed_lines + + return bboxes + + +def extract_images_from_pdf( + pdf_bytes: bytes, vlp_output: dict, dpi: int = 200 +) -> tuple[list[Array], list[dict], list[PageInfo]]: with fitz.open(stream=pdf_bytes) as fh: - images = [] - info = [] + table_images = [] + table_info = [] + page_info = [] vlp_output = vlp_output["data"] if isinstance(vlp_output, dict) else vlp_output @@ -30,17 +64,25 @@ def extract_images_from_pdf(pdf_bytes: bytes, vlp_output: dict, dpi: int = 200) boxes = filter(lambda box_obj: box_obj["label"] == "table", boxes) page = fh[page_num] # pages[int(page_num)] + # TODO: Workaround to be able to transform the image coordinates to pdf coordinates in a later step. + current_page_info = PageInfo(page_num, page.rotation_matrix, page.transformation_matrix, dpi) for box_obj in boxes: bbox = box_obj["box"] x1, y1, x2, y2 = itemgetter("x1", "y1", "x2", "y2")(bbox) rect = fitz.Rect((x1, y1), (x2, y2)) - rect = rect * page.transformation_matrix + # FIXME: Check if de-rotation works as intended and is necessary at all. + # Note that there exists also a derotation_matrix. If changing this, also change the + # current_page_info object to include the derotation_matrix. + rect = rect * page.transformation_matrix * page.rotation_matrix pixmap = page.get_pixmap(clip=rect, dpi=dpi, colorspace=fitz.csGRAY) shape = (pixmap.h, pixmap.w, pixmap.n) if pixmap.n > 1 else (pixmap.h, pixmap.w) image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(*shape) - images.append(image) - info.append({"pageNum": page_num, "bbox": bbox, "uuid": box_obj["uuid"], "label": box_obj["label"]}) + table_images.append(image) + table_info.append( + {"pageNum": page_num, "bbox": bbox, "uuid": box_obj["uuid"], "label": box_obj["label"]} + ) + page_info.append(current_page_info) - return images, info + return table_images, table_info, page_info