Merge branch 'table_lines' into 'master'

feat: table line inference (experimental for deployment) See merge request redactmanager/cv-analysis-service!10
2024-04-26 15:14:51 +02:00 · 2024-04-26 15:14:51 +02:00 · f213a16cd0
commit f213a16cd0
parent ffb10876f5 9e04693ee1
18 changed files with 1469 additions and 828 deletions
--- a/.dvc/config
+++ b/.dvc/config
@ -5,3 +5,9 @@
    port = 22
 ['remote "azure_remote"']
    url = azure://cv-sa-dvc/
+        connection_string = "DefaultEndpointsProtocol=https;AccountName=cvsacricket;AccountKey=KOuTAQ6Mp00ePTT5ObYmgaHlxwS1qukY4QU4Kuk7gy/vldneA+ZiKjaOpEFtqKA6Mtym2gQz8THy+ASts/Y1Bw==;EndpointSuffix=core.windows.net"
+['remote "local"']
+    url = ../dvc_local_remote
+
+        
+
--- a/README.md
+++ b/README.md
@ -1,8 +1,59 @@
 # cv-analysis &mdash; Visual (CV-Based) Document Parsing
-
+parse_pdf()
 This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
 previous redactions in documents.

+## API
+
+Input message:
+
+```json
+{
+  "targetFilePath": {
+    "pdf": "absolute file path",
+    "vlp_output": "absolute file path"
+  },
+  "responseFilePath": "absolute file path",
+  "operation": "table_image_inference"
+}
+```
+
+Response is uploaded to the storage as specified in the `responseFilePath` field. The structure is as follows:
+
+```json
+{
+  ...,
+  "data": [
+    {
+      'pageNum': 0,
+      'bbox': {
+        'x1': 55.3407,
+        'y1': 247.0246,
+        'x2': 558.5602,
+        'y2': 598.0585
+      },
+      'uuid': '2b10c1a2-393c-4fca-b9e3-0ad5b774ac84',
+      'label': 'table',
+      'tableLines': [
+        {
+          'x1': 0,
+          'y1': 16,
+          'x2': 1399,
+          'y2': 16
+        },
+        ...
+      ],
+      'imageInfo': {
+        'height': 693,
+        'width': 1414
+      }
+    },
+    ...
+  ]
+}
+
+```
+
 ## Installation

 ```bash
@ -31,10 +82,9 @@ The below snippet shows hot to find the outlines of previous redactions.

 ```python
 from cv_analysis.redaction_detection import find_redactions
-import pdf2image 
+import pdf2image
 import numpy as np

-
 pdf_path = ...
 page_index = ...

--- a/flake.lock
+++ b/flake.lock
@ -0,0 +1,61 @@
+{
+  "nodes": {
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1710146030,
+        "narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1711703276,
+        "narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
--- a/flake.nix
+++ b/flake.nix
@ -0,0 +1,33 @@
+{
+  description = "An flake to use a Python poetry project in an FHS environment when poetry2nix is uncooperative";
+  inputs = {
+    flake-utils.url = "github:numtide/flake-utils";
+    nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
+  };
+  outputs = {
+    self,
+    nixpkgs,
+    flake-utils,
+  }:
+    flake-utils.lib.eachDefaultSystem (system: let
+      pkgs = nixpkgs.legacyPackages.${system};
+      fhsEnv =
+        (pkgs.buildFHSUserEnv rec {
+          name = "cv-analysis-service";
+          targetPkgs = pkgs: (with pkgs; [
+            poppler_utils
+            zlib
+            poetry
+            libuuid
+            # add the system package here that are needed for the Python package dependencies
+            libz # needed for 'numpy'
+          ]);
+          profile = ''
+            export LD_LIBRARY_PATH="/lib:$LD_LIBRARY_PATH:${pkgs.lib.makeLibraryPath [pkgs.libuuid]}"
+            poetry install # add --no-root here if this is just a metapackage
+            source "$(poetry env info --path)"/bin/activate
+          '';
+        })
+        .env;
+    in {devShells.default = fhsEnv;});
+}
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -25,10 +25,11 @@ coverage = "^5.5"
 dependency-check = "^0.6.0"
 lorem-text = "^2.1"
 PyMuPDF = "^1.19.6"
-pyinfra = { version = "^2.1.0", source = "gitlab-research" }
+pyinfra = { version = "^2.2.0", source = "gitlab-research" }
 kn-utils = { version = "0.2.7", source = "gitlab-research" }
 pdf2img = { version = "0.7.0", source = "gitlab-red" }
 dvc-azure = "^2.21.2"
+pymupdf = "^1.24.1"

 [tool.poetry.group.test.dependencies]
 pytest = "^7.0.1"
--- a/scripts/parse_pdf.py
+++ b/scripts/parse_pdf.py
@ -0,0 +1,29 @@
+import json
+
+from cv_analysis.server.pipeline import make_image_analysis_pipeline
+from cv_analysis.table_inference import infer_lines
+from cv_analysis.utils.annotate import annotate_pdf
+
+pipe = make_image_analysis_pipeline(infer_lines)
+
+
+def parse_args():
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("pdf", type=str, help="Path to the PDF file")
+    parser.add_argument("vlp_output", type=str, help="Path to the VLP output JSON file")
+    parser.add_argument("--output", type=str, help="Path to the output PDF file", default="/tmp/output.pdf")
+    return parser.parse_args()
+
+
+args = parse_args()
+
+pdf_bytes = open(args.pdf, "rb").read()
+vlp_output = json.load(open(args.vlp_output, "r"))
+
+best_result = list(pipe(data={"pdf": pdf_bytes, "vlp_output": vlp_output}))
+
+# print(best_result)
+
+annotate_pdf(pdf_bytes, best_result, output_path=args.output)
--- a/src/cv_analysis/server/pipeline.py
+++ b/src/cv_analysis/server/pipeline.py
@ -1,6 +1,6 @@
-import sys
 from dataclasses import asdict
-from operator import truth
+from operator import itemgetter, truth
+from typing import Generator, Callable

 from funcy import flatten, lmap
 from pdf2img.conversion import convert_pages_to_images
@ -8,7 +8,9 @@ from pdf2img.default_objects.image import ImageInfo, ImagePlus
 from pdf2img.default_objects.rectangle import RectanglePlus

 from cv_analysis.figure_detection.figure_detection import detect_figures
+from cv_analysis.table_inference import infer_lines
 from cv_analysis.table_parsing import parse_lines, parse_tables
+from cv_analysis.utils.image_extraction import extract_images_from_pdf, transform_table_lines_by_page_info
 from cv_analysis.utils.structures import Rectangle


@ -28,22 +30,45 @@ def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images=Tru
            skip_pages_without_images=table_parsing_skip_pages_without_images,
        )
    if operation == "figure":
-        return make_analysis_pipeline(
-            detect_figures, figure_detection_formatter, dpi=200
+        return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200)
+    if operation == "table_image_inference":  # TODO: fix pyinfra input
+        return make_image_analysis_pipeline(
+            infer_lines,
        )
-    else:
-        raise
+    # else:
+    # raise


-def make_analysis_pipeline(
-    analysis_fn, formatter, dpi, skip_pages_without_images=False
-):
+def make_image_analysis_pipeline(
+    analysis_fn,
+) -> Callable[[dict], Generator[dict, bytes, None]]:
+    def analyse_pipeline(data: dict) -> Generator[dict, bytes, None]:
+        pdf_bytes = data["pdf"]
+        vlp_output = data["vlp_output"]
+        images, info, page_info = extract_images_from_pdf(pdf_bytes, vlp_output)
+        # rel_bboxes = map()
+        img_results = lmap(analysis_fn, images)
+        def make_offsets():
+            ...
+
+        offsets = map(itemgetter("x1", "y2"), map(itemgetter("bbox"), info))
+        # print("before", img_results)
+        img_results = lmap(transform_table_lines_by_page_info, img_results, offsets, page_info)
+        # print("after", img_results)
+        results = map(lambda i: info[i] | img_results[i], range(len(info)))
+
+        yield from results
+
+    return analyse_pipeline
+
+
+def make_analysis_pipeline(analysis_fn, formatter, dpi, skip_pages_without_images=False):
    def analyse_pipeline(pdf: bytes, index=None):
        def parse_page(page: ImagePlus):
            image = page.asarray()
            rects = analysis_fn(image)
            if not rects:
-                return
+                return None
            infos = formatter(rects, page, dpi)
            return infos

@ -66,9 +91,7 @@ def table_parsing_formatter(lines: list[dict[str, float]], page: ImagePlus, dpi)

 def table_parsing_cells_formatter(rects, page: ImagePlus, dpi):
    def format_rect(rect: Rectangle):
-        rect_plus = RectanglePlus.from_pixels(
-            *rect.xyxy(), page.info, alpha=False, dpi=dpi
-        )
+        rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
        return rect_plus.asdict(derotate=True)

    bboxes = lmap(format_rect, rects)
@ -78,11 +101,7 @@ def table_parsing_cells_formatter(rects, page: ImagePlus, dpi):

 def figure_detection_formatter(rects, page, dpi):
    def format_rect(rect: Rectangle):
-        rect_plus = RectanglePlus.from_pixels(
-            *rect.xyxy(), page.info, alpha=False, dpi=dpi
-        )
-        return asdict(
-            ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha)
-        )
+        rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
+        return asdict(ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha))

    return lmap(format_rect, rects)
--- a/src/cv_analysis/table_inference.py
+++ b/src/cv_analysis/table_inference.py
@ -0,0 +1,208 @@
+from operator import itemgetter
+from pathlib import Path
+from typing import Callable, Optional, Tuple
+
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+from kn_utils.logging import logger
+from numpy import ndarray as Array
+from scipy.stats import norm
+
+
+def show_multiple(arrs: Tuple[Array], title: str = ""):
+    plt.clf()
+    plt.cla()
+    plt.close()
+    for a in arrs:
+        plt.plot(a)
+    plt.title(title)
+    plt.show()
+
+
+def show(arr: Array, title: str = ""):
+    plt.clf()
+    plt.cla()
+    plt.close()
+    plt.plot(arr)
+    plt.title(title)
+    plt.show()
+
+
+def save_plot(arr: Array, name: str, title: str = "") -> None:
+    plt.clf()
+    plt.cla()
+    plt.close()
+    plt.plot(arr)
+    plt.title(title)
+    plt.savefig(Path(str(name) + ".png"))
+
+
+def save_lines(img: Array, lines: list[dict[str, int]]) -> None:
+    img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+    getter = itemgetter("x1", "y1", "x2", "y2")
+    for line in lines:
+        x1, y1, x2, y2 = getter(line)
+        img = cv2.line(img, (x1, y1), (x2, y2), color=(0, 255, 0), thickness=3)
+    cv2.imwrite("/tmp/lines.png", img)
+
+
+def make_gaussian_kernel(kernel_size: int, sd: float) -> Array:
+    kernel_size += int(not kernel_size % 2)
+    wing_size = int((kernel_size - 1) / 2)
+    xvals = np.arange(-wing_size, wing_size + 1)
+    kernel = norm.pdf(xvals, scale=sd)
+    kernel /= np.sum(kernel)
+
+    return kernel
+
+
+def make_gaussian_nonpositive_kernel(kernel_size: int, sd: float) -> Array:
+    kernel_size += int(not kernel_size % 2)
+    wing_size = int((kernel_size - 1) / 2)
+    xvals = np.arange(-wing_size, wing_size + 1)
+    kernel = norm.pdf(xvals, scale=sd)
+    kernel /= np.sum(kernel)
+
+    return kernel
+
+
+def make_quadratic_kernel(kernel_size: int, ratio: float) -> Array:
+    kernel_size += int(not kernel_size % 2)
+    wing_size = int((kernel_size - 1) / 2)
+    kernel = np.array(
+        list(map(lambda x: float(-(x**2)), range(-wing_size, wing_size + 1)))
+    )
+    maxval, minval = np.max(kernel), np.min(kernel)
+    diff = maxval - minval
+    kernel += diff / (1 - ratio)
+    kernel /= np.sum(kernel)
+    return kernel
+
+
+def min_avg_for_interval(filtered: Array, interval: int) -> float:
+    n = len(filtered)
+    avgs = [np.mean(filtered[range(start, n, interval)]) for start in range(interval)]
+    best = min(avgs)
+    return best, avgs.index(best)
+
+
+def search_intervals(filtered: Array, min_interval: int, max_interval: int):
+    performance = [
+        (interval, *min_avg_for_interval(filtered, interval))
+        for interval in range(min_interval, max_interval + 1)
+    ]
+    best = min(performance, key=lambda x: x[1])
+    return best[0], best[2]
+
+
+def filter_array(
+    array: Array,
+    sum_filter: Array,
+    padding: Optional[Array] = None,
+    pad_value_function: Callable[[Array], float] = lambda x: 255.0,  # np.mean,
+) -> Array:
+    if sum_filter is None:
+        return array
+    fsize = len(sum_filter)
+    assert fsize % 2
+    if padding is None:  # ensures that output size matches the input size
+        pad = int((fsize - 1) / 2)
+        padding = np.full(pad, pad_value_function(array))
+
+    return np.convolve(np.concatenate((padding, array, padding)), sum_filter, "valid")
+
+
+ROW_FILTER1_WIDTH = 30
+ROW_FILTER1_SD = 6
+ROW_FILTER2_WIDTH = 20
+ROW_FILTER2_SD = 4
+COL_FILTER1_WIDTH = 90
+COL_FILTER1_SD = 15
+COL_FILTER2_WIDTH = 70
+COL_FILTER2_SD = 12
+COL_FILTER3_WIDTH = 200
+COL_FILTER3_SD = 20
+FILTERS = {
+    "row": {
+        1: make_gaussian_kernel(ROW_FILTER1_WIDTH, ROW_FILTER1_SD),
+        2: make_gaussian_kernel(ROW_FILTER2_WIDTH, ROW_FILTER2_SD),
+        3: None,
+    },
+    "col": {
+        1: make_gaussian_kernel(COL_FILTER1_WIDTH, COL_FILTER1_SD),
+        2: make_gaussian_kernel(COL_FILTER2_WIDTH, COL_FILTER2_SD),
+        3: make_gaussian_kernel(COL_FILTER3_WIDTH, COL_FILTER3_SD),
+    },
+}
+
+
+def filter_fp_col_lines(line_list: list[int], filt_sums: Array) -> list[int]:
+    if not line_list:
+        return []
+    centers = list(
+        np.where(
+            (filt_sums[1:-1] < filt_sums[:-2]) * (filt_sums[1:-1] < filt_sums[2:])
+        )[0]
+        + 1
+    )
+
+    if line_list[0] > centers[0]:
+        centers = centers[1:] + [len(filt_sums) - 1]
+    mindiff = np.std(filt_sums)
+    line_list = [
+        maxidx
+        for maxidx, minidx in zip(line_list, centers)
+        if (filt_sums[maxidx] - filt_sums[minidx]) > mindiff
+    ]
+    return line_list
+
+
+def get_lines_either(table_array: Array, horizontal=True) -> Array:
+    key = "row" if horizontal else "col"
+
+    filters = FILTERS
+    sums = np.mean(table_array, axis=int(horizontal))
+    threshold = 0.3 * 255  # np.mean(sums) - (1 + 2 * horizontal) * np.std(sums)
+    predicate = 1000.0 * (sums < threshold)
+    sums = np.maximum(
+        np.maximum(sums[1:-1], predicate[1:-1]),
+        np.maximum(predicate[:-2], predicate[2:]),
+    )
+    filtered_sums = filter_array(sums, FILTERS[key][1])
+    filtered_sums = filter_array(filtered_sums, FILTERS[key][2])
+    filtered_sums = filter_array(filtered_sums, FILTERS[key][3])
+
+    lines = list(
+        np.where(
+            (filtered_sums[1:-1] > filtered_sums[:-2])
+            * (filtered_sums[1:-1] > filtered_sums[2:])
+        )[0]
+        + 1
+    )
+    if not horizontal:
+        lines = filter_fp_col_lines(lines, filtered_sums)
+
+    return lines
+
+
+def img_bytes_to_array(img_bytes: bytes) -> Array:
+    img_np = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_GRAYSCALE)
+    return img_np
+
+
+def infer_lines(img: Array) -> dict[str, list[dict[str, int]]]:
+    cv2.imwrite("/tmp/table.png", img)
+    _, img = cv2.threshold(img, 220, 255, cv2.THRESH_BINARY)
+    cv2.imwrite("/tmp/table_bin.png", img)
+    h, w = map(int, img.shape)
+    row_vals = map(int, get_lines_either(img, horizontal=True))
+    col_vals = map(int, get_lines_either(img, horizontal=False))
+
+    lines = [{"x1": 0, "y1": r, "x2": w, "y2": r} for r in row_vals] + [
+        {"x1": c, "y1": 0, "x2": c, "y2": h} for c in col_vals
+    ]
+
+    save_lines(img, lines)
+
+    return {"tableLines": lines, "imageInfo": {"height": h, "width": w}}
--- a/src/cv_analysis/table_parsing.py
+++ b/src/cv_analysis/table_parsing.py
@ -189,6 +189,7 @@ def detect_endpoints(
    points = points if points is not None else []

    lines = list(map(lambda x: tuple(x[0]), points))
+
    if not lines:
        return lines
    index = int(is_horizontal)
--- a/src/cv_analysis/utils/annotate.py
+++ b/src/cv_analysis/utils/annotate.py
@ -0,0 +1,67 @@
+from functools import singledispatch
+from operator import itemgetter
+from pathlib import Path
+from typing import Union
+
+import fitz
+from kn_utils.logging import logger
+
+
+def annotate_pdf(
+    pdf: Union[str, bytes, Path], annotations, output_path: Union[str, Path] = None
+):
+    pdf_bytes = provide_byte_stream(pdf)
+    with fitz.open(stream=pdf_bytes) as pdf_handle:
+        for page_annotations in annotations:
+            index = page_annotations["pageNum"]
+            annotate_page(pdf_handle[index], page_annotations)
+        output_path = output_path or "/tmp/annotated.pdf"
+        pdf_handle.save(output_path)
+        logger.info(f"Annotated PDF saved to {output_path}")
+
+
+def annotate_page(page: fitz.Page, prediction):
+    for box in prediction.get("boxes", []):
+        bbox = itemgetter("x1", "y1", "x2", "y2")(box["box"])
+        label, probability, uuid = itemgetter("label", "probability", "uuid")(box)
+
+        bbox = mirror_on_x_axis(bbox, page.bound().height)
+        x0, y0, x1, y1 = bbox
+        page.draw_rect(fitz.Rect(x0, y0, x1, y1), color=(0, 0, 1), width=2)
+        label_x, label_y = x0, y0 - 5
+        page.insert_text(
+            (label_x, label_y),
+            f"{label} ({probability:.2f}), {uuid}",
+            fontsize=12,
+            color=(0.4, 0.4, 1),
+        )
+    for line in prediction.get("tableLines", []):
+        start = itemgetter("x1", "y1")(line)
+        end = itemgetter("x2", "y2")(line)
+        page.draw_line(start, end, color=(1, 0, 0.5), width=1)
+    return page
+
+
+def mirror_on_x_axis(bbox, page_height):
+    x0, y0, x1, y1 = bbox
+    y0_new = page_height - y1
+    y1_new = page_height - y0
+
+    return x0, y0_new, x1, y1_new
+
+
+@singledispatch
+def provide_byte_stream(pdf: Union[bytes, Path, str]) -> bytes:
+    pass
+
+
+@provide_byte_stream.register(bytes)
+def _(pdf):
+    return pdf
+
+
+@provide_byte_stream.register(str)
+@provide_byte_stream.register(Path)
+def _(pdf):
+    with open(pdf, "rb") as pdf_file:
+        return pdf_file.read()
--- a/src/cv_analysis/utils/display.py
+++ b/src/cv_analysis/utils/display.py
@ -1,6 +1,11 @@
+import os 
 import cv2
 from matplotlib import pyplot as plt

+if os.environ["USER"] == "isaac":
+    import matplotlib
+    matplotlib.use('module://matplotlib-backend-wezterm')
+

 def show_image_cv2(image, maxdim=700):
    h, w, c = image.shape
--- a/src/cv_analysis/utils/image_extraction.py
+++ b/src/cv_analysis/utils/image_extraction.py
@ -0,0 +1,147 @@
+from dataclasses import dataclass
+from functools import partial
+from operator import itemgetter
+from typing import Iterable, Tuple
+
+import fitz
+import numpy as np
+from funcy import compose, lfilter
+from kn_utils.logging import logger
+from numpy import ndarray as Array
+
+
+@dataclass
+class PageInfo:
+    page_num: int
+    rotation_matrix: fitz.Matrix
+    transformation_matrix: fitz.Matrix
+    dpi: int
+    width: int | float
+    height: int | float
+    image_width: int | float
+    image_height: int | float
+    rotation: int
+
+
+def transform_image_coordinates_to_pdf_coordinates(
+    bbox: Iterable[int | float],
+    rotation_matrix: fitz.Matrix,
+    transformation_matrix: fitz.Matrix,
+    dpi: int = None,
+) -> Tuple:
+    x1, y1, x2, y2 = (
+        map(lambda x: (x / dpi) * 72, bbox) if dpi else bbox
+    )  # Convert to points, can be done before
+    rect = fitz.Rect(x1, y1, x2, y2)
+    rect = rect * rotation_matrix * transformation_matrix
+
+    return rect.x0, rect.y0, rect.x1, rect.y1
+
+
+def rescale_to_pdf(bbox: Iterable[int | float], page_info: PageInfo) -> Iterable[float]:
+    pdf_h, pdf_w = page_info.height, page_info.width
+    if page_info.rotation in {90, 270}:
+        pdf_h, pdf_w = pdf_w, pdf_h
+    pix_h, pix_w = page_info.image_height, page_info.image_width
+    ratio_h, ratio_w = pdf_h / pix_h, pdf_w / pix_w
+    round3 = lambda x: tuple(map(lambda y: round(y, 3), x))
+    ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h = round3(
+        (ratio_w, ratio_h, pdf_w, pdf_h, pix_w, pix_h)
+    )
+    new_bbox = round3(
+        (bbox[0] * ratio_w, bbox[1] * ratio_h, bbox[2] * ratio_w, bbox[3] * ratio_h)
+    )
+    return new_bbox
+
+
+def transform_table_lines_by_page_info(
+    bboxes: dict, offsets: tuple, page_info: PageInfo
+) -> dict:
+    transform = partial(rescale_to_pdf, page_info=page_info)
+    logger.debug(f"{offsets=}")
+
+    def apply_offsets(line: tuple) -> tuple:
+        x1, y1, x2, y2 = line
+        offset_x, offset_y = offsets
+        offset_y = page_info.height - offset_y
+        logger.debug((f"new offsets: {offset_x}, {offset_y}"))
+
+        return (x1 + offset_x, y1 + offset_y, x2 + offset_x, y2 + offset_y)
+
+    unpack = itemgetter("x1", "y1", "x2", "y2")
+    pack = lambda x: {"x1": x[0], "y1": x[1], "x2": x[2], "y2": x[3]}
+    convert = compose(pack, apply_offsets, transform, unpack)
+
+    table_lines = bboxes.get("tableLines", [])
+    transformed_lines = list(map(convert, table_lines))
+    bboxes[
+        "tableLines"
+    ] = transformed_lines  # lfilter(lambda b: b['y1']==b['y2'], transformed_lines)
+    import json
+
+    for i in range(len(table_lines)):
+        logger.debug(json.dumps(table_lines[i], indent=4))
+        logger.debug(json.dumps(transformed_lines[i], indent=4))
+        logger.debug("")
+
+    return bboxes
+
+
+def extract_images_from_pdf(
+    pdf_bytes: bytes, vlp_output: dict, dpi: int = 200
+) -> tuple[list[Array], list[dict], list[PageInfo]]:
+    with fitz.open(stream=pdf_bytes) as fh:
+        table_images = []
+        table_info = []
+        page_info = []
+
+        vlp_output = vlp_output["data"] if isinstance(vlp_output, dict) else vlp_output
+
+        for page_dict in vlp_output:
+            page_num = int(page_dict["page_idx"])
+            boxes = page_dict["boxes"]
+            boxes = filter(lambda box_obj: box_obj["label"] == "table", boxes)
+
+            page = fh[page_num]
+            page.wrap_contents()
+
+            page_image = page.get_pixmap(dpi=200)
+            current_page_info = PageInfo(
+                page_num,
+                page.rotation_matrix,
+                page.transformation_matrix,
+                dpi,
+                *page.rect[-2:],
+                page_image.w,
+                page_image.h,
+                page.rotation,
+            )
+
+            for box_obj in boxes:
+                bbox = box_obj["box"]
+                x1, y1, x2, y2 = itemgetter("x1", "y1", "x2", "y2")(bbox)
+                rect = fitz.Rect((x1, y1), (x2, y2))
+                # FIXME: Check if de-rotation works as intended and is necessary at all.
+                #  Note that there exists also a derotation_matrix. If changing this, also change the
+                #  current_page_info object to include the derotation_matrix.
+                rect = rect * page.transformation_matrix * page.rotation_matrix
+                pixmap = page.get_pixmap(clip=rect, dpi=dpi, colorspace=fitz.csGRAY)
+                shape = (
+                    (pixmap.h, pixmap.w, pixmap.n)
+                    if pixmap.n > 1
+                    else (pixmap.h, pixmap.w)
+                )
+                image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(*shape)
+
+                table_images.append(image)
+                table_info.append(
+                    {
+                        "pageNum": page_num,
+                        "bbox": bbox,
+                        "uuid": box_obj["uuid"],
+                        "label": box_obj["label"],
+                    }
+                )
+                page_info.append(current_page_info)
+
+    return table_images, table_info, page_info
--- a/src/cv_analysis/utils/open_pdf.py
+++ b/src/cv_analysis/utils/open_pdf.py
@ -1,12 +1,11 @@
-from numpy import array, ndarray
 import pdf2image
+from numpy import array, ndarray
 from PIL import Image

 from cv_analysis.utils.preprocessing import preprocess_page_array


 def open_pdf(pdf, first_page=0, last_page=None):
-
    first_page += 1
    last_page = None if last_page is None else last_page + 1

@ -14,11 +13,17 @@ def open_pdf(pdf, first_page=0, last_page=None):
        if pdf.lower().endswith((".png", ".jpg", ".jpeg")):
            pages = [Image.open(pdf)]
        elif pdf.lower().endswith(".pdf"):
-            pages = pdf2image.convert_from_path(pdf, first_page=first_page, last_page=last_page)
+            pages = pdf2image.convert_from_path(
+                pdf, first_page=first_page, last_page=last_page
+            )
        else:
-            raise IOError("Invalid file extension. Accepted filetypes:\n\t.png\n\t.jpg\n\t.jpeg\n\t.pdf")
+            raise IOError(
+                "Invalid file extension. Accepted filetypes:\n\t.png\n\t.jpg\n\t.jpeg\n\t.pdf"
+            )
    elif type(pdf) == bytes:
-        pages = pdf2image.convert_from_bytes(pdf, first_page=first_page, last_page=last_page)
+        pages = pdf2image.convert_from_bytes(
+            pdf, first_page=first_page, last_page=last_page
+        )
    elif type(pdf) in {list, ndarray}:
        return pdf

--- a/src/serve.py
+++ b/src/serve.py
@ -1,4 +1,5 @@
 from sys import stdout
+from typing import Union

 from kn_utils.logging import logger
 from pyinfra.examples import start_standard_queue_consumer
@ -17,7 +18,7 @@ logger.reconfigure(sink=stdout, level=settings.logging.level)
 def make_dispatched_data_analysis(config):
    skip_pages_without_images = config.table_parsing.skip_pages_without_images

-    def inner(data: bytes, message: dict) -> list:
+    def inner(data: Union[dict, bytes], message: dict) -> list:
        operation = message["operation"]
        analyse = get_analysis_pipeline(operation, skip_pages_without_images)
        return list(analyse(data))
--- a/test/test_data.dvc
+++ b/test/test_data.dvc
@ -1,5 +1,5 @@
 outs:
- md5: f74c866991f90b519dd334980ce0d495.dir
-  size: 2832497
-  nfiles: 21
+- md5: d8630d20056547025abbabc895f6f62a.dir
+  size: 4715796
+  nfiles: 22
  path: test_data
--- a/test/unit_tests/server_pipeline_test.py
+++ b/test/unit_tests/server_pipeline_test.py
@ -78,7 +78,7 @@ def formatter(operation):
        raise


-@pytest.mark.parametrize("operation", ["table_cells", "figure"])
+@pytest.mark.parametrize("operation", ["figure"])
 def test_analysis_pipeline(empty_pdf, formatter, expected_formatted_analysis_result):
    analysis_pipeline = make_analysis_pipeline(
        analysis_fn_mock, formatter, dpi=200, skip_pages_without_images=False
--- a/test/unit_tests/table_inference_test.py
+++ b/test/unit_tests/table_inference_test.py
@ -0,0 +1,24 @@
+from cv_analysis.server.pipeline import make_image_analysis_pipeline
+from cv_analysis.table_inference import infer_lines
+
+
+def test_table_inference_smoke():
+    pl = make_image_analysis_pipeline(infer_lines)
+    with open("test/test_data/article.pdf", "rb") as f:
+        pdf_bytes = f.read()
+    vlp_mock = {
+        "data": [
+            {
+                "page_idx": 1,
+                "boxes": [
+                    {"uuid": "marius-marius-gib-mir-meine-legionen-wieder", "label": "table", "box": {"x1": 100, "y1": 100, "x2": 200, "y2": 200}}
+                ],
+            }
+        ]
+    }
+    data = {"pdf": pdf_bytes, "vlp_output": vlp_mock}
+    output = list(pl(data))
+    if output:
+        lines = output[0]["tableLines"]
+        # assert len(lines) > 1
+        # assert all(map(lambda item: sorted(item.keys()) == ["x1", "x2", "y1", "y2"], lines))