feat: adapt pipeline for new table inference + pyinfra

2024-04-22 10:08:24 +02:00 · 2024-04-22 10:08:24 +02:00 · e264c948cf
commit e264c948cf
parent ddd680bb4c
9 changed files with 1122 additions and 803 deletions
--- a/.dvc/config
+++ b/.dvc/config
@ -5,3 +5,9 @@
    port = 22
 ['remote "azure_remote"']
    url = azure://cv-sa-dvc/
+        connection_string = "DefaultEndpointsProtocol=https;AccountName=cvsacricket;AccountKey=KOuTAQ6Mp00ePTT5ObYmgaHlxwS1qukY4QU4Kuk7gy/vldneA+ZiKjaOpEFtqKA6Mtym2gQz8THy+ASts/Y1Bw==;EndpointSuffix=core.windows.net"
+['remote "local"']
+    url = ../dvc_local_remote
+
+        
+
--- a/flake.lock
+++ b/flake.lock
@ -0,0 +1,61 @@
+{
+  "nodes": {
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1710146030,
+        "narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1711703276,
+        "narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
--- a/flake.nix
+++ b/flake.nix
@ -0,0 +1,33 @@
+{
+  description = "An flake to use a Python poetry project in an FHS environment when poetry2nix is uncooperative";
+  inputs = {
+    flake-utils.url = "github:numtide/flake-utils";
+    nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
+  };
+  outputs = {
+    self,
+    nixpkgs,
+    flake-utils,
+  }:
+    flake-utils.lib.eachDefaultSystem (system: let
+      pkgs = nixpkgs.legacyPackages.${system};
+      fhsEnv =
+        (pkgs.buildFHSUserEnv rec {
+          name = "cv-analysis-service";
+          targetPkgs = pkgs: (with pkgs; [
+            poppler_utils
+            zlib
+            poetry
+            libuuid
+            # add the system package here that are needed for the Python package dependencies
+            libz # needed for 'numpy'
+          ]);
+          profile = ''
+            export LD_LIBRARY_PATH="/lib:$LD_LIBRARY_PATH:${pkgs.lib.makeLibraryPath [pkgs.libuuid]}"
+            poetry install # add --no-root here if this is just a metapackage
+            source "$(poetry env info --path)"/bin/activate
+          '';
+        })
+        .env;
+    in {devShells.default = fhsEnv;});
+}
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -29,6 +29,7 @@ pyinfra = { version = "^2.1.0", source = "gitlab-research" }
 kn-utils = { version = "0.2.7", source = "gitlab-research" }
 pdf2img = { version = "0.7.0", source = "gitlab-red" }
 dvc-azure = "^2.21.2"
+pymupdf = "^1.24.1"

 [tool.poetry.group.test.dependencies]
 pytest = "^7.0.1"
--- a/src/cv_analysis/server/pipeline.py
+++ b/src/cv_analysis/server/pipeline.py
@ -1,6 +1,7 @@
 import sys
 from dataclasses import asdict
 from operator import truth
+from typing import Generator

 from funcy import flatten, lmap
 from pdf2img.conversion import convert_pages_to_images
@ -8,6 +9,7 @@ from pdf2img.default_objects.image import ImageInfo, ImagePlus
 from pdf2img.default_objects.rectangle import RectanglePlus

 from cv_analysis.figure_detection.figure_detection import detect_figures
+from cv_analysis.table_inference import extract_images_from_pdf, infer_lines
 from cv_analysis.table_parsing import parse_lines, parse_tables
 from cv_analysis.utils.structures import Rectangle

@ -31,8 +33,27 @@ def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images=Tru
        return make_analysis_pipeline(
            detect_figures, figure_detection_formatter, dpi=200
        )
-    else:
-        raise
+    if (
+        operation == "table_image_inference"
+    ):  # TODO: fix pyinfra input
+        return make_image_analysis_pipeline(
+            infer_lines,
+        )
+    # else:
+    # raise
+
+
+def make_image_analysis_pipeline(
+    analysis_fn,
+) -> Generator[dict, bytes, None]:
+    def analyse_pipeline(pdf_bytes: bytes, vlp_output: dict):
+        images, info = extract_images_from_pdf(pdf_bytes, vlp_output)
+        img_results = map(analysis_fn, images)
+        results = map(lambda i: info[i] | {"tableLines": img_results[i]}, range(len(info)))
+
+        yield from results
+
+    return analyse_pipeline


 def make_analysis_pipeline(
@ -43,7 +64,7 @@ def make_analysis_pipeline(
            image = page.asarray()
            rects = analysis_fn(image)
            if not rects:
-                return
+                return None
            infos = formatter(rects, page, dpi)
            return infos

--- a/src/cv_analysis/table_inference.py
+++ b/src/cv_analysis/table_inference.py
@ -0,0 +1,161 @@
+from pathlib import Path
+from typing import Callable, Iterable, Optional, Tuple
+from typing import Tuple
+
+
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+from numpy import ndarray as Array
+from scipy.signal import argrelextrema
+from scipy.stats import norm
+import fitz
+from pdf2img.conversion import convert_pages_to_images
+
+
+
+
+def show_multiple(arrs: Tuple[Array], title: str = ""):
+    plt.clf()
+    plt.cla()
+    plt.close()
+    for a in arrs:
+        plt.plot(a)
+    plt.title(title)
+    plt.show()
+
+
+def show(arr: Array, title: str = ""):
+    plt.clf()
+    plt.cla()
+    plt.close()
+    plt.plot(arr)
+    plt.title(title)
+    plt.show()
+
+
+def save_plot(arr: Array, name: str, title: str = "") -> None:
+    plt.clf()
+    plt.cla()
+    plt.close()
+    plt.plot(arr)
+    plt.title(title)
+    plt.savefig(Path(str(name) + ".png"))
+
+
+def make_gaussian_kernel(kernel_size: int, sd: float) -> Array:
+    kernel_size += int(not kernel_size % 2)
+    wing_size = int((kernel_size - 1) / 2)
+    xvals = np.arange(-wing_size, wing_size + 1)
+    kernel = norm.pdf(xvals, scale=sd)
+    # maxval, minval = np.max(kernel), np.min(kernel)
+    # diff = maxval - minval
+    # kernel += (diff / (1 - ratio))
+    kernel /= np.sum(kernel)
+
+    return kernel
+
+
+def make_gaussian_nonpositive_kernel(kernel_size: int, sd: float) -> Array:
+    kernel_size += int(not kernel_size % 2)
+    wing_size = int((kernel_size - 1) / 2)
+    xvals = np.arange(-wing_size, wing_size + 1)
+    kernel = norm.pdf(xvals, scale=sd)
+    # maxval, minval = np.max(kernel), np.min(kernel)
+    # diff = maxval - minval
+    # kernel += (diff / (1 - ratio))
+    kernel /= np.sum(kernel)
+
+    return kernel
+
+
+def make_quadratic_kernel(kernel_size: int, ratio: float) -> Array:
+    # print(bound)
+    # step_size = 2 * bound / (kernel_size - 1)
+    kernel_size += int(not kernel_size % 2)
+    # print(kernel_size)
+    wing_size = int((kernel_size - 1) / 2)
+    # print(step_size)
+    # xvals = list(map(lambda i: i * step_size, range(-wing_size, wing_size + 1)))
+    # print(xvals)
+    kernel = np.array(
+        list(map(lambda x: float(-(x**2)), range(-wing_size, wing_size + 1)))
+    )
+    # print(kernel)
+    maxval, minval = np.max(kernel), np.min(kernel)
+    diff = maxval - minval
+    kernel += diff / (1 - ratio)
+    # print(kernel)
+    kernel /= np.sum(kernel)
+    # print(kernel)
+    return kernel
+
+
+def min_avg_for_interval(filtered: Array, interval: int) -> float:
+    n = len(filtered)
+    avgs = [np.mean(filtered[range(start, n, interval)]) for start in range(interval)]
+    best = min(avgs)
+    return best, avgs.index(best)
+
+
+def search_intervals(filtered: Array, min_interval: int, max_interval: int):
+    performance = [
+        (interval, *min_avg_for_interval(filtered, interval))
+        for interval in range(min_interval, max_interval + 1)
+    ]
+    best = min(performance, key=lambda x: x[1])
+    return best[0], best[2]
+
+
+def filter_array(
+    array: Array,
+    sum_filter: Array,
+    padding: Optional[Array] = None,
+    pad_value_function: Callable[[Array], float] = np.mean,
+) -> Array:
+    if not sum_filter:
+        return array
+    fsize = len(sum_filter)
+    assert fsize % 2
+    if padding is None:  # ensures that output size matches the input size
+        pad = int((fsize - 1) / 2)
+        padding = np.full(pad, pad_value_function(array))
+
+    return np.convolve(np.concatenate((padding, array, padding)), sum_filter, "valid")
+
+
+FILTERS = {
+    "row": {1: make_gaussian_kernel(30, 6), 2: make_gaussian_kernel(20, 4)},
+    "col": {1: make_gaussian_kernel(70, 10), 2: None},
+}
+
+
+def get_lines_either(table_array: Array, horizontal=True) -> Array:
+    key = "row" if horizontal else "col"
+    THRESHOLD = 0.3
+
+    filters = FILTERS
+    sums = np.mean(table_array, axis=int(horizontal))
+    sums = np.maximum(sums, (sums < THRESHOLD))
+    # save_plot(rows, name=save_path / "rows", title="raw row averages")
+    filtered_sums = filter_array(sums, FILTERS[key][1])  # ROW_FILTER1)
+    filtered_sums = filter_array(sums, FILTERS[key][2])  # ROW_FILTER2)
+    lines = argrelextrema(filtered_sums, np.greater)[0]
+    return lines
+
+
+def img_bytes_to_array(img_bytes: bytes) -> Array:
+    img_np = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_GRAYSCALE)
+    return img_np
+
+
+def infer_lines(img: Array) -> dict[str, list[dict[str, int]]]:
+    h, w = img.shape
+    row_vals = get_lines_either(img, horizontal=True)
+    col_vals = get_lines_either(img, horizontal=False)
+
+    lines = [{"x1": 0, "y1": r, "x2": w, "y2": r} for r in row_vals] + [
+        {"x1": c, "y1": 0, "x2": c, "y2": h} for c in col_vals
+    ]
+
+    return {"tableLines": lines, "imageInfo": {"height": h, "width": w}}
--- a/src/cv_analysis/utils/image_extraction.py
+++ b/src/cv_analysis/utils/image_extraction.py
@ -0,0 +1,47 @@
+from pathlib import Path
+from typing import Callable, Iterable, Optional, Tuple
+from typing import Tuple
+
+import numpy as np
+from numpy import ndarray as Array
+from scipy.signal import argrelextrema
+from scipy.stats import norm
+import fitz
+from pdf2img.conversion import convert_pages_to_images
+
+
+def transform_image_coordinates_to_pdf_coordinates(
+    bbox: Iterable[int | float], rotation_matrix: fitz.Matrix, transformation_matrix: fitz.Matrix, dpi: int = None
+) -> Tuple:
+    x1, y1, x2, y2 = map(lambda x: (x / dpi) * 72, bbox) if dpi else bbox  # Convert to points, can be done before
+    rect = fitz.Rect(x1, y1, x2, y2)
+    rect = rect.transform(rotation_matrix).transform(transformation_matrix)
+
+    return rect.x0, rect.y0, rect.x1, rect.y1
+
+
+def extract_images_from_pdf(pdf_bytes: bytes, vlp_output: dict, dpi: int = 200) -> tuple[list[Array], dict]:
+
+    with fitz.open(stream=pdf_bytes) as fh:
+
+        images = []
+        info = []
+        
+        for page_dict in vlp_output["data"]:
+            page_num = int(page_dict["page_idx"])
+            boxes = page_dict["image_boxes"]
+            boxes = filter(lambda box_obj: box_obj["label"] == "table", boxes)
+
+            page = fh[page_num] #pages[int(page_num)]
+            h, w = page.shape
+            
+            for bbox in boxes:
+                x1, x2 = map(lambda x: int(x * w), (bbox["x1"], bbox["x2"]))
+                y1, y2 = map(lambda y: int(y * h), (bbox["y1"], bbox["y2"]))
+                rect = fitz.Rect((x1, y1), (x2, y2))
+                pixmap = page.get_pixmap(clip=rect, dpi=dpi, colorspace=fitz.csGRAY)
+                image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
+                images.append(image)
+                info.append({"pageNum": page_num, "bbox": bbox})
+
+    return images, info
--- a/src/cv_analysis/utils/open_pdf.py
+++ b/src/cv_analysis/utils/open_pdf.py
@ -1,12 +1,11 @@
-from numpy import array, ndarray
 import pdf2image
+from numpy import array, ndarray
 from PIL import Image

 from cv_analysis.utils.preprocessing import preprocess_page_array


 def open_pdf(pdf, first_page=0, last_page=None):
-
    first_page += 1
    last_page = None if last_page is None else last_page + 1

@ -14,11 +13,17 @@ def open_pdf(pdf, first_page=0, last_page=None):
        if pdf.lower().endswith((".png", ".jpg", ".jpeg")):
            pages = [Image.open(pdf)]
        elif pdf.lower().endswith(".pdf"):
-            pages = pdf2image.convert_from_path(pdf, first_page=first_page, last_page=last_page)
+            pages = pdf2image.convert_from_path(
+                pdf, first_page=first_page, last_page=last_page
+            )
        else:
-            raise IOError("Invalid file extension. Accepted filetypes:\n\t.png\n\t.jpg\n\t.jpeg\n\t.pdf")
+            raise IOError(
+                "Invalid file extension. Accepted filetypes:\n\t.png\n\t.jpg\n\t.jpeg\n\t.pdf"
+            )
    elif type(pdf) == bytes:
-        pages = pdf2image.convert_from_bytes(pdf, first_page=first_page, last_page=last_page)
+        pages = pdf2image.convert_from_bytes(
+            pdf, first_page=first_page, last_page=last_page
+        )
    elif type(pdf) in {list, ndarray}:
        return pdf