feat: adapt pipeline for new table inference + pyinfra

This commit is contained in:
iriley 2024-04-22 10:08:24 +02:00
parent ddd680bb4c
commit e264c948cf
9 changed files with 1122 additions and 803 deletions

View File

@ -5,3 +5,9 @@
port = 22
['remote "azure_remote"']
url = azure://cv-sa-dvc/
connection_string = "DefaultEndpointsProtocol=https;AccountName=cvsacricket;AccountKey=KOuTAQ6Mp00ePTT5ObYmgaHlxwS1qukY4QU4Kuk7gy/vldneA+ZiKjaOpEFtqKA6Mtym2gQz8THy+ASts/Y1Bw==;EndpointSuffix=core.windows.net"
['remote "local"']
url = ../dvc_local_remote

61
flake.lock generated Normal file
View File

@ -0,0 +1,61 @@
{
"nodes": {
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1710146030,
"narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1711703276,
"narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

33
flake.nix Normal file
View File

@ -0,0 +1,33 @@
{
description = "An flake to use a Python poetry project in an FHS environment when poetry2nix is uncooperative";
inputs = {
flake-utils.url = "github:numtide/flake-utils";
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
};
outputs = {
self,
nixpkgs,
flake-utils,
}:
flake-utils.lib.eachDefaultSystem (system: let
pkgs = nixpkgs.legacyPackages.${system};
fhsEnv =
(pkgs.buildFHSUserEnv rec {
name = "cv-analysis-service";
targetPkgs = pkgs: (with pkgs; [
poppler_utils
zlib
poetry
libuuid
# add the system package here that are needed for the Python package dependencies
libz # needed for 'numpy'
]);
profile = ''
export LD_LIBRARY_PATH="/lib:$LD_LIBRARY_PATH:${pkgs.lib.makeLibraryPath [pkgs.libuuid]}"
poetry install # add --no-root here if this is just a metapackage
source "$(poetry env info --path)"/bin/activate
'';
})
.env;
in {devShells.default = fhsEnv;});
}

1574
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -29,6 +29,7 @@ pyinfra = { version = "^2.1.0", source = "gitlab-research" }
kn-utils = { version = "0.2.7", source = "gitlab-research" }
pdf2img = { version = "0.7.0", source = "gitlab-red" }
dvc-azure = "^2.21.2"
pymupdf = "^1.24.1"
[tool.poetry.group.test.dependencies]
pytest = "^7.0.1"

View File

@ -1,6 +1,7 @@
import sys
from dataclasses import asdict
from operator import truth
from typing import Generator
from funcy import flatten, lmap
from pdf2img.conversion import convert_pages_to_images
@ -8,6 +9,7 @@ from pdf2img.default_objects.image import ImageInfo, ImagePlus
from pdf2img.default_objects.rectangle import RectanglePlus
from cv_analysis.figure_detection.figure_detection import detect_figures
from cv_analysis.table_inference import extract_images_from_pdf, infer_lines
from cv_analysis.table_parsing import parse_lines, parse_tables
from cv_analysis.utils.structures import Rectangle
@ -31,8 +33,27 @@ def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images=Tru
return make_analysis_pipeline(
detect_figures, figure_detection_formatter, dpi=200
)
else:
raise
if (
operation == "table_image_inference"
): # TODO: fix pyinfra input
return make_image_analysis_pipeline(
infer_lines,
)
# else:
# raise
def make_image_analysis_pipeline(
analysis_fn,
) -> Generator[dict, bytes, None]:
def analyse_pipeline(pdf_bytes: bytes, vlp_output: dict):
images, info = extract_images_from_pdf(pdf_bytes, vlp_output)
img_results = map(analysis_fn, images)
results = map(lambda i: info[i] | {"tableLines": img_results[i]}, range(len(info)))
yield from results
return analyse_pipeline
def make_analysis_pipeline(
@ -43,7 +64,7 @@ def make_analysis_pipeline(
image = page.asarray()
rects = analysis_fn(image)
if not rects:
return
return None
infos = formatter(rects, page, dpi)
return infos

View File

@ -0,0 +1,161 @@
from pathlib import Path
from typing import Callable, Iterable, Optional, Tuple
from typing import Tuple
import cv2
import matplotlib.pyplot as plt
import numpy as np
from numpy import ndarray as Array
from scipy.signal import argrelextrema
from scipy.stats import norm
import fitz
from pdf2img.conversion import convert_pages_to_images
def show_multiple(arrs: Tuple[Array], title: str = ""):
plt.clf()
plt.cla()
plt.close()
for a in arrs:
plt.plot(a)
plt.title(title)
plt.show()
def show(arr: Array, title: str = ""):
plt.clf()
plt.cla()
plt.close()
plt.plot(arr)
plt.title(title)
plt.show()
def save_plot(arr: Array, name: str, title: str = "") -> None:
plt.clf()
plt.cla()
plt.close()
plt.plot(arr)
plt.title(title)
plt.savefig(Path(str(name) + ".png"))
def make_gaussian_kernel(kernel_size: int, sd: float) -> Array:
kernel_size += int(not kernel_size % 2)
wing_size = int((kernel_size - 1) / 2)
xvals = np.arange(-wing_size, wing_size + 1)
kernel = norm.pdf(xvals, scale=sd)
# maxval, minval = np.max(kernel), np.min(kernel)
# diff = maxval - minval
# kernel += (diff / (1 - ratio))
kernel /= np.sum(kernel)
return kernel
def make_gaussian_nonpositive_kernel(kernel_size: int, sd: float) -> Array:
kernel_size += int(not kernel_size % 2)
wing_size = int((kernel_size - 1) / 2)
xvals = np.arange(-wing_size, wing_size + 1)
kernel = norm.pdf(xvals, scale=sd)
# maxval, minval = np.max(kernel), np.min(kernel)
# diff = maxval - minval
# kernel += (diff / (1 - ratio))
kernel /= np.sum(kernel)
return kernel
def make_quadratic_kernel(kernel_size: int, ratio: float) -> Array:
# print(bound)
# step_size = 2 * bound / (kernel_size - 1)
kernel_size += int(not kernel_size % 2)
# print(kernel_size)
wing_size = int((kernel_size - 1) / 2)
# print(step_size)
# xvals = list(map(lambda i: i * step_size, range(-wing_size, wing_size + 1)))
# print(xvals)
kernel = np.array(
list(map(lambda x: float(-(x**2)), range(-wing_size, wing_size + 1)))
)
# print(kernel)
maxval, minval = np.max(kernel), np.min(kernel)
diff = maxval - minval
kernel += diff / (1 - ratio)
# print(kernel)
kernel /= np.sum(kernel)
# print(kernel)
return kernel
def min_avg_for_interval(filtered: Array, interval: int) -> float:
n = len(filtered)
avgs = [np.mean(filtered[range(start, n, interval)]) for start in range(interval)]
best = min(avgs)
return best, avgs.index(best)
def search_intervals(filtered: Array, min_interval: int, max_interval: int):
performance = [
(interval, *min_avg_for_interval(filtered, interval))
for interval in range(min_interval, max_interval + 1)
]
best = min(performance, key=lambda x: x[1])
return best[0], best[2]
def filter_array(
array: Array,
sum_filter: Array,
padding: Optional[Array] = None,
pad_value_function: Callable[[Array], float] = np.mean,
) -> Array:
if not sum_filter:
return array
fsize = len(sum_filter)
assert fsize % 2
if padding is None: # ensures that output size matches the input size
pad = int((fsize - 1) / 2)
padding = np.full(pad, pad_value_function(array))
return np.convolve(np.concatenate((padding, array, padding)), sum_filter, "valid")
FILTERS = {
"row": {1: make_gaussian_kernel(30, 6), 2: make_gaussian_kernel(20, 4)},
"col": {1: make_gaussian_kernel(70, 10), 2: None},
}
def get_lines_either(table_array: Array, horizontal=True) -> Array:
key = "row" if horizontal else "col"
THRESHOLD = 0.3
filters = FILTERS
sums = np.mean(table_array, axis=int(horizontal))
sums = np.maximum(sums, (sums < THRESHOLD))
# save_plot(rows, name=save_path / "rows", title="raw row averages")
filtered_sums = filter_array(sums, FILTERS[key][1]) # ROW_FILTER1)
filtered_sums = filter_array(sums, FILTERS[key][2]) # ROW_FILTER2)
lines = argrelextrema(filtered_sums, np.greater)[0]
return lines
def img_bytes_to_array(img_bytes: bytes) -> Array:
img_np = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_GRAYSCALE)
return img_np
def infer_lines(img: Array) -> dict[str, list[dict[str, int]]]:
h, w = img.shape
row_vals = get_lines_either(img, horizontal=True)
col_vals = get_lines_either(img, horizontal=False)
lines = [{"x1": 0, "y1": r, "x2": w, "y2": r} for r in row_vals] + [
{"x1": c, "y1": 0, "x2": c, "y2": h} for c in col_vals
]
return {"tableLines": lines, "imageInfo": {"height": h, "width": w}}

View File

@ -0,0 +1,47 @@
from pathlib import Path
from typing import Callable, Iterable, Optional, Tuple
from typing import Tuple
import numpy as np
from numpy import ndarray as Array
from scipy.signal import argrelextrema
from scipy.stats import norm
import fitz
from pdf2img.conversion import convert_pages_to_images
def transform_image_coordinates_to_pdf_coordinates(
bbox: Iterable[int | float], rotation_matrix: fitz.Matrix, transformation_matrix: fitz.Matrix, dpi: int = None
) -> Tuple:
x1, y1, x2, y2 = map(lambda x: (x / dpi) * 72, bbox) if dpi else bbox # Convert to points, can be done before
rect = fitz.Rect(x1, y1, x2, y2)
rect = rect.transform(rotation_matrix).transform(transformation_matrix)
return rect.x0, rect.y0, rect.x1, rect.y1
def extract_images_from_pdf(pdf_bytes: bytes, vlp_output: dict, dpi: int = 200) -> tuple[list[Array], dict]:
with fitz.open(stream=pdf_bytes) as fh:
images = []
info = []
for page_dict in vlp_output["data"]:
page_num = int(page_dict["page_idx"])
boxes = page_dict["image_boxes"]
boxes = filter(lambda box_obj: box_obj["label"] == "table", boxes)
page = fh[page_num] #pages[int(page_num)]
h, w = page.shape
for bbox in boxes:
x1, x2 = map(lambda x: int(x * w), (bbox["x1"], bbox["x2"]))
y1, y2 = map(lambda y: int(y * h), (bbox["y1"], bbox["y2"]))
rect = fitz.Rect((x1, y1), (x2, y2))
pixmap = page.get_pixmap(clip=rect, dpi=dpi, colorspace=fitz.csGRAY)
image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
images.append(image)
info.append({"pageNum": page_num, "bbox": bbox})
return images, info

View File

@ -1,12 +1,11 @@
from numpy import array, ndarray
import pdf2image
from numpy import array, ndarray
from PIL import Image
from cv_analysis.utils.preprocessing import preprocess_page_array
def open_pdf(pdf, first_page=0, last_page=None):
first_page += 1
last_page = None if last_page is None else last_page + 1
@ -14,11 +13,17 @@ def open_pdf(pdf, first_page=0, last_page=None):
if pdf.lower().endswith((".png", ".jpg", ".jpeg")):
pages = [Image.open(pdf)]
elif pdf.lower().endswith(".pdf"):
pages = pdf2image.convert_from_path(pdf, first_page=first_page, last_page=last_page)
pages = pdf2image.convert_from_path(
pdf, first_page=first_page, last_page=last_page
)
else:
raise IOError("Invalid file extension. Accepted filetypes:\n\t.png\n\t.jpg\n\t.jpeg\n\t.pdf")
raise IOError(
"Invalid file extension. Accepted filetypes:\n\t.png\n\t.jpg\n\t.jpeg\n\t.pdf"
)
elif type(pdf) == bytes:
pages = pdf2image.convert_from_bytes(pdf, first_page=first_page, last_page=last_page)
pages = pdf2image.convert_from_bytes(
pdf, first_page=first_page, last_page=last_page
)
elif type(pdf) in {list, ndarray}:
return pdf