feat: adapt pipeline for new table inference + pyinfra
This commit is contained in:
parent
ddd680bb4c
commit
e264c948cf
@ -5,3 +5,9 @@
|
||||
port = 22
|
||||
['remote "azure_remote"']
|
||||
url = azure://cv-sa-dvc/
|
||||
connection_string = "DefaultEndpointsProtocol=https;AccountName=cvsacricket;AccountKey=KOuTAQ6Mp00ePTT5ObYmgaHlxwS1qukY4QU4Kuk7gy/vldneA+ZiKjaOpEFtqKA6Mtym2gQz8THy+ASts/Y1Bw==;EndpointSuffix=core.windows.net"
|
||||
['remote "local"']
|
||||
url = ../dvc_local_remote
|
||||
|
||||
|
||||
|
||||
|
||||
61
flake.lock
generated
Normal file
61
flake.lock
generated
Normal file
@ -0,0 +1,61 @@
|
||||
{
|
||||
"nodes": {
|
||||
"flake-utils": {
|
||||
"inputs": {
|
||||
"systems": "systems"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1710146030,
|
||||
"narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1711703276,
|
||||
"narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"flake-utils": "flake-utils",
|
||||
"nixpkgs": "nixpkgs"
|
||||
}
|
||||
},
|
||||
"systems": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
"version": 7
|
||||
}
|
||||
33
flake.nix
Normal file
33
flake.nix
Normal file
@ -0,0 +1,33 @@
|
||||
{
|
||||
description = "An flake to use a Python poetry project in an FHS environment when poetry2nix is uncooperative";
|
||||
inputs = {
|
||||
flake-utils.url = "github:numtide/flake-utils";
|
||||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
|
||||
};
|
||||
outputs = {
|
||||
self,
|
||||
nixpkgs,
|
||||
flake-utils,
|
||||
}:
|
||||
flake-utils.lib.eachDefaultSystem (system: let
|
||||
pkgs = nixpkgs.legacyPackages.${system};
|
||||
fhsEnv =
|
||||
(pkgs.buildFHSUserEnv rec {
|
||||
name = "cv-analysis-service";
|
||||
targetPkgs = pkgs: (with pkgs; [
|
||||
poppler_utils
|
||||
zlib
|
||||
poetry
|
||||
libuuid
|
||||
# add the system package here that are needed for the Python package dependencies
|
||||
libz # needed for 'numpy'
|
||||
]);
|
||||
profile = ''
|
||||
export LD_LIBRARY_PATH="/lib:$LD_LIBRARY_PATH:${pkgs.lib.makeLibraryPath [pkgs.libuuid]}"
|
||||
poetry install # add --no-root here if this is just a metapackage
|
||||
source "$(poetry env info --path)"/bin/activate
|
||||
'';
|
||||
})
|
||||
.env;
|
||||
in {devShells.default = fhsEnv;});
|
||||
}
|
||||
1574
poetry.lock
generated
1574
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -29,6 +29,7 @@ pyinfra = { version = "^2.1.0", source = "gitlab-research" }
|
||||
kn-utils = { version = "0.2.7", source = "gitlab-research" }
|
||||
pdf2img = { version = "0.7.0", source = "gitlab-red" }
|
||||
dvc-azure = "^2.21.2"
|
||||
pymupdf = "^1.24.1"
|
||||
|
||||
[tool.poetry.group.test.dependencies]
|
||||
pytest = "^7.0.1"
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
import sys
|
||||
from dataclasses import asdict
|
||||
from operator import truth
|
||||
from typing import Generator
|
||||
|
||||
from funcy import flatten, lmap
|
||||
from pdf2img.conversion import convert_pages_to_images
|
||||
@ -8,6 +9,7 @@ from pdf2img.default_objects.image import ImageInfo, ImagePlus
|
||||
from pdf2img.default_objects.rectangle import RectanglePlus
|
||||
|
||||
from cv_analysis.figure_detection.figure_detection import detect_figures
|
||||
from cv_analysis.table_inference import extract_images_from_pdf, infer_lines
|
||||
from cv_analysis.table_parsing import parse_lines, parse_tables
|
||||
from cv_analysis.utils.structures import Rectangle
|
||||
|
||||
@ -31,8 +33,27 @@ def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images=Tru
|
||||
return make_analysis_pipeline(
|
||||
detect_figures, figure_detection_formatter, dpi=200
|
||||
)
|
||||
else:
|
||||
raise
|
||||
if (
|
||||
operation == "table_image_inference"
|
||||
): # TODO: fix pyinfra input
|
||||
return make_image_analysis_pipeline(
|
||||
infer_lines,
|
||||
)
|
||||
# else:
|
||||
# raise
|
||||
|
||||
|
||||
def make_image_analysis_pipeline(
|
||||
analysis_fn,
|
||||
) -> Generator[dict, bytes, None]:
|
||||
def analyse_pipeline(pdf_bytes: bytes, vlp_output: dict):
|
||||
images, info = extract_images_from_pdf(pdf_bytes, vlp_output)
|
||||
img_results = map(analysis_fn, images)
|
||||
results = map(lambda i: info[i] | {"tableLines": img_results[i]}, range(len(info)))
|
||||
|
||||
yield from results
|
||||
|
||||
return analyse_pipeline
|
||||
|
||||
|
||||
def make_analysis_pipeline(
|
||||
@ -43,7 +64,7 @@ def make_analysis_pipeline(
|
||||
image = page.asarray()
|
||||
rects = analysis_fn(image)
|
||||
if not rects:
|
||||
return
|
||||
return None
|
||||
infos = formatter(rects, page, dpi)
|
||||
return infos
|
||||
|
||||
|
||||
161
src/cv_analysis/table_inference.py
Normal file
161
src/cv_analysis/table_inference.py
Normal file
@ -0,0 +1,161 @@
|
||||
from pathlib import Path
|
||||
from typing import Callable, Iterable, Optional, Tuple
|
||||
from typing import Tuple
|
||||
|
||||
|
||||
import cv2
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from numpy import ndarray as Array
|
||||
from scipy.signal import argrelextrema
|
||||
from scipy.stats import norm
|
||||
import fitz
|
||||
from pdf2img.conversion import convert_pages_to_images
|
||||
|
||||
|
||||
|
||||
|
||||
def show_multiple(arrs: Tuple[Array], title: str = ""):
|
||||
plt.clf()
|
||||
plt.cla()
|
||||
plt.close()
|
||||
for a in arrs:
|
||||
plt.plot(a)
|
||||
plt.title(title)
|
||||
plt.show()
|
||||
|
||||
|
||||
def show(arr: Array, title: str = ""):
|
||||
plt.clf()
|
||||
plt.cla()
|
||||
plt.close()
|
||||
plt.plot(arr)
|
||||
plt.title(title)
|
||||
plt.show()
|
||||
|
||||
|
||||
def save_plot(arr: Array, name: str, title: str = "") -> None:
|
||||
plt.clf()
|
||||
plt.cla()
|
||||
plt.close()
|
||||
plt.plot(arr)
|
||||
plt.title(title)
|
||||
plt.savefig(Path(str(name) + ".png"))
|
||||
|
||||
|
||||
def make_gaussian_kernel(kernel_size: int, sd: float) -> Array:
|
||||
kernel_size += int(not kernel_size % 2)
|
||||
wing_size = int((kernel_size - 1) / 2)
|
||||
xvals = np.arange(-wing_size, wing_size + 1)
|
||||
kernel = norm.pdf(xvals, scale=sd)
|
||||
# maxval, minval = np.max(kernel), np.min(kernel)
|
||||
# diff = maxval - minval
|
||||
# kernel += (diff / (1 - ratio))
|
||||
kernel /= np.sum(kernel)
|
||||
|
||||
return kernel
|
||||
|
||||
|
||||
def make_gaussian_nonpositive_kernel(kernel_size: int, sd: float) -> Array:
|
||||
kernel_size += int(not kernel_size % 2)
|
||||
wing_size = int((kernel_size - 1) / 2)
|
||||
xvals = np.arange(-wing_size, wing_size + 1)
|
||||
kernel = norm.pdf(xvals, scale=sd)
|
||||
# maxval, minval = np.max(kernel), np.min(kernel)
|
||||
# diff = maxval - minval
|
||||
# kernel += (diff / (1 - ratio))
|
||||
kernel /= np.sum(kernel)
|
||||
|
||||
return kernel
|
||||
|
||||
|
||||
def make_quadratic_kernel(kernel_size: int, ratio: float) -> Array:
|
||||
# print(bound)
|
||||
# step_size = 2 * bound / (kernel_size - 1)
|
||||
kernel_size += int(not kernel_size % 2)
|
||||
# print(kernel_size)
|
||||
wing_size = int((kernel_size - 1) / 2)
|
||||
# print(step_size)
|
||||
# xvals = list(map(lambda i: i * step_size, range(-wing_size, wing_size + 1)))
|
||||
# print(xvals)
|
||||
kernel = np.array(
|
||||
list(map(lambda x: float(-(x**2)), range(-wing_size, wing_size + 1)))
|
||||
)
|
||||
# print(kernel)
|
||||
maxval, minval = np.max(kernel), np.min(kernel)
|
||||
diff = maxval - minval
|
||||
kernel += diff / (1 - ratio)
|
||||
# print(kernel)
|
||||
kernel /= np.sum(kernel)
|
||||
# print(kernel)
|
||||
return kernel
|
||||
|
||||
|
||||
def min_avg_for_interval(filtered: Array, interval: int) -> float:
|
||||
n = len(filtered)
|
||||
avgs = [np.mean(filtered[range(start, n, interval)]) for start in range(interval)]
|
||||
best = min(avgs)
|
||||
return best, avgs.index(best)
|
||||
|
||||
|
||||
def search_intervals(filtered: Array, min_interval: int, max_interval: int):
|
||||
performance = [
|
||||
(interval, *min_avg_for_interval(filtered, interval))
|
||||
for interval in range(min_interval, max_interval + 1)
|
||||
]
|
||||
best = min(performance, key=lambda x: x[1])
|
||||
return best[0], best[2]
|
||||
|
||||
|
||||
def filter_array(
|
||||
array: Array,
|
||||
sum_filter: Array,
|
||||
padding: Optional[Array] = None,
|
||||
pad_value_function: Callable[[Array], float] = np.mean,
|
||||
) -> Array:
|
||||
if not sum_filter:
|
||||
return array
|
||||
fsize = len(sum_filter)
|
||||
assert fsize % 2
|
||||
if padding is None: # ensures that output size matches the input size
|
||||
pad = int((fsize - 1) / 2)
|
||||
padding = np.full(pad, pad_value_function(array))
|
||||
|
||||
return np.convolve(np.concatenate((padding, array, padding)), sum_filter, "valid")
|
||||
|
||||
|
||||
FILTERS = {
|
||||
"row": {1: make_gaussian_kernel(30, 6), 2: make_gaussian_kernel(20, 4)},
|
||||
"col": {1: make_gaussian_kernel(70, 10), 2: None},
|
||||
}
|
||||
|
||||
|
||||
def get_lines_either(table_array: Array, horizontal=True) -> Array:
|
||||
key = "row" if horizontal else "col"
|
||||
THRESHOLD = 0.3
|
||||
|
||||
filters = FILTERS
|
||||
sums = np.mean(table_array, axis=int(horizontal))
|
||||
sums = np.maximum(sums, (sums < THRESHOLD))
|
||||
# save_plot(rows, name=save_path / "rows", title="raw row averages")
|
||||
filtered_sums = filter_array(sums, FILTERS[key][1]) # ROW_FILTER1)
|
||||
filtered_sums = filter_array(sums, FILTERS[key][2]) # ROW_FILTER2)
|
||||
lines = argrelextrema(filtered_sums, np.greater)[0]
|
||||
return lines
|
||||
|
||||
|
||||
def img_bytes_to_array(img_bytes: bytes) -> Array:
|
||||
img_np = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_GRAYSCALE)
|
||||
return img_np
|
||||
|
||||
|
||||
def infer_lines(img: Array) -> dict[str, list[dict[str, int]]]:
|
||||
h, w = img.shape
|
||||
row_vals = get_lines_either(img, horizontal=True)
|
||||
col_vals = get_lines_either(img, horizontal=False)
|
||||
|
||||
lines = [{"x1": 0, "y1": r, "x2": w, "y2": r} for r in row_vals] + [
|
||||
{"x1": c, "y1": 0, "x2": c, "y2": h} for c in col_vals
|
||||
]
|
||||
|
||||
return {"tableLines": lines, "imageInfo": {"height": h, "width": w}}
|
||||
47
src/cv_analysis/utils/image_extraction.py
Normal file
47
src/cv_analysis/utils/image_extraction.py
Normal file
@ -0,0 +1,47 @@
|
||||
from pathlib import Path
|
||||
from typing import Callable, Iterable, Optional, Tuple
|
||||
from typing import Tuple
|
||||
|
||||
import numpy as np
|
||||
from numpy import ndarray as Array
|
||||
from scipy.signal import argrelextrema
|
||||
from scipy.stats import norm
|
||||
import fitz
|
||||
from pdf2img.conversion import convert_pages_to_images
|
||||
|
||||
|
||||
def transform_image_coordinates_to_pdf_coordinates(
|
||||
bbox: Iterable[int | float], rotation_matrix: fitz.Matrix, transformation_matrix: fitz.Matrix, dpi: int = None
|
||||
) -> Tuple:
|
||||
x1, y1, x2, y2 = map(lambda x: (x / dpi) * 72, bbox) if dpi else bbox # Convert to points, can be done before
|
||||
rect = fitz.Rect(x1, y1, x2, y2)
|
||||
rect = rect.transform(rotation_matrix).transform(transformation_matrix)
|
||||
|
||||
return rect.x0, rect.y0, rect.x1, rect.y1
|
||||
|
||||
|
||||
def extract_images_from_pdf(pdf_bytes: bytes, vlp_output: dict, dpi: int = 200) -> tuple[list[Array], dict]:
|
||||
|
||||
with fitz.open(stream=pdf_bytes) as fh:
|
||||
|
||||
images = []
|
||||
info = []
|
||||
|
||||
for page_dict in vlp_output["data"]:
|
||||
page_num = int(page_dict["page_idx"])
|
||||
boxes = page_dict["image_boxes"]
|
||||
boxes = filter(lambda box_obj: box_obj["label"] == "table", boxes)
|
||||
|
||||
page = fh[page_num] #pages[int(page_num)]
|
||||
h, w = page.shape
|
||||
|
||||
for bbox in boxes:
|
||||
x1, x2 = map(lambda x: int(x * w), (bbox["x1"], bbox["x2"]))
|
||||
y1, y2 = map(lambda y: int(y * h), (bbox["y1"], bbox["y2"]))
|
||||
rect = fitz.Rect((x1, y1), (x2, y2))
|
||||
pixmap = page.get_pixmap(clip=rect, dpi=dpi, colorspace=fitz.csGRAY)
|
||||
image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
|
||||
images.append(image)
|
||||
info.append({"pageNum": page_num, "bbox": bbox})
|
||||
|
||||
return images, info
|
||||
@ -1,12 +1,11 @@
|
||||
from numpy import array, ndarray
|
||||
import pdf2image
|
||||
from numpy import array, ndarray
|
||||
from PIL import Image
|
||||
|
||||
from cv_analysis.utils.preprocessing import preprocess_page_array
|
||||
|
||||
|
||||
def open_pdf(pdf, first_page=0, last_page=None):
|
||||
|
||||
first_page += 1
|
||||
last_page = None if last_page is None else last_page + 1
|
||||
|
||||
@ -14,11 +13,17 @@ def open_pdf(pdf, first_page=0, last_page=None):
|
||||
if pdf.lower().endswith((".png", ".jpg", ".jpeg")):
|
||||
pages = [Image.open(pdf)]
|
||||
elif pdf.lower().endswith(".pdf"):
|
||||
pages = pdf2image.convert_from_path(pdf, first_page=first_page, last_page=last_page)
|
||||
pages = pdf2image.convert_from_path(
|
||||
pdf, first_page=first_page, last_page=last_page
|
||||
)
|
||||
else:
|
||||
raise IOError("Invalid file extension. Accepted filetypes:\n\t.png\n\t.jpg\n\t.jpeg\n\t.pdf")
|
||||
raise IOError(
|
||||
"Invalid file extension. Accepted filetypes:\n\t.png\n\t.jpg\n\t.jpeg\n\t.pdf"
|
||||
)
|
||||
elif type(pdf) == bytes:
|
||||
pages = pdf2image.convert_from_bytes(pdf, first_page=first_page, last_page=last_page)
|
||||
pages = pdf2image.convert_from_bytes(
|
||||
pdf, first_page=first_page, last_page=last_page
|
||||
)
|
||||
elif type(pdf) in {list, ndarray}:
|
||||
return pdf
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user