Compare commits

..

2 Commits

Author SHA1 Message Date
iriley
99359596da chore: bump package versions 2024-07-19 22:47:05 +02:00
iriley
ef02253ad7 chore: bump package versions 2024-07-19 22:43:54 +02:00
12 changed files with 2297 additions and 2877 deletions

2
.gitignore vendored
View File

@ -50,3 +50,5 @@ __pycache__/
# unignore files # unignore files
!bom.* !bom.*
dotted/

View File

@ -12,6 +12,15 @@ variables:
NEXUS_PROJECT_DIR: red # subfolder in Nexus docker-gin where your container will be stored NEXUS_PROJECT_DIR: red # subfolder in Nexus docker-gin where your container will be stored
IMAGENAME: $CI_PROJECT_NAME # if the project URL is gitlab.example.com/group-name/project-1, CI_PROJECT_NAME is project-1 IMAGENAME: $CI_PROJECT_NAME # if the project URL is gitlab.example.com/group-name/project-1, CI_PROJECT_NAME is project-1
stages:
- data
- setup
- unit-tests
- versioning
- build
- integration-tests
- release
pages: pages:
only: only:
- master # KEEP THIS, necessary because `master` branch and not `main` branch - master # KEEP THIS, necessary because `master` branch and not `main` branch

View File

@ -5,10 +5,10 @@ default_language_version:
python: python3.10 python: python3.10
repos: repos:
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0 rev: v4.5.0
hooks: hooks:
- id: trailing-whitespace - id: trailing-whitespace
- id: end-of-file-fixer # - id: end-of-file-fixer
- id: check-yaml - id: check-yaml
args: [--unsafe] # needed for .gitlab-ci.yml args: [--unsafe] # needed for .gitlab-ci.yml
- id: check-toml - id: check-toml
@ -34,7 +34,7 @@ repos:
- --profile black - --profile black
- repo: https://github.com/psf/black - repo: https://github.com/psf/black
rev: 24.10.0 rev: 24.3.0
hooks: hooks:
- id: black - id: black
# exclude: ^(docs/|notebooks/|data/|src/secrets/) # exclude: ^(docs/|notebooks/|data/|src/secrets/)
@ -42,7 +42,7 @@ repos:
- --line-length=120 - --line-length=120
- repo: https://github.com/compilerla/conventional-pre-commit - repo: https://github.com/compilerla/conventional-pre-commit
rev: v4.0.0 rev: v3.2.0
hooks: hooks:
- id: conventional-pre-commit - id: conventional-pre-commit
pass_filenames: false pass_filenames: false

View File

@ -1,22 +1,11 @@
[asyncio]
max_concurrent_tasks = 10
[dynamic_tenant_queues]
enabled = true
[metrics.prometheus] [metrics.prometheus]
enabled = true enabled = true
prefix = "redactmanager_cv_analysis_service" prefix = "redactmanager_cv_analysis_service"
[tracing]
enabled = true
# possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
type = "azure_monitor"
[tracing.opentelemetry] [tracing.opentelemetry]
enabled = true
endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces" endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
service_name = "redactmanager_cv_analysis_service" service_name = "redactmanager_cv_analyisis_service"
exporter = "otlp" exporter = "otlp"
[webserver] [webserver]
@ -36,15 +25,6 @@ input_queue = "request_queue"
output_queue = "response_queue" output_queue = "response_queue"
dead_letter_queue = "dead_letter_queue" dead_letter_queue = "dead_letter_queue"
tenant_event_queue_suffix = "_tenant_event_queue"
tenant_event_dlq_suffix = "_tenant_events_dlq"
tenant_exchange_name = "tenants-exchange"
queue_expiration_time = 300000 # 5 minutes in milliseconds
service_request_queue_prefix = "cv_analysis_request_queue"
service_request_exchange_name = "cv_analysis_request_exchange"
service_response_exchange_name = "cv_analysis_response_exchange"
service_dlq_name = "cv_analysis_dlq"
[storage] [storage]
backend = "s3" backend = "s3"
@ -61,7 +41,4 @@ connection_string = ""
[storage.tenant_server] [storage.tenant_server]
public_key = "" public_key = ""
endpoint = "http://tenant-user-management:8081/internal-api/tenants" endpoint = "http://tenant-user-management:8081/internal-api/tenants"
[kubernetes]
pod_name = "test_pod"

View File

@ -15,6 +15,7 @@
(pkgs.buildFHSUserEnv rec { (pkgs.buildFHSUserEnv rec {
name = "cv-analysis-service"; name = "cv-analysis-service";
targetPkgs = pkgs: (with pkgs; [ targetPkgs = pkgs: (with pkgs; [
python310
poppler_utils poppler_utils
zlib zlib
poetry poetry

4986
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "cv-analysis-service" name = "cv-analysis-service"
version = "2.30.0" version = "2.19.0"
description = "" description = ""
authors = ["Isaac Riley <isaac.riley@knecon.com>"] authors = ["Isaac Riley <isaac.riley@knecon.com>"]
readme = "README.md" readme = "README.md"
@ -25,12 +25,13 @@ coverage = "^5.5"
dependency-check = "^0.6.0" dependency-check = "^0.6.0"
lorem-text = "^2.1" lorem-text = "^2.1"
PyMuPDF = "^1.19.6" PyMuPDF = "^1.19.6"
pyinfra = { version = "3.4.2", source = "gitlab-research" } pyinfra = { version = "^2.2.0", source = "gitlab-research" }
kn-utils = { version = ">=0.4.0", source = "gitlab-research" } kn-utils = { version = "0.2.7", source = "gitlab-research" }
pdf2img = { version = "0.7.0", source = "gitlab-red" } pdf2img = { version = "0.7.0", source = "gitlab-red" }
dvc-azure = "^2.21.2" dvc-azure = "^2.21.2"
pymupdf = "^1.24.1" pymupdf = "^1.24.1"
types-pillow = "^10.2.0.20240423" types-pillow = "^10.2.0.20240423"
#matplotlib-backend-wezterm = "^2.1.2"
[tool.poetry.group.test.dependencies] [tool.poetry.group.test.dependencies]
pytest = "^7.0.1" pytest = "^7.0.1"
@ -76,7 +77,7 @@ priority = "explicit"
[tool.pylint] [tool.pylint]
max-line-length = 120 max-line-length = 120
docstring-min-length = 4 docstring-min-length=4
extension-pkg-whitelist = ["cv2"] extension-pkg-whitelist = ["cv2"]
extension-pkg-allow-list = ["cv2"] extension-pkg-allow-list = ["cv2"]

6
renovate.json Normal file
View File

@ -0,0 +1,6 @@
{
"$schema": "https://docs.renovatebot.com/renovate-schema.json",
"extends": [
"config:base"
]
}

View File

@ -1,41 +0,0 @@
#!/bin/bash
python_version=$1
gitlab_user=$2
gitlab_personal_access_token=$3
# cookiecutter https://gitlab.knecon.com/knecon/research/template-python-project.git --checkout master
# latest_dir=$(ls -td -- */ | head -n 1) # should be the dir cookiecutter just created
# cd $latest_dir
pyenv install $python_version
pyenv local $python_version
pyenv shell $python_version
# install poetry globally (PREFERRED), only need to install it once
# curl -sSL https://install.python-poetry.org | python3 -
# remember to update poetry once in a while
poetry self update
# install poetry in current python environment, can lead to multiple instances of poetry being installed on one system (DISPREFERRED)
# pip install --upgrade pip
# pip install poetry
poetry config virtualenvs.in-project true
poetry config installer.max-workers 10
poetry config repositories.gitlab-research https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
poetry config http-basic.gitlab-research ${gitlab_user} ${gitlab_personal_access_token}
poetry config repositories.gitlab-red https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
poetry config http-basic.gitlab-red ${gitlab_user} ${gitlab_personal_access_token}
poetry config repositories.gitlab-fforesight https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
poetry config http-basic.gitlab-fforesight ${gitlab_user} ${gitlab_personal_access_token}
poetry env use $(pyenv which python)
poetry install --with=dev
poetry update
source .venv/bin/activate
pre-commit install
pre-commit autoupdate

8
scripts/grid_search.py Normal file
View File

@ -0,0 +1,8 @@
from cv_analysis.table_inference import infer_lines
def grid_search() -> None: ...
if __name__ == "__main__":
grid_search()

View File

@ -1,6 +1,6 @@
from operator import itemgetter from operator import itemgetter
from pathlib import Path from pathlib import Path
from typing import Callable, Optional, Tuple from typing import Callable, Iterable, Optional, Tuple
import cv2 import cv2
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
@ -9,6 +9,8 @@ from kn_utils.logging import logger # type: ignore
from numpy import ndarray as Array from numpy import ndarray as Array
from scipy.stats import norm # type: ignore from scipy.stats import norm # type: ignore
from .utils.dotted_lines import detect_dotted_from_extrema
def show_multiple(arrs: Tuple[Array], title: str = ""): def show_multiple(arrs: Tuple[Array], title: str = ""):
plt.clf() plt.clf()
@ -150,16 +152,65 @@ def filter_fp_col_lines(line_list: list[int], filt_sums: Array) -> list[int]:
return line_list return line_list
def sharpen_sums(sums: Array) -> Array:
sums = sums.astype("int64")
shift = 3
diffs = abs(sums[shift:-shift] - sums[2 * shift :]) + abs(sums[shift:-shift] - sums[: -2 * shift])
f2 = filter_array(sums, FILTERS["col"][2])
return diffs
def detect_dotted_lines(
image: Array,
sums: Iterable,
horizontal: bool = True,
threshold: float = 1.0,
min_distance: int = 2,
max_distance: int = 20,
) -> bool:
key = "row" if horizontal else "col"
naive = filter_array(sums, FILTERS[key][1])
naive_lines = np.where((naive[1:-1] < naive[:-2]) * (naive[1:-1] < naive[2:]) * (sums[1:-1] < 250))[0] + 1
bool_array = np.zeros(image.shape[1 - int(horizontal)])
for idx in naive_lines:
band = image[idx - 1 : idx + 2, :] if horizontal else image[:, idx - 1 : idx + 1]
band_sums = np.mean(band, axis=1 - int(horizontal))
band_sums = filter_array(band_sums, FILTERS[key][1])
extrema = np.where((band_sums[1:-1] < band_sums[:-2]) * (band_sums[1:-1] < band_sums[2:]))[0] + 1
distances = extrema[1:] - extrema[:-1]
mean = np.mean(distances)
std = np.std(distances)
check = "" if (ratio := (mean / (std + 0.01))) > 1.5 and mean < 40 else ""
print(f"{idx:4} {mean:6.2f} {std:6.2f} {ratio:6.2f} {check}")
score = std # maybe make more advanced score function later
if (min_distance <= mean <= max_distance) and (score < threshold):
print(idx)
bool_array[idx] = 1
return bool_array
def get_lines_either(table_array: Array, horizontal=True) -> list[int]: def get_lines_either(table_array: Array, horizontal=True) -> list[int]:
key = "row" if horizontal else "col" key = "row" if horizontal else "col"
h, w = map(int, table_array.shape)
table_array = (
table_array[:, int(0.1 * w) : int(0.9 * w)] if horizontal else table_array[int(0.1 * h) : int(0.9 * h)]
)
sums = np.mean(table_array, axis=int(horizontal)) sums = np.mean(table_array, axis=int(horizontal))
dotted = detect_dotted_lines(table_array, sums, horizontal=horizontal)
threshold = 0.3 * 255 # np.mean(sums) - (1 + 2 * horizontal) * np.std(sums) threshold = 0.3 * 255 # np.mean(sums) - (1 + 2 * horizontal) * np.std(sums)
predicate = 1000.0 * (sums < threshold) predicate = 1000.0 * ((sums < threshold) | dotted)
sums = np.maximum( sums = np.maximum(
np.maximum(sums[1:-1], predicate[1:-1]), np.maximum(sums[1:-1], predicate[1:-1]),
np.maximum(predicate[:-2], predicate[2:]), np.maximum(predicate[:-2], predicate[2:]),
) )
filtered_sums = filter_array(sums, FILTERS[key][1]) filtered_sums = filter_array(sums, FILTERS[key][1])
filtered_sums = filter_array(filtered_sums, FILTERS[key][2]) filtered_sums = filter_array(filtered_sums, FILTERS[key][2])
filtered_sums = filter_array(filtered_sums, FILTERS[key][3]) filtered_sums = filter_array(filtered_sums, FILTERS[key][3])
@ -179,9 +230,7 @@ def img_bytes_to_array(img_bytes: bytes) -> Array:
def infer_lines(img: Array) -> dict[str, dict[str, int] | list[dict[str, int]]]: def infer_lines(img: Array) -> dict[str, dict[str, int] | list[dict[str, int]]]:
cv2.imwrite("/tmp/table.png", img)
_, img = cv2.threshold(img, 220, 255, cv2.THRESH_BINARY) _, img = cv2.threshold(img, 220, 255, cv2.THRESH_BINARY)
cv2.imwrite("/tmp/table_bin.png", img)
h, w = map(int, img.shape) h, w = map(int, img.shape)
row_vals = map(int, get_lines_either(img, horizontal=True)) row_vals = map(int, get_lines_either(img, horizontal=True))
col_vals = map(int, get_lines_either(img, horizontal=False)) col_vals = map(int, get_lines_either(img, horizontal=False))

View File

@ -0,0 +1,18 @@
"""
General approach:
Get horizontal and vertical pixel sum extrema. Then take a band of k around each minimum (corresponding to darkest), e.g. k=3.
Recalculate minima for each band.
Compute a list of distances between minima.
Compute the mean and standard deviation between minima.
If rho:=std/(eta*mean) < phi for some threshold phi, the band contains a dotted line. -> logic: std can be larger for larger mean, i.e. more spaced-out dotted lines
Pros:
Intuitive and efficient.
Cons:
May not work for irregular/mixed dotted lines, such as (possibly) --*--*--*--*--*--*--*--*--*--*--
"""
from typing import Iterable
import numpy as np