Compare commits

..

32 Commits

Author SHA1 Message Date
Jonathan Kössler
799fe331c3 Merge branch 'bugfix/RED-10722' into 'master'
RED-10722: fix dead letter queue

Closes RED-10722

See merge request redactmanager/cv-analysis-service!32
2025-01-16 09:28:41 +01:00
Jonathan Kössler
dfbfc50556 chore: update version 2025-01-15 13:40:01 +01:00
Jonathan Kössler
63fbd387a3 chore: update pyinfra to v3.4.2 2025-01-15 13:32:57 +01:00
Jonathan Kössler
41dbfc69d9 chore: update pyinfra to v3.4.2 2025-01-14 16:52:13 +01:00
Jonathan Kössler
b73e9b2ed9 Merge branch 'feature/RED-10441' into 'master'
RED-10441: fix abandoned queues

Closes RED-10441

See merge request redactmanager/cv-analysis-service!31
2024-11-13 17:27:22 +01:00
Jonathan Kössler
92692281ce chore: update pyinfra to v3.3.5 2024-11-13 17:22:24 +01:00
Jonathan Kössler
cb0c58d699 chore: update pyinfra to v3.3.4 2024-11-13 16:41:04 +01:00
Jonathan Kössler
eb96403fe2 chore: update pyinfra to v3.3.3 2024-11-13 14:53:11 +01:00
Jonathan Kössler
c8daf888c6 chore: update pyinfra to v3.3.2 2024-11-13 09:45:43 +01:00
Jonathan Kössler
eb921c365d Merge branch 'chore/update_pyinfra' into 'master'
RES-858: fix graceful shutdown

See merge request redactmanager/cv-analysis-service!30
2024-09-30 11:01:07 +02:00
Jonathan Kössler
7762f81a4a chore: update pyinfra to v3.2.11 2024-09-30 10:07:29 +02:00
Jonathan Kössler
e991cfe1bf Merge branch 'chore/update_pyinfra' into 'master'
RES-844 && RES-856: fix tracing & proto format

See merge request redactmanager/cv-analysis-service!29
2024-09-27 08:21:46 +02:00
Jonathan Kössler
35c5ee5831 fix: opentelemtry service name 2024-09-26 13:46:05 +02:00
Jonathan Kössler
e97f34391a chore: update pyinfra to v3.2.10 2024-09-26 13:44:48 +02:00
Francisco Schulz
1fa10721aa Merge branch 'RED-10017-investigate-crashing-py-services-when-upload-large-number-of-files' into 'master'
RED-10017 "Investigate crashing py services when upload large number of files"

See merge request redactmanager/cv-analysis-service!28
2024-09-23 18:55:08 +02:00
Francisco Schulz
7f0d0a48db RED-10017 "Investigate crashing py services when upload large number of files" 2024-09-23 18:55:08 +02:00
Francisco Schulz
333cd498b9 Merge branch 'RES-842-pyinfra-fix-rabbit-mq-handler-shuts-down-when-queues-not-available-yet' into 'master'
chore: update pyinfra version, increase pkg version

Closes RES-842

See merge request redactmanager/cv-analysis-service!27
2024-08-30 14:57:14 +02:00
francisco.schulz
9df8c8f936 chore: update service version 2024-08-30 08:25:00 -04:00
francisco.schulz
60adf0c381 chore: update pyinfra version 2024-08-30 08:15:34 -04:00
francisco.schulz
537f605a85 chore: remove renovate bot config 2024-08-29 11:48:15 -04:00
francisco.schulz
66987ab8e9 chore: update pyinfra version, increase pkg version 2024-08-29 11:29:07 -04:00
Jonathan Kössler
43570142c3 Merge branch 'feature/RES-840-add-client-connector-error' into 'master'
fix: add exception handling for ClientConnectorError

Closes RES-840

See merge request redactmanager/cv-analysis-service!26
2024-08-28 15:47:01 +02:00
Jonathan Kössler
d457f49001 chore: update pyinfra version 2024-08-28 14:47:29 +02:00
Jonathan Kössler
536928c032 Merge branch 'feature/RES-826-pyinfra-update' into 'master'
chore: bump pyinfra version

Closes RES-826

See merge request redactmanager/cv-analysis-service!25
2024-08-26 16:15:17 +02:00
Jonathan Kössler
dc6183490f chore: bump pyinfra version 2024-08-26 15:13:59 +02:00
Jonathan Kössler
bbc2d0c8bf chore: bump pyinfra version 2024-08-22 09:33:26 +02:00
Jonathan Kössler
3462faf8c7 Merge branch 'feature/RES-731-add-queues-per-tenant' into 'master'
RES-731: add queues per tenant

Closes RES-731

See merge request redactmanager/cv-analysis-service!24
2024-08-19 15:03:38 +02:00
Jonathan Kössler
b136cc9ff3 RES-731: add queues per tenant 2024-08-19 15:03:37 +02:00
Julius Unverfehrt
cf431df1cb Merge branch 'table_lines' into 'master'
Table lines

See merge request redactmanager/cv-analysis-service!23
2024-05-15 16:53:27 +02:00
Julius Unverfehrt
e86214f6b7 Merge branch 'table_lines' into 'master'
fix: maping of image coordinates to pdf coordinates (table inference)

See merge request redactmanager/cv-analysis-service!22
2024-05-15 13:02:24 +02:00
Isaac Riley
3c9ddfcf0f Merge branch 'table_lines' into 'master'
fix: check nonzero list length in filter_fp_col_lines

See merge request redactmanager/cv-analysis-service!21
2024-05-13 09:39:20 +02:00
Isaac Riley
0f45a25bc8 Merge branch 'table_lines' into 'master'
fix: make envvar conditional unfailable

See merge request redactmanager/cv-analysis-service!20
2024-05-08 15:33:45 +02:00
12 changed files with 2879 additions and 2299 deletions

2
.gitignore vendored
View File

@ -50,5 +50,3 @@ __pycache__/
# unignore files
!bom.*
dotted/

View File

@ -12,15 +12,6 @@ variables:
NEXUS_PROJECT_DIR: red # subfolder in Nexus docker-gin where your container will be stored
IMAGENAME: $CI_PROJECT_NAME # if the project URL is gitlab.example.com/group-name/project-1, CI_PROJECT_NAME is project-1
stages:
- data
- setup
- unit-tests
- versioning
- build
- integration-tests
- release
pages:
only:
- master # KEEP THIS, necessary because `master` branch and not `main` branch

View File

@ -5,10 +5,10 @@ default_language_version:
python: python3.10
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
rev: v5.0.0
hooks:
- id: trailing-whitespace
# - id: end-of-file-fixer
- id: end-of-file-fixer
- id: check-yaml
args: [--unsafe] # needed for .gitlab-ci.yml
- id: check-toml
@ -34,7 +34,7 @@ repos:
- --profile black
- repo: https://github.com/psf/black
rev: 24.3.0
rev: 24.10.0
hooks:
- id: black
# exclude: ^(docs/|notebooks/|data/|src/secrets/)
@ -42,7 +42,7 @@ repos:
- --line-length=120
- repo: https://github.com/compilerla/conventional-pre-commit
rev: v3.2.0
rev: v4.0.0
hooks:
- id: conventional-pre-commit
pass_filenames: false

View File

@ -1,11 +1,22 @@
[asyncio]
max_concurrent_tasks = 10
[dynamic_tenant_queues]
enabled = true
[metrics.prometheus]
enabled = true
prefix = "redactmanager_cv_analysis_service"
[tracing.opentelemetry]
[tracing]
enabled = true
# possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
type = "azure_monitor"
[tracing.opentelemetry]
endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
service_name = "redactmanager_cv_analyisis_service"
service_name = "redactmanager_cv_analysis_service"
exporter = "otlp"
[webserver]
@ -25,6 +36,15 @@ input_queue = "request_queue"
output_queue = "response_queue"
dead_letter_queue = "dead_letter_queue"
tenant_event_queue_suffix = "_tenant_event_queue"
tenant_event_dlq_suffix = "_tenant_events_dlq"
tenant_exchange_name = "tenants-exchange"
queue_expiration_time = 300000 # 5 minutes in milliseconds
service_request_queue_prefix = "cv_analysis_request_queue"
service_request_exchange_name = "cv_analysis_request_exchange"
service_response_exchange_name = "cv_analysis_response_exchange"
service_dlq_name = "cv_analysis_dlq"
[storage]
backend = "s3"
@ -41,4 +61,7 @@ connection_string = ""
[storage.tenant_server]
public_key = ""
endpoint = "http://tenant-user-management:8081/internal-api/tenants"
endpoint = "http://tenant-user-management:8081/internal-api/tenants"
[kubernetes]
pod_name = "test_pod"

View File

@ -15,7 +15,6 @@
(pkgs.buildFHSUserEnv rec {
name = "cv-analysis-service";
targetPkgs = pkgs: (with pkgs; [
python310
poppler_utils
zlib
poetry

4990
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "cv-analysis-service"
version = "2.19.0"
version = "2.30.0"
description = ""
authors = ["Isaac Riley <isaac.riley@knecon.com>"]
readme = "README.md"
@ -25,13 +25,12 @@ coverage = "^5.5"
dependency-check = "^0.6.0"
lorem-text = "^2.1"
PyMuPDF = "^1.19.6"
pyinfra = { version = "^2.2.0", source = "gitlab-research" }
kn-utils = { version = "0.2.7", source = "gitlab-research" }
pyinfra = { version = "3.4.2", source = "gitlab-research" }
kn-utils = { version = ">=0.4.0", source = "gitlab-research" }
pdf2img = { version = "0.7.0", source = "gitlab-red" }
dvc-azure = "^2.21.2"
pymupdf = "^1.24.1"
types-pillow = "^10.2.0.20240423"
#matplotlib-backend-wezterm = "^2.1.2"
[tool.poetry.group.test.dependencies]
pytest = "^7.0.1"
@ -77,7 +76,7 @@ priority = "explicit"
[tool.pylint]
max-line-length = 120
docstring-min-length=4
docstring-min-length = 4
extension-pkg-whitelist = ["cv2"]
extension-pkg-allow-list = ["cv2"]

View File

@ -1,6 +0,0 @@
{
"$schema": "https://docs.renovatebot.com/renovate-schema.json",
"extends": [
"config:base"
]
}

41
scripts/devenvsetup.sh Normal file
View File

@ -0,0 +1,41 @@
#!/bin/bash
python_version=$1
gitlab_user=$2
gitlab_personal_access_token=$3
# cookiecutter https://gitlab.knecon.com/knecon/research/template-python-project.git --checkout master
# latest_dir=$(ls -td -- */ | head -n 1) # should be the dir cookiecutter just created
# cd $latest_dir
pyenv install $python_version
pyenv local $python_version
pyenv shell $python_version
# install poetry globally (PREFERRED), only need to install it once
# curl -sSL https://install.python-poetry.org | python3 -
# remember to update poetry once in a while
poetry self update
# install poetry in current python environment, can lead to multiple instances of poetry being installed on one system (DISPREFERRED)
# pip install --upgrade pip
# pip install poetry
poetry config virtualenvs.in-project true
poetry config installer.max-workers 10
poetry config repositories.gitlab-research https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
poetry config http-basic.gitlab-research ${gitlab_user} ${gitlab_personal_access_token}
poetry config repositories.gitlab-red https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
poetry config http-basic.gitlab-red ${gitlab_user} ${gitlab_personal_access_token}
poetry config repositories.gitlab-fforesight https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
poetry config http-basic.gitlab-fforesight ${gitlab_user} ${gitlab_personal_access_token}
poetry env use $(pyenv which python)
poetry install --with=dev
poetry update
source .venv/bin/activate
pre-commit install
pre-commit autoupdate

View File

@ -1,8 +0,0 @@
from cv_analysis.table_inference import infer_lines
def grid_search() -> None: ...
if __name__ == "__main__":
grid_search()

View File

@ -1,6 +1,6 @@
from operator import itemgetter
from pathlib import Path
from typing import Callable, Iterable, Optional, Tuple
from typing import Callable, Optional, Tuple
import cv2
import matplotlib.pyplot as plt
@ -9,8 +9,6 @@ from kn_utils.logging import logger # type: ignore
from numpy import ndarray as Array
from scipy.stats import norm # type: ignore
from .utils.dotted_lines import detect_dotted_from_extrema
def show_multiple(arrs: Tuple[Array], title: str = ""):
plt.clf()
@ -152,65 +150,16 @@ def filter_fp_col_lines(line_list: list[int], filt_sums: Array) -> list[int]:
return line_list
def sharpen_sums(sums: Array) -> Array:
sums = sums.astype("int64")
shift = 3
diffs = abs(sums[shift:-shift] - sums[2 * shift :]) + abs(sums[shift:-shift] - sums[: -2 * shift])
f2 = filter_array(sums, FILTERS["col"][2])
return diffs
def detect_dotted_lines(
image: Array,
sums: Iterable,
horizontal: bool = True,
threshold: float = 1.0,
min_distance: int = 2,
max_distance: int = 20,
) -> bool:
key = "row" if horizontal else "col"
naive = filter_array(sums, FILTERS[key][1])
naive_lines = np.where((naive[1:-1] < naive[:-2]) * (naive[1:-1] < naive[2:]) * (sums[1:-1] < 250))[0] + 1
bool_array = np.zeros(image.shape[1 - int(horizontal)])
for idx in naive_lines:
band = image[idx - 1 : idx + 2, :] if horizontal else image[:, idx - 1 : idx + 1]
band_sums = np.mean(band, axis=1 - int(horizontal))
band_sums = filter_array(band_sums, FILTERS[key][1])
extrema = np.where((band_sums[1:-1] < band_sums[:-2]) * (band_sums[1:-1] < band_sums[2:]))[0] + 1
distances = extrema[1:] - extrema[:-1]
mean = np.mean(distances)
std = np.std(distances)
check = "" if (ratio := (mean / (std + 0.01))) > 1.5 and mean < 40 else ""
print(f"{idx:4} {mean:6.2f} {std:6.2f} {ratio:6.2f} {check}")
score = std # maybe make more advanced score function later
if (min_distance <= mean <= max_distance) and (score < threshold):
print(idx)
bool_array[idx] = 1
return bool_array
def get_lines_either(table_array: Array, horizontal=True) -> list[int]:
key = "row" if horizontal else "col"
h, w = map(int, table_array.shape)
table_array = (
table_array[:, int(0.1 * w) : int(0.9 * w)] if horizontal else table_array[int(0.1 * h) : int(0.9 * h)]
)
sums = np.mean(table_array, axis=int(horizontal))
dotted = detect_dotted_lines(table_array, sums, horizontal=horizontal)
threshold = 0.3 * 255 # np.mean(sums) - (1 + 2 * horizontal) * np.std(sums)
predicate = 1000.0 * ((sums < threshold) | dotted)
predicate = 1000.0 * (sums < threshold)
sums = np.maximum(
np.maximum(sums[1:-1], predicate[1:-1]),
np.maximum(predicate[:-2], predicate[2:]),
)
filtered_sums = filter_array(sums, FILTERS[key][1])
filtered_sums = filter_array(filtered_sums, FILTERS[key][2])
filtered_sums = filter_array(filtered_sums, FILTERS[key][3])
@ -230,7 +179,9 @@ def img_bytes_to_array(img_bytes: bytes) -> Array:
def infer_lines(img: Array) -> dict[str, dict[str, int] | list[dict[str, int]]]:
cv2.imwrite("/tmp/table.png", img)
_, img = cv2.threshold(img, 220, 255, cv2.THRESH_BINARY)
cv2.imwrite("/tmp/table_bin.png", img)
h, w = map(int, img.shape)
row_vals = map(int, get_lines_either(img, horizontal=True))
col_vals = map(int, get_lines_either(img, horizontal=False))

View File

@ -1,18 +0,0 @@
"""
General approach:
Get horizontal and vertical pixel sum extrema. Then take a band of k around each minimum (corresponding to darkest), e.g. k=3.
Recalculate minima for each band.
Compute a list of distances between minima.
Compute the mean and standard deviation between minima.
If rho:=std/(eta*mean) < phi for some threshold phi, the band contains a dotted line. -> logic: std can be larger for larger mean, i.e. more spaced-out dotted lines
Pros:
Intuitive and efficient.
Cons:
May not work for irregular/mixed dotted lines, such as (possibly) --*--*--*--*--*--*--*--*--*--*--
"""
from typing import Iterable
import numpy as np