Merge branch 'bugfix/RED-10722' into 'master'

RED-10722: fix dead letter queue Closes RED-10722 See merge request redactmanager/cv-analysis-service!32
chore: update version
2025-01-16 09:28:41 +01:00 · 2025-01-15 13:40:01 +01:00 · 2025-01-15 13:32:57 +01:00 · 2025-01-14 16:52:13 +01:00 · 2024-11-13 17:27:22 +01:00 · 2024-11-13 17:22:24 +01:00
12 changed files with 2879 additions and 2299 deletions
--- a/.gitignore
+++ b/.gitignore
@ -50,5 +50,3 @@ __pycache__/
 # unignore files
 !bom.*
 dotted/
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -12,15 +12,6 @@ variables:
  NEXUS_PROJECT_DIR: red # subfolder in Nexus docker-gin where your container will be stored
  IMAGENAME: $CI_PROJECT_NAME # if the project URL is gitlab.example.com/group-name/project-1, CI_PROJECT_NAME is project-1
 stages:
  - data
  - setup
  - unit-tests
  - versioning
  - build
  - integration-tests
  - release
 pages:
  only:
    - master # KEEP THIS, necessary because `master` branch and not `main` branch
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -5,10 +5,10 @@ default_language_version:
  python: python3.10
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v5.0.0
    hooks:
      - id: trailing-whitespace
-      # - id: end-of-file-fixer
+      - id: end-of-file-fixer
      - id: check-yaml
        args: [--unsafe] # needed for .gitlab-ci.yml
      - id: check-toml
@ -34,7 +34,7 @@ repos:
          - --profile black
  - repo: https://github.com/psf/black
-    rev: 24.3.0
+    rev: 24.10.0
    hooks:
      - id: black
        # exclude: ^(docs/|notebooks/|data/|src/secrets/)
@ -42,7 +42,7 @@ repos:
          - --line-length=120
  - repo: https://github.com/compilerla/conventional-pre-commit
-    rev: v3.2.0
+    rev: v4.0.0
    hooks:
      - id: conventional-pre-commit
        pass_filenames: false
--- a/config/pyinfra.toml
+++ b/config/pyinfra.toml
@ -1,11 +1,22 @@
 [asyncio]
 max_concurrent_tasks = 10
 [dynamic_tenant_queues]
 enabled = true
 [metrics.prometheus]
 enabled = true
 prefix = "redactmanager_cv_analysis_service"
-[tracing.opentelemetry]
+[tracing]
 enabled = true
 # possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
 type = "azure_monitor"
 [tracing.opentelemetry]
 endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
-service_name = "redactmanager_cv_analyisis_service"
+service_name = "redactmanager_cv_analysis_service"
 exporter = "otlp"
 [webserver]
@ -25,6 +36,15 @@ input_queue = "request_queue"
 output_queue = "response_queue"
 dead_letter_queue = "dead_letter_queue"
 tenant_event_queue_suffix = "_tenant_event_queue"
 tenant_event_dlq_suffix = "_tenant_events_dlq"
 tenant_exchange_name = "tenants-exchange"
 queue_expiration_time = 300000                                   # 5 minutes in milliseconds
 service_request_queue_prefix = "cv_analysis_request_queue"
 service_request_exchange_name = "cv_analysis_request_exchange"
 service_response_exchange_name = "cv_analysis_response_exchange"
 service_dlq_name = "cv_analysis_dlq"
 [storage]
 backend = "s3"
@ -41,4 +61,7 @@ connection_string = ""
 [storage.tenant_server]
 public_key = ""
-endpoint =  "http://tenant-user-management:8081/internal-api/tenants"
+endpoint = "http://tenant-user-management:8081/internal-api/tenants"
 [kubernetes]
 pod_name = "test_pod"
--- a/flake.nix
+++ b/flake.nix
@ -15,7 +15,6 @@
        (pkgs.buildFHSUserEnv rec {
          name = "cv-analysis-service";
          targetPkgs = pkgs: (with pkgs; [
            python310
            poppler_utils
            zlib
            poetry
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "cv-analysis-service"
-version = "2.19.0"
+version = "2.30.0"
 description = ""
 authors = ["Isaac Riley <isaac.riley@knecon.com>"]
 readme = "README.md"
@ -25,13 +25,12 @@ coverage = "^5.5"
 dependency-check = "^0.6.0"
 lorem-text = "^2.1"
 PyMuPDF = "^1.19.6"
-pyinfra = { version = "^2.2.0", source = "gitlab-research" }
+pyinfra = { version = "3.4.2", source = "gitlab-research" }
-kn-utils = { version = "0.2.7", source = "gitlab-research" }
+kn-utils = { version = ">=0.4.0", source = "gitlab-research" }
 pdf2img = { version = "0.7.0", source = "gitlab-red" }
 dvc-azure = "^2.21.2"
 pymupdf = "^1.24.1"
 types-pillow = "^10.2.0.20240423"
 #matplotlib-backend-wezterm = "^2.1.2"
 [tool.poetry.group.test.dependencies]
 pytest = "^7.0.1"
@ -77,7 +76,7 @@ priority = "explicit"
 [tool.pylint]
 max-line-length = 120
-docstring-min-length=4
+docstring-min-length = 4
 extension-pkg-whitelist = ["cv2"]
 extension-pkg-allow-list = ["cv2"]
--- a/renovate.json
+++ b/renovate.json
@ -1,6 +0,0 @@
 {
  "$schema": "https://docs.renovatebot.com/renovate-schema.json",
  "extends": [
    "config:base"
  ]
 }
--- a/scripts/devenvsetup.sh
+++ b/scripts/devenvsetup.sh
@ -0,0 +1,41 @@
 #!/bin/bash
 python_version=$1
 gitlab_user=$2
 gitlab_personal_access_token=$3
 # cookiecutter https://gitlab.knecon.com/knecon/research/template-python-project.git --checkout master
 # latest_dir=$(ls -td -- */ | head -n 1)  # should be the dir cookiecutter just created
 # cd $latest_dir
 pyenv install $python_version
 pyenv local $python_version
 pyenv shell $python_version
 # install poetry globally (PREFERRED), only need to install it once
 # curl -sSL https://install.python-poetry.org | python3 -
 # remember to update poetry once in a while
 poetry self update
 # install poetry in current python environment, can lead to multiple instances of poetry being installed on one system (DISPREFERRED)
 # pip install --upgrade pip
 # pip install poetry
 poetry config virtualenvs.in-project true
 poetry config installer.max-workers 10
 poetry config repositories.gitlab-research https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
 poetry config http-basic.gitlab-research ${gitlab_user} ${gitlab_personal_access_token}
 poetry config repositories.gitlab-red https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
 poetry config http-basic.gitlab-red ${gitlab_user} ${gitlab_personal_access_token}
 poetry config repositories.gitlab-fforesight https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
 poetry config http-basic.gitlab-fforesight ${gitlab_user} ${gitlab_personal_access_token}
 poetry env use $(pyenv which python)
 poetry install --with=dev
 poetry update
 source .venv/bin/activate
 pre-commit install
 pre-commit autoupdate
--- a/scripts/grid_search.py
+++ b/scripts/grid_search.py
@ -1,8 +0,0 @@
 from cv_analysis.table_inference import infer_lines
 def grid_search() -> None: ...
 if __name__ == "__main__":
    grid_search()
--- a/src/cv_analysis/table_inference.py
+++ b/src/cv_analysis/table_inference.py
@ -1,6 +1,6 @@
 from operator import itemgetter
 from pathlib import Path
-from typing import Callable, Iterable, Optional, Tuple
+from typing import Callable, Optional, Tuple
 import cv2
 import matplotlib.pyplot as plt
@ -9,8 +9,6 @@ from kn_utils.logging import logger  # type: ignore
 from numpy import ndarray as Array
 from scipy.stats import norm  # type: ignore
 from .utils.dotted_lines import detect_dotted_from_extrema
 def show_multiple(arrs: Tuple[Array], title: str = ""):
    plt.clf()
@ -152,65 +150,16 @@ def filter_fp_col_lines(line_list: list[int], filt_sums: Array) -> list[int]:
    return line_list
 def sharpen_sums(sums: Array) -> Array:
    sums = sums.astype("int64")
    shift = 3
    diffs = abs(sums[shift:-shift] - sums[2 * shift :]) + abs(sums[shift:-shift] - sums[: -2 * shift])
    f2 = filter_array(sums, FILTERS["col"][2])
    return diffs
 def detect_dotted_lines(
    image: Array,
    sums: Iterable,
    horizontal: bool = True,
    threshold: float = 1.0,
    min_distance: int = 2,
    max_distance: int = 20,
 ) -> bool:
    key = "row" if horizontal else "col"
    naive = filter_array(sums, FILTERS[key][1])
    naive_lines = np.where((naive[1:-1] < naive[:-2]) * (naive[1:-1] < naive[2:]) * (sums[1:-1] < 250))[0] + 1
    bool_array = np.zeros(image.shape[1 - int(horizontal)])
    for idx in naive_lines:
        band = image[idx - 1 : idx + 2, :] if horizontal else image[:, idx - 1 : idx + 1]
        band_sums = np.mean(band, axis=1 - int(horizontal))
        band_sums = filter_array(band_sums, FILTERS[key][1])
        extrema = np.where((band_sums[1:-1] < band_sums[:-2]) * (band_sums[1:-1] < band_sums[2:]))[0] + 1
        distances = extrema[1:] - extrema[:-1]
        mean = np.mean(distances)
        std = np.std(distances)
        check = "✔" if (ratio := (mean / (std + 0.01))) > 1.5 and mean < 40 else ""
        print(f"{idx:4} {mean:6.2f}  {std:6.2f}  {ratio:6.2f} {check}")
        score = std  # maybe make more advanced score function later
        if (min_distance <= mean <= max_distance) and (score < threshold):
            print(idx)
            bool_array[idx] = 1
    return bool_array
 def get_lines_either(table_array: Array, horizontal=True) -> list[int]:
    key = "row" if horizontal else "col"
    h, w = map(int, table_array.shape)
    table_array = (
        table_array[:, int(0.1 * w) : int(0.9 * w)] if horizontal else table_array[int(0.1 * h) : int(0.9 * h)]
    )
    sums = np.mean(table_array, axis=int(horizontal))
    dotted = detect_dotted_lines(table_array, sums, horizontal=horizontal)
    threshold = 0.3 * 255  # np.mean(sums) - (1 + 2 * horizontal) * np.std(sums)
-    predicate = 1000.0 * ((sums < threshold) | dotted)
+    predicate = 1000.0 * (sums < threshold)
    sums = np.maximum(
        np.maximum(sums[1:-1], predicate[1:-1]),
        np.maximum(predicate[:-2], predicate[2:]),
    )
    filtered_sums = filter_array(sums, FILTERS[key][1])
    filtered_sums = filter_array(filtered_sums, FILTERS[key][2])
    filtered_sums = filter_array(filtered_sums, FILTERS[key][3])
@ -230,7 +179,9 @@ def img_bytes_to_array(img_bytes: bytes) -> Array:
 def infer_lines(img: Array) -> dict[str, dict[str, int] | list[dict[str, int]]]:
    cv2.imwrite("/tmp/table.png", img)
    _, img = cv2.threshold(img, 220, 255, cv2.THRESH_BINARY)
    cv2.imwrite("/tmp/table_bin.png", img)
    h, w = map(int, img.shape)
    row_vals = map(int, get_lines_either(img, horizontal=True))
    col_vals = map(int, get_lines_either(img, horizontal=False))
--- a/src/cv_analysis/utils/dotted_lines.py
+++ b/src/cv_analysis/utils/dotted_lines.py
@ -1,18 +0,0 @@
 """
 General approach:
  Get horizontal and vertical pixel sum extrema. Then take a band of k around each minimum (corresponding to darkest), e.g. k=3.
  Recalculate minima for each band.
  Compute a list of distances between minima.
  Compute the mean and standard deviation between minima.
  If rho:=std/(eta*mean) < phi for some threshold phi, the band contains a dotted line. -> logic: std can be larger for larger mean, i.e. more spaced-out dotted lines
 Pros:
  Intuitive and efficient.
 Cons:
  May not work for irregular/mixed dotted lines, such as (possibly) --*--*--*--*--*--*--*--*--*--*--
 """
 from typing import Iterable
 import numpy as np
Author	SHA1	Message	Date
Jonathan Kössler	799fe331c3	Merge branch 'bugfix/RED-10722' into 'master' RED-10722: fix dead letter queue Closes RED-10722 See merge request redactmanager/cv-analysis-service!32	2025-01-16 09:28:41 +01:00
Jonathan Kössler	dfbfc50556	chore: update version	2025-01-15 13:40:01 +01:00
Jonathan Kössler	63fbd387a3	chore: update pyinfra to v3.4.2	2025-01-15 13:32:57 +01:00
Jonathan Kössler	41dbfc69d9	chore: update pyinfra to v3.4.2	2025-01-14 16:52:13 +01:00
Jonathan Kössler	b73e9b2ed9	Merge branch 'feature/RED-10441' into 'master' RED-10441: fix abandoned queues Closes RED-10441 See merge request redactmanager/cv-analysis-service!31	2024-11-13 17:27:22 +01:00
Jonathan Kössler	92692281ce	chore: update pyinfra to v3.3.5	2024-11-13 17:22:24 +01:00
Jonathan Kössler	cb0c58d699	chore: update pyinfra to v3.3.4	2024-11-13 16:41:04 +01:00
Jonathan Kössler	eb96403fe2	chore: update pyinfra to v3.3.3	2024-11-13 14:53:11 +01:00
Jonathan Kössler	c8daf888c6	chore: update pyinfra to v3.3.2	2024-11-13 09:45:43 +01:00
Jonathan Kössler	eb921c365d	Merge branch 'chore/update_pyinfra' into 'master' RES-858: fix graceful shutdown See merge request redactmanager/cv-analysis-service!30	2024-09-30 11:01:07 +02:00
Jonathan Kössler	7762f81a4a	chore: update pyinfra to v3.2.11	2024-09-30 10:07:29 +02:00
Jonathan Kössler	e991cfe1bf	Merge branch 'chore/update_pyinfra' into 'master' RES-844 && RES-856: fix tracing & proto format See merge request redactmanager/cv-analysis-service!29	2024-09-27 08:21:46 +02:00
Jonathan Kössler	35c5ee5831	fix: opentelemtry service name	2024-09-26 13:46:05 +02:00
Jonathan Kössler	e97f34391a	chore: update pyinfra to v3.2.10	2024-09-26 13:44:48 +02:00
Francisco Schulz	1fa10721aa	Merge branch 'RED-10017-investigate-crashing-py-services-when-upload-large-number-of-files' into 'master' RED-10017 "Investigate crashing py services when upload large number of files" See merge request redactmanager/cv-analysis-service!28	2024-09-23 18:55:08 +02:00
Francisco Schulz	7f0d0a48db	RED-10017 "Investigate crashing py services when upload large number of files"	2024-09-23 18:55:08 +02:00
Francisco Schulz	333cd498b9	Merge branch 'RES-842-pyinfra-fix-rabbit-mq-handler-shuts-down-when-queues-not-available-yet' into 'master' chore: update pyinfra version, increase pkg version Closes RES-842 See merge request redactmanager/cv-analysis-service!27	2024-08-30 14:57:14 +02:00
francisco.schulz	9df8c8f936	chore: update service version	2024-08-30 08:25:00 -04:00
francisco.schulz	60adf0c381	chore: update pyinfra version	2024-08-30 08:15:34 -04:00
francisco.schulz	537f605a85	chore: remove renovate bot config	2024-08-29 11:48:15 -04:00
francisco.schulz	66987ab8e9	chore: update pyinfra version, increase pkg version	2024-08-29 11:29:07 -04:00
Jonathan Kössler	43570142c3	Merge branch 'feature/RES-840-add-client-connector-error' into 'master' fix: add exception handling for ClientConnectorError Closes RES-840 See merge request redactmanager/cv-analysis-service!26	2024-08-28 15:47:01 +02:00
Jonathan Kössler	d457f49001	chore: update pyinfra version	2024-08-28 14:47:29 +02:00
Jonathan Kössler	536928c032	Merge branch 'feature/RES-826-pyinfra-update' into 'master' chore: bump pyinfra version Closes RES-826 See merge request redactmanager/cv-analysis-service!25	2024-08-26 16:15:17 +02:00
Jonathan Kössler	dc6183490f	chore: bump pyinfra version	2024-08-26 15:13:59 +02:00
Jonathan Kössler	bbc2d0c8bf	chore: bump pyinfra version	2024-08-22 09:33:26 +02:00
Jonathan Kössler	3462faf8c7	Merge branch 'feature/RES-731-add-queues-per-tenant' into 'master' RES-731: add queues per tenant Closes RES-731 See merge request redactmanager/cv-analysis-service!24	2024-08-19 15:03:38 +02:00
Jonathan Kössler	b136cc9ff3	RES-731: add queues per tenant	2024-08-19 15:03:37 +02:00
Julius Unverfehrt	cf431df1cb	Merge branch 'table_lines' into 'master' Table lines See merge request redactmanager/cv-analysis-service!23	2024-05-15 16:53:27 +02:00
Julius Unverfehrt	e86214f6b7	Merge branch 'table_lines' into 'master' fix: maping of image coordinates to pdf coordinates (table inference) See merge request redactmanager/cv-analysis-service!22	2024-05-15 13:02:24 +02:00
Isaac Riley	3c9ddfcf0f	Merge branch 'table_lines' into 'master' fix: check nonzero list length in filter_fp_col_lines See merge request redactmanager/cv-analysis-service!21	2024-05-13 09:39:20 +02:00
Isaac Riley	0f45a25bc8	Merge branch 'table_lines' into 'master' fix: make envvar conditional unfailable See merge request redactmanager/cv-analysis-service!20	2024-05-08 15:33:45 +02:00