chore: bump package versions

2024-07-19 22:47:05 +02:00 · 2024-07-19 22:43:54 +02:00
12 changed files with 2297 additions and 2877 deletions
--- a/.gitignore
+++ b/.gitignore
@ -50,3 +50,5 @@ __pycache__/

 # unignore files
 !bom.*
+
+dotted/
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -12,6 +12,15 @@ variables:
  NEXUS_PROJECT_DIR: red # subfolder in Nexus docker-gin where your container will be stored
  IMAGENAME: $CI_PROJECT_NAME # if the project URL is gitlab.example.com/group-name/project-1, CI_PROJECT_NAME is project-1

+stages:
+  - data
+  - setup
+  - unit-tests
+  - versioning
+  - build
+  - integration-tests
+  - release
+
 pages:
  only:
    - master # KEEP THIS, necessary because `master` branch and not `main` branch
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -5,10 +5,10 @@ default_language_version:
  python: python3.10
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
+    rev: v4.5.0
    hooks:
      - id: trailing-whitespace
-      - id: end-of-file-fixer
+      # - id: end-of-file-fixer
      - id: check-yaml
        args: [--unsafe] # needed for .gitlab-ci.yml
      - id: check-toml
@ -34,7 +34,7 @@ repos:
          - --profile black

  - repo: https://github.com/psf/black
-    rev: 24.10.0
+    rev: 24.3.0
    hooks:
      - id: black
        # exclude: ^(docs/|notebooks/|data/|src/secrets/)
@ -42,7 +42,7 @@ repos:
          - --line-length=120

  - repo: https://github.com/compilerla/conventional-pre-commit
-    rev: v4.0.0
+    rev: v3.2.0
    hooks:
      - id: conventional-pre-commit
        pass_filenames: false
--- a/config/pyinfra.toml
+++ b/config/pyinfra.toml
@ -1,22 +1,11 @@
-
-[asyncio]
-max_concurrent_tasks = 10
-
-[dynamic_tenant_queues]
-enabled = true
-
 [metrics.prometheus]
 enabled = true
 prefix = "redactmanager_cv_analysis_service"

-[tracing]
-enabled = true
-# possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
-type = "azure_monitor"
-
 [tracing.opentelemetry]
+enabled = true
 endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
-service_name = "redactmanager_cv_analysis_service"
+service_name = "redactmanager_cv_analyisis_service"
 exporter = "otlp"

 [webserver]
@ -36,15 +25,6 @@ input_queue = "request_queue"
 output_queue = "response_queue"
 dead_letter_queue = "dead_letter_queue"

-tenant_event_queue_suffix = "_tenant_event_queue"
-tenant_event_dlq_suffix = "_tenant_events_dlq"
-tenant_exchange_name = "tenants-exchange"
-queue_expiration_time = 300000                                   # 5 minutes in milliseconds
-service_request_queue_prefix = "cv_analysis_request_queue"
-service_request_exchange_name = "cv_analysis_request_exchange"
-service_response_exchange_name = "cv_analysis_response_exchange"
-service_dlq_name = "cv_analysis_dlq"
-
 [storage]
 backend = "s3"

@ -61,7 +41,4 @@ connection_string = ""

 [storage.tenant_server]
 public_key = ""
-endpoint = "http://tenant-user-management:8081/internal-api/tenants"
-
-[kubernetes]
-pod_name = "test_pod"
+endpoint =  "http://tenant-user-management:8081/internal-api/tenants"
--- a/flake.nix
+++ b/flake.nix
@ -15,6 +15,7 @@
        (pkgs.buildFHSUserEnv rec {
          name = "cv-analysis-service";
          targetPkgs = pkgs: (with pkgs; [
+            python310
            poppler_utils
            zlib
            poetry
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "cv-analysis-service"
-version = "2.30.0"
+version = "2.19.0"
 description = ""
 authors = ["Isaac Riley <isaac.riley@knecon.com>"]
 readme = "README.md"
@ -25,12 +25,13 @@ coverage = "^5.5"
 dependency-check = "^0.6.0"
 lorem-text = "^2.1"
 PyMuPDF = "^1.19.6"
-pyinfra = { version = "3.4.2", source = "gitlab-research" }
-kn-utils = { version = ">=0.4.0", source = "gitlab-research" }
+pyinfra = { version = "^2.2.0", source = "gitlab-research" }
+kn-utils = { version = "0.2.7", source = "gitlab-research" }
 pdf2img = { version = "0.7.0", source = "gitlab-red" }
 dvc-azure = "^2.21.2"
 pymupdf = "^1.24.1"
 types-pillow = "^10.2.0.20240423"
+#matplotlib-backend-wezterm = "^2.1.2"

 [tool.poetry.group.test.dependencies]
 pytest = "^7.0.1"
@ -76,7 +77,7 @@ priority = "explicit"

 [tool.pylint]
 max-line-length = 120
-docstring-min-length = 4
+docstring-min-length=4
 extension-pkg-whitelist = ["cv2"]
 extension-pkg-allow-list = ["cv2"]

--- a/renovate.json
+++ b/renovate.json
@ -0,0 +1,6 @@
+{
+  "$schema": "https://docs.renovatebot.com/renovate-schema.json",
+  "extends": [
+    "config:base"
+  ]
+}
--- a/scripts/devenvsetup.sh
+++ b/scripts/devenvsetup.sh
@ -1,41 +0,0 @@
-#!/bin/bash
-python_version=$1
-gitlab_user=$2
-gitlab_personal_access_token=$3
-
-# cookiecutter https://gitlab.knecon.com/knecon/research/template-python-project.git --checkout master
-# latest_dir=$(ls -td -- */ | head -n 1)  # should be the dir cookiecutter just created
-
-# cd $latest_dir
-
-pyenv install $python_version
-pyenv local $python_version
-pyenv shell $python_version
-
-# install poetry globally (PREFERRED), only need to install it once
-# curl -sSL https://install.python-poetry.org | python3 -
-
-# remember to update poetry once in a while
-poetry self update
-
-# install poetry in current python environment, can lead to multiple instances of poetry being installed on one system (DISPREFERRED)
-# pip install --upgrade pip
-# pip install poetry
-
-poetry config virtualenvs.in-project true
-poetry config installer.max-workers 10
-poetry config repositories.gitlab-research https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
-poetry config http-basic.gitlab-research ${gitlab_user} ${gitlab_personal_access_token}
-poetry config repositories.gitlab-red https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
-poetry config http-basic.gitlab-red ${gitlab_user} ${gitlab_personal_access_token}
-poetry config repositories.gitlab-fforesight https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
-poetry config http-basic.gitlab-fforesight ${gitlab_user} ${gitlab_personal_access_token}
-
-poetry env use $(pyenv which python)
-poetry install --with=dev
-poetry update
-
-source .venv/bin/activate
-
-pre-commit install
-pre-commit autoupdate
--- a/scripts/grid_search.py
+++ b/scripts/grid_search.py
@ -0,0 +1,8 @@
+from cv_analysis.table_inference import infer_lines
+
+
+def grid_search() -> None: ...
+
+
+if __name__ == "__main__":
+    grid_search()
--- a/src/cv_analysis/table_inference.py
+++ b/src/cv_analysis/table_inference.py
@ -1,6 +1,6 @@
 from operator import itemgetter
 from pathlib import Path
-from typing import Callable, Optional, Tuple
+from typing import Callable, Iterable, Optional, Tuple

 import cv2
 import matplotlib.pyplot as plt
@ -9,6 +9,8 @@ from kn_utils.logging import logger  # type: ignore
 from numpy import ndarray as Array
 from scipy.stats import norm  # type: ignore

+from .utils.dotted_lines import detect_dotted_from_extrema
+

 def show_multiple(arrs: Tuple[Array], title: str = ""):
    plt.clf()
@ -150,16 +152,65 @@ def filter_fp_col_lines(line_list: list[int], filt_sums: Array) -> list[int]:
    return line_list


+def sharpen_sums(sums: Array) -> Array:
+    sums = sums.astype("int64")
+    shift = 3
+    diffs = abs(sums[shift:-shift] - sums[2 * shift :]) + abs(sums[shift:-shift] - sums[: -2 * shift])
+    f2 = filter_array(sums, FILTERS["col"][2])
+    return diffs
+
+
+def detect_dotted_lines(
+    image: Array,
+    sums: Iterable,
+    horizontal: bool = True,
+    threshold: float = 1.0,
+    min_distance: int = 2,
+    max_distance: int = 20,
+) -> bool:
+    key = "row" if horizontal else "col"
+    naive = filter_array(sums, FILTERS[key][1])
+    naive_lines = np.where((naive[1:-1] < naive[:-2]) * (naive[1:-1] < naive[2:]) * (sums[1:-1] < 250))[0] + 1
+
+    bool_array = np.zeros(image.shape[1 - int(horizontal)])
+    for idx in naive_lines:
+        band = image[idx - 1 : idx + 2, :] if horizontal else image[:, idx - 1 : idx + 1]
+        band_sums = np.mean(band, axis=1 - int(horizontal))
+        band_sums = filter_array(band_sums, FILTERS[key][1])
+        extrema = np.where((band_sums[1:-1] < band_sums[:-2]) * (band_sums[1:-1] < band_sums[2:]))[0] + 1
+
+        distances = extrema[1:] - extrema[:-1]
+        mean = np.mean(distances)
+        std = np.std(distances)
+
+        check = "✔" if (ratio := (mean / (std + 0.01))) > 1.5 and mean < 40 else ""
+        print(f"{idx:4} {mean:6.2f}  {std:6.2f}  {ratio:6.2f} {check}")
+
+        score = std  # maybe make more advanced score function later
+        if (min_distance <= mean <= max_distance) and (score < threshold):
+            print(idx)
+            bool_array[idx] = 1
+    return bool_array
+
+
 def get_lines_either(table_array: Array, horizontal=True) -> list[int]:
    key = "row" if horizontal else "col"
+    h, w = map(int, table_array.shape)
+
+    table_array = (
+        table_array[:, int(0.1 * w) : int(0.9 * w)] if horizontal else table_array[int(0.1 * h) : int(0.9 * h)]
+    )

    sums = np.mean(table_array, axis=int(horizontal))
+    dotted = detect_dotted_lines(table_array, sums, horizontal=horizontal)
+
    threshold = 0.3 * 255  # np.mean(sums) - (1 + 2 * horizontal) * np.std(sums)
-    predicate = 1000.0 * (sums < threshold)
+    predicate = 1000.0 * ((sums < threshold) | dotted)
    sums = np.maximum(
        np.maximum(sums[1:-1], predicate[1:-1]),
        np.maximum(predicate[:-2], predicate[2:]),
    )
+
    filtered_sums = filter_array(sums, FILTERS[key][1])
    filtered_sums = filter_array(filtered_sums, FILTERS[key][2])
    filtered_sums = filter_array(filtered_sums, FILTERS[key][3])
@ -179,9 +230,7 @@ def img_bytes_to_array(img_bytes: bytes) -> Array:


 def infer_lines(img: Array) -> dict[str, dict[str, int] | list[dict[str, int]]]:
-    cv2.imwrite("/tmp/table.png", img)
    _, img = cv2.threshold(img, 220, 255, cv2.THRESH_BINARY)
-    cv2.imwrite("/tmp/table_bin.png", img)
    h, w = map(int, img.shape)
    row_vals = map(int, get_lines_either(img, horizontal=True))
    col_vals = map(int, get_lines_either(img, horizontal=False))
--- a/src/cv_analysis/utils/dotted_lines.py
+++ b/src/cv_analysis/utils/dotted_lines.py
@ -0,0 +1,18 @@
+"""
+General approach:
+  Get horizontal and vertical pixel sum extrema. Then take a band of k around each minimum (corresponding to darkest), e.g. k=3.
+  Recalculate minima for each band.
+  Compute a list of distances between minima.
+  Compute the mean and standard deviation between minima.
+  If rho:=std/(eta*mean) < phi for some threshold phi, the band contains a dotted line. -> logic: std can be larger for larger mean, i.e. more spaced-out dotted lines
+
+Pros:
+  Intuitive and efficient.
+
+Cons:
+  May not work for irregular/mixed dotted lines, such as (possibly) --*--*--*--*--*--*--*--*--*--*--
+"""
+
+from typing import Iterable
+
+import numpy as np
Author	SHA1	Message	Date
iriley	99359596da	chore: bump package versions	2024-07-19 22:47:05 +02:00
iriley	ef02253ad7	chore: bump package versions	2024-07-19 22:43:54 +02:00