chore: bump package versions

2024-07-19 22:47:05 +02:00 · 2024-07-19 22:43:54 +02:00
12 changed files with 2297 additions and 2877 deletions
--- a/.gitignore
+++ b/.gitignore
@ -50,3 +50,5 @@ __pycache__/
 # unignore files
 !bom.*
 dotted/
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -12,6 +12,15 @@ variables:
  NEXUS_PROJECT_DIR: red # subfolder in Nexus docker-gin where your container will be stored
  IMAGENAME: $CI_PROJECT_NAME # if the project URL is gitlab.example.com/group-name/project-1, CI_PROJECT_NAME is project-1
 stages:
  - data
  - setup
  - unit-tests
  - versioning
  - build
  - integration-tests
  - release
 pages:
  only:
    - master # KEEP THIS, necessary because `master` branch and not `main` branch
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -5,10 +5,10 @@ default_language_version:
  python: python3.10
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
+    rev: v4.5.0
    hooks:
      - id: trailing-whitespace
-      - id: end-of-file-fixer
+      # - id: end-of-file-fixer
      - id: check-yaml
        args: [--unsafe] # needed for .gitlab-ci.yml
      - id: check-toml
@ -34,7 +34,7 @@ repos:
          - --profile black
  - repo: https://github.com/psf/black
-    rev: 24.10.0
+    rev: 24.3.0
    hooks:
      - id: black
        # exclude: ^(docs/|notebooks/|data/|src/secrets/)
@ -42,7 +42,7 @@ repos:
          - --line-length=120
  - repo: https://github.com/compilerla/conventional-pre-commit
-    rev: v4.0.0
+    rev: v3.2.0
    hooks:
      - id: conventional-pre-commit
        pass_filenames: false
--- a/config/pyinfra.toml
+++ b/config/pyinfra.toml
@ -1,22 +1,11 @@
 [asyncio]
 max_concurrent_tasks = 10
 [dynamic_tenant_queues]
 enabled = true
 [metrics.prometheus]
 enabled = true
 prefix = "redactmanager_cv_analysis_service"
 [tracing]
 enabled = true
 # possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
 type = "azure_monitor"
 [tracing.opentelemetry]
 enabled = true
 endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
-service_name = "redactmanager_cv_analysis_service"
+service_name = "redactmanager_cv_analyisis_service"
 exporter = "otlp"
 [webserver]
@ -36,15 +25,6 @@ input_queue = "request_queue"
 output_queue = "response_queue"
 dead_letter_queue = "dead_letter_queue"
 tenant_event_queue_suffix = "_tenant_event_queue"
 tenant_event_dlq_suffix = "_tenant_events_dlq"
 tenant_exchange_name = "tenants-exchange"
 queue_expiration_time = 300000                                   # 5 minutes in milliseconds
 service_request_queue_prefix = "cv_analysis_request_queue"
 service_request_exchange_name = "cv_analysis_request_exchange"
 service_response_exchange_name = "cv_analysis_response_exchange"
 service_dlq_name = "cv_analysis_dlq"
 [storage]
 backend = "s3"
@ -61,7 +41,4 @@ connection_string = ""
 [storage.tenant_server]
 public_key = ""
-endpoint = "http://tenant-user-management:8081/internal-api/tenants"
+endpoint =  "http://tenant-user-management:8081/internal-api/tenants"
 [kubernetes]
 pod_name = "test_pod"
--- a/flake.nix
+++ b/flake.nix
@ -15,6 +15,7 @@
        (pkgs.buildFHSUserEnv rec {
          name = "cv-analysis-service";
          targetPkgs = pkgs: (with pkgs; [
            python310
            poppler_utils
            zlib
            poetry
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "cv-analysis-service"
-version = "2.30.0"
+version = "2.19.0"
 description = ""
 authors = ["Isaac Riley <isaac.riley@knecon.com>"]
 readme = "README.md"
@ -25,12 +25,13 @@ coverage = "^5.5"
 dependency-check = "^0.6.0"
 lorem-text = "^2.1"
 PyMuPDF = "^1.19.6"
-pyinfra = { version = "3.4.2", source = "gitlab-research" }
+pyinfra = { version = "^2.2.0", source = "gitlab-research" }
-kn-utils = { version = ">=0.4.0", source = "gitlab-research" }
+kn-utils = { version = "0.2.7", source = "gitlab-research" }
 pdf2img = { version = "0.7.0", source = "gitlab-red" }
 dvc-azure = "^2.21.2"
 pymupdf = "^1.24.1"
 types-pillow = "^10.2.0.20240423"
 #matplotlib-backend-wezterm = "^2.1.2"
 [tool.poetry.group.test.dependencies]
 pytest = "^7.0.1"
@ -76,7 +77,7 @@ priority = "explicit"
 [tool.pylint]
 max-line-length = 120
-docstring-min-length = 4
+docstring-min-length=4
 extension-pkg-whitelist = ["cv2"]
 extension-pkg-allow-list = ["cv2"]
--- a/renovate.json
+++ b/renovate.json
@ -0,0 +1,6 @@
 {
  "$schema": "https://docs.renovatebot.com/renovate-schema.json",
  "extends": [
    "config:base"
  ]
 }
--- a/scripts/devenvsetup.sh
+++ b/scripts/devenvsetup.sh
@ -1,41 +0,0 @@
 #!/bin/bash
 python_version=$1
 gitlab_user=$2
 gitlab_personal_access_token=$3
 # cookiecutter https://gitlab.knecon.com/knecon/research/template-python-project.git --checkout master
 # latest_dir=$(ls -td -- */ | head -n 1)  # should be the dir cookiecutter just created
 # cd $latest_dir
 pyenv install $python_version
 pyenv local $python_version
 pyenv shell $python_version
 # install poetry globally (PREFERRED), only need to install it once
 # curl -sSL https://install.python-poetry.org | python3 -
 # remember to update poetry once in a while
 poetry self update
 # install poetry in current python environment, can lead to multiple instances of poetry being installed on one system (DISPREFERRED)
 # pip install --upgrade pip
 # pip install poetry
 poetry config virtualenvs.in-project true
 poetry config installer.max-workers 10
 poetry config repositories.gitlab-research https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
 poetry config http-basic.gitlab-research ${gitlab_user} ${gitlab_personal_access_token}
 poetry config repositories.gitlab-red https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
 poetry config http-basic.gitlab-red ${gitlab_user} ${gitlab_personal_access_token}
 poetry config repositories.gitlab-fforesight https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
 poetry config http-basic.gitlab-fforesight ${gitlab_user} ${gitlab_personal_access_token}
 poetry env use $(pyenv which python)
 poetry install --with=dev
 poetry update
 source .venv/bin/activate
 pre-commit install
 pre-commit autoupdate
--- a/scripts/grid_search.py
+++ b/scripts/grid_search.py
@ -0,0 +1,8 @@
 from cv_analysis.table_inference import infer_lines
 def grid_search() -> None: ...
 if __name__ == "__main__":
    grid_search()
--- a/src/cv_analysis/table_inference.py
+++ b/src/cv_analysis/table_inference.py
@ -1,6 +1,6 @@
 from operator import itemgetter
 from pathlib import Path
-from typing import Callable, Optional, Tuple
+from typing import Callable, Iterable, Optional, Tuple
 import cv2
 import matplotlib.pyplot as plt
@ -9,6 +9,8 @@ from kn_utils.logging import logger  # type: ignore
 from numpy import ndarray as Array
 from scipy.stats import norm  # type: ignore
 from .utils.dotted_lines import detect_dotted_from_extrema
 def show_multiple(arrs: Tuple[Array], title: str = ""):
    plt.clf()
@ -150,16 +152,65 @@ def filter_fp_col_lines(line_list: list[int], filt_sums: Array) -> list[int]:
    return line_list
 def sharpen_sums(sums: Array) -> Array:
    sums = sums.astype("int64")
    shift = 3
    diffs = abs(sums[shift:-shift] - sums[2 * shift :]) + abs(sums[shift:-shift] - sums[: -2 * shift])
    f2 = filter_array(sums, FILTERS["col"][2])
    return diffs
 def detect_dotted_lines(
    image: Array,
    sums: Iterable,
    horizontal: bool = True,
    threshold: float = 1.0,
    min_distance: int = 2,
    max_distance: int = 20,
 ) -> bool:
    key = "row" if horizontal else "col"
    naive = filter_array(sums, FILTERS[key][1])
    naive_lines = np.where((naive[1:-1] < naive[:-2]) * (naive[1:-1] < naive[2:]) * (sums[1:-1] < 250))[0] + 1
    bool_array = np.zeros(image.shape[1 - int(horizontal)])
    for idx in naive_lines:
        band = image[idx - 1 : idx + 2, :] if horizontal else image[:, idx - 1 : idx + 1]
        band_sums = np.mean(band, axis=1 - int(horizontal))
        band_sums = filter_array(band_sums, FILTERS[key][1])
        extrema = np.where((band_sums[1:-1] < band_sums[:-2]) * (band_sums[1:-1] < band_sums[2:]))[0] + 1
        distances = extrema[1:] - extrema[:-1]
        mean = np.mean(distances)
        std = np.std(distances)
        check = "✔" if (ratio := (mean / (std + 0.01))) > 1.5 and mean < 40 else ""
        print(f"{idx:4} {mean:6.2f}  {std:6.2f}  {ratio:6.2f} {check}")
        score = std  # maybe make more advanced score function later
        if (min_distance <= mean <= max_distance) and (score < threshold):
            print(idx)
            bool_array[idx] = 1
    return bool_array
 def get_lines_either(table_array: Array, horizontal=True) -> list[int]:
    key = "row" if horizontal else "col"
    h, w = map(int, table_array.shape)
    table_array = (
        table_array[:, int(0.1 * w) : int(0.9 * w)] if horizontal else table_array[int(0.1 * h) : int(0.9 * h)]
    )
    sums = np.mean(table_array, axis=int(horizontal))
    dotted = detect_dotted_lines(table_array, sums, horizontal=horizontal)
    threshold = 0.3 * 255  # np.mean(sums) - (1 + 2 * horizontal) * np.std(sums)
-    predicate = 1000.0 * (sums < threshold)
+    predicate = 1000.0 * ((sums < threshold) | dotted)
    sums = np.maximum(
        np.maximum(sums[1:-1], predicate[1:-1]),
        np.maximum(predicate[:-2], predicate[2:]),
    )
    filtered_sums = filter_array(sums, FILTERS[key][1])
    filtered_sums = filter_array(filtered_sums, FILTERS[key][2])
    filtered_sums = filter_array(filtered_sums, FILTERS[key][3])
@ -179,9 +230,7 @@ def img_bytes_to_array(img_bytes: bytes) -> Array:
 def infer_lines(img: Array) -> dict[str, dict[str, int] | list[dict[str, int]]]:
    cv2.imwrite("/tmp/table.png", img)
    _, img = cv2.threshold(img, 220, 255, cv2.THRESH_BINARY)
    cv2.imwrite("/tmp/table_bin.png", img)
    h, w = map(int, img.shape)
    row_vals = map(int, get_lines_either(img, horizontal=True))
    col_vals = map(int, get_lines_either(img, horizontal=False))
--- a/src/cv_analysis/utils/dotted_lines.py
+++ b/src/cv_analysis/utils/dotted_lines.py
@ -0,0 +1,18 @@
 """
 General approach:
  Get horizontal and vertical pixel sum extrema. Then take a band of k around each minimum (corresponding to darkest), e.g. k=3.
  Recalculate minima for each band.
  Compute a list of distances between minima.
  Compute the mean and standard deviation between minima.
  If rho:=std/(eta*mean) < phi for some threshold phi, the band contains a dotted line. -> logic: std can be larger for larger mean, i.e. more spaced-out dotted lines
 Pros:
  Intuitive and efficient.
 Cons:
  May not work for irregular/mixed dotted lines, such as (possibly) --*--*--*--*--*--*--*--*--*--*--
 """
 from typing import Iterable
 import numpy as np
Author	SHA1	Message	Date
iriley	99359596da	chore: bump package versions	2024-07-19 22:47:05 +02:00
iriley	ef02253ad7	chore: bump package versions	2024-07-19 22:43:54 +02:00