Compare commits
32 Commits
table_line
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
799fe331c3 | ||
|
|
dfbfc50556 | ||
|
|
63fbd387a3 | ||
|
|
41dbfc69d9 | ||
|
|
b73e9b2ed9 | ||
|
|
92692281ce | ||
|
|
cb0c58d699 | ||
|
|
eb96403fe2 | ||
|
|
c8daf888c6 | ||
|
|
eb921c365d | ||
|
|
7762f81a4a | ||
|
|
e991cfe1bf | ||
|
|
35c5ee5831 | ||
|
|
e97f34391a | ||
|
|
1fa10721aa | ||
|
|
7f0d0a48db | ||
|
|
333cd498b9 | ||
|
|
9df8c8f936 | ||
|
|
60adf0c381 | ||
|
|
537f605a85 | ||
|
|
66987ab8e9 | ||
|
|
43570142c3 | ||
|
|
d457f49001 | ||
|
|
536928c032 | ||
|
|
dc6183490f | ||
|
|
bbc2d0c8bf | ||
|
|
3462faf8c7 | ||
|
|
b136cc9ff3 | ||
|
|
cf431df1cb | ||
|
|
e86214f6b7 | ||
|
|
3c9ddfcf0f | ||
|
|
0f45a25bc8 |
2
.gitignore
vendored
2
.gitignore
vendored
@ -50,5 +50,3 @@ __pycache__/
|
|||||||
|
|
||||||
# unignore files
|
# unignore files
|
||||||
!bom.*
|
!bom.*
|
||||||
|
|
||||||
dotted/
|
|
||||||
|
|||||||
@ -12,15 +12,6 @@ variables:
|
|||||||
NEXUS_PROJECT_DIR: red # subfolder in Nexus docker-gin where your container will be stored
|
NEXUS_PROJECT_DIR: red # subfolder in Nexus docker-gin where your container will be stored
|
||||||
IMAGENAME: $CI_PROJECT_NAME # if the project URL is gitlab.example.com/group-name/project-1, CI_PROJECT_NAME is project-1
|
IMAGENAME: $CI_PROJECT_NAME # if the project URL is gitlab.example.com/group-name/project-1, CI_PROJECT_NAME is project-1
|
||||||
|
|
||||||
stages:
|
|
||||||
- data
|
|
||||||
- setup
|
|
||||||
- unit-tests
|
|
||||||
- versioning
|
|
||||||
- build
|
|
||||||
- integration-tests
|
|
||||||
- release
|
|
||||||
|
|
||||||
pages:
|
pages:
|
||||||
only:
|
only:
|
||||||
- master # KEEP THIS, necessary because `master` branch and not `main` branch
|
- master # KEEP THIS, necessary because `master` branch and not `main` branch
|
||||||
|
|||||||
@ -5,10 +5,10 @@ default_language_version:
|
|||||||
python: python3.10
|
python: python3.10
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
rev: v4.5.0
|
rev: v5.0.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: trailing-whitespace
|
- id: trailing-whitespace
|
||||||
# - id: end-of-file-fixer
|
- id: end-of-file-fixer
|
||||||
- id: check-yaml
|
- id: check-yaml
|
||||||
args: [--unsafe] # needed for .gitlab-ci.yml
|
args: [--unsafe] # needed for .gitlab-ci.yml
|
||||||
- id: check-toml
|
- id: check-toml
|
||||||
@ -34,7 +34,7 @@ repos:
|
|||||||
- --profile black
|
- --profile black
|
||||||
|
|
||||||
- repo: https://github.com/psf/black
|
- repo: https://github.com/psf/black
|
||||||
rev: 24.3.0
|
rev: 24.10.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: black
|
- id: black
|
||||||
# exclude: ^(docs/|notebooks/|data/|src/secrets/)
|
# exclude: ^(docs/|notebooks/|data/|src/secrets/)
|
||||||
@ -42,7 +42,7 @@ repos:
|
|||||||
- --line-length=120
|
- --line-length=120
|
||||||
|
|
||||||
- repo: https://github.com/compilerla/conventional-pre-commit
|
- repo: https://github.com/compilerla/conventional-pre-commit
|
||||||
rev: v3.2.0
|
rev: v4.0.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: conventional-pre-commit
|
- id: conventional-pre-commit
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
|
|||||||
@ -1,11 +1,22 @@
|
|||||||
|
|
||||||
|
[asyncio]
|
||||||
|
max_concurrent_tasks = 10
|
||||||
|
|
||||||
|
[dynamic_tenant_queues]
|
||||||
|
enabled = true
|
||||||
|
|
||||||
[metrics.prometheus]
|
[metrics.prometheus]
|
||||||
enabled = true
|
enabled = true
|
||||||
prefix = "redactmanager_cv_analysis_service"
|
prefix = "redactmanager_cv_analysis_service"
|
||||||
|
|
||||||
[tracing.opentelemetry]
|
[tracing]
|
||||||
enabled = true
|
enabled = true
|
||||||
|
# possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
|
||||||
|
type = "azure_monitor"
|
||||||
|
|
||||||
|
[tracing.opentelemetry]
|
||||||
endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
|
endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
|
||||||
service_name = "redactmanager_cv_analyisis_service"
|
service_name = "redactmanager_cv_analysis_service"
|
||||||
exporter = "otlp"
|
exporter = "otlp"
|
||||||
|
|
||||||
[webserver]
|
[webserver]
|
||||||
@ -25,6 +36,15 @@ input_queue = "request_queue"
|
|||||||
output_queue = "response_queue"
|
output_queue = "response_queue"
|
||||||
dead_letter_queue = "dead_letter_queue"
|
dead_letter_queue = "dead_letter_queue"
|
||||||
|
|
||||||
|
tenant_event_queue_suffix = "_tenant_event_queue"
|
||||||
|
tenant_event_dlq_suffix = "_tenant_events_dlq"
|
||||||
|
tenant_exchange_name = "tenants-exchange"
|
||||||
|
queue_expiration_time = 300000 # 5 minutes in milliseconds
|
||||||
|
service_request_queue_prefix = "cv_analysis_request_queue"
|
||||||
|
service_request_exchange_name = "cv_analysis_request_exchange"
|
||||||
|
service_response_exchange_name = "cv_analysis_response_exchange"
|
||||||
|
service_dlq_name = "cv_analysis_dlq"
|
||||||
|
|
||||||
[storage]
|
[storage]
|
||||||
backend = "s3"
|
backend = "s3"
|
||||||
|
|
||||||
@ -41,4 +61,7 @@ connection_string = ""
|
|||||||
|
|
||||||
[storage.tenant_server]
|
[storage.tenant_server]
|
||||||
public_key = ""
|
public_key = ""
|
||||||
endpoint = "http://tenant-user-management:8081/internal-api/tenants"
|
endpoint = "http://tenant-user-management:8081/internal-api/tenants"
|
||||||
|
|
||||||
|
[kubernetes]
|
||||||
|
pod_name = "test_pod"
|
||||||
|
|||||||
@ -15,7 +15,6 @@
|
|||||||
(pkgs.buildFHSUserEnv rec {
|
(pkgs.buildFHSUserEnv rec {
|
||||||
name = "cv-analysis-service";
|
name = "cv-analysis-service";
|
||||||
targetPkgs = pkgs: (with pkgs; [
|
targetPkgs = pkgs: (with pkgs; [
|
||||||
python310
|
|
||||||
poppler_utils
|
poppler_utils
|
||||||
zlib
|
zlib
|
||||||
poetry
|
poetry
|
||||||
|
|||||||
4990
poetry.lock
generated
4990
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "cv-analysis-service"
|
name = "cv-analysis-service"
|
||||||
version = "2.19.0"
|
version = "2.30.0"
|
||||||
description = ""
|
description = ""
|
||||||
authors = ["Isaac Riley <isaac.riley@knecon.com>"]
|
authors = ["Isaac Riley <isaac.riley@knecon.com>"]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
@ -25,13 +25,12 @@ coverage = "^5.5"
|
|||||||
dependency-check = "^0.6.0"
|
dependency-check = "^0.6.0"
|
||||||
lorem-text = "^2.1"
|
lorem-text = "^2.1"
|
||||||
PyMuPDF = "^1.19.6"
|
PyMuPDF = "^1.19.6"
|
||||||
pyinfra = { version = "^2.2.0", source = "gitlab-research" }
|
pyinfra = { version = "3.4.2", source = "gitlab-research" }
|
||||||
kn-utils = { version = "0.2.7", source = "gitlab-research" }
|
kn-utils = { version = ">=0.4.0", source = "gitlab-research" }
|
||||||
pdf2img = { version = "0.7.0", source = "gitlab-red" }
|
pdf2img = { version = "0.7.0", source = "gitlab-red" }
|
||||||
dvc-azure = "^2.21.2"
|
dvc-azure = "^2.21.2"
|
||||||
pymupdf = "^1.24.1"
|
pymupdf = "^1.24.1"
|
||||||
types-pillow = "^10.2.0.20240423"
|
types-pillow = "^10.2.0.20240423"
|
||||||
#matplotlib-backend-wezterm = "^2.1.2"
|
|
||||||
|
|
||||||
[tool.poetry.group.test.dependencies]
|
[tool.poetry.group.test.dependencies]
|
||||||
pytest = "^7.0.1"
|
pytest = "^7.0.1"
|
||||||
@ -77,7 +76,7 @@ priority = "explicit"
|
|||||||
|
|
||||||
[tool.pylint]
|
[tool.pylint]
|
||||||
max-line-length = 120
|
max-line-length = 120
|
||||||
docstring-min-length=4
|
docstring-min-length = 4
|
||||||
extension-pkg-whitelist = ["cv2"]
|
extension-pkg-whitelist = ["cv2"]
|
||||||
extension-pkg-allow-list = ["cv2"]
|
extension-pkg-allow-list = ["cv2"]
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +0,0 @@
|
|||||||
{
|
|
||||||
"$schema": "https://docs.renovatebot.com/renovate-schema.json",
|
|
||||||
"extends": [
|
|
||||||
"config:base"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
41
scripts/devenvsetup.sh
Normal file
41
scripts/devenvsetup.sh
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
python_version=$1
|
||||||
|
gitlab_user=$2
|
||||||
|
gitlab_personal_access_token=$3
|
||||||
|
|
||||||
|
# cookiecutter https://gitlab.knecon.com/knecon/research/template-python-project.git --checkout master
|
||||||
|
# latest_dir=$(ls -td -- */ | head -n 1) # should be the dir cookiecutter just created
|
||||||
|
|
||||||
|
# cd $latest_dir
|
||||||
|
|
||||||
|
pyenv install $python_version
|
||||||
|
pyenv local $python_version
|
||||||
|
pyenv shell $python_version
|
||||||
|
|
||||||
|
# install poetry globally (PREFERRED), only need to install it once
|
||||||
|
# curl -sSL https://install.python-poetry.org | python3 -
|
||||||
|
|
||||||
|
# remember to update poetry once in a while
|
||||||
|
poetry self update
|
||||||
|
|
||||||
|
# install poetry in current python environment, can lead to multiple instances of poetry being installed on one system (DISPREFERRED)
|
||||||
|
# pip install --upgrade pip
|
||||||
|
# pip install poetry
|
||||||
|
|
||||||
|
poetry config virtualenvs.in-project true
|
||||||
|
poetry config installer.max-workers 10
|
||||||
|
poetry config repositories.gitlab-research https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
|
||||||
|
poetry config http-basic.gitlab-research ${gitlab_user} ${gitlab_personal_access_token}
|
||||||
|
poetry config repositories.gitlab-red https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
|
||||||
|
poetry config http-basic.gitlab-red ${gitlab_user} ${gitlab_personal_access_token}
|
||||||
|
poetry config repositories.gitlab-fforesight https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
|
||||||
|
poetry config http-basic.gitlab-fforesight ${gitlab_user} ${gitlab_personal_access_token}
|
||||||
|
|
||||||
|
poetry env use $(pyenv which python)
|
||||||
|
poetry install --with=dev
|
||||||
|
poetry update
|
||||||
|
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
pre-commit install
|
||||||
|
pre-commit autoupdate
|
||||||
@ -1,8 +0,0 @@
|
|||||||
from cv_analysis.table_inference import infer_lines
|
|
||||||
|
|
||||||
|
|
||||||
def grid_search() -> None: ...
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
grid_search()
|
|
||||||
@ -1,6 +1,6 @@
|
|||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Callable, Iterable, Optional, Tuple
|
from typing import Callable, Optional, Tuple
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
@ -9,8 +9,6 @@ from kn_utils.logging import logger # type: ignore
|
|||||||
from numpy import ndarray as Array
|
from numpy import ndarray as Array
|
||||||
from scipy.stats import norm # type: ignore
|
from scipy.stats import norm # type: ignore
|
||||||
|
|
||||||
from .utils.dotted_lines import detect_dotted_from_extrema
|
|
||||||
|
|
||||||
|
|
||||||
def show_multiple(arrs: Tuple[Array], title: str = ""):
|
def show_multiple(arrs: Tuple[Array], title: str = ""):
|
||||||
plt.clf()
|
plt.clf()
|
||||||
@ -152,65 +150,16 @@ def filter_fp_col_lines(line_list: list[int], filt_sums: Array) -> list[int]:
|
|||||||
return line_list
|
return line_list
|
||||||
|
|
||||||
|
|
||||||
def sharpen_sums(sums: Array) -> Array:
|
|
||||||
sums = sums.astype("int64")
|
|
||||||
shift = 3
|
|
||||||
diffs = abs(sums[shift:-shift] - sums[2 * shift :]) + abs(sums[shift:-shift] - sums[: -2 * shift])
|
|
||||||
f2 = filter_array(sums, FILTERS["col"][2])
|
|
||||||
return diffs
|
|
||||||
|
|
||||||
|
|
||||||
def detect_dotted_lines(
|
|
||||||
image: Array,
|
|
||||||
sums: Iterable,
|
|
||||||
horizontal: bool = True,
|
|
||||||
threshold: float = 1.0,
|
|
||||||
min_distance: int = 2,
|
|
||||||
max_distance: int = 20,
|
|
||||||
) -> bool:
|
|
||||||
key = "row" if horizontal else "col"
|
|
||||||
naive = filter_array(sums, FILTERS[key][1])
|
|
||||||
naive_lines = np.where((naive[1:-1] < naive[:-2]) * (naive[1:-1] < naive[2:]) * (sums[1:-1] < 250))[0] + 1
|
|
||||||
|
|
||||||
bool_array = np.zeros(image.shape[1 - int(horizontal)])
|
|
||||||
for idx in naive_lines:
|
|
||||||
band = image[idx - 1 : idx + 2, :] if horizontal else image[:, idx - 1 : idx + 1]
|
|
||||||
band_sums = np.mean(band, axis=1 - int(horizontal))
|
|
||||||
band_sums = filter_array(band_sums, FILTERS[key][1])
|
|
||||||
extrema = np.where((band_sums[1:-1] < band_sums[:-2]) * (band_sums[1:-1] < band_sums[2:]))[0] + 1
|
|
||||||
|
|
||||||
distances = extrema[1:] - extrema[:-1]
|
|
||||||
mean = np.mean(distances)
|
|
||||||
std = np.std(distances)
|
|
||||||
|
|
||||||
check = "✔" if (ratio := (mean / (std + 0.01))) > 1.5 and mean < 40 else ""
|
|
||||||
print(f"{idx:4} {mean:6.2f} {std:6.2f} {ratio:6.2f} {check}")
|
|
||||||
|
|
||||||
score = std # maybe make more advanced score function later
|
|
||||||
if (min_distance <= mean <= max_distance) and (score < threshold):
|
|
||||||
print(idx)
|
|
||||||
bool_array[idx] = 1
|
|
||||||
return bool_array
|
|
||||||
|
|
||||||
|
|
||||||
def get_lines_either(table_array: Array, horizontal=True) -> list[int]:
|
def get_lines_either(table_array: Array, horizontal=True) -> list[int]:
|
||||||
key = "row" if horizontal else "col"
|
key = "row" if horizontal else "col"
|
||||||
h, w = map(int, table_array.shape)
|
|
||||||
|
|
||||||
table_array = (
|
|
||||||
table_array[:, int(0.1 * w) : int(0.9 * w)] if horizontal else table_array[int(0.1 * h) : int(0.9 * h)]
|
|
||||||
)
|
|
||||||
|
|
||||||
sums = np.mean(table_array, axis=int(horizontal))
|
sums = np.mean(table_array, axis=int(horizontal))
|
||||||
dotted = detect_dotted_lines(table_array, sums, horizontal=horizontal)
|
|
||||||
|
|
||||||
threshold = 0.3 * 255 # np.mean(sums) - (1 + 2 * horizontal) * np.std(sums)
|
threshold = 0.3 * 255 # np.mean(sums) - (1 + 2 * horizontal) * np.std(sums)
|
||||||
predicate = 1000.0 * ((sums < threshold) | dotted)
|
predicate = 1000.0 * (sums < threshold)
|
||||||
sums = np.maximum(
|
sums = np.maximum(
|
||||||
np.maximum(sums[1:-1], predicate[1:-1]),
|
np.maximum(sums[1:-1], predicate[1:-1]),
|
||||||
np.maximum(predicate[:-2], predicate[2:]),
|
np.maximum(predicate[:-2], predicate[2:]),
|
||||||
)
|
)
|
||||||
|
|
||||||
filtered_sums = filter_array(sums, FILTERS[key][1])
|
filtered_sums = filter_array(sums, FILTERS[key][1])
|
||||||
filtered_sums = filter_array(filtered_sums, FILTERS[key][2])
|
filtered_sums = filter_array(filtered_sums, FILTERS[key][2])
|
||||||
filtered_sums = filter_array(filtered_sums, FILTERS[key][3])
|
filtered_sums = filter_array(filtered_sums, FILTERS[key][3])
|
||||||
@ -230,7 +179,9 @@ def img_bytes_to_array(img_bytes: bytes) -> Array:
|
|||||||
|
|
||||||
|
|
||||||
def infer_lines(img: Array) -> dict[str, dict[str, int] | list[dict[str, int]]]:
|
def infer_lines(img: Array) -> dict[str, dict[str, int] | list[dict[str, int]]]:
|
||||||
|
cv2.imwrite("/tmp/table.png", img)
|
||||||
_, img = cv2.threshold(img, 220, 255, cv2.THRESH_BINARY)
|
_, img = cv2.threshold(img, 220, 255, cv2.THRESH_BINARY)
|
||||||
|
cv2.imwrite("/tmp/table_bin.png", img)
|
||||||
h, w = map(int, img.shape)
|
h, w = map(int, img.shape)
|
||||||
row_vals = map(int, get_lines_either(img, horizontal=True))
|
row_vals = map(int, get_lines_either(img, horizontal=True))
|
||||||
col_vals = map(int, get_lines_either(img, horizontal=False))
|
col_vals = map(int, get_lines_either(img, horizontal=False))
|
||||||
|
|||||||
@ -1,18 +0,0 @@
|
|||||||
"""
|
|
||||||
General approach:
|
|
||||||
Get horizontal and vertical pixel sum extrema. Then take a band of k around each minimum (corresponding to darkest), e.g. k=3.
|
|
||||||
Recalculate minima for each band.
|
|
||||||
Compute a list of distances between minima.
|
|
||||||
Compute the mean and standard deviation between minima.
|
|
||||||
If rho:=std/(eta*mean) < phi for some threshold phi, the band contains a dotted line. -> logic: std can be larger for larger mean, i.e. more spaced-out dotted lines
|
|
||||||
|
|
||||||
Pros:
|
|
||||||
Intuitive and efficient.
|
|
||||||
|
|
||||||
Cons:
|
|
||||||
May not work for irregular/mixed dotted lines, such as (possibly) --*--*--*--*--*--*--*--*--*--*--
|
|
||||||
"""
|
|
||||||
|
|
||||||
from typing import Iterable
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
Loading…
x
Reference in New Issue
Block a user