Compare commits
59 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0027421628 | ||
|
|
00740c91b8 | ||
|
|
a3d79eb9af | ||
|
|
373f9f2d01 | ||
|
|
2429d90dd5 | ||
|
|
2b85999258 | ||
|
|
4b15d2c2ca | ||
|
|
bf1ca8d6f9 | ||
|
|
9a4b8cad2b | ||
|
|
28adb50330 | ||
|
|
7a3fdf8fa4 | ||
|
|
3fbcd65e9b | ||
|
|
90a60b4b7c | ||
|
|
526de8984c | ||
|
|
99cbf3c9bf | ||
|
|
986137e729 | ||
|
|
f950b96cfb | ||
|
|
2385d19bc2 | ||
|
|
16f2f0d557 | ||
|
|
afa6fc34cb | ||
|
|
a192e05be2 | ||
|
|
d23034e38a | ||
|
|
4bc53cf88b | ||
|
|
e737f64ed2 | ||
|
|
4b099f0106 | ||
|
|
b3a58d6777 | ||
|
|
c888453cc6 | ||
|
|
bf9ab4b1a2 | ||
|
|
9ff88a1e5d | ||
|
|
c852434b75 | ||
|
|
8655e25ec0 | ||
|
|
103c19d4cd | ||
|
|
530001a0af | ||
|
|
a6c11a9db5 | ||
|
|
1796c1bcbb | ||
|
|
f4b9ff54aa | ||
|
|
278b42e368 | ||
|
|
9600e4ca23 | ||
|
|
8485345dd1 | ||
|
|
d1a523c7d6 | ||
|
|
278f54eaa7 | ||
|
|
443c2614f9 | ||
|
|
4102a564a3 | ||
|
|
7f49642ba0 | ||
|
|
ba8d1dfdfe | ||
|
|
150d0d64e5 | ||
|
|
a024ddfcf7 | ||
|
|
13cbfa4ddf | ||
|
|
75af55dbda | ||
|
|
499c501acf | ||
|
|
6163e29d6b | ||
|
|
dadc0a4163 | ||
|
|
729ce17de0 | ||
|
|
88fbe077e6 | ||
|
|
f8ecef1054 | ||
|
|
5f44cc6560 | ||
|
|
b60f4d0383 | ||
|
|
87873cc3a3 | ||
|
|
523ca1db7d |
@ -5,4 +5,4 @@
|
|||||||
url = ssh://vector.iqser.com/research/image-prediction/
|
url = ssh://vector.iqser.com/research/image-prediction/
|
||||||
port = 22
|
port = 22
|
||||||
['remote "azure_remote"']
|
['remote "azure_remote"']
|
||||||
url = azure://ic-sa-dvc/
|
url = azure://image-classification-dvc/
|
||||||
@ -1,8 +1,51 @@
|
|||||||
include:
|
include:
|
||||||
- project: "Gitlab/gitlab"
|
- project: "Gitlab/gitlab"
|
||||||
ref: 0.2.6
|
ref: main
|
||||||
file: "/ci-templates/research/dvc-versioning-build-release.gitlab-ci.yml"
|
file: "/ci-templates/research/dvc.gitlab-ci.yml"
|
||||||
|
- project: "Gitlab/gitlab"
|
||||||
|
ref: main
|
||||||
|
file: "/ci-templates/research/versioning-build-test-release.gitlab-ci.yml"
|
||||||
|
|
||||||
variables:
|
variables:
|
||||||
NEXUS_PROJECT_DIR: red
|
NEXUS_PROJECT_DIR: red
|
||||||
IMAGENAME: "${CI_PROJECT_NAME}"
|
IMAGENAME: "${CI_PROJECT_NAME}"
|
||||||
|
INTEGRATION_TEST_FILE: "${CI_PROJECT_ID}.pdf"
|
||||||
|
FF_USE_FASTZIP: "true" # enable fastzip - a faster zip implementation that also supports level configuration.
|
||||||
|
ARTIFACT_COMPRESSION_LEVEL: default # can also be set to fastest, fast, slow and slowest. If just enabling fastzip is not enough try setting this to fastest or fast.
|
||||||
|
CACHE_COMPRESSION_LEVEL: default # same as above, but for caches
|
||||||
|
# TRANSFER_METER_FREQUENCY: 5s # will display transfer progress every 5 seconds for artifacts and remote caches. For debugging purposes.
|
||||||
|
|
||||||
|
stages:
|
||||||
|
- data
|
||||||
|
- setup
|
||||||
|
- tests
|
||||||
|
- sonarqube
|
||||||
|
- versioning
|
||||||
|
- build
|
||||||
|
- integration-tests
|
||||||
|
- release
|
||||||
|
|
||||||
|
docker-build:
|
||||||
|
extends: .docker-build
|
||||||
|
needs:
|
||||||
|
- job: dvc-pull
|
||||||
|
artifacts: true
|
||||||
|
- !reference [.needs-versioning, needs] # leave this line as is
|
||||||
|
|
||||||
|
###################
|
||||||
|
# INTEGRATION TESTS
|
||||||
|
trigger-integration-tests:
|
||||||
|
extends: .integration-tests
|
||||||
|
# ADD THE MODEL BUILD WHICH SHOULD TRIGGER THE INTEGRATION TESTS
|
||||||
|
# needs:
|
||||||
|
# - job: docker-build::model_name
|
||||||
|
# artifacts: true
|
||||||
|
rules:
|
||||||
|
- when: never
|
||||||
|
|
||||||
|
#########
|
||||||
|
# RELEASE
|
||||||
|
release:
|
||||||
|
extends: .release
|
||||||
|
needs:
|
||||||
|
- !reference [.needs-versioning, needs] # leave this line as is
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
3.8.13
|
3.10
|
||||||
|
|||||||
59
Dockerfile
59
Dockerfile
@ -1,11 +1,17 @@
|
|||||||
FROM python:3.8
|
FROM python:3.10-slim AS builder
|
||||||
|
|
||||||
|
ARG GITLAB_USER
|
||||||
|
ARG GITLAB_ACCESS_TOKEN
|
||||||
|
|
||||||
ARG USERNAME
|
|
||||||
ARG TOKEN
|
|
||||||
ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
|
ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
|
||||||
ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
|
ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
|
||||||
|
|
||||||
ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
|
ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
|
||||||
ARG POETRY_SOURCE_REF_RED=gitlab-red
|
ARG POETRY_SOURCE_REF_RED=gitlab-red
|
||||||
|
|
||||||
|
ARG PYPI_REGISTRY_FFORESIGHT=https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
|
||||||
|
ARG POETRY_SOURCE_REF_FFORESIGHT=gitlab-fforesight
|
||||||
|
|
||||||
ARG VERSION=dev
|
ARG VERSION=dev
|
||||||
|
|
||||||
LABEL maintainer="Research <research@knecon.com>"
|
LABEL maintainer="Research <research@knecon.com>"
|
||||||
@ -13,26 +19,55 @@ LABEL version="${VERSION}"
|
|||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
###########
|
||||||
|
# ENV SETUP
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=true
|
||||||
ENV PYTHONUNBUFFERED=true
|
ENV PYTHONUNBUFFERED=true
|
||||||
ENV POETRY_HOME=/opt/poetry
|
ENV POETRY_HOME=/opt/poetry
|
||||||
ENV PATH="$POETRY_HOME/bin:$PATH"
|
ENV PATH="$POETRY_HOME/bin:$PATH"
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y curl git bash build-essential libffi-dev libssl-dev && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN curl -sSL https://install.python-poetry.org | python3 -
|
RUN curl -sSL https://install.python-poetry.org | python3 -
|
||||||
|
RUN poetry --version
|
||||||
|
|
||||||
COPY ./data ./data
|
COPY pyproject.toml poetry.lock ./
|
||||||
COPY ./scripts ./scripts
|
|
||||||
COPY ./image_prediction ./image_prediction
|
|
||||||
COPY pyproject.toml poetry.lock banner.txt config.yaml ./src ./
|
|
||||||
|
|
||||||
RUN poetry config virtualenvs.create false && \
|
RUN poetry config virtualenvs.create true && \
|
||||||
|
poetry config virtualenvs.in-project true && \
|
||||||
poetry config installer.max-workers 10 && \
|
poetry config installer.max-workers 10 && \
|
||||||
poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
|
poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
|
||||||
poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${USERNAME} ${TOKEN} && \
|
poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
|
||||||
poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
|
poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
|
||||||
poetry config http-basic.${POETRY_SOURCE_REF_RED} ${USERNAME} ${TOKEN} && \
|
poetry config http-basic.${POETRY_SOURCE_REF_RED} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
|
||||||
poetry install --without=test -vv --no-interaction --no-root
|
poetry config repositories.${POETRY_SOURCE_REF_FFORESIGHT} ${PYPI_REGISTRY_FFORESIGHT} && \
|
||||||
|
poetry config http-basic.${POETRY_SOURCE_REF_FFORESIGHT} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
|
||||||
|
poetry install --without=dev -vv --no-interaction --no-root
|
||||||
|
|
||||||
|
###############
|
||||||
|
# WORKING IMAGE
|
||||||
|
FROM python:3.10-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# COPY SOURCE CODE FROM BUILDER IMAGE
|
||||||
|
COPY --from=builder /app /app
|
||||||
|
# COPY BILL OF MATERIALS (BOM)
|
||||||
|
COPY bom.json /bom.json
|
||||||
|
|
||||||
|
ENV PATH="/app/.venv/bin:$PATH"
|
||||||
|
|
||||||
|
###################
|
||||||
|
# COPY SOURCE CODE
|
||||||
|
COPY ./src ./src
|
||||||
|
COPY ./config ./config
|
||||||
|
COPY ./data ./data
|
||||||
|
COPY banner.txt ./
|
||||||
|
|
||||||
EXPOSE 5000
|
EXPOSE 5000
|
||||||
EXPOSE 8080
|
EXPOSE 8080
|
||||||
|
|
||||||
CMD [ "python", "serve.py"]
|
CMD [ "python", "src/serve.py"]
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
FROM python:3.8
|
FROM python:3.10
|
||||||
|
|
||||||
ARG USERNAME
|
ARG USERNAME
|
||||||
ARG TOKEN
|
ARG TOKEN
|
||||||
@ -20,9 +20,10 @@ ENV PATH="$POETRY_HOME/bin:$PATH"
|
|||||||
RUN curl -sSL https://install.python-poetry.org | python3 -
|
RUN curl -sSL https://install.python-poetry.org | python3 -
|
||||||
|
|
||||||
COPY ./data ./data
|
COPY ./data ./data
|
||||||
COPY ./image_prediction ./image_prediction
|
|
||||||
COPY ./test ./test
|
COPY ./test ./test
|
||||||
COPY pyproject.toml poetry.lock banner.txt config.yaml ./src ./
|
COPY ./config ./config
|
||||||
|
COPY ./src ./src
|
||||||
|
COPY pyproject.toml poetry.lock banner.txt config.yaml./
|
||||||
|
|
||||||
RUN poetry config virtualenvs.create false && \
|
RUN poetry config virtualenvs.create false && \
|
||||||
poetry config installer.max-workers 10 && \
|
poetry config installer.max-workers 10 && \
|
||||||
@ -30,7 +31,7 @@ RUN poetry config virtualenvs.create false && \
|
|||||||
poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${USERNAME} ${TOKEN} && \
|
poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${USERNAME} ${TOKEN} && \
|
||||||
poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
|
poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
|
||||||
poetry config http-basic.${POETRY_SOURCE_REF_RED} ${USERNAME} ${TOKEN} && \
|
poetry config http-basic.${POETRY_SOURCE_REF_RED} ${USERNAME} ${TOKEN} && \
|
||||||
poetry install --without=test -vv --no-interaction --no-root
|
poetry install --without=dev -vv --no-interaction --no-root
|
||||||
|
|
||||||
EXPOSE 5000
|
EXPOSE 5000
|
||||||
EXPOSE 8080
|
EXPOSE 8080
|
||||||
|
|||||||
24
config.yaml
24
config.yaml
@ -1,24 +0,0 @@
|
|||||||
webserver:
|
|
||||||
host: $SERVER_HOST|"127.0.0.1" # webserver address
|
|
||||||
port: $SERVER_PORT|5000 # webserver port
|
|
||||||
|
|
||||||
service:
|
|
||||||
logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for service logger
|
|
||||||
verbose: $VERBOSE|False # Service DOES NOT prints document processing progress to stdout
|
|
||||||
batch_size: $BATCH_SIZE|16 # Number of images in memory simultaneously
|
|
||||||
mlflow_run_id: $MLFLOW_RUN_ID|fabfb1f192c745369b88cab34471aba7 # The ID of the mlflow run to load the service_estimator from
|
|
||||||
|
|
||||||
# These variables control filters that are applied to either images, image metadata or service_estimator predictions.
|
|
||||||
# The filter result values are reported in the service responses. For convenience the response to a request contains a
|
|
||||||
# "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
|
|
||||||
# specified required value.
|
|
||||||
filters:
|
|
||||||
image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)
|
|
||||||
min: $MIN_REL_IMAGE_SIZE|0.05 # Minimum permissible
|
|
||||||
max: $MAX_REL_IMAGE_SIZE|0.75 # Maximum permissible
|
|
||||||
|
|
||||||
image_width_to_height_quotient: # Image width to height ratio
|
|
||||||
min: $MIN_IMAGE_FORMAT|0.1 # Minimum permissible
|
|
||||||
max: $MAX_IMAGE_FORMAT|10 # Maximum permissible
|
|
||||||
|
|
||||||
min_confidence: $MIN_CONFIDENCE|0.5 # Minimum permissible prediction confidence
|
|
||||||
68
config/pyinfra.toml
Normal file
68
config/pyinfra.toml
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
|
||||||
|
[asyncio]
|
||||||
|
max_concurrent_tasks = 10
|
||||||
|
|
||||||
|
[dynamic_tenant_queues]
|
||||||
|
enabled = true
|
||||||
|
|
||||||
|
[metrics.prometheus]
|
||||||
|
enabled = true
|
||||||
|
prefix = "redactmanager_image_service"
|
||||||
|
|
||||||
|
[tracing]
|
||||||
|
enabled = true
|
||||||
|
# possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
|
||||||
|
type = "azure_monitor"
|
||||||
|
|
||||||
|
[tracing.opentelemetry]
|
||||||
|
endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
|
||||||
|
service_name = "redactmanager_image_service"
|
||||||
|
exporter = "otlp"
|
||||||
|
|
||||||
|
[webserver]
|
||||||
|
host = "0.0.0.0"
|
||||||
|
port = 8080
|
||||||
|
|
||||||
|
[rabbitmq]
|
||||||
|
host = "localhost"
|
||||||
|
port = 5672
|
||||||
|
username = ""
|
||||||
|
password = ""
|
||||||
|
heartbeat = 60
|
||||||
|
# Has to be a divider of heartbeat, and shouldn't be too big, since only in these intervals queue interactions happen (like receiving new messages)
|
||||||
|
# This is also the minimum time the service needs to process a message
|
||||||
|
connection_sleep = 5
|
||||||
|
input_queue = "request_queue"
|
||||||
|
output_queue = "response_queue"
|
||||||
|
dead_letter_queue = "dead_letter_queue"
|
||||||
|
|
||||||
|
tenant_event_queue_suffix = "_tenant_event_queue"
|
||||||
|
tenant_event_dlq_suffix = "_tenant_events_dlq"
|
||||||
|
tenant_exchange_name = "tenants-exchange"
|
||||||
|
queue_expiration_time = 300000 # 5 minutes in milliseconds
|
||||||
|
|
||||||
|
service_request_queue_prefix = "image_request_queue"
|
||||||
|
service_request_exchange_name = "image_request_exchange"
|
||||||
|
service_response_exchange_name = "image_response_exchange"
|
||||||
|
service_dlq_name = "image_dlq"
|
||||||
|
|
||||||
|
[storage]
|
||||||
|
backend = "s3"
|
||||||
|
|
||||||
|
[storage.s3]
|
||||||
|
bucket = "redaction"
|
||||||
|
endpoint = "http://127.0.0.1:9000"
|
||||||
|
key = ""
|
||||||
|
secret = ""
|
||||||
|
region = "eu-central-1"
|
||||||
|
|
||||||
|
[storage.azure]
|
||||||
|
container = "redaction"
|
||||||
|
connection_string = ""
|
||||||
|
|
||||||
|
[storage.tenant_server]
|
||||||
|
public_key = ""
|
||||||
|
endpoint = "http://tenant-user-management:8081/internal-api/tenants"
|
||||||
|
|
||||||
|
[kubernetes]
|
||||||
|
pod_name = "test_pod"
|
||||||
42
config/settings.toml
Normal file
42
config/settings.toml
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
[logging]
|
||||||
|
level = "INFO"
|
||||||
|
|
||||||
|
[service]
|
||||||
|
# Print document processing progress to stdout
|
||||||
|
verbose = false
|
||||||
|
batch_size = 6
|
||||||
|
image_stiching_tolerance = 1 # in pixels
|
||||||
|
mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
|
||||||
|
|
||||||
|
# These variables control filters that are applied to either images, image metadata or service_estimator predictions.
|
||||||
|
# The filter result values are reported in the service responses. For convenience the response to a request contains a
|
||||||
|
# "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
|
||||||
|
# specified required value.
|
||||||
|
[filters.confidence]
|
||||||
|
# Minimum permissible prediction confidence
|
||||||
|
min = 0.5
|
||||||
|
|
||||||
|
# Image size to page size ratio (ratio of geometric means of areas)
|
||||||
|
[filters.image_to_page_quotient]
|
||||||
|
min = 0.05
|
||||||
|
max = 0.75
|
||||||
|
|
||||||
|
[filters.is_scanned_page]
|
||||||
|
# Minimum permissible image to page ratio tolerance for a page to be considered scanned.
|
||||||
|
# This is only used for filtering small images on scanned pages and is applied before processing the image, therefore
|
||||||
|
# superseding the image_to_page_quotient filter that only applies a tag to the image after processing.
|
||||||
|
tolerance = 0
|
||||||
|
|
||||||
|
# Image width to height ratio
|
||||||
|
[filters.image_width_to_height_quotient]
|
||||||
|
min = 0.1
|
||||||
|
max = 10
|
||||||
|
|
||||||
|
# put class specific filters here ['signature', 'formula', 'logo']
|
||||||
|
[filters.overrides.signature.image_to_page_quotient]
|
||||||
|
max = 0.4
|
||||||
|
|
||||||
|
[filters.overrides.logo.image_to_page_quotient]
|
||||||
|
min = 0.06
|
||||||
|
|
||||||
|
|
||||||
@ -1,46 +0,0 @@
|
|||||||
"""Implements a config object with dot-indexing syntax."""
|
|
||||||
|
|
||||||
|
|
||||||
from envyaml import EnvYAML
|
|
||||||
|
|
||||||
from image_prediction.locations import CONFIG_FILE
|
|
||||||
|
|
||||||
|
|
||||||
def _get_item_and_maybe_make_dotindexable(container, item):
|
|
||||||
ret = container[item]
|
|
||||||
return DotIndexable(ret) if isinstance(ret, dict) else ret
|
|
||||||
|
|
||||||
|
|
||||||
class DotIndexable:
|
|
||||||
def __init__(self, x):
|
|
||||||
self.x = x
|
|
||||||
|
|
||||||
def get(self, item, default=None):
|
|
||||||
try:
|
|
||||||
return _get_item_and_maybe_make_dotindexable(self.x, item)
|
|
||||||
except KeyError:
|
|
||||||
return default
|
|
||||||
|
|
||||||
def __getattr__(self, item):
|
|
||||||
return _get_item_and_maybe_make_dotindexable(self.x, item)
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return self.x.__repr__()
|
|
||||||
|
|
||||||
def __getitem__(self, item):
|
|
||||||
return self.__getattr__(item)
|
|
||||||
|
|
||||||
|
|
||||||
class Config:
|
|
||||||
def __init__(self, config_path):
|
|
||||||
self.__config = EnvYAML(config_path)
|
|
||||||
|
|
||||||
def __getattr__(self, item):
|
|
||||||
if item in self.__config:
|
|
||||||
return _get_item_and_maybe_make_dotindexable(self.__config, item)
|
|
||||||
|
|
||||||
def __getitem__(self, item):
|
|
||||||
return self.__getattr__(item)
|
|
||||||
|
|
||||||
|
|
||||||
CONFIG = Config(CONFIG_FILE)
|
|
||||||
@ -1,16 +0,0 @@
|
|||||||
"""Defines constant paths relative to the module root path."""
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
MODULE_DIR = Path(__file__).resolve().parents[0]
|
|
||||||
PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
|
|
||||||
|
|
||||||
CONFIG_FILE = PACKAGE_ROOT_DIR / "config.yaml"
|
|
||||||
BANNER_FILE = PACKAGE_ROOT_DIR / "banner.txt"
|
|
||||||
|
|
||||||
DATA_DIR = PACKAGE_ROOT_DIR / "data"
|
|
||||||
MLRUNS_DIR = str(DATA_DIR / "mlruns")
|
|
||||||
|
|
||||||
TEST_DIR = PACKAGE_ROOT_DIR / "test"
|
|
||||||
TEST_DATA_DIR = TEST_DIR / "data"
|
|
||||||
TEST_DATA_DIR_DVC = TEST_DIR / "data.dvc"
|
|
||||||
@ -1,27 +0,0 @@
|
|||||||
import logging
|
|
||||||
|
|
||||||
from image_prediction.config import CONFIG
|
|
||||||
|
|
||||||
|
|
||||||
def make_logger_getter():
|
|
||||||
logger = logging.getLogger("imclf")
|
|
||||||
logger.propagate = False
|
|
||||||
|
|
||||||
handler = logging.StreamHandler()
|
|
||||||
handler.setLevel(CONFIG.service.logging_level)
|
|
||||||
|
|
||||||
log_format = "%(asctime)s %(levelname)-8s %(message)s"
|
|
||||||
formatter = logging.Formatter(log_format, datefmt="%Y-%m-%d %H:%M:%S")
|
|
||||||
|
|
||||||
handler.setFormatter(formatter)
|
|
||||||
logger.addHandler(handler)
|
|
||||||
|
|
||||||
logger.setLevel(CONFIG.service.logging_level)
|
|
||||||
|
|
||||||
def get_logger():
|
|
||||||
return logger
|
|
||||||
|
|
||||||
return get_logger
|
|
||||||
|
|
||||||
|
|
||||||
get_logger = make_logger_getter()
|
|
||||||
6662
poetry.lock
generated
6662
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,18 +1,20 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "image-classification-service"
|
name = "image-classification-service"
|
||||||
version = "1.33.0"
|
version = "2.17.0"
|
||||||
description = ""
|
description = ""
|
||||||
authors = ["Team Research <research@knecon.com>"]
|
authors = ["Team Research <research@knecon.com>"]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
packages = [{ include = "image_prediction" }]
|
packages = [{ include = "image_prediction", from = "src" }]
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "~3.8"
|
python = ">=3.10,<3.11"
|
||||||
|
# FIXME: This should be recent pyinfra, but the recent protobuf packages are not compatible with tensorflow 2.9.0, also
|
||||||
|
# see RED-9948.
|
||||||
|
pyinfra = { version = "3.4.2", source = "gitlab-research" }
|
||||||
|
kn-utils = { version = ">=0.4.0", source = "gitlab-research" }
|
||||||
dvc = "^2.34.0"
|
dvc = "^2.34.0"
|
||||||
dvc-ssh = "^2.20.0"
|
dvc-ssh = "^2.20.0"
|
||||||
dvc-azure = "^2.21.2"
|
dvc-azure = "^2.21.2"
|
||||||
pyinfra = { version = "1.6.0", source = "gitlab-research" }
|
|
||||||
kn-utils = { version = "0.1.4", source = "gitlab-research" }
|
|
||||||
Flask = "^2.1.1"
|
Flask = "^2.1.1"
|
||||||
requests = "^2.27.1"
|
requests = "^2.27.1"
|
||||||
iteration-utilities = "^0.11.0"
|
iteration-utilities = "^0.11.0"
|
||||||
@ -23,22 +25,25 @@ mlflow = "^1.24.0"
|
|||||||
numpy = "^1.22.3"
|
numpy = "^1.22.3"
|
||||||
tqdm = "^4.64.0"
|
tqdm = "^4.64.0"
|
||||||
pandas = "^1.4.2"
|
pandas = "^1.4.2"
|
||||||
tensorflow = "^2.8.0"
|
# FIXME: Our current model significantly changes the prediction behaviour when using newer tensorflow (/ protobuf)
|
||||||
|
# versions which is introduuced by pyinfra updates using newer protobuf versions, see RED-9948.
|
||||||
|
tensorflow = "2.9.0"
|
||||||
|
protobuf = "^3.20"
|
||||||
pytest = "^7.1.0"
|
pytest = "^7.1.0"
|
||||||
funcy = "^1.17"
|
funcy = "^2"
|
||||||
PyMuPDF = "^1.19.6"
|
PyMuPDF = "^1.19.6"
|
||||||
fpdf = "^1.7.2"
|
fpdf = "^1.7.2"
|
||||||
coverage = "^6.3.2"
|
coverage = "^6.3.2"
|
||||||
Pillow = "^9.1.0"
|
Pillow = "^9.1.0"
|
||||||
pdf2image = "^1.16.0"
|
pdf2image = "^1.16.0"
|
||||||
frozendict = "^2.3.0"
|
frozendict = "^2.3.0"
|
||||||
protobuf = "^3.20.0"
|
|
||||||
fsspec = "^2022.11.0"
|
fsspec = "^2022.11.0"
|
||||||
PyMonad = "^2.4.0"
|
PyMonad = "^2.4.0"
|
||||||
pdfnetpython3 = "9.4.2"
|
pdfnetpython3 = "9.4.2"
|
||||||
loguru = "^0.6.0"
|
loguru = "^0.7.0"
|
||||||
|
cyclonedx-bom = "^4.5.0"
|
||||||
|
|
||||||
[tool.poetry.group.test.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
pytest = "^7.0.1"
|
pytest = "^7.0.1"
|
||||||
pymonad = "^2.4.0"
|
pymonad = "^2.4.0"
|
||||||
pylint = "^2.17.4"
|
pylint = "^2.17.4"
|
||||||
|
|||||||
46
scripts/debug/debug.py
Normal file
46
scripts/debug/debug.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
"""Script to debug RED-9948. The predictions unexpectedly changed for some images, and we need to understand why."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
from kn_utils.logging import logger
|
||||||
|
|
||||||
|
from image_prediction.config import CONFIG
|
||||||
|
from image_prediction.pipeline import load_pipeline
|
||||||
|
|
||||||
|
|
||||||
|
def process_pdf(pipeline, pdf_path, page_range=None):
|
||||||
|
with open(pdf_path, "rb") as f:
|
||||||
|
logger.info(f"Processing {pdf_path}")
|
||||||
|
predictions = list(pipeline(f.read(), page_range=page_range))
|
||||||
|
|
||||||
|
return predictions
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_seeds():
|
||||||
|
seed = 42
|
||||||
|
np.random.seed(seed)
|
||||||
|
random.seed(seed)
|
||||||
|
tf.random.set_seed(seed)
|
||||||
|
|
||||||
|
|
||||||
|
def debug_info():
|
||||||
|
devices = tf.config.list_physical_devices()
|
||||||
|
print("Available devices:", devices)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# For in container debugging, copy the file and adjust the path.
|
||||||
|
debug_file_path = Path(__file__).parents[2] / "test" / "data" / "RED-9948" / "SYNGENTA_EFSA_sanitisation_GFL_v2"
|
||||||
|
ensure_seeds()
|
||||||
|
debug_info()
|
||||||
|
|
||||||
|
pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size)
|
||||||
|
predictions = process_pdf(pipeline, debug_file_path)
|
||||||
|
# This is the image that has the wrong prediction mentioned in RED-9948. The predictions should inconclusive, and
|
||||||
|
# the flag all passed should be false.
|
||||||
|
predictions = [x for x in predictions if x["representation"] == "FA30F080F0C031CE17E8CF237"]
|
||||||
|
print(json.dumps(predictions, indent=2))
|
||||||
@ -1,6 +1,6 @@
|
|||||||
docker build -t image-clsasification-service:$(poetry version -s)-dev \
|
docker build -t --platform linux/amd64 image-clsasification-service:$(poetry version -s)-dev \
|
||||||
-f Dockerfile \
|
-f Dockerfile \
|
||||||
--build-arg USERNAME=$GITLAB_USER \
|
--build-arg GITLAB_USER=$GITLAB_USER \
|
||||||
--build-arg TOKEN=$GITLAB_ACCESS_TOKEN \
|
--build-arg GITLAB_ACCESS_TOKEN=$GITLAB_ACCESS_TOKEN \
|
||||||
. && \
|
. && \
|
||||||
docker run -it --rm image-clsasification-service:$(poetry version -s)-dev
|
docker run -it --rm image-clsasification-service:$(poetry version -s)-dev
|
||||||
|
|||||||
@ -3,12 +3,15 @@ import json
|
|||||||
import os
|
import os
|
||||||
from glob import glob
|
from glob import glob
|
||||||
|
|
||||||
|
from image_prediction.config import CONFIG
|
||||||
from image_prediction.pipeline import load_pipeline
|
from image_prediction.pipeline import load_pipeline
|
||||||
from image_prediction.utils import get_logger
|
from image_prediction.utils import get_logger
|
||||||
from image_prediction.utils.pdf_annotation import annotate_pdf
|
from image_prediction.utils.pdf_annotation import annotate_pdf
|
||||||
|
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
|
||||||
|
logger.setLevel("DEBUG")
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
@ -35,7 +38,7 @@ def process_pdf(pipeline, pdf_path, page_range=None):
|
|||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
pipeline = load_pipeline(verbose=True, tolerance=3)
|
pipeline = load_pipeline(verbose=CONFIG.service.verbose, batch_size=CONFIG.service.batch_size, tolerance=CONFIG.service.image_stiching_tolerance)
|
||||||
|
|
||||||
if os.path.isfile(args.input):
|
if os.path.isfile(args.input):
|
||||||
pdf_paths = [args.input]
|
pdf_paths = [args.input]
|
||||||
|
|||||||
7
src/image_prediction/config.py
Normal file
7
src/image_prediction/config.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pyinfra.config.loader import load_settings
|
||||||
|
|
||||||
|
from image_prediction.locations import PROJECT_ROOT_DIR
|
||||||
|
|
||||||
|
CONFIG = load_settings(root_path=PROJECT_ROOT_DIR, settings_path="config")
|
||||||
@ -13,7 +13,7 @@ class HashEncoder(Encoder):
|
|||||||
yield from self.encode(images)
|
yield from self.encode(images)
|
||||||
|
|
||||||
|
|
||||||
def hash_image(image: Image.Image):
|
def hash_image(image: Image.Image) -> str:
|
||||||
"""See: https://stackoverflow.com/a/49692185/3578468"""
|
"""See: https://stackoverflow.com/a/49692185/3578468"""
|
||||||
image = image.resize((10, 10), Image.ANTIALIAS)
|
image = image.resize((10, 10), Image.ANTIALIAS)
|
||||||
image = image.convert("L")
|
image = image.convert("L")
|
||||||
@ -21,4 +21,6 @@ def hash_image(image: Image.Image):
|
|||||||
avg_pixel = sum(pixel_data) / len(pixel_data)
|
avg_pixel = sum(pixel_data) / len(pixel_data)
|
||||||
bits = "".join(["1" if (px >= avg_pixel) else "0" for px in pixel_data])
|
bits = "".join(["1" if (px >= avg_pixel) else "0" for px in pixel_data])
|
||||||
hex_representation = str(hex(int(bits, 2)))[2:][::-1].upper()
|
hex_representation = str(hex(int(bits, 2)))[2:][::-1].upper()
|
||||||
return hex_representation
|
# Note: For each 4 leading zeros, the hex representation will be shorter by one character.
|
||||||
|
# To ensure that all hashes have the same length, we pad the hex representation with zeros (also see RED-3813).
|
||||||
|
return hex_representation.zfill(25)
|
||||||
@ -3,7 +3,7 @@ import json
|
|||||||
import traceback
|
import traceback
|
||||||
from _operator import itemgetter
|
from _operator import itemgetter
|
||||||
from functools import partial, lru_cache
|
from functools import partial, lru_cache
|
||||||
from itertools import chain, starmap, filterfalse
|
from itertools import chain, starmap, filterfalse, tee
|
||||||
from operator import itemgetter, truth
|
from operator import itemgetter, truth
|
||||||
from typing import Iterable, Iterator, List, Union
|
from typing import Iterable, Iterator, List, Union
|
||||||
|
|
||||||
@ -11,9 +11,10 @@ import fitz
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from funcy import merge, pluck, compose, rcompose, remove, keep
|
from funcy import merge, pluck, compose, rcompose, remove, keep
|
||||||
|
from scipy.stats import gmean
|
||||||
|
|
||||||
from image_prediction.config import CONFIG
|
from image_prediction.config import CONFIG
|
||||||
from image_prediction.exceptions import InvalidBox, BadXref
|
from image_prediction.exceptions import InvalidBox
|
||||||
from image_prediction.formatter.formatters.enum import EnumFormatter
|
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||||
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||||
from image_prediction.info import Info
|
from image_prediction.info import Info
|
||||||
@ -34,7 +35,7 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
|||||||
tolerance: The tolerance in pixels for the distance between images, beyond which they will not be stitched
|
tolerance: The tolerance in pixels for the distance between images, beyond which they will not be stitched
|
||||||
together
|
together
|
||||||
"""
|
"""
|
||||||
self.doc: fitz.fitz.Document = None
|
self.doc: fitz.Document = None
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.tolerance = tolerance
|
self.tolerance = tolerance
|
||||||
|
|
||||||
@ -47,7 +48,7 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
|||||||
|
|
||||||
yield from image_metadata_pairs
|
yield from image_metadata_pairs
|
||||||
|
|
||||||
def __process_images_on_page(self, page: fitz.fitz.Page):
|
def __process_images_on_page(self, page: fitz.Page):
|
||||||
metadata = extract_valid_metadata(self.doc, page)
|
metadata = extract_valid_metadata(self.doc, page)
|
||||||
images = get_images_on_page(self.doc, metadata)
|
images = get_images_on_page(self.doc, metadata)
|
||||||
|
|
||||||
@ -64,9 +65,13 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def __filter_valid_images(image_metadata_pairs: Iterable[ImageMetadataPair]) -> Iterator[ImageMetadataPair]:
|
def __filter_valid_images(image_metadata_pairs: Iterable[ImageMetadataPair]) -> Iterator[ImageMetadataPair]:
|
||||||
def validate(image: Image.Image, metadata: dict):
|
def validate_image_is_not_corrupt(image: Image.Image, metadata: dict):
|
||||||
|
"""See RED-5148: Some images are corrupt and cannot be processed by the image classifier. This function
|
||||||
|
filters out such images by trying to resize and convert them to RGB. If this fails, the image is considered
|
||||||
|
corrupt and is dropped.
|
||||||
|
TODO: find cleaner solution
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
# TODO: stand-in heuristic for testing if image is valid => find cleaner solution (RED-5148)
|
|
||||||
image.resize((100, 100)).convert("RGB")
|
image.resize((100, 100)).convert("RGB")
|
||||||
return ImageMetadataPair(image, metadata)
|
return ImageMetadataPair(image, metadata)
|
||||||
except (OSError, Exception) as err:
|
except (OSError, Exception) as err:
|
||||||
@ -74,7 +79,41 @@ class ParsablePDFImageExtractor(ImageExtractor):
|
|||||||
logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
|
logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return filter(truth, starmap(validate, image_metadata_pairs))
|
def filter_small_images_on_scanned_pages(image_metadata_pairs) -> Iterable[ImageMetadataPair]:
|
||||||
|
"""See RED-9746: Small images on scanned pages should be dropped, so they are not classified. This is a
|
||||||
|
heuristic to filter out images that are too small in relation to the page size if they are on a scanned page.
|
||||||
|
|
||||||
|
The ratio is computed as the geometric mean of the width and height of the image divided by the geometric mean
|
||||||
|
of the width and height of the page. If the ratio is below the threshold, the image is dropped.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def image_is_a_scanned_page(image_metadata_pair: ImageMetadataPair) -> bool:
|
||||||
|
tolerance = CONFIG.filters.is_scanned_page.tolerance
|
||||||
|
width_ratio = image_metadata_pair.metadata[Info.WIDTH] / image_metadata_pair.metadata[Info.PAGE_WIDTH]
|
||||||
|
height_ratio = (
|
||||||
|
image_metadata_pair.metadata[Info.HEIGHT] / image_metadata_pair.metadata[Info.PAGE_HEIGHT]
|
||||||
|
)
|
||||||
|
return width_ratio >= 1 - tolerance and height_ratio >= 1 - tolerance
|
||||||
|
|
||||||
|
def image_fits_geometric_mean_ratio(image_metadata_pair: ImageMetadataPair) -> bool:
|
||||||
|
min_ratio = CONFIG.filters.image_to_page_quotient.min
|
||||||
|
metadatum = image_metadata_pair.metadata
|
||||||
|
image_gmean = gmean([metadatum[Info.WIDTH], metadatum[Info.HEIGHT]])
|
||||||
|
page_gmean = gmean([metadatum[Info.PAGE_WIDTH], metadatum[Info.PAGE_HEIGHT]])
|
||||||
|
ratio = image_gmean / page_gmean
|
||||||
|
return ratio >= min_ratio
|
||||||
|
|
||||||
|
pairs, pairs_copy = tee(image_metadata_pairs)
|
||||||
|
|
||||||
|
if any(map(image_is_a_scanned_page, pairs_copy)):
|
||||||
|
logger.debug("Scanned page detected, filtering out small images ...")
|
||||||
|
return filter(image_fits_geometric_mean_ratio, pairs)
|
||||||
|
else:
|
||||||
|
return pairs
|
||||||
|
|
||||||
|
image_metadata_pairs = filter_small_images_on_scanned_pages(image_metadata_pairs)
|
||||||
|
|
||||||
|
return filter(truth, starmap(validate_image_is_not_corrupt, image_metadata_pairs))
|
||||||
|
|
||||||
|
|
||||||
def extract_pages(doc, page_range):
|
def extract_pages(doc, page_range):
|
||||||
@ -91,13 +130,12 @@ def get_images_on_page(doc, metadata):
|
|||||||
yield from images
|
yield from images
|
||||||
|
|
||||||
|
|
||||||
def extract_valid_metadata(doc: fitz.fitz.Document, page: fitz.fitz.Page):
|
def extract_valid_metadata(doc: fitz.Document, page: fitz.Page):
|
||||||
return compose(
|
metadata = get_metadata_for_images_on_page(page)
|
||||||
list,
|
metadata = filter_valid_metadata(metadata)
|
||||||
partial(add_alpha_channel_info, doc),
|
metadata = add_alpha_channel_info(doc, metadata)
|
||||||
filter_valid_metadata,
|
|
||||||
get_metadata_for_images_on_page,
|
return list(metadata)
|
||||||
)(page)
|
|
||||||
|
|
||||||
|
|
||||||
def get_metadata_for_images_on_page(page: fitz.Page):
|
def get_metadata_for_images_on_page(page: fitz.Page):
|
||||||
@ -153,7 +191,7 @@ def xref_to_image(doc, xref) -> Union[Image.Image, None]:
|
|||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def convert_pixmap_to_array(pixmap: fitz.fitz.Pixmap):
|
def convert_pixmap_to_array(pixmap: fitz.Pixmap):
|
||||||
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
|
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
|
||||||
array = _normalize_channels(array)
|
array = _normalize_channels(array)
|
||||||
return array
|
return array
|
||||||
@ -172,7 +210,6 @@ def _normalize_channels(array: np.ndarray):
|
|||||||
|
|
||||||
|
|
||||||
def get_image_metadata(image_info):
|
def get_image_metadata(image_info):
|
||||||
|
|
||||||
xref, coords = itemgetter("xref", "bbox")(image_info)
|
xref, coords = itemgetter("xref", "bbox")(image_info)
|
||||||
x1, y1, x2, y2 = map(rounder, coords)
|
x1, y1, x2, y2 = map(rounder, coords)
|
||||||
|
|
||||||
@ -207,7 +244,11 @@ def add_alpha_channel_info(doc, metadata):
|
|||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@lru_cache(maxsize=None)
|
||||||
def load_image_handle_from_xref(doc, xref):
|
def load_image_handle_from_xref(doc, xref):
|
||||||
return doc.extract_image(xref)
|
try:
|
||||||
|
return doc.extract_image(xref)
|
||||||
|
except ValueError:
|
||||||
|
logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
rounder = rcompose(round, int)
|
rounder = rcompose(round, int)
|
||||||
@ -224,7 +265,6 @@ def get_page_metadata(page):
|
|||||||
|
|
||||||
|
|
||||||
def has_alpha_channel(doc, xref):
|
def has_alpha_channel(doc, xref):
|
||||||
|
|
||||||
maybe_image = load_image_handle_from_xref(doc, xref)
|
maybe_image = load_image_handle_from_xref(doc, xref)
|
||||||
maybe_smask = maybe_image["smask"] if maybe_image else None
|
maybe_smask = maybe_image["smask"] if maybe_image else None
|
||||||
|
|
||||||
18
src/image_prediction/locations.py
Normal file
18
src/image_prediction/locations.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
"""Defines constant paths relative to the module root path."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# FIXME: move these paths to config, only depending on 'ROOT_PATH' environment variable.
|
||||||
|
MODULE_DIR = Path(__file__).resolve().parents[0]
|
||||||
|
PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
|
||||||
|
PROJECT_ROOT_DIR = PACKAGE_ROOT_DIR.parents[0]
|
||||||
|
|
||||||
|
CONFIG_FILE = PROJECT_ROOT_DIR / "config" / "settings.toml"
|
||||||
|
BANNER_FILE = PROJECT_ROOT_DIR / "banner.txt"
|
||||||
|
|
||||||
|
DATA_DIR = PROJECT_ROOT_DIR / "data"
|
||||||
|
MLRUNS_DIR = str(DATA_DIR / "mlruns")
|
||||||
|
|
||||||
|
TEST_DIR = PROJECT_ROOT_DIR / "test"
|
||||||
|
TEST_DATA_DIR = TEST_DIR / "data"
|
||||||
|
TEST_DATA_DIR_DVC = TEST_DIR / "data.dvc"
|
||||||
@ -1,8 +1,10 @@
|
|||||||
import os
|
import os
|
||||||
from functools import lru_cache, partial
|
from functools import lru_cache, partial
|
||||||
from itertools import chain, tee
|
from itertools import chain, tee
|
||||||
|
from typing import Iterable, Any
|
||||||
|
|
||||||
from funcy import rcompose, first, compose, second, chunks, identity, rpartial
|
from funcy import rcompose, first, compose, second, chunks, identity, rpartial
|
||||||
|
from kn_utils.logging import logger
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from image_prediction.config import CONFIG
|
from image_prediction.config import CONFIG
|
||||||
@ -21,6 +23,7 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
|||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@lru_cache(maxsize=None)
|
||||||
def load_pipeline(**kwargs):
|
def load_pipeline(**kwargs):
|
||||||
|
logger.info(f"Loading pipeline with kwargs: {kwargs}")
|
||||||
model_loader = get_mlflow_model_loader(MLRUNS_DIR)
|
model_loader = get_mlflow_model_loader(MLRUNS_DIR)
|
||||||
model_identifier = CONFIG.service.mlflow_run_id
|
model_identifier = CONFIG.service.mlflow_run_id
|
||||||
|
|
||||||
@ -52,7 +55,7 @@ class Pipeline:
|
|||||||
join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))
|
join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))
|
||||||
|
|
||||||
# />--classify--\
|
# />--classify--\
|
||||||
# --extract-->--split--+->--encode---->+--join-->reformat
|
# --extract-->--split--+->--encode---->+--join-->reformat-->filter_duplicates
|
||||||
# \>--identity--/
|
# \>--identity--/
|
||||||
|
|
||||||
self.pipe = rcompose(
|
self.pipe = rcompose(
|
||||||
@ -61,6 +64,7 @@ class Pipeline:
|
|||||||
pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise
|
pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise
|
||||||
join, # ... the streams by zipping
|
join, # ... the streams by zipping
|
||||||
reformat, # ... the items
|
reformat, # ... the items
|
||||||
|
filter_duplicates, # ... filter out duplicate images
|
||||||
)
|
)
|
||||||
|
|
||||||
def __call__(self, pdf: bytes, page_range: range = None):
|
def __call__(self, pdf: bytes, page_range: range = None):
|
||||||
@ -70,3 +74,32 @@ class Pipeline:
|
|||||||
unit=" images",
|
unit=" images",
|
||||||
disable=not self.verbose,
|
disable=not self.verbose,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
|
||||||
|
"""Filter out duplicate images from the `position` (image coordinates) and `page`, preferring the one with
|
||||||
|
`allPassed` set to True.
|
||||||
|
See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
|
||||||
|
"""
|
||||||
|
keep = dict()
|
||||||
|
for image_meta in metadata:
|
||||||
|
key: tuple[int, int, int, int, int] = (
|
||||||
|
image_meta["position"]["x1"],
|
||||||
|
image_meta["position"]["x2"],
|
||||||
|
image_meta["position"]["y1"],
|
||||||
|
image_meta["position"]["y2"],
|
||||||
|
image_meta["position"]["pageNumber"],
|
||||||
|
)
|
||||||
|
if key in keep:
|
||||||
|
logger.warning(
|
||||||
|
f"Duplicate image found: x1={key[0]}, x2={key[1]}, y1={key[2]}, y2={key[3]}, pageNumber={key[4]}"
|
||||||
|
)
|
||||||
|
if image_meta["filters"]["allPassed"]:
|
||||||
|
logger.warning("Setting the image with allPassed flag set to True")
|
||||||
|
keep[key] = image_meta
|
||||||
|
else:
|
||||||
|
logger.warning("Keeping the previous image since the current image has allPassed flag set to False")
|
||||||
|
else:
|
||||||
|
keep[key] = image_meta
|
||||||
|
|
||||||
|
yield from keep.values()
|
||||||
@ -1,13 +1,8 @@
|
|||||||
import json
|
|
||||||
import math
|
import math
|
||||||
import os
|
from dynaconf import Dynaconf
|
||||||
from functools import lru_cache
|
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
|
|
||||||
from funcy import first
|
|
||||||
|
|
||||||
from image_prediction.config import CONFIG
|
from image_prediction.config import CONFIG
|
||||||
from image_prediction.exceptions import ParsingError
|
|
||||||
from image_prediction.transformer.transformer import Transformer
|
from image_prediction.transformer.transformer import Transformer
|
||||||
from image_prediction.utils import get_logger
|
from image_prediction.utils import get_logger
|
||||||
|
|
||||||
@ -32,21 +27,22 @@ def build_image_info(data: dict) -> dict:
|
|||||||
geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4)
|
geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4)
|
||||||
|
|
||||||
min_image_to_page_quotient_breached = bool(
|
min_image_to_page_quotient_breached = bool(
|
||||||
geometric_quotient < get_class_specific_min_image_to_page_quotient(label)
|
geometric_quotient < get_class_specific_filter_value(label, CONFIG, "image_to_page_quotient", "min")
|
||||||
)
|
)
|
||||||
max_image_to_page_quotient_breached = bool(
|
max_image_to_page_quotient_breached = bool(
|
||||||
geometric_quotient > get_class_specific_max_image_to_page_quotient(label)
|
geometric_quotient > get_class_specific_filter_value(label, CONFIG, "image_to_page_quotient", "max")
|
||||||
)
|
)
|
||||||
|
|
||||||
min_image_width_to_height_quotient_breached = bool(
|
min_image_width_to_height_quotient_breached = bool(
|
||||||
width / height < get_class_specific_min_image_width_to_height_quotient(label)
|
width / height < get_class_specific_filter_value(label, CONFIG, "image_width_to_height_quotient", "min")
|
||||||
)
|
)
|
||||||
max_image_width_to_height_quotient_breached = bool(
|
max_image_width_to_height_quotient_breached = bool(
|
||||||
width / height > get_class_specific_max_image_width_to_height_quotient(label)
|
width / height > get_class_specific_filter_value(label, CONFIG, "image_width_to_height_quotient", "max")
|
||||||
)
|
)
|
||||||
|
|
||||||
min_confidence_breached = bool(
|
min_confidence_breached = bool(
|
||||||
max(classification["probabilities"].values()) < get_class_specific_min_classification_confidence(label)
|
max(classification["probabilities"].values())
|
||||||
|
< get_class_specific_filter_value(label, CONFIG, "confidence", "min")
|
||||||
)
|
)
|
||||||
|
|
||||||
image_info = {
|
image_info = {
|
||||||
@ -90,65 +86,15 @@ def compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1):
|
|||||||
return image_area_sqrt / page_area_sqrt
|
return image_area_sqrt / page_area_sqrt
|
||||||
|
|
||||||
|
|
||||||
def get_class_specific_min_image_to_page_quotient(label, table=None):
|
def get_class_specific_filter_value(label: str, settings: Dynaconf, filter_type: str, bound: str = None):
|
||||||
return get_class_specific_value(
|
|
||||||
"REL_IMAGE_SIZE", label, "min", CONFIG.filters.image_to_page_quotient.min, table=table
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_class_specific_max_image_to_page_quotient(label, table=None):
|
|
||||||
return get_class_specific_value(
|
|
||||||
"REL_IMAGE_SIZE", label, "max", CONFIG.filters.image_to_page_quotient.max, table=table
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_class_specific_min_image_width_to_height_quotient(label, table=None):
|
|
||||||
return get_class_specific_value(
|
|
||||||
"IMAGE_FORMAT", label, "min", CONFIG.filters.image_width_to_height_quotient.min, table=table
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_class_specific_max_image_width_to_height_quotient(label, table=None):
|
|
||||||
return get_class_specific_value(
|
|
||||||
"IMAGE_FORMAT", label, "max", CONFIG.filters.image_width_to_height_quotient.max, table=table
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_class_specific_min_classification_confidence(label, table=None):
|
|
||||||
return get_class_specific_value("CONFIDENCE", label, "min", CONFIG.filters.min_confidence, table=table)
|
|
||||||
|
|
||||||
|
|
||||||
def get_class_specific_value(prefix, label, bound, fallback_value, table=None):
|
|
||||||
def fallback():
|
|
||||||
return fallback_value
|
|
||||||
|
|
||||||
def success():
|
|
||||||
threshold_map = parse_env_var(prefix, table=table) or {}
|
|
||||||
value = threshold_map.get(label, {}).get(bound)
|
|
||||||
if value:
|
|
||||||
logger.debug(f"Using class '{label}' specific {bound} {prefix.lower().replace('_', '-')} value.")
|
|
||||||
return value
|
|
||||||
|
|
||||||
assert bound in ["min", "max"]
|
|
||||||
|
|
||||||
return success() or fallback()
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
|
||||||
def parse_env_var(prefix, table=None):
|
|
||||||
table = table or os.environ
|
|
||||||
head = first(filter(lambda s: s == prefix, table))
|
|
||||||
if head:
|
|
||||||
try:
|
|
||||||
return parse_env_var_value(table[head])
|
|
||||||
except ParsingError as err:
|
|
||||||
logger.warning(err)
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def parse_env_var_value(env_var_value):
|
|
||||||
try:
|
try:
|
||||||
return json.loads(env_var_value)
|
value = (
|
||||||
except Exception as err:
|
settings.filters.overrides[label][filter_type][bound]
|
||||||
raise ParsingError(f"Failed to parse {env_var_value}") from err
|
if bound
|
||||||
|
else settings.filters.overrides[label][filter_type]
|
||||||
|
)
|
||||||
|
logger.warning(f"Using {label=} specific {bound=} {filter_type=} {value=}.")
|
||||||
|
except KeyError:
|
||||||
|
value = settings.filters[filter_type][bound]
|
||||||
|
|
||||||
|
return value
|
||||||
4
src/image_prediction/utils/logger.py
Normal file
4
src/image_prediction/utils/logger.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
import kn_utils
|
||||||
|
|
||||||
|
# TODO: remove this module and use the `get_logger` function from the `kn_utils` package.
|
||||||
|
get_logger = kn_utils.get_logger
|
||||||
@ -56,7 +56,8 @@ def annotate_image(doc, image_info):
|
|||||||
|
|
||||||
def init():
|
def init():
|
||||||
PDFNet.Initialize(
|
PDFNet.Initialize(
|
||||||
"Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
|
# "Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
|
||||||
|
"Knecon AG:OEM:DDA-R::WL+:AMS(20270129):EA5FDFB23C7F36B9C2AE606F4F0D9197DE1FB649119F9730B622FABEF5C7"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
28
src/serve.py
28
src/serve.py
@ -1,17 +1,15 @@
|
|||||||
from image_prediction import logger
|
from sys import stdout
|
||||||
from image_prediction.config import Config
|
|
||||||
from image_prediction.locations import CONFIG_FILE
|
from kn_utils.logging import logger
|
||||||
|
from pyinfra.examples import start_standard_queue_consumer
|
||||||
|
from pyinfra.queue.callback import make_download_process_upload_callback
|
||||||
|
|
||||||
|
from image_prediction.config import CONFIG
|
||||||
from image_prediction.pipeline import load_pipeline
|
from image_prediction.pipeline import load_pipeline
|
||||||
from image_prediction.utils.banner import load_banner
|
from image_prediction.utils.banner import load_banner
|
||||||
from image_prediction.utils.process_wrapping import wrap_in_process
|
from image_prediction.utils.process_wrapping import wrap_in_process
|
||||||
from pyinfra import config
|
|
||||||
from pyinfra.payload_processing.processor import make_payload_processor
|
|
||||||
from pyinfra.queue.queue_manager import QueueManager
|
|
||||||
|
|
||||||
PYINFRA_CONFIG = config.get_config()
|
logger.reconfigure(sink=stdout, level=CONFIG.logging.level)
|
||||||
IMAGE_CONFIG = Config(CONFIG_FILE)
|
|
||||||
|
|
||||||
logger.setLevel(PYINFRA_CONFIG.logging_level_root)
|
|
||||||
|
|
||||||
|
|
||||||
# A component of the processing pipeline (probably tensorflow) does not release allocated memory (see RED-4206).
|
# A component of the processing pipeline (probably tensorflow) does not release allocated memory (see RED-4206).
|
||||||
@ -19,18 +17,16 @@ logger.setLevel(PYINFRA_CONFIG.logging_level_root)
|
|||||||
# Workaround: Manage Memory with the operating system, by wrapping the processing in a sub-process.
|
# Workaround: Manage Memory with the operating system, by wrapping the processing in a sub-process.
|
||||||
# FIXME: Find more fine-grained solution or if the problem occurs persistently for python services,
|
# FIXME: Find more fine-grained solution or if the problem occurs persistently for python services,
|
||||||
@wrap_in_process
|
@wrap_in_process
|
||||||
def process_data(data: bytes) -> list:
|
def process_data(data: bytes, _message: dict) -> list:
|
||||||
pipeline = load_pipeline(verbose=IMAGE_CONFIG.service.verbose, batch_size=IMAGE_CONFIG.service.batch_size)
|
pipeline = load_pipeline(verbose=CONFIG.service.verbose, batch_size=CONFIG.service.batch_size, tolerance=CONFIG.service.image_stiching_tolerance)
|
||||||
return list(pipeline(data))
|
return list(pipeline(data))
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
logger.info(load_banner())
|
logger.info(load_banner())
|
||||||
|
|
||||||
process_payload = make_payload_processor(process_data, config=PYINFRA_CONFIG)
|
callback = make_download_process_upload_callback(process_data, CONFIG)
|
||||||
|
start_standard_queue_consumer(callback, CONFIG)
|
||||||
queue_manager = QueueManager(PYINFRA_CONFIG)
|
|
||||||
queue_manager.start_consuming(process_payload)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@ -1,10 +1,3 @@
|
|||||||
import logging
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from image_prediction.utils import get_logger
|
|
||||||
|
|
||||||
|
|
||||||
pytest_plugins = [
|
pytest_plugins = [
|
||||||
"test.fixtures.extractor",
|
"test.fixtures.extractor",
|
||||||
"test.fixtures.image",
|
"test.fixtures.image",
|
||||||
@ -17,14 +10,5 @@ pytest_plugins = [
|
|||||||
"test.fixtures.parameters",
|
"test.fixtures.parameters",
|
||||||
"test.fixtures.pdf",
|
"test.fixtures.pdf",
|
||||||
"test.fixtures.target",
|
"test.fixtures.target",
|
||||||
"test.unit_tests.image_stitching_test"
|
"test.unit_tests.image_stitching_test",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
|
||||||
def mute_logger():
|
|
||||||
logger = get_logger()
|
|
||||||
level = logger.level
|
|
||||||
logger.setLevel(logging.CRITICAL + 1)
|
|
||||||
yield
|
|
||||||
logger.setLevel(level)
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
outs:
|
outs:
|
||||||
- md5: 4b0fec291ce0661b3efbbd8b80f4f514.dir
|
- md5: 08bf8a63f04b3f19f859008556699708.dir
|
||||||
size: 107332
|
size: 7979836
|
||||||
nfiles: 4
|
nfiles: 7
|
||||||
path: data
|
path: data
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user