Compare commits
178 Commits
refactorin
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0027421628 | ||
|
|
00740c91b8 | ||
|
|
a3d79eb9af | ||
|
|
373f9f2d01 | ||
|
|
2429d90dd5 | ||
|
|
2b85999258 | ||
|
|
4b15d2c2ca | ||
|
|
bf1ca8d6f9 | ||
|
|
9a4b8cad2b | ||
|
|
28adb50330 | ||
|
|
7a3fdf8fa4 | ||
|
|
3fbcd65e9b | ||
|
|
90a60b4b7c | ||
|
|
526de8984c | ||
|
|
99cbf3c9bf | ||
|
|
986137e729 | ||
|
|
f950b96cfb | ||
|
|
2385d19bc2 | ||
|
|
16f2f0d557 | ||
|
|
afa6fc34cb | ||
|
|
a192e05be2 | ||
|
|
d23034e38a | ||
|
|
4bc53cf88b | ||
|
|
e737f64ed2 | ||
|
|
4b099f0106 | ||
|
|
b3a58d6777 | ||
|
|
c888453cc6 | ||
|
|
bf9ab4b1a2 | ||
|
|
9ff88a1e5d | ||
|
|
c852434b75 | ||
|
|
8655e25ec0 | ||
|
|
103c19d4cd | ||
|
|
530001a0af | ||
|
|
a6c11a9db5 | ||
|
|
1796c1bcbb | ||
|
|
f4b9ff54aa | ||
|
|
278b42e368 | ||
|
|
9600e4ca23 | ||
|
|
8485345dd1 | ||
|
|
d1a523c7d6 | ||
|
|
278f54eaa7 | ||
|
|
443c2614f9 | ||
|
|
4102a564a3 | ||
|
|
7f49642ba0 | ||
|
|
ba8d1dfdfe | ||
|
|
150d0d64e5 | ||
|
|
a024ddfcf7 | ||
|
|
13cbfa4ddf | ||
|
|
75af55dbda | ||
|
|
499c501acf | ||
|
|
6163e29d6b | ||
|
|
dadc0a4163 | ||
|
|
729ce17de0 | ||
|
|
88fbe077e6 | ||
|
|
f8ecef1054 | ||
|
|
5f44cc6560 | ||
|
|
b60f4d0383 | ||
|
|
87873cc3a3 | ||
|
|
523ca1db7d | ||
|
|
c25f6902e0 | ||
|
|
9e336ecc01 | ||
|
|
0efa2127d7 | ||
|
|
501fd48d69 | ||
|
|
4a825cb264 | ||
|
|
694a6ccb33 | ||
|
|
1d043f97fc | ||
|
|
7cac73f07b | ||
|
|
133fde67ba | ||
|
|
946cfff630 | ||
|
|
f73264874e | ||
|
|
d3868efb4e | ||
|
|
f0c2282197 | ||
|
|
57e1ec1a14 | ||
|
|
8b9771373b | ||
|
|
cd3ce653e1 | ||
|
|
d8075aad38 | ||
|
|
2b3043bc1e | ||
|
|
3ad0345f4e | ||
|
|
134156f59d | ||
|
|
1205f2e0ed | ||
|
|
8ee966c721 | ||
|
|
892742ef17 | ||
|
|
06b1af9f1a | ||
|
|
0194ce3f7e | ||
|
|
41d08f7b5b | ||
|
|
b91d5a0ab2 | ||
|
|
7b37f3c913 | ||
|
|
c32005b841 | ||
|
|
6406ce6b25 | ||
|
|
4ecafb2977 | ||
|
|
967c2fad1b | ||
|
|
b74e79f113 | ||
|
|
50c791f6ca | ||
|
|
adb363842d | ||
|
|
81520b1a53 | ||
|
|
ed25af33ad | ||
|
|
1967945ff7 | ||
|
|
faf4d7ed0f | ||
|
|
7c7b038491 | ||
|
|
cd3e215776 | ||
|
|
bc1bd96e6c | ||
|
|
2001e9d7f3 | ||
|
|
846f127d3b | ||
|
|
d4657f1ab1 | ||
|
|
ee99d76aab | ||
|
|
00b40c0632 | ||
|
|
c1ae8e6a4b | ||
|
|
0bdf5a726a | ||
|
|
d505ac4e50 | ||
|
|
7dca05a53d | ||
|
|
c1449134ec | ||
|
|
29c76e7ebf | ||
|
|
ecc9f69d9c | ||
|
|
4bcadcd266 | ||
|
|
9065ec1d12 | ||
|
|
d239368d70 | ||
|
|
b5dc5aa777 | ||
|
|
54b7ba24e8 | ||
|
|
463f4da92b | ||
|
|
79455f0dd6 | ||
|
|
2bc9c24f6a | ||
|
|
ea301b4df2 | ||
|
|
5cdf93b923 | ||
|
|
4d43e385c5 | ||
|
|
bd0279ddd1 | ||
|
|
2995d5ee48 | ||
|
|
eff1bb4124 | ||
|
|
c478333111 | ||
|
|
978f48e8f9 | ||
|
|
94652aafe4 | ||
|
|
c4416636c0 | ||
|
|
c0b41e77b8 | ||
|
|
73f7491c8f | ||
|
|
2385584dcb | ||
|
|
b880e892ec | ||
|
|
8c7349c2d1 | ||
|
|
c55777e339 | ||
|
|
0f440bdb09 | ||
|
|
436a32ad2b | ||
|
|
9ec6cc19ba | ||
|
|
2d385b0a73 | ||
|
|
5bd5e0cf2b | ||
|
|
876260f403 | ||
|
|
368c54a8be | ||
|
|
1490d27308 | ||
|
|
4eb7f3c40a | ||
|
|
98dc001123 | ||
|
|
25fc7d84b9 | ||
|
|
d63f8c4eaf | ||
|
|
549b2aac5c | ||
|
|
c72ef26a6c | ||
|
|
561a7f527c | ||
|
|
48dd52131d | ||
|
|
053837722b | ||
|
|
98e639d83f | ||
|
|
13d4427c78 | ||
|
|
9763d2ca65 | ||
|
|
521222eb96 | ||
|
|
ebfdc14265 | ||
|
|
e54819e687 | ||
|
|
d1190f7efe | ||
|
|
d13b8436e2 | ||
|
|
520eee26e3 | ||
|
|
c1b96290df | ||
|
|
3405a34893 | ||
|
|
f787b957f8 | ||
|
|
5d611d5fae | ||
|
|
c14d00cac8 | ||
|
|
fd0e4dc3cf | ||
|
|
9f18ef9cd1 | ||
|
|
6030f4055a | ||
|
|
eb050a588b | ||
|
|
d55f77e1fa | ||
|
|
1e65d672d7 | ||
|
|
e7d229c0d7 | ||
|
|
ddd8d4685e | ||
|
|
eb18ae8719 | ||
|
|
a9d60654f5 |
63
.coveragerc
Normal file
63
.coveragerc
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
# .coveragerc to control coverage.py
|
||||||
|
[run]
|
||||||
|
branch = True
|
||||||
|
parallel = True
|
||||||
|
command_line = -m pytest
|
||||||
|
concurrency = multiprocessing
|
||||||
|
omit =
|
||||||
|
*/site-packages/*
|
||||||
|
*/distutils/*
|
||||||
|
*/test/*
|
||||||
|
*/__init__.py
|
||||||
|
*/setup.py
|
||||||
|
*/venv/*
|
||||||
|
*/env/*
|
||||||
|
*/build_venv/*
|
||||||
|
*/build_env/*
|
||||||
|
*/utils/banner.py
|
||||||
|
*/utils/logger.py
|
||||||
|
*/src/*
|
||||||
|
source =
|
||||||
|
image_prediction
|
||||||
|
relative_files = True
|
||||||
|
data_file = .coverage
|
||||||
|
|
||||||
|
[report]
|
||||||
|
# Regexes for lines to exclude from consideration
|
||||||
|
exclude_lines =
|
||||||
|
# Have to re-enable the standard pragma
|
||||||
|
pragma: no cover
|
||||||
|
|
||||||
|
# Don't complain about missing debug-only code:
|
||||||
|
def __repr__
|
||||||
|
if self\.debug
|
||||||
|
|
||||||
|
# Don't complain if tests don't hit defensive assertion code:
|
||||||
|
raise AssertionError
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
# Don't complain if non-runnable code isn't run:
|
||||||
|
if 0:
|
||||||
|
if __name__ == .__main__.:
|
||||||
|
omit =
|
||||||
|
*/site-packages/*
|
||||||
|
*/distutils/*
|
||||||
|
*/test/*
|
||||||
|
*/__init__.py
|
||||||
|
*/setup.py
|
||||||
|
*/venv/*
|
||||||
|
*/env/*
|
||||||
|
*/build_venv/*
|
||||||
|
*/build_env/*
|
||||||
|
*/utils/banner.py
|
||||||
|
*/utils/logger.py
|
||||||
|
*/src/*
|
||||||
|
*/pdf_annotation.py
|
||||||
|
|
||||||
|
ignore_errors = True
|
||||||
|
|
||||||
|
[html]
|
||||||
|
directory = reports
|
||||||
|
|
||||||
|
[xml]
|
||||||
|
output = reports/coverage.xml
|
||||||
@ -1,5 +1,8 @@
|
|||||||
[core]
|
[core]
|
||||||
remote = vector
|
remote = azure_remote
|
||||||
|
autostage = true
|
||||||
['remote "vector"']
|
['remote "vector"']
|
||||||
url = ssh://vector.iqser.com/research/image_service/
|
url = ssh://vector.iqser.com/research/image-prediction/
|
||||||
port = 22
|
port = 22
|
||||||
|
['remote "azure_remote"']
|
||||||
|
url = azure://image-classification-dvc/
|
||||||
11
.gitignore
vendored
11
.gitignore
vendored
@ -1,7 +1,8 @@
|
|||||||
.vscode/
|
.vscode/
|
||||||
*.h5
|
*.h5
|
||||||
/venv/
|
*venv
|
||||||
.idea/
|
.idea/
|
||||||
|
src/data
|
||||||
|
|
||||||
!.gitignore
|
!.gitignore
|
||||||
*.project
|
*.project
|
||||||
@ -32,6 +33,9 @@
|
|||||||
**/classpath-data.json
|
**/classpath-data.json
|
||||||
**/dependencies-and-licenses-overview.txt
|
**/dependencies-and-licenses-overview.txt
|
||||||
|
|
||||||
|
.coverage
|
||||||
|
.coverage\.*\.*
|
||||||
|
|
||||||
|
|
||||||
*__pycache__
|
*__pycache__
|
||||||
*.egg-info*
|
*.egg-info*
|
||||||
@ -44,7 +48,6 @@
|
|||||||
*misc
|
*misc
|
||||||
|
|
||||||
/coverage_html_report/
|
/coverage_html_report/
|
||||||
.coverage
|
|
||||||
|
|
||||||
# Created by https://www.toptal.com/developers/gitignore/api/linux,pycharm
|
# Created by https://www.toptal.com/developers/gitignore/api/linux,pycharm
|
||||||
# Edit at https://www.toptal.com/developers/gitignore?templates=linux,pycharm
|
# Edit at https://www.toptal.com/developers/gitignore?templates=linux,pycharm
|
||||||
@ -170,6 +173,4 @@ fabric.properties
|
|||||||
# https://plugins.jetbrains.com/plugin/12206-codestream
|
# https://plugins.jetbrains.com/plugin/12206-codestream
|
||||||
.idea/codestream.xml
|
.idea/codestream.xml
|
||||||
|
|
||||||
# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
|
# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
|
||||||
/image_prediction/data/mlruns/
|
|
||||||
/data/mlruns/
|
|
||||||
51
.gitlab-ci.yml
Normal file
51
.gitlab-ci.yml
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
include:
|
||||||
|
- project: "Gitlab/gitlab"
|
||||||
|
ref: main
|
||||||
|
file: "/ci-templates/research/dvc.gitlab-ci.yml"
|
||||||
|
- project: "Gitlab/gitlab"
|
||||||
|
ref: main
|
||||||
|
file: "/ci-templates/research/versioning-build-test-release.gitlab-ci.yml"
|
||||||
|
|
||||||
|
variables:
|
||||||
|
NEXUS_PROJECT_DIR: red
|
||||||
|
IMAGENAME: "${CI_PROJECT_NAME}"
|
||||||
|
INTEGRATION_TEST_FILE: "${CI_PROJECT_ID}.pdf"
|
||||||
|
FF_USE_FASTZIP: "true" # enable fastzip - a faster zip implementation that also supports level configuration.
|
||||||
|
ARTIFACT_COMPRESSION_LEVEL: default # can also be set to fastest, fast, slow and slowest. If just enabling fastzip is not enough try setting this to fastest or fast.
|
||||||
|
CACHE_COMPRESSION_LEVEL: default # same as above, but for caches
|
||||||
|
# TRANSFER_METER_FREQUENCY: 5s # will display transfer progress every 5 seconds for artifacts and remote caches. For debugging purposes.
|
||||||
|
|
||||||
|
stages:
|
||||||
|
- data
|
||||||
|
- setup
|
||||||
|
- tests
|
||||||
|
- sonarqube
|
||||||
|
- versioning
|
||||||
|
- build
|
||||||
|
- integration-tests
|
||||||
|
- release
|
||||||
|
|
||||||
|
docker-build:
|
||||||
|
extends: .docker-build
|
||||||
|
needs:
|
||||||
|
- job: dvc-pull
|
||||||
|
artifacts: true
|
||||||
|
- !reference [.needs-versioning, needs] # leave this line as is
|
||||||
|
|
||||||
|
###################
|
||||||
|
# INTEGRATION TESTS
|
||||||
|
trigger-integration-tests:
|
||||||
|
extends: .integration-tests
|
||||||
|
# ADD THE MODEL BUILD WHICH SHOULD TRIGGER THE INTEGRATION TESTS
|
||||||
|
# needs:
|
||||||
|
# - job: docker-build::model_name
|
||||||
|
# artifacts: true
|
||||||
|
rules:
|
||||||
|
- when: never
|
||||||
|
|
||||||
|
#########
|
||||||
|
# RELEASE
|
||||||
|
release:
|
||||||
|
extends: .release
|
||||||
|
needs:
|
||||||
|
- !reference [.needs-versioning, needs] # leave this line as is
|
||||||
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -1,3 +0,0 @@
|
|||||||
[submodule "incl/redai_image"]
|
|
||||||
path = incl/redai_image
|
|
||||||
url = ssh://git@git.iqser.com:2222/rr/redai_image.git
|
|
||||||
1
.python-version
Normal file
1
.python-version
Normal file
@ -0,0 +1 @@
|
|||||||
|
3.10
|
||||||
80
Dockerfile
80
Dockerfile
@ -1,25 +1,73 @@
|
|||||||
ARG BASE_ROOT="nexus.iqser.com:5001/red/"
|
FROM python:3.10-slim AS builder
|
||||||
ARG VERSION_TAG="latest"
|
|
||||||
|
|
||||||
FROM ${BASE_ROOT}image-prediction-base:${VERSION_TAG}
|
ARG GITLAB_USER
|
||||||
|
ARG GITLAB_ACCESS_TOKEN
|
||||||
|
|
||||||
WORKDIR /app/service
|
ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
|
||||||
|
ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
|
||||||
|
|
||||||
COPY src src
|
ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
|
||||||
COPY data data
|
ARG POETRY_SOURCE_REF_RED=gitlab-red
|
||||||
COPY image_prediction image_prediction
|
|
||||||
COPY incl/redai_image/redai incl/redai_image/redai
|
|
||||||
COPY setup.py setup.py
|
|
||||||
COPY requirements.txt requirements.txt
|
|
||||||
COPY config.yaml config.yaml
|
|
||||||
|
|
||||||
# Install dependencies differing from base image.
|
ARG PYPI_REGISTRY_FFORESIGHT=https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
|
||||||
RUN python3 -m pip install -r requirements.txt
|
ARG POETRY_SOURCE_REF_FFORESIGHT=gitlab-fforesight
|
||||||
|
|
||||||
RUN python3 -m pip install -e .
|
ARG VERSION=dev
|
||||||
RUN python3 -m pip install -e incl/redai_image/redai
|
|
||||||
|
LABEL maintainer="Research <research@knecon.com>"
|
||||||
|
LABEL version="${VERSION}"
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
###########
|
||||||
|
# ENV SETUP
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=true
|
||||||
|
ENV PYTHONUNBUFFERED=true
|
||||||
|
ENV POETRY_HOME=/opt/poetry
|
||||||
|
ENV PATH="$POETRY_HOME/bin:$PATH"
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y curl git bash build-essential libffi-dev libssl-dev && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
RUN curl -sSL https://install.python-poetry.org | python3 -
|
||||||
|
RUN poetry --version
|
||||||
|
|
||||||
|
COPY pyproject.toml poetry.lock ./
|
||||||
|
|
||||||
|
RUN poetry config virtualenvs.create true && \
|
||||||
|
poetry config virtualenvs.in-project true && \
|
||||||
|
poetry config installer.max-workers 10 && \
|
||||||
|
poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
|
||||||
|
poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
|
||||||
|
poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
|
||||||
|
poetry config http-basic.${POETRY_SOURCE_REF_RED} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
|
||||||
|
poetry config repositories.${POETRY_SOURCE_REF_FFORESIGHT} ${PYPI_REGISTRY_FFORESIGHT} && \
|
||||||
|
poetry config http-basic.${POETRY_SOURCE_REF_FFORESIGHT} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
|
||||||
|
poetry install --without=dev -vv --no-interaction --no-root
|
||||||
|
|
||||||
|
###############
|
||||||
|
# WORKING IMAGE
|
||||||
|
FROM python:3.10-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# COPY SOURCE CODE FROM BUILDER IMAGE
|
||||||
|
COPY --from=builder /app /app
|
||||||
|
# COPY BILL OF MATERIALS (BOM)
|
||||||
|
COPY bom.json /bom.json
|
||||||
|
|
||||||
|
ENV PATH="/app/.venv/bin:$PATH"
|
||||||
|
|
||||||
|
###################
|
||||||
|
# COPY SOURCE CODE
|
||||||
|
COPY ./src ./src
|
||||||
|
COPY ./config ./config
|
||||||
|
COPY ./data ./data
|
||||||
|
COPY banner.txt ./
|
||||||
|
|
||||||
EXPOSE 5000
|
EXPOSE 5000
|
||||||
EXPOSE 8080
|
EXPOSE 8080
|
||||||
|
|
||||||
CMD ["python3", "src/serve.py"]
|
CMD [ "python", "src/serve.py"]
|
||||||
|
|||||||
@ -1,25 +0,0 @@
|
|||||||
FROM python:3.8 as builder1
|
|
||||||
|
|
||||||
# Use a virtual environment.
|
|
||||||
RUN python -m venv /app/venv
|
|
||||||
ENV PATH="/app/venv/bin:$PATH"
|
|
||||||
|
|
||||||
# Upgrade pip.
|
|
||||||
RUN python -m pip install --upgrade pip
|
|
||||||
|
|
||||||
# Make a directory for the service files and copy the service repo into the container.
|
|
||||||
WORKDIR /app/service
|
|
||||||
COPY ./requirements.txt ./requirements.txt
|
|
||||||
|
|
||||||
# Install dependencies.
|
|
||||||
RUN python3 -m pip install -r requirements.txt
|
|
||||||
|
|
||||||
# Make a new container and copy all relevant files over to filter out temporary files
|
|
||||||
# produced during setup to reduce the final container's size.
|
|
||||||
FROM python:3.8
|
|
||||||
|
|
||||||
WORKDIR /app/
|
|
||||||
COPY --from=builder1 /app .
|
|
||||||
ENV PATH="/app/venv/bin:$PATH"
|
|
||||||
|
|
||||||
WORKDIR /app/service
|
|
||||||
43
Dockerfile_tests
Normal file
43
Dockerfile_tests
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
FROM python:3.10
|
||||||
|
|
||||||
|
ARG USERNAME
|
||||||
|
ARG TOKEN
|
||||||
|
ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
|
||||||
|
ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
|
||||||
|
ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
|
||||||
|
ARG POETRY_SOURCE_REF_RED=gitlab-red
|
||||||
|
ARG VERSION=dev
|
||||||
|
|
||||||
|
LABEL maintainer="Research <research@knecon.com>"
|
||||||
|
LABEL version="${VERSION}"
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENV PYTHONUNBUFFERED=true
|
||||||
|
ENV POETRY_HOME=/opt/poetry
|
||||||
|
ENV PATH="$POETRY_HOME/bin:$PATH"
|
||||||
|
|
||||||
|
RUN curl -sSL https://install.python-poetry.org | python3 -
|
||||||
|
|
||||||
|
COPY ./data ./data
|
||||||
|
COPY ./test ./test
|
||||||
|
COPY ./config ./config
|
||||||
|
COPY ./src ./src
|
||||||
|
COPY pyproject.toml poetry.lock banner.txt config.yaml./
|
||||||
|
|
||||||
|
RUN poetry config virtualenvs.create false && \
|
||||||
|
poetry config installer.max-workers 10 && \
|
||||||
|
poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
|
||||||
|
poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${USERNAME} ${TOKEN} && \
|
||||||
|
poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
|
||||||
|
poetry config http-basic.${POETRY_SOURCE_REF_RED} ${USERNAME} ${TOKEN} && \
|
||||||
|
poetry install --without=dev -vv --no-interaction --no-root
|
||||||
|
|
||||||
|
EXPOSE 5000
|
||||||
|
EXPOSE 8080
|
||||||
|
|
||||||
|
RUN apt update --yes
|
||||||
|
RUN apt install vim --yes
|
||||||
|
RUN apt install poppler-utils --yes
|
||||||
|
|
||||||
|
CMD coverage run -m pytest test/ --tb=native -q -s -vvv -x && coverage combine && coverage report -m && coverage xml
|
||||||
136
README.md
136
README.md
@ -1,25 +1,143 @@
|
|||||||
### Building
|
### Setup
|
||||||
|
|
||||||
Build base image
|
Build base image
|
||||||
```bash
|
```bash
|
||||||
setup/docker.sh
|
docker build -t image-classification-image --progress=plain --no-cache \
|
||||||
```
|
-f Dockerfile \
|
||||||
|
--build-arg USERNAME=$GITLAB_USER \
|
||||||
Build head image
|
--build-arg TOKEN=$GITLAB_ACCESS_TOKEN \
|
||||||
```bash
|
.
|
||||||
docker build -f Dockerfile -t image-prediction . --build-arg BASE_ROOT=""
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Usage
|
### Usage
|
||||||
|
|
||||||
|
#### Without Docker
|
||||||
|
|
||||||
|
|
||||||
|
```bash
|
||||||
|
py scripts/run_pipeline.py /path/to/a/pdf
|
||||||
|
```
|
||||||
|
|
||||||
|
#### With Docker
|
||||||
|
|
||||||
Shell 1
|
Shell 1
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run --rm --net=host --rm image-prediction
|
docker run --rm --net=host image-prediction
|
||||||
```
|
```
|
||||||
|
|
||||||
Shell 2
|
Shell 2
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python scripts/pyinfra_mock.py --pdf_path /path/to/a/pdf
|
python scripts/pyinfra_mock.py /path/to/a/pdf
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Tests
|
||||||
|
|
||||||
|
Run for example this command to execute all tests and get a coverage report:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
coverage run -m pytest test --tb=native -q -s -vvv -x && coverage combine && coverage report -m
|
||||||
|
```
|
||||||
|
|
||||||
|
After having built the service container as specified above, you can also run tests in a container as follows:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./run_tests.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Message Body Formats
|
||||||
|
|
||||||
|
|
||||||
|
#### Request Format
|
||||||
|
|
||||||
|
The request messages need to provide the fields `"dossierId"` and `"fileId"`. A request should look like this:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"dossierId": "<string identifier>",
|
||||||
|
"fileId": "<string identifier>"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Any additional keys are ignored.
|
||||||
|
|
||||||
|
|
||||||
|
#### Response Format
|
||||||
|
|
||||||
|
Response bodies contain information about the identified class of the image, the confidence of the classification, the
|
||||||
|
position and size of the image as well as the results of additional convenience filters which can be configured through
|
||||||
|
environment variables. A response body looks like this:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"dossierId": "debug",
|
||||||
|
"fileId": "13ffa9851740c8d20c4c7d1706d72f2a",
|
||||||
|
"data": [...]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
An image metadata record (entry in `"data"` field of a response body) looks like this:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"classification": {
|
||||||
|
"label": "logo",
|
||||||
|
"probabilities": {
|
||||||
|
"logo": 1.0,
|
||||||
|
"signature": 1.1599173226749333e-17,
|
||||||
|
"other": 2.994595513398207e-23,
|
||||||
|
"formula": 4.352109377281029e-31
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"position": {
|
||||||
|
"x1": 475.95,
|
||||||
|
"x2": 533.4,
|
||||||
|
"y1": 796.47,
|
||||||
|
"y2": 827.62,
|
||||||
|
"pageNumber": 6
|
||||||
|
},
|
||||||
|
"geometry": {
|
||||||
|
"width": 57.44999999999999,
|
||||||
|
"height": 31.149999999999977
|
||||||
|
},
|
||||||
|
"alpha": false,
|
||||||
|
"filters": {
|
||||||
|
"geometry": {
|
||||||
|
"imageSize": {
|
||||||
|
"quotient": 0.05975350599135938,
|
||||||
|
"tooLarge": false,
|
||||||
|
"tooSmall": false
|
||||||
|
},
|
||||||
|
"imageFormat": {
|
||||||
|
"quotient": 1.8443017656500813,
|
||||||
|
"tooTall": false,
|
||||||
|
"tooWide": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"probability": {
|
||||||
|
"unconfident": false
|
||||||
|
},
|
||||||
|
"allPassed": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
A configuration file is located under `config.yaml`. All relevant variables can be configured via
|
||||||
|
exporting environment variables.
|
||||||
|
|
||||||
|
| __Environment Variable__ | Default | Description |
|
||||||
|
|------------------------------------|------------------------------------|----------------------------------------------------------------------------------------|
|
||||||
|
| __LOGGING_LEVEL_ROOT__ | "INFO" | Logging level for log file messages |
|
||||||
|
| __VERBOSE__ | *true* | Service prints document processing progress to stdout |
|
||||||
|
| __BATCH_SIZE__ | 16 | Number of images in memory simultaneously per service instance |
|
||||||
|
| __RUN_ID__ | "fabfb1f192c745369b88cab34471aba7" | The ID of the mlflow run to load the image classifier from |
|
||||||
|
| __MIN_REL_IMAGE_SIZE__ | 0.05 | Minimally permissible image size to page size ratio |
|
||||||
|
| __MAX_REL_IMAGE_SIZE__ | 0.75 | Maximally permissible image size to page size ratio |
|
||||||
|
| __MIN_IMAGE_FORMAT__ | 0.1 | Minimally permissible image width to height ratio |
|
||||||
|
| __MAX_IMAGE_FORMAT__ | 10 | Maximally permissible image width to height ratio |
|
||||||
|
|
||||||
|
See also: https://git.iqser.com/projects/RED/repos/helm/browse/redaction/templates/image-service-v2
|
||||||
|
|||||||
@ -1,40 +0,0 @@
|
|||||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
||||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
|
||||||
<modelVersion>4.0.0</modelVersion>
|
|
||||||
|
|
||||||
<parent>
|
|
||||||
<groupId>com.atlassian.bamboo</groupId>
|
|
||||||
<artifactId>bamboo-specs-parent</artifactId>
|
|
||||||
<version>7.1.2</version>
|
|
||||||
<relativePath/>
|
|
||||||
</parent>
|
|
||||||
|
|
||||||
<artifactId>bamboo-specs</artifactId>
|
|
||||||
<version>1.0.0-SNAPSHOT</version>
|
|
||||||
<packaging>jar</packaging>
|
|
||||||
|
|
||||||
<properties>
|
|
||||||
<sonar.skip>true</sonar.skip>
|
|
||||||
</properties>
|
|
||||||
|
|
||||||
<dependencies>
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.atlassian.bamboo</groupId>
|
|
||||||
<artifactId>bamboo-specs-api</artifactId>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
|
||||||
<groupId>com.atlassian.bamboo</groupId>
|
|
||||||
<artifactId>bamboo-specs</artifactId>
|
|
||||||
</dependency>
|
|
||||||
|
|
||||||
<!-- Test dependencies -->
|
|
||||||
<dependency>
|
|
||||||
<groupId>junit</groupId>
|
|
||||||
<artifactId>junit</artifactId>
|
|
||||||
<scope>test</scope>
|
|
||||||
</dependency>
|
|
||||||
</dependencies>
|
|
||||||
|
|
||||||
<!-- run 'mvn test' to perform offline validation of the plan -->
|
|
||||||
<!-- run 'mvn -Ppublish-specs' to upload the plan to your Bamboo server -->
|
|
||||||
</project>
|
|
||||||
@ -1,182 +0,0 @@
|
|||||||
package buildjob;
|
|
||||||
|
|
||||||
import com.atlassian.bamboo.specs.api.BambooSpec;
|
|
||||||
import com.atlassian.bamboo.specs.api.builders.BambooKey;
|
|
||||||
import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
|
|
||||||
import com.atlassian.bamboo.specs.api.builders.permission.PermissionType;
|
|
||||||
import com.atlassian.bamboo.specs.api.builders.permission.Permissions;
|
|
||||||
import com.atlassian.bamboo.specs.api.builders.permission.PlanPermissions;
|
|
||||||
import com.atlassian.bamboo.specs.api.builders.plan.Job;
|
|
||||||
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
|
|
||||||
import com.atlassian.bamboo.specs.api.builders.plan.PlanIdentifier;
|
|
||||||
import com.atlassian.bamboo.specs.api.builders.plan.Stage;
|
|
||||||
import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup;
|
|
||||||
import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement;
|
|
||||||
import com.atlassian.bamboo.specs.api.builders.project.Project;
|
|
||||||
import com.atlassian.bamboo.specs.builders.task.CheckoutItem;
|
|
||||||
import com.atlassian.bamboo.specs.builders.task.InjectVariablesTask;
|
|
||||||
import com.atlassian.bamboo.specs.builders.task.ScriptTask;
|
|
||||||
import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask;
|
|
||||||
import com.atlassian.bamboo.specs.builders.task.CleanWorkingDirectoryTask;
|
|
||||||
import com.atlassian.bamboo.specs.builders.task.VcsTagTask;
|
|
||||||
import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
|
|
||||||
import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
|
|
||||||
import com.atlassian.bamboo.specs.api.builders.Variable;
|
|
||||||
import com.atlassian.bamboo.specs.util.BambooServer;
|
|
||||||
import com.atlassian.bamboo.specs.builders.task.ScriptTask;
|
|
||||||
import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Plan configuration for Bamboo.
|
|
||||||
* Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
|
|
||||||
*/
|
|
||||||
@BambooSpec
|
|
||||||
public class PlanSpec {
|
|
||||||
|
|
||||||
private static final String SERVICE_NAME = "image-prediction";
|
|
||||||
private static final String SERVICE_NAME_BASE = "image-prediction-base";
|
|
||||||
|
|
||||||
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_","");
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Run main to publish plan on Bamboo
|
|
||||||
*/
|
|
||||||
public static void main(final String[] args) throws Exception {
|
|
||||||
//By default credentials are read from the '.credentials' file.
|
|
||||||
BambooServer bambooServer = new BambooServer("http://localhost:8085");
|
|
||||||
|
|
||||||
Plan plan = new PlanSpec().createDockerBuildPlan();
|
|
||||||
bambooServer.publish(plan);
|
|
||||||
PlanPermissions planPermission = new PlanSpec().createPlanPermission(plan.getIdentifier());
|
|
||||||
bambooServer.publish(planPermission);
|
|
||||||
}
|
|
||||||
|
|
||||||
private PlanPermissions createPlanPermission(PlanIdentifier planIdentifier) {
|
|
||||||
Permissions permission = new Permissions()
|
|
||||||
.userPermissions("atlbamboo", PermissionType.EDIT, PermissionType.VIEW, PermissionType.ADMIN, PermissionType.CLONE, PermissionType.BUILD)
|
|
||||||
.groupPermissions("research", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
|
|
||||||
.groupPermissions("Development", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
|
|
||||||
.groupPermissions("QA", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
|
|
||||||
.loggedInUserPermissions(PermissionType.VIEW)
|
|
||||||
.anonymousUserPermissionView();
|
|
||||||
return new PlanPermissions(planIdentifier.getProjectKey(), planIdentifier.getPlanKey()).permissions(permission);
|
|
||||||
}
|
|
||||||
|
|
||||||
private Project project() {
|
|
||||||
return new Project()
|
|
||||||
.name("RED")
|
|
||||||
.key(new BambooKey("RED"));
|
|
||||||
}
|
|
||||||
|
|
||||||
public Plan createDockerBuildPlan() {
|
|
||||||
return new Plan(
|
|
||||||
project(),
|
|
||||||
SERVICE_NAME, new BambooKey(SERVICE_KEY))
|
|
||||||
.description("Docker build for image-prediction.")
|
|
||||||
// .variables()
|
|
||||||
.stages(new Stage("Build Stage")
|
|
||||||
.jobs(
|
|
||||||
new Job("Build Job", new BambooKey("BUILD"))
|
|
||||||
.tasks(
|
|
||||||
new CleanWorkingDirectoryTask()
|
|
||||||
.description("Clean working directory.")
|
|
||||||
.enabled(true),
|
|
||||||
new VcsCheckoutTask()
|
|
||||||
.description("Checkout default repository.")
|
|
||||||
.checkoutItems(new CheckoutItem().defaultRepository()),
|
|
||||||
new VcsCheckoutTask()
|
|
||||||
.description("Checkout redai_image research repository.")
|
|
||||||
.checkoutItems(new CheckoutItem().repository("RR / redai_image").path("redai_image")),
|
|
||||||
new ScriptTask()
|
|
||||||
.description("Set config and keys.")
|
|
||||||
.inlineBody("mkdir -p ~/.ssh\n" +
|
|
||||||
"echo \"${bamboo.bamboo_agent_ssh}\" | base64 -d >> ~/.ssh/id_rsa\n" +
|
|
||||||
"echo \"host vector.iqser.com\" > ~/.ssh/config\n" +
|
|
||||||
"echo \" user bamboo-agent\" >> ~/.ssh/config\n" +
|
|
||||||
"chmod 600 ~/.ssh/config ~/.ssh/id_rsa"),
|
|
||||||
new ScriptTask()
|
|
||||||
.description("Build Docker container.")
|
|
||||||
.location(Location.FILE)
|
|
||||||
.fileFromPath("bamboo-specs/src/main/resources/scripts/docker-build.sh")
|
|
||||||
.argument(SERVICE_NAME + " " + SERVICE_NAME_BASE))
|
|
||||||
.dockerConfiguration(
|
|
||||||
new DockerConfiguration()
|
|
||||||
.image("nexus.iqser.com:5001/infra/release_build:4.2.0")
|
|
||||||
.volume("/var/run/docker.sock", "/var/run/docker.sock")),
|
|
||||||
new Job("Sonar Job", new BambooKey("SONAR"))
|
|
||||||
.tasks(
|
|
||||||
new CleanWorkingDirectoryTask()
|
|
||||||
.description("Clean working directory.")
|
|
||||||
.enabled(true),
|
|
||||||
new VcsCheckoutTask()
|
|
||||||
.description("Checkout default repository.")
|
|
||||||
.checkoutItems(new CheckoutItem().defaultRepository()),
|
|
||||||
new VcsCheckoutTask()
|
|
||||||
.description("Checkout redai_image repository.")
|
|
||||||
.checkoutItems(new CheckoutItem().repository("RR / redai_image").path("redai_image")),
|
|
||||||
new ScriptTask()
|
|
||||||
.description("Set config and keys.")
|
|
||||||
.inlineBody("mkdir -p ~/.ssh\n" +
|
|
||||||
"echo \"${bamboo.bamboo_agent_ssh}\" | base64 -d >> ~/.ssh/id_rsa\n" +
|
|
||||||
"echo \"host vector.iqser.com\" > ~/.ssh/config\n" +
|
|
||||||
"echo \" user bamboo-agent\" >> ~/.ssh/config\n" +
|
|
||||||
"chmod 600 ~/.ssh/config ~/.ssh/id_rsa"),
|
|
||||||
new ScriptTask()
|
|
||||||
.description("Run Sonarqube scan.")
|
|
||||||
.location(Location.FILE)
|
|
||||||
.fileFromPath("bamboo-specs/src/main/resources/scripts/sonar-scan.sh")
|
|
||||||
.argument(SERVICE_NAME))
|
|
||||||
.dockerConfiguration(
|
|
||||||
new DockerConfiguration()
|
|
||||||
.image("nexus.iqser.com:5001/infra/release_build:4.2.0")
|
|
||||||
.volume("/var/run/docker.sock", "/var/run/docker.sock"))),
|
|
||||||
new Stage("Licence Stage")
|
|
||||||
.jobs(
|
|
||||||
new Job("Git Tag Job", new BambooKey("GITTAG"))
|
|
||||||
.tasks(
|
|
||||||
new VcsCheckoutTask()
|
|
||||||
.description("Checkout default repository.")
|
|
||||||
.checkoutItems(new CheckoutItem().defaultRepository()),
|
|
||||||
new ScriptTask()
|
|
||||||
.description("Build git tag.")
|
|
||||||
.location(Location.FILE)
|
|
||||||
.fileFromPath("bamboo-specs/src/main/resources/scripts/git-tag.sh"),
|
|
||||||
new InjectVariablesTask()
|
|
||||||
.description("Inject git tag.")
|
|
||||||
.path("git.tag")
|
|
||||||
.namespace("g")
|
|
||||||
.scope(InjectVariablesScope.LOCAL),
|
|
||||||
new VcsTagTask()
|
|
||||||
.description("${bamboo.g.gitTag}")
|
|
||||||
.tagName("${bamboo.g.gitTag}")
|
|
||||||
.defaultRepository())
|
|
||||||
.dockerConfiguration(
|
|
||||||
new DockerConfiguration()
|
|
||||||
.image("nexus.iqser.com:5001/infra/release_build:4.4.1")),
|
|
||||||
new Job("Licence Job", new BambooKey("LICENCE"))
|
|
||||||
.enabled(false)
|
|
||||||
.tasks(
|
|
||||||
new VcsCheckoutTask()
|
|
||||||
.description("Checkout default repository.")
|
|
||||||
.checkoutItems(new CheckoutItem().defaultRepository()),
|
|
||||||
new ScriptTask()
|
|
||||||
.description("Build licence.")
|
|
||||||
.location(Location.FILE)
|
|
||||||
.fileFromPath("bamboo-specs/src/main/resources/scripts/create-licence.sh"))
|
|
||||||
.dockerConfiguration(
|
|
||||||
new DockerConfiguration()
|
|
||||||
.image("nexus.iqser.com:5001/infra/maven:3.6.2-jdk-13-3.0.0")
|
|
||||||
.volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
|
|
||||||
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
|
|
||||||
.linkedRepositories("RR / " + SERVICE_NAME)
|
|
||||||
.linkedRepositories("RR / redai_image")
|
|
||||||
.triggers(new BitbucketServerTrigger())
|
|
||||||
.planBranchManagement(new PlanBranchManagement()
|
|
||||||
.createForVcsBranch()
|
|
||||||
.delete(new BranchCleanup()
|
|
||||||
.whenInactiveInRepositoryAfterDays(14))
|
|
||||||
.notificationForCommitters());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,19 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [[ \"${bamboo_version_tag}\" != \"dev\" ]]
|
|
||||||
then
|
|
||||||
${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
|
|
||||||
-f ${bamboo_build_working_directory}/pom.xml \
|
|
||||||
versions:set \
|
|
||||||
-DnewVersion=${bamboo_version_tag}
|
|
||||||
|
|
||||||
${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
|
|
||||||
-f ${bamboo_build_working_directory}/pom.xml \
|
|
||||||
-B clean deploy \
|
|
||||||
-e -DdeployAtEnd=true \
|
|
||||||
-Dmaven.wagon.http.ssl.insecure=true \
|
|
||||||
-Dmaven.wagon.http.ssl.allowall=true \
|
|
||||||
-Dmaven.wagon.http.ssl.ignore.validity.dates=true \
|
|
||||||
-DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/gin4-platform-releases
|
|
||||||
fi
|
|
||||||
@ -1,19 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
SERVICE_NAME=$1
|
|
||||||
SERVICE_NAME_BASE=$2
|
|
||||||
|
|
||||||
python3 -m venv build_venv
|
|
||||||
source build_venv/bin/activate
|
|
||||||
python3 -m pip install --upgrade pip
|
|
||||||
|
|
||||||
pip install dvc
|
|
||||||
pip install 'dvc[ssh]'
|
|
||||||
dvc pull
|
|
||||||
|
|
||||||
echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
|
|
||||||
docker build -f Dockerfile_base -t nexus.iqser.com:5001/red/$SERVICE_NAME_BASE:${bamboo_version_tag} .
|
|
||||||
docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} --build-arg VERSION_TAG=${bamboo_version_tag} .
|
|
||||||
echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
|
|
||||||
docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag}
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
if [[ "${bamboo_version_tag}" = "dev" ]]
|
|
||||||
then
|
|
||||||
echo "gitTag=${bamboo_planRepository_1_branch}_${bamboo_buildNumber}" > git.tag
|
|
||||||
else
|
|
||||||
echo "gitTag=${bamboo_version_tag}" > git.tag
|
|
||||||
fi
|
|
||||||
@ -1,51 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
export JAVA_HOME=/usr/bin/sonar-scanner/jre
|
|
||||||
|
|
||||||
python3 -m venv build_venv
|
|
||||||
source build_venv/bin/activate
|
|
||||||
python3 -m pip install --upgrade pip
|
|
||||||
|
|
||||||
echo "dev setup for unit test and coverage 💖"
|
|
||||||
|
|
||||||
pip install -e .
|
|
||||||
pip install -r requirements.txt
|
|
||||||
|
|
||||||
SERVICE_NAME=$1
|
|
||||||
|
|
||||||
echo "dependency-check:aggregate"
|
|
||||||
mkdir -p reports
|
|
||||||
dependency-check --enableExperimental -f JSON -f HTML -f XML \
|
|
||||||
--disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
|
|
||||||
--exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**"
|
|
||||||
|
|
||||||
if [[ -z "${bamboo_repository_pr_key}" ]]
|
|
||||||
then
|
|
||||||
echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
|
|
||||||
/usr/bin/sonar-scanner/bin/sonar-scanner \
|
|
||||||
-Dsonar.projectKey=RED_$SERVICE_NAME \
|
|
||||||
-Dsonar.sources=image_prediction \
|
|
||||||
-Dsonar.host.url=https://sonarqube.iqser.com \
|
|
||||||
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
|
|
||||||
-Dsonar.branch.name=${bamboo_planRepository_1_branch} \
|
|
||||||
-Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
|
|
||||||
-Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
|
|
||||||
-Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
|
|
||||||
-Dsonar.python.coverage.reportPaths=reports/coverage.xml
|
|
||||||
|
|
||||||
else
|
|
||||||
echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
|
|
||||||
/usr/bin/sonar-scanner/bin/sonar-scanner \
|
|
||||||
-Dsonar.projectKey=RED_$SERVICE_NAME \
|
|
||||||
-Dsonar.sources=image_prediction \
|
|
||||||
-Dsonar.host.url=https://sonarqube.iqser.com \
|
|
||||||
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
|
|
||||||
-Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
|
|
||||||
-Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \
|
|
||||||
-Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \
|
|
||||||
-Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
|
|
||||||
-Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
|
|
||||||
-Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
|
|
||||||
-Dsonar.python.coverage.reportPaths=reports/coverage.xml
|
|
||||||
fi
|
|
||||||
@ -1,16 +0,0 @@
|
|||||||
package buildjob;
|
|
||||||
|
|
||||||
|
|
||||||
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
|
|
||||||
import com.atlassian.bamboo.specs.api.exceptions.PropertiesValidationException;
|
|
||||||
import com.atlassian.bamboo.specs.api.util.EntityPropertiesBuilders;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
public class PlanSpecTest {
|
|
||||||
@Test
|
|
||||||
public void checkYourPlanOffline() throws PropertiesValidationException {
|
|
||||||
Plan plan = new PlanSpec().createDockerBuildPlan();
|
|
||||||
|
|
||||||
EntityPropertiesBuilders.build(plan);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
11
banner.txt
Normal file
11
banner.txt
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
+----------------------------------------------------+
|
||||||
|
| ___ |
|
||||||
|
| __/_ `. .-"""-. |
|
||||||
|
|_._ _,-'""`-._ \_,` | \-' / )`-')|
|
||||||
|
|(,-.`._,'( |\`-/| "") `"` \ ((`"` |
|
||||||
|
| `-.-' \ )-`( , o o) ___Y , .'7 /| |
|
||||||
|
| `- \`_`"'- (_,___/...-` (_/_/ |
|
||||||
|
| |
|
||||||
|
+----------------------------------------------------+
|
||||||
|
| Image Classification Service |
|
||||||
|
+----------------------------------------------------+
|
||||||
28
config.yaml
28
config.yaml
@ -1,28 +0,0 @@
|
|||||||
webserver:
|
|
||||||
host: $SERVER_HOST|"127.0.0.1" # webserver address
|
|
||||||
port: $SERVER_PORT|5000 # webserver port
|
|
||||||
mode: $SERVER_MODE|production # webserver mode: {development, production}
|
|
||||||
|
|
||||||
service:
|
|
||||||
logging_level: $LOGGING_LEVEL_ROOT|DEBUG # Logging level for service logger
|
|
||||||
batch_size: $BATCH_SIZE|32 # Number of images in memory simultaneously
|
|
||||||
verbose: $VERBOSE|True # Service prints document processing progress to stdout
|
|
||||||
run_id: $RUN_ID|fabfb1f192c745369b88cab34471aba7 # The ID of the mlflow run to load the model from
|
|
||||||
|
|
||||||
|
|
||||||
# These variables control filters that are applied to either images, image metadata or model predictions. The filter
|
|
||||||
# result values are reported in the service responses. For convenience the response to a request contains a
|
|
||||||
# "filters.allPassed" field, which is set to false if any of the filters returned values did not meet its specified
|
|
||||||
# required value.
|
|
||||||
filters:
|
|
||||||
|
|
||||||
image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)
|
|
||||||
min: $MIN_REL_IMAGE_SIZE|0.05 # Minimum permissible
|
|
||||||
max: $MAX_REL_IMAGE_SIZE|0.75 # Maximum permissible
|
|
||||||
|
|
||||||
image_width_to_height_quotient: # Image width to height ratio
|
|
||||||
min: $MIN_IMAGE_FORMAT|0.1 # Minimum permissible
|
|
||||||
max: $MAX_IMAGE_FORMAT|10 # Maximum permissible
|
|
||||||
|
|
||||||
min_confidence: $MIN_CONFIDENCE|0.5 # Minimum permissible prediction confidence
|
|
||||||
|
|
||||||
68
config/pyinfra.toml
Normal file
68
config/pyinfra.toml
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
|
||||||
|
[asyncio]
|
||||||
|
max_concurrent_tasks = 10
|
||||||
|
|
||||||
|
[dynamic_tenant_queues]
|
||||||
|
enabled = true
|
||||||
|
|
||||||
|
[metrics.prometheus]
|
||||||
|
enabled = true
|
||||||
|
prefix = "redactmanager_image_service"
|
||||||
|
|
||||||
|
[tracing]
|
||||||
|
enabled = true
|
||||||
|
# possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
|
||||||
|
type = "azure_monitor"
|
||||||
|
|
||||||
|
[tracing.opentelemetry]
|
||||||
|
endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
|
||||||
|
service_name = "redactmanager_image_service"
|
||||||
|
exporter = "otlp"
|
||||||
|
|
||||||
|
[webserver]
|
||||||
|
host = "0.0.0.0"
|
||||||
|
port = 8080
|
||||||
|
|
||||||
|
[rabbitmq]
|
||||||
|
host = "localhost"
|
||||||
|
port = 5672
|
||||||
|
username = ""
|
||||||
|
password = ""
|
||||||
|
heartbeat = 60
|
||||||
|
# Has to be a divider of heartbeat, and shouldn't be too big, since only in these intervals queue interactions happen (like receiving new messages)
|
||||||
|
# This is also the minimum time the service needs to process a message
|
||||||
|
connection_sleep = 5
|
||||||
|
input_queue = "request_queue"
|
||||||
|
output_queue = "response_queue"
|
||||||
|
dead_letter_queue = "dead_letter_queue"
|
||||||
|
|
||||||
|
tenant_event_queue_suffix = "_tenant_event_queue"
|
||||||
|
tenant_event_dlq_suffix = "_tenant_events_dlq"
|
||||||
|
tenant_exchange_name = "tenants-exchange"
|
||||||
|
queue_expiration_time = 300000 # 5 minutes in milliseconds
|
||||||
|
|
||||||
|
service_request_queue_prefix = "image_request_queue"
|
||||||
|
service_request_exchange_name = "image_request_exchange"
|
||||||
|
service_response_exchange_name = "image_response_exchange"
|
||||||
|
service_dlq_name = "image_dlq"
|
||||||
|
|
||||||
|
[storage]
|
||||||
|
backend = "s3"
|
||||||
|
|
||||||
|
[storage.s3]
|
||||||
|
bucket = "redaction"
|
||||||
|
endpoint = "http://127.0.0.1:9000"
|
||||||
|
key = ""
|
||||||
|
secret = ""
|
||||||
|
region = "eu-central-1"
|
||||||
|
|
||||||
|
[storage.azure]
|
||||||
|
container = "redaction"
|
||||||
|
connection_string = ""
|
||||||
|
|
||||||
|
[storage.tenant_server]
|
||||||
|
public_key = ""
|
||||||
|
endpoint = "http://tenant-user-management:8081/internal-api/tenants"
|
||||||
|
|
||||||
|
[kubernetes]
|
||||||
|
pod_name = "test_pod"
|
||||||
42
config/settings.toml
Normal file
42
config/settings.toml
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
[logging]
|
||||||
|
level = "INFO"
|
||||||
|
|
||||||
|
[service]
|
||||||
|
# Print document processing progress to stdout
|
||||||
|
verbose = false
|
||||||
|
batch_size = 6
|
||||||
|
image_stiching_tolerance = 1 # in pixels
|
||||||
|
mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
|
||||||
|
|
||||||
|
# These variables control filters that are applied to either images, image metadata or service_estimator predictions.
|
||||||
|
# The filter result values are reported in the service responses. For convenience the response to a request contains a
|
||||||
|
# "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
|
||||||
|
# specified required value.
|
||||||
|
[filters.confidence]
|
||||||
|
# Minimum permissible prediction confidence
|
||||||
|
min = 0.5
|
||||||
|
|
||||||
|
# Image size to page size ratio (ratio of geometric means of areas)
|
||||||
|
[filters.image_to_page_quotient]
|
||||||
|
min = 0.05
|
||||||
|
max = 0.75
|
||||||
|
|
||||||
|
[filters.is_scanned_page]
|
||||||
|
# Minimum permissible image to page ratio tolerance for a page to be considered scanned.
|
||||||
|
# This is only used for filtering small images on scanned pages and is applied before processing the image, therefore
|
||||||
|
# superseding the image_to_page_quotient filter that only applies a tag to the image after processing.
|
||||||
|
tolerance = 0
|
||||||
|
|
||||||
|
# Image width to height ratio
|
||||||
|
[filters.image_width_to_height_quotient]
|
||||||
|
min = 0.1
|
||||||
|
max = 10
|
||||||
|
|
||||||
|
# put class specific filters here ['signature', 'formula', 'logo']
|
||||||
|
[filters.overrides.signature.image_to_page_quotient]
|
||||||
|
max = 0.4
|
||||||
|
|
||||||
|
[filters.overrides.logo.image_to_page_quotient]
|
||||||
|
min = 0.06
|
||||||
|
|
||||||
|
|
||||||
1
data/.gitignore
vendored
Normal file
1
data/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
/mlruns
|
||||||
@ -1,4 +0,0 @@
|
|||||||
outs:
|
|
||||||
- md5: 6d0186c1f25e889d531788f168fa6cf0
|
|
||||||
size: 16727296
|
|
||||||
path: base_weights.h5
|
|
||||||
@ -1,5 +1,5 @@
|
|||||||
outs:
|
outs:
|
||||||
- md5: d1c708270bab6fcd344d4a8b05d1103d.dir
|
- md5: ad061d607f615afc149643f62dbf37cc.dir
|
||||||
size: 150225383
|
size: 166952700
|
||||||
nfiles: 178
|
nfiles: 179
|
||||||
path: mlruns
|
path: mlruns
|
||||||
|
|||||||
1
doc/tests.drawio
Normal file
1
doc/tests.drawio
Normal file
@ -0,0 +1 @@
|
|||||||
|
<mxfile host="app.diagrams.net" modified="2022-03-17T15:35:10.371Z" agent="5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36" etag="b-CbBXg6FXQ9T3Px-oLc" version="17.1.1" type="device"><diagram id="tS3WR_Pr6QhNVK3FqSUP" name="Page-1">1ZZRT6QwEMc/DY8mQHdRX93z9JLbmNzGmNxbQ0daLQzpDrL46a/IsCzinneJcd0XaP+dtsN/fkADscg3V06WeokKbBCHahOIb0Ecnydzf22FphPmyXknZM6oTooGYWWegcWQ1cooWI8CCdGSKcdiikUBKY006RzW47B7tONdS5nBRFil0k7VO6NId+rZPBz0azCZ7neOQh7JZR/MwlpLhfWOJC4DsXCI1LXyzQJs613vSzfv+57RbWIOCvqXCZqW9PBref27aZ7xsQ5vTn/cnvAqT9JW/MCwJuNzR8dZU9Nb4bAqFLSrhYG4qLUhWJUybUdrX3uvacqt70W+yeuCI9jsTTja2uDxAcyBXONDeILonWN04hn366EQUR+jd4qQsCa59tl26cEe32CH/sOt+TueoCONGRbS/kQs2YkHIGoYbFkRvuUTqAmFr1zyu2LlUvhLdjG/HtJlQO/VfOq6AyvJPI3z+HAL4wlwpbp/2V0qODxzUTJmLjo4c8nEkxaWFXcLLPzt4ithKI4BQzHBMOc/l8UvAeLrj9/hQTw9NhBnxwDibB+IB+ZvdvZ5/PnucAx6Gds5S4rLPw==</diagram></mxfile>
|
||||||
@ -1,40 +0,0 @@
|
|||||||
"""Implements a config object with dot-indexing syntax."""
|
|
||||||
|
|
||||||
|
|
||||||
from envyaml import EnvYAML
|
|
||||||
|
|
||||||
from image_prediction.locations import CONFIG_FILE
|
|
||||||
|
|
||||||
|
|
||||||
def _get_item_and_maybe_make_dotindexable(container, item):
|
|
||||||
ret = container[item]
|
|
||||||
return DotIndexable(ret) if isinstance(ret, dict) else ret
|
|
||||||
|
|
||||||
|
|
||||||
class DotIndexable:
|
|
||||||
def __init__(self, x):
|
|
||||||
self.x = x
|
|
||||||
|
|
||||||
def __getattr__(self, item):
|
|
||||||
return _get_item_and_maybe_make_dotindexable(self.x, item)
|
|
||||||
|
|
||||||
def __setitem__(self, key, value):
|
|
||||||
self.x[key] = value
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return self.x.__repr__()
|
|
||||||
|
|
||||||
|
|
||||||
class Config:
|
|
||||||
def __init__(self, config_path):
|
|
||||||
self.__config = EnvYAML(config_path)
|
|
||||||
|
|
||||||
def __getattr__(self, item):
|
|
||||||
if item in self.__config:
|
|
||||||
return _get_item_and_maybe_make_dotindexable(self.__config, item)
|
|
||||||
|
|
||||||
def __getitem__(self, item):
|
|
||||||
return self.__getattr__(item)
|
|
||||||
|
|
||||||
|
|
||||||
CONFIG = Config(CONFIG_FILE)
|
|
||||||
@ -1,14 +0,0 @@
|
|||||||
from os import path
|
|
||||||
|
|
||||||
MODULE_DIR = path.dirname(path.abspath(__file__))
|
|
||||||
PACKAGE_ROOT_DIR = path.dirname(MODULE_DIR)
|
|
||||||
REPO_ROOT_DIR = path.dirname(path.dirname(PACKAGE_ROOT_DIR))
|
|
||||||
|
|
||||||
DOCKER_COMPOSE_FILE = path.join(REPO_ROOT_DIR, "docker-compose.yaml")
|
|
||||||
|
|
||||||
CONFIG_FILE = path.join(PACKAGE_ROOT_DIR, "config.yaml")
|
|
||||||
LOG_FILE = "/tmp/log.log"
|
|
||||||
|
|
||||||
DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data")
|
|
||||||
MLRUNS_DIR = path.join(DATA_DIR, "mlruns")
|
|
||||||
BASE_WEIGHTS = path.join(DATA_DIR, "base_weights.h5")
|
|
||||||
@ -1,116 +0,0 @@
|
|||||||
import logging
|
|
||||||
from itertools import chain
|
|
||||||
from operator import itemgetter
|
|
||||||
from typing import List, Dict, Iterable
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from image_prediction.config import CONFIG
|
|
||||||
from image_prediction.locations import MLRUNS_DIR, BASE_WEIGHTS
|
|
||||||
from incl.redai_image.redai.redai.backend.model.model_handle import ModelHandle
|
|
||||||
from incl.redai_image.redai.redai.backend.pdf.image_extraction import extract_and_stitch
|
|
||||||
from incl.redai_image.redai.redai.utils.mlflow_reader import MlflowModelReader
|
|
||||||
from incl.redai_image.redai.redai.utils.shared import chunk_iterable
|
|
||||||
|
|
||||||
|
|
||||||
class Predictor:
|
|
||||||
"""`ModelHandle` wrapper. Forwards to wrapped model handle for prediction and produces structured output that is
|
|
||||||
interpretable independently of the wrapped model (e.g. with regard to a .classes_ attribute).
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, model_handle: ModelHandle = None):
|
|
||||||
"""Initializes a ServiceEstimator.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
model_handle: ModelHandle object to forward to for prediction. By default, a model handle is loaded from the
|
|
||||||
mlflow database via CONFIG.service.run_id.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
if model_handle is None:
|
|
||||||
reader = MlflowModelReader(run_id=CONFIG.service.run_id, mlruns_dir=MLRUNS_DIR)
|
|
||||||
self.model_handle = reader.get_model_handle(BASE_WEIGHTS)
|
|
||||||
else:
|
|
||||||
self.model_handle = model_handle
|
|
||||||
|
|
||||||
self.classes = self.model_handle.model.classes_
|
|
||||||
self.classes_readable = np.array(self.model_handle.classes)
|
|
||||||
self.classes_readable_aligned = self.classes_readable[self.classes[list(range(len(self.classes)))]]
|
|
||||||
except Exception as e:
|
|
||||||
logging.info(f"Service estimator initialization failed: {e}")
|
|
||||||
|
|
||||||
def __make_predictions_human_readable(self, probs: np.ndarray) -> List[Dict[str, float]]:
|
|
||||||
"""Translates an n x m matrix of probabilities over classes into an n-element list of mappings from classes to
|
|
||||||
probabilities.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
probs: probability matrix (items x classes)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list of mappings from classes to probabilities.
|
|
||||||
"""
|
|
||||||
classes = np.argmax(probs, axis=1)
|
|
||||||
classes = self.classes[classes]
|
|
||||||
classes_readable = [self.model_handle.classes[c] for c in classes]
|
|
||||||
return classes_readable
|
|
||||||
|
|
||||||
def predict(self, images: List, probabilities: bool = False, **kwargs):
|
|
||||||
"""Gathers predictions for list of images. Assigns each image a class and optionally a probability distribution
|
|
||||||
over all classes.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
images (List[PIL.Image]) : Images to gather predictions for.
|
|
||||||
probabilities: Whether to return dictionaries of the following form instead of strings:
|
|
||||||
{
|
|
||||||
"class": predicted class,
|
|
||||||
"probabilities": {
|
|
||||||
"class 1" : class 1 probability,
|
|
||||||
"class 2" : class 2 probability,
|
|
||||||
...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
By default the return value is a list of classes (meaningful class name strings). Alternatively a list of
|
|
||||||
dictionaries with an additional probability field for estimated class probabilities per image can be
|
|
||||||
returned.
|
|
||||||
"""
|
|
||||||
X = self.model_handle.prep_images(list(images))
|
|
||||||
|
|
||||||
probs_per_item = self.model_handle.model.predict_proba(X, **kwargs).astype(float)
|
|
||||||
classes = self.__make_predictions_human_readable(probs_per_item)
|
|
||||||
|
|
||||||
class2prob_per_item = [dict(zip(self.classes_readable_aligned, probs)) for probs in probs_per_item]
|
|
||||||
class2prob_per_item = [
|
|
||||||
dict(sorted(c2p.items(), key=itemgetter(1), reverse=True)) for c2p in class2prob_per_item
|
|
||||||
]
|
|
||||||
|
|
||||||
predictions = [{"class": c, "probabilities": c2p} for c, c2p in zip(classes, class2prob_per_item)]
|
|
||||||
|
|
||||||
return predictions if probabilities else classes
|
|
||||||
|
|
||||||
|
|
||||||
def extract_image_metadata_pairs(pdf_path: str, **kwargs):
|
|
||||||
def image_is_large_enough(metadata: dict):
|
|
||||||
x1, x2, y1, y2 = itemgetter("x1", "x2", "y1", "y2")(metadata)
|
|
||||||
|
|
||||||
return abs(x1 - x2) > 2 and abs(y1 - y2) > 2
|
|
||||||
|
|
||||||
yield from extract_and_stitch(pdf_path, convert_to_rgb=True, filter_fn=image_is_large_enough, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def classify_images(predictor, image_metadata_pairs: Iterable, batch_size: int = CONFIG.service.batch_size):
|
|
||||||
def process_chunk(chunk):
|
|
||||||
images, metadata = zip(*chunk)
|
|
||||||
predictions = predictor.predict(images, probabilities=True)
|
|
||||||
return predictions, metadata
|
|
||||||
|
|
||||||
def predict(image_metadata_pair_generator):
|
|
||||||
chunks = chunk_iterable(image_metadata_pair_generator, n=batch_size)
|
|
||||||
return map(chain.from_iterable, zip(*map(process_chunk, chunks)))
|
|
||||||
|
|
||||||
try:
|
|
||||||
predictions, metadata = predict(image_metadata_pairs)
|
|
||||||
return predictions, metadata
|
|
||||||
|
|
||||||
except ValueError:
|
|
||||||
return [], []
|
|
||||||
@ -1,71 +0,0 @@
|
|||||||
"""Defines functions for constructing service responses."""
|
|
||||||
|
|
||||||
|
|
||||||
from itertools import starmap
|
|
||||||
from operator import itemgetter
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from image_prediction.config import CONFIG
|
|
||||||
|
|
||||||
|
|
||||||
def build_response(predictions: list, metadata: list) -> list:
|
|
||||||
return list(starmap(build_image_info, zip(predictions, metadata)))
|
|
||||||
|
|
||||||
|
|
||||||
def build_image_info(prediction: dict, metadata: dict) -> dict:
|
|
||||||
def compute_geometric_quotient():
|
|
||||||
page_area_sqrt = np.sqrt(abs(page_width * page_height))
|
|
||||||
image_area_sqrt = np.sqrt(abs(x2 - x1) * abs(y2 - y1))
|
|
||||||
return image_area_sqrt / page_area_sqrt
|
|
||||||
|
|
||||||
page_width, page_height, x1, x2, y1, y2, width, height = itemgetter(
|
|
||||||
"page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height"
|
|
||||||
)(metadata)
|
|
||||||
|
|
||||||
quotient = compute_geometric_quotient()
|
|
||||||
|
|
||||||
min_image_to_page_quotient_breached = bool(quotient < CONFIG.filters.image_to_page_quotient.min)
|
|
||||||
max_image_to_page_quotient_breached = bool(quotient > CONFIG.filters.image_to_page_quotient.max)
|
|
||||||
min_image_width_to_height_quotient_breached = bool(
|
|
||||||
width / height < CONFIG.filters.image_width_to_height_quotient.min
|
|
||||||
)
|
|
||||||
max_image_width_to_height_quotient_breached = bool(
|
|
||||||
width / height > CONFIG.filters.image_width_to_height_quotient.max
|
|
||||||
)
|
|
||||||
|
|
||||||
min_confidence_breached = bool(max(prediction["probabilities"].values()) < CONFIG.filters.min_confidence)
|
|
||||||
prediction["label"] = prediction.pop("class") # "class" as field name causes problem for Java objectmapper
|
|
||||||
prediction["probabilities"] = {klass: np.round(prob, 6) for klass, prob in prediction["probabilities"].items()}
|
|
||||||
|
|
||||||
image_info = {
|
|
||||||
"classification": prediction,
|
|
||||||
"position": {"x1": x1, "x2": x2, "y1": y1, "y2": y2, "pageNumber": metadata["page_idx"] + 1},
|
|
||||||
"geometry": {"width": width, "height": height},
|
|
||||||
"filters": {
|
|
||||||
"geometry": {
|
|
||||||
"imageSize": {
|
|
||||||
"quotient": quotient,
|
|
||||||
"tooLarge": max_image_to_page_quotient_breached,
|
|
||||||
"tooSmall": min_image_to_page_quotient_breached,
|
|
||||||
},
|
|
||||||
"imageFormat": {
|
|
||||||
"quotient": width / height,
|
|
||||||
"tooTall": min_image_width_to_height_quotient_breached,
|
|
||||||
"tooWide": max_image_width_to_height_quotient_breached,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"probability": {"unconfident": min_confidence_breached},
|
|
||||||
"allPassed": not any(
|
|
||||||
[
|
|
||||||
max_image_to_page_quotient_breached,
|
|
||||||
min_image_to_page_quotient_breached,
|
|
||||||
min_image_width_to_height_quotient_breached,
|
|
||||||
max_image_width_to_height_quotient_breached,
|
|
||||||
min_confidence_breached,
|
|
||||||
]
|
|
||||||
),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
return image_info
|
|
||||||
@ -1 +0,0 @@
|
|||||||
Subproject commit 4c3b26d7673457aaa99e0663dad6950cd36da967
|
|
||||||
7267
poetry.lock
generated
Normal file
7267
poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
73
pyproject.toml
Normal file
73
pyproject.toml
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
[tool.poetry]
|
||||||
|
name = "image-classification-service"
|
||||||
|
version = "2.17.0"
|
||||||
|
description = ""
|
||||||
|
authors = ["Team Research <research@knecon.com>"]
|
||||||
|
readme = "README.md"
|
||||||
|
packages = [{ include = "image_prediction", from = "src" }]
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = ">=3.10,<3.11"
|
||||||
|
# FIXME: This should be recent pyinfra, but the recent protobuf packages are not compatible with tensorflow 2.9.0, also
|
||||||
|
# see RED-9948.
|
||||||
|
pyinfra = { version = "3.4.2", source = "gitlab-research" }
|
||||||
|
kn-utils = { version = ">=0.4.0", source = "gitlab-research" }
|
||||||
|
dvc = "^2.34.0"
|
||||||
|
dvc-ssh = "^2.20.0"
|
||||||
|
dvc-azure = "^2.21.2"
|
||||||
|
Flask = "^2.1.1"
|
||||||
|
requests = "^2.27.1"
|
||||||
|
iteration-utilities = "^0.11.0"
|
||||||
|
waitress = "^2.1.1"
|
||||||
|
envyaml = "^1.10.211231"
|
||||||
|
dependency-check = "^0.6.0"
|
||||||
|
mlflow = "^1.24.0"
|
||||||
|
numpy = "^1.22.3"
|
||||||
|
tqdm = "^4.64.0"
|
||||||
|
pandas = "^1.4.2"
|
||||||
|
# FIXME: Our current model significantly changes the prediction behaviour when using newer tensorflow (/ protobuf)
|
||||||
|
# versions which is introduuced by pyinfra updates using newer protobuf versions, see RED-9948.
|
||||||
|
tensorflow = "2.9.0"
|
||||||
|
protobuf = "^3.20"
|
||||||
|
pytest = "^7.1.0"
|
||||||
|
funcy = "^2"
|
||||||
|
PyMuPDF = "^1.19.6"
|
||||||
|
fpdf = "^1.7.2"
|
||||||
|
coverage = "^6.3.2"
|
||||||
|
Pillow = "^9.1.0"
|
||||||
|
pdf2image = "^1.16.0"
|
||||||
|
frozendict = "^2.3.0"
|
||||||
|
fsspec = "^2022.11.0"
|
||||||
|
PyMonad = "^2.4.0"
|
||||||
|
pdfnetpython3 = "9.4.2"
|
||||||
|
loguru = "^0.7.0"
|
||||||
|
cyclonedx-bom = "^4.5.0"
|
||||||
|
|
||||||
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
pytest = "^7.0.1"
|
||||||
|
pymonad = "^2.4.0"
|
||||||
|
pylint = "^2.17.4"
|
||||||
|
ipykernel = "^6.23.2"
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
testpaths = ["test"]
|
||||||
|
addopts = "--ignore=data"
|
||||||
|
filterwarnings = ["ignore:.*:DeprecationWarning"]
|
||||||
|
|
||||||
|
[[tool.poetry.source]]
|
||||||
|
name = "PyPI"
|
||||||
|
priority = "primary"
|
||||||
|
|
||||||
|
[[tool.poetry.source]]
|
||||||
|
name = "gitlab-research"
|
||||||
|
url = "https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi/simple"
|
||||||
|
priority = "explicit"
|
||||||
|
|
||||||
|
[[tool.poetry.source]]
|
||||||
|
name = "gitlab-red"
|
||||||
|
url = "https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi/simple"
|
||||||
|
priority = "explicit"
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry-core"]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
||||||
@ -1,21 +0,0 @@
|
|||||||
Flask==2.0.2
|
|
||||||
requests==2.27.1
|
|
||||||
iteration-utilities==0.11.0
|
|
||||||
dvc==2.9.3
|
|
||||||
dvc[ssh]
|
|
||||||
frozendict==2.3.0
|
|
||||||
waitress==2.0.0
|
|
||||||
envyaml~=1.8.210417
|
|
||||||
dependency-check==0.6.*
|
|
||||||
envyaml~=1.8.210417
|
|
||||||
mlflow~=1.20.2
|
|
||||||
numpy~=1.19.3
|
|
||||||
PDFNetPython3~=9.1.0
|
|
||||||
tqdm~=4.62.2
|
|
||||||
pandas~=1.3.1
|
|
||||||
mlflow~=1.20.2
|
|
||||||
tensorflow~=2.5.0
|
|
||||||
PDFNetPython3~=9.1.0
|
|
||||||
Pillow~=8.3.2
|
|
||||||
PyYAML~=5.4.1
|
|
||||||
scikit_learn~=0.24.2
|
|
||||||
46
scripts/debug/debug.py
Normal file
46
scripts/debug/debug.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
"""Script to debug RED-9948. The predictions unexpectedly changed for some images, and we need to understand why."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
from kn_utils.logging import logger
|
||||||
|
|
||||||
|
from image_prediction.config import CONFIG
|
||||||
|
from image_prediction.pipeline import load_pipeline
|
||||||
|
|
||||||
|
|
||||||
|
def process_pdf(pipeline, pdf_path, page_range=None):
|
||||||
|
with open(pdf_path, "rb") as f:
|
||||||
|
logger.info(f"Processing {pdf_path}")
|
||||||
|
predictions = list(pipeline(f.read(), page_range=page_range))
|
||||||
|
|
||||||
|
return predictions
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_seeds():
|
||||||
|
seed = 42
|
||||||
|
np.random.seed(seed)
|
||||||
|
random.seed(seed)
|
||||||
|
tf.random.set_seed(seed)
|
||||||
|
|
||||||
|
|
||||||
|
def debug_info():
|
||||||
|
devices = tf.config.list_physical_devices()
|
||||||
|
print("Available devices:", devices)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# For in container debugging, copy the file and adjust the path.
|
||||||
|
debug_file_path = Path(__file__).parents[2] / "test" / "data" / "RED-9948" / "SYNGENTA_EFSA_sanitisation_GFL_v2"
|
||||||
|
ensure_seeds()
|
||||||
|
debug_info()
|
||||||
|
|
||||||
|
pipeline = load_pipeline(verbose=True, batch_size=CONFIG.service.batch_size)
|
||||||
|
predictions = process_pdf(pipeline, debug_file_path)
|
||||||
|
# This is the image that has the wrong prediction mentioned in RED-9948. The predictions should inconclusive, and
|
||||||
|
# the flag all passed should be false.
|
||||||
|
predictions = [x for x in predictions if x["representation"] == "FA30F080F0C031CE17E8CF237"]
|
||||||
|
print(json.dumps(predictions, indent=2))
|
||||||
30
scripts/devenvsetup.sh
Normal file
30
scripts/devenvsetup.sh
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
python_version=$1
|
||||||
|
gitlab_user=$2
|
||||||
|
gitlab_personal_access_token=$3
|
||||||
|
|
||||||
|
# cookiecutter https://gitlab.knecon.com/knecon/research/template-python-project.git --checkout master
|
||||||
|
# latest_dir=$(ls -td -- */ | head -n 1) # should be the dir cookiecutter just created
|
||||||
|
|
||||||
|
# cd $latest_dir
|
||||||
|
|
||||||
|
pyenv install $python_version
|
||||||
|
pyenv local $python_version
|
||||||
|
pyenv shell $python_version
|
||||||
|
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install poetry
|
||||||
|
|
||||||
|
poetry config installer.max-workers 10
|
||||||
|
# research package registry
|
||||||
|
poetry config repositories.gitlab-research https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
|
||||||
|
poetry config http-basic.gitlab-research ${gitlab_user} ${gitlab_personal_access_token}
|
||||||
|
# redactmanager package registry
|
||||||
|
poetry config repositories.gitlab-red https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
|
||||||
|
poetry config http-basic.gitlab-red ${gitlab_user} ${gitlab_personal_access_token}
|
||||||
|
|
||||||
|
poetry env use $(pyenv which python)
|
||||||
|
poetry install --with=dev
|
||||||
|
poetry update
|
||||||
|
|
||||||
|
source .venv/bin/activate
|
||||||
6
scripts/docker_build_run.sh
Normal file
6
scripts/docker_build_run.sh
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
docker build -t --platform linux/amd64 image-clsasification-service:$(poetry version -s)-dev \
|
||||||
|
-f Dockerfile \
|
||||||
|
--build-arg GITLAB_USER=$GITLAB_USER \
|
||||||
|
--build-arg GITLAB_ACCESS_TOKEN=$GITLAB_ACCESS_TOKEN \
|
||||||
|
. && \
|
||||||
|
docker run -it --rm image-clsasification-service:$(poetry version -s)-dev
|
||||||
3
scripts/docker_tag_push.sh
Normal file
3
scripts/docker_tag_push.sh
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
docker tag image-clsasification-service:$(poetry version -s)-dev $NEXUS_REGISTRY/red/image-clsasification-service:$(poetry version -s)-dev
|
||||||
|
|
||||||
|
docker push $NEXUS_REGISTRY/red/image-clsasification-service:$(poetry version -s)-dev
|
||||||
6
scripts/k8s_startup_probe.py
Normal file
6
scripts/k8s_startup_probe.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
from pyinfra.k8s_probes import startup
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logger.debug("running health check")
|
||||||
|
startup.run_checks()
|
||||||
58
scripts/keras_MnWE.py
Normal file
58
scripts/keras_MnWE.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
import multiprocessing
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from tensorflow import keras
|
||||||
|
from tensorflow.keras import layers
|
||||||
|
|
||||||
|
|
||||||
|
def process(predict_fn_wrapper):
|
||||||
|
# We observed memory doesn't get properly deallocated unless we do this:
|
||||||
|
manager = multiprocessing.Manager()
|
||||||
|
return_dict = manager.dict()
|
||||||
|
p = multiprocessing.Process(
|
||||||
|
target=predict_fn_wrapper,
|
||||||
|
args=(return_dict,),
|
||||||
|
)
|
||||||
|
p.start()
|
||||||
|
p.join()
|
||||||
|
try:
|
||||||
|
return dict(return_dict)["result"]
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def make_model():
|
||||||
|
inputs = keras.Input(shape=(784,))
|
||||||
|
dense = layers.Dense(64, activation="relu")
|
||||||
|
x = dense(inputs)
|
||||||
|
outputs = layers.Dense(10)(x)
|
||||||
|
model = keras.Model(inputs=inputs, outputs=outputs, name="mnist_model")
|
||||||
|
model.compile(
|
||||||
|
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
|
||||||
|
optimizer=keras.optimizers.RMSprop(),
|
||||||
|
metrics=["accuracy"],
|
||||||
|
)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def make_predict_fn():
|
||||||
|
# Keras bug: doesn't work in outer scope
|
||||||
|
model = make_model()
|
||||||
|
|
||||||
|
def predict(*args):
|
||||||
|
# service_estimator = make_model()
|
||||||
|
return model.predict(np.random.random(size=(1, 784)))
|
||||||
|
|
||||||
|
return predict
|
||||||
|
|
||||||
|
|
||||||
|
def make_predict_fn_wrapper(predict_fn):
|
||||||
|
def predict_fn_wrapper(return_dict):
|
||||||
|
return_dict["result"] = predict_fn()
|
||||||
|
|
||||||
|
return predict_fn_wrapper
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
predict_fn = make_predict_fn()
|
||||||
|
print(process(make_predict_fn_wrapper(predict_fn)))
|
||||||
@ -6,7 +6,7 @@ import requests
|
|||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--pdf_path", required=True)
|
parser.add_argument("pdf_path")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
return args
|
return args
|
||||||
|
|||||||
58
scripts/run_pipeline.py
Normal file
58
scripts/run_pipeline.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from glob import glob
|
||||||
|
|
||||||
|
from image_prediction.config import CONFIG
|
||||||
|
from image_prediction.pipeline import load_pipeline
|
||||||
|
from image_prediction.utils import get_logger
|
||||||
|
from image_prediction.utils.pdf_annotation import annotate_pdf
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
logger.setLevel("DEBUG")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument("input", help="pdf file or directory")
|
||||||
|
parser.add_argument("--print", "-p", help="print output to terminal", action="store_true", default=False)
|
||||||
|
parser.add_argument("--page_interval", "-i", help="page interval [i, j), min index = 0", nargs=2, type=int)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def process_pdf(pipeline, pdf_path, page_range=None):
|
||||||
|
with open(pdf_path, "rb") as f:
|
||||||
|
logger.info(f"Processing {pdf_path}")
|
||||||
|
predictions = list(pipeline(f.read(), page_range=page_range))
|
||||||
|
|
||||||
|
annotate_pdf(
|
||||||
|
pdf_path, predictions, os.path.join("/tmp", os.path.basename(pdf_path.replace(".pdf", "_annotated.pdf")))
|
||||||
|
)
|
||||||
|
|
||||||
|
return predictions
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
pipeline = load_pipeline(verbose=CONFIG.service.verbose, batch_size=CONFIG.service.batch_size, tolerance=CONFIG.service.image_stiching_tolerance)
|
||||||
|
|
||||||
|
if os.path.isfile(args.input):
|
||||||
|
pdf_paths = [args.input]
|
||||||
|
else:
|
||||||
|
pdf_paths = glob(os.path.join(args.input, "*.pdf"))
|
||||||
|
page_range = range(*args.page_interval) if args.page_interval else None
|
||||||
|
|
||||||
|
for pdf_path in pdf_paths:
|
||||||
|
predictions = process_pdf(pipeline, pdf_path, page_range=page_range)
|
||||||
|
if args.print:
|
||||||
|
print(pdf_path)
|
||||||
|
print(json.dumps(predictions, indent=2))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parse_args()
|
||||||
|
main(args)
|
||||||
15
scripts/run_tests.sh
Executable file
15
scripts/run_tests.sh
Executable file
@ -0,0 +1,15 @@
|
|||||||
|
echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
|
||||||
|
|
||||||
|
pip install dvc
|
||||||
|
pip install 'dvc[ssh]'
|
||||||
|
echo "Pulling dvc data"
|
||||||
|
dvc pull
|
||||||
|
|
||||||
|
docker build -f Dockerfile_tests -t image-prediction-tests .
|
||||||
|
|
||||||
|
rnd=$(date +"%s")
|
||||||
|
name=image-prediction-tests-${rnd}
|
||||||
|
|
||||||
|
echo "running tests container"
|
||||||
|
|
||||||
|
docker run --rm --name $name -v $PWD:$PWD -w $PWD -v /var/run/docker.sock:/var/run/docker.sock image-prediction-tests
|
||||||
13
setup.py
13
setup.py
@ -1,13 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
from distutils.core import setup
|
|
||||||
|
|
||||||
setup(
|
|
||||||
name="image_prediction",
|
|
||||||
version="0.1.0",
|
|
||||||
description="",
|
|
||||||
author="",
|
|
||||||
author_email="",
|
|
||||||
url="",
|
|
||||||
packages=["image_prediction"],
|
|
||||||
)
|
|
||||||
@ -1,15 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
python3 -m venv build_venv
|
|
||||||
source build_venv/bin/activate
|
|
||||||
python3 -m pip install --upgrade pip
|
|
||||||
|
|
||||||
pip install dvc
|
|
||||||
pip install 'dvc[ssh]'
|
|
||||||
dvc pull
|
|
||||||
|
|
||||||
git submodule update --init --recursive
|
|
||||||
|
|
||||||
docker build -f Dockerfile_base -t image-prediction-base .
|
|
||||||
docker build -f Dockerfile -t image-prediction .
|
|
||||||
@ -1,4 +0,0 @@
|
|||||||
sonar.exclusions=bamboo-specs/**, **/test_data/**
|
|
||||||
sonar.c.file.suffixes=-
|
|
||||||
sonar.cpp.file.suffixes=-
|
|
||||||
sonar.objc.file.suffixes=-
|
|
||||||
13
src/image_prediction/__init__.py
Normal file
13
src/image_prediction/__init__.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# log config
|
||||||
|
LOG_FORMAT = "%(asctime)s [%(levelname)s] - [%(filename)s -> %(funcName)s() -> %(lineno)s] : %(message)s"
|
||||||
|
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
|
||||||
|
stream_handler = logging.StreamHandler(sys.stdout)
|
||||||
|
stream_handler_format = logging.Formatter(LOG_FORMAT, datefmt=DATE_FORMAT)
|
||||||
|
stream_handler.setFormatter(stream_handler_format)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.propagate = False
|
||||||
|
logger.addHandler(stream_handler)
|
||||||
35
src/image_prediction/classifier/classifier.py
Normal file
35
src/image_prediction/classifier/classifier.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
from typing import List, Union, Tuple
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from PIL.Image import Image
|
||||||
|
from funcy import rcompose
|
||||||
|
|
||||||
|
from image_prediction.estimator.adapter.adapter import EstimatorAdapter
|
||||||
|
from image_prediction.label_mapper.mapper import LabelMapper
|
||||||
|
from image_prediction.utils import get_logger
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class Classifier:
|
||||||
|
def __init__(self, estimator_adapter: EstimatorAdapter, label_mapper: LabelMapper):
|
||||||
|
"""Abstraction layer over different estimator backends (e.g. keras or scikit-learn). For each backend to be used
|
||||||
|
an EstimatorAdapter must be implemented.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
estimator_adapter: adapter for a given estimator backend
|
||||||
|
"""
|
||||||
|
self.__estimator_adapter = estimator_adapter
|
||||||
|
self.__label_mapper = label_mapper
|
||||||
|
self.__pipe = rcompose(self.__estimator_adapter, self.__label_mapper)
|
||||||
|
|
||||||
|
def predict(self, batch: Union[np.array, Tuple[Image]]) -> List[str]:
|
||||||
|
|
||||||
|
if isinstance(batch, np.ndarray) and batch.shape[0] == 0:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return self.__pipe(batch)
|
||||||
|
|
||||||
|
def __call__(self, batch: np.array) -> List[str]:
|
||||||
|
logger.debug("Classifier.predict")
|
||||||
|
return self.predict(batch)
|
||||||
32
src/image_prediction/classifier/image_classifier.py
Normal file
32
src/image_prediction/classifier/image_classifier.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
from itertools import chain
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
from PIL.Image import Image
|
||||||
|
from funcy import rcompose, chunks
|
||||||
|
|
||||||
|
from image_prediction.classifier.classifier import Classifier
|
||||||
|
from image_prediction.estimator.preprocessor.preprocessor import Preprocessor
|
||||||
|
from image_prediction.estimator.preprocessor.preprocessors.identity import IdentityPreprocessor
|
||||||
|
from image_prediction.utils import get_logger
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class ImageClassifier:
|
||||||
|
"""Combines a classifier with a preprocessing pipeline: Receives images, chunks into batches, converts to tensors,
|
||||||
|
applies transformations and finally sends to internal classifier.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, classifier: Classifier, preprocessor: Preprocessor = None):
|
||||||
|
self.estimator = classifier
|
||||||
|
self.preprocessor = preprocessor if preprocessor else IdentityPreprocessor()
|
||||||
|
self.pipe = rcompose(self.preprocessor, self.estimator)
|
||||||
|
|
||||||
|
def predict(self, images: Iterable[Image], batch_size=16):
|
||||||
|
batches = chunks(batch_size, images)
|
||||||
|
predictions = chain.from_iterable(map(self.pipe, batches))
|
||||||
|
return predictions
|
||||||
|
|
||||||
|
def __call__(self, images: Iterable[Image], batch_size=16):
|
||||||
|
logger.debug("ImageClassifier.predict")
|
||||||
|
yield from self.predict(images, batch_size=batch_size)
|
||||||
16
src/image_prediction/compositor/compositor.py
Normal file
16
src/image_prediction/compositor/compositor.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
from funcy import rcompose
|
||||||
|
|
||||||
|
from image_prediction.transformer.transformer import Transformer
|
||||||
|
from image_prediction.utils import get_logger
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class TransformerCompositor(Transformer):
|
||||||
|
def __init__(self, formatter: Transformer, *formatters: Transformer):
|
||||||
|
formatters = (formatter, *formatters)
|
||||||
|
self.pipe = rcompose(*formatters)
|
||||||
|
|
||||||
|
def transform(self, obj):
|
||||||
|
logger.debug("TransformerCompositor.transform")
|
||||||
|
return self.pipe(obj)
|
||||||
7
src/image_prediction/config.py
Normal file
7
src/image_prediction/config.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pyinfra.config.loader import load_settings
|
||||||
|
|
||||||
|
from image_prediction.locations import PROJECT_ROOT_DIR
|
||||||
|
|
||||||
|
CONFIG = load_settings(root_path=PROJECT_ROOT_DIR, settings_path="config")
|
||||||
43
src/image_prediction/default_objects.py
Normal file
43
src/image_prediction/default_objects.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
from funcy import juxt
|
||||||
|
|
||||||
|
from image_prediction.classifier.classifier import Classifier
|
||||||
|
from image_prediction.classifier.image_classifier import ImageClassifier
|
||||||
|
from image_prediction.compositor.compositor import TransformerCompositor
|
||||||
|
from image_prediction.encoder.encoders.hash_encoder import HashEncoder
|
||||||
|
from image_prediction.estimator.adapter.adapter import EstimatorAdapter
|
||||||
|
from image_prediction.formatter.formatters.camel_case import Snake2CamelCaseKeyFormatter
|
||||||
|
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||||
|
from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
|
||||||
|
from image_prediction.label_mapper.mappers.probability import ProbabilityMapper
|
||||||
|
from image_prediction.model_loader.loader import ModelLoader
|
||||||
|
from image_prediction.model_loader.loaders.mlflow import MlflowConnector
|
||||||
|
from image_prediction.redai_adapter.mlflow import MlflowModelReader
|
||||||
|
from image_prediction.transformer.transformers.coordinate.pdfnet import PDFNetCoordinateTransformer
|
||||||
|
from image_prediction.transformer.transformers.response import ResponseTransformer
|
||||||
|
|
||||||
|
|
||||||
|
def get_mlflow_model_loader(mlruns_dir):
|
||||||
|
model_loader = ModelLoader(MlflowConnector(MlflowModelReader(mlruns_dir)))
|
||||||
|
return model_loader
|
||||||
|
|
||||||
|
|
||||||
|
def get_image_classifier(model_loader, model_identifier):
|
||||||
|
model, classes = juxt(model_loader.load_model, model_loader.load_classes)(model_identifier)
|
||||||
|
return ImageClassifier(Classifier(EstimatorAdapter(model), ProbabilityMapper(classes)))
|
||||||
|
|
||||||
|
|
||||||
|
def get_extractor(**kwargs):
|
||||||
|
image_extractor = ParsablePDFImageExtractor(**kwargs)
|
||||||
|
|
||||||
|
return image_extractor
|
||||||
|
|
||||||
|
|
||||||
|
def get_formatter():
|
||||||
|
formatter = TransformerCompositor(
|
||||||
|
PDFNetCoordinateTransformer(), EnumFormatter(), ResponseTransformer(), Snake2CamelCaseKeyFormatter()
|
||||||
|
)
|
||||||
|
return formatter
|
||||||
|
|
||||||
|
|
||||||
|
def get_encoder():
|
||||||
|
return HashEncoder()
|
||||||
13
src/image_prediction/encoder/encoder.py
Normal file
13
src/image_prediction/encoder/encoder.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
import abc
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
from PIL.Image import Image
|
||||||
|
|
||||||
|
|
||||||
|
class Encoder(abc.ABC):
|
||||||
|
@abc.abstractmethod
|
||||||
|
def encode(self, images: Iterable[Image]):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def __call__(self, images: Iterable[Image], batch_size=16):
|
||||||
|
yield from self.encode(images)
|
||||||
0
src/image_prediction/encoder/encoders/__init__.py
Normal file
0
src/image_prediction/encoder/encoders/__init__.py
Normal file
26
src/image_prediction/encoder/encoders/hash_encoder.py
Normal file
26
src/image_prediction/encoder/encoders/hash_encoder.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from image_prediction.encoder.encoder import Encoder
|
||||||
|
|
||||||
|
|
||||||
|
class HashEncoder(Encoder):
|
||||||
|
def encode(self, images: Iterable[Image.Image]):
|
||||||
|
yield from map(hash_image, images)
|
||||||
|
|
||||||
|
def __call__(self, images: Iterable[Image.Image], batch_size=16):
|
||||||
|
yield from self.encode(images)
|
||||||
|
|
||||||
|
|
||||||
|
def hash_image(image: Image.Image) -> str:
|
||||||
|
"""See: https://stackoverflow.com/a/49692185/3578468"""
|
||||||
|
image = image.resize((10, 10), Image.ANTIALIAS)
|
||||||
|
image = image.convert("L")
|
||||||
|
pixel_data = list(image.getdata())
|
||||||
|
avg_pixel = sum(pixel_data) / len(pixel_data)
|
||||||
|
bits = "".join(["1" if (px >= avg_pixel) else "0" for px in pixel_data])
|
||||||
|
hex_representation = str(hex(int(bits, 2)))[2:][::-1].upper()
|
||||||
|
# Note: For each 4 leading zeros, the hex representation will be shorter by one character.
|
||||||
|
# To ensure that all hashes have the same length, we pad the hex representation with zeros (also see RED-3813).
|
||||||
|
return hex_representation.zfill(25)
|
||||||
0
src/image_prediction/estimator/__init__.py
Normal file
0
src/image_prediction/estimator/__init__.py
Normal file
0
src/image_prediction/estimator/adapter/__init__.py
Normal file
0
src/image_prediction/estimator/adapter/__init__.py
Normal file
15
src/image_prediction/estimator/adapter/adapter.py
Normal file
15
src/image_prediction/estimator/adapter/adapter.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
from image_prediction.utils import get_logger
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class EstimatorAdapter:
|
||||||
|
def __init__(self, estimator):
|
||||||
|
self.estimator = estimator
|
||||||
|
|
||||||
|
def predict(self, batch):
|
||||||
|
return self.estimator(batch)
|
||||||
|
|
||||||
|
def __call__(self, batch):
|
||||||
|
logger.debug("EstimatorAdapter.predict")
|
||||||
|
return self.predict(batch)
|
||||||
10
src/image_prediction/estimator/preprocessor/preprocessor.py
Normal file
10
src/image_prediction/estimator/preprocessor/preprocessor.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
import abc
|
||||||
|
|
||||||
|
|
||||||
|
class Preprocessor(abc.ABC):
|
||||||
|
@abc.abstractmethod
|
||||||
|
def preprocess(self, batch):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def __call__(self, batch):
|
||||||
|
return self.preprocess(batch)
|
||||||
@ -0,0 +1,10 @@
|
|||||||
|
from image_prediction.estimator.preprocessor.preprocessor import Preprocessor
|
||||||
|
from image_prediction.estimator.preprocessor.utils import images_to_batch_tensor
|
||||||
|
|
||||||
|
|
||||||
|
class BasicPreprocessor(Preprocessor):
|
||||||
|
"""Converts images to tensors"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def preprocess(images):
|
||||||
|
return images_to_batch_tensor(images)
|
||||||
@ -0,0 +1,10 @@
|
|||||||
|
from image_prediction.estimator.preprocessor.preprocessor import Preprocessor
|
||||||
|
|
||||||
|
|
||||||
|
class IdentityPreprocessor(Preprocessor):
|
||||||
|
@staticmethod
|
||||||
|
def preprocess(images):
|
||||||
|
return images
|
||||||
|
|
||||||
|
def __call__(self, images):
|
||||||
|
return self.preprocess(images)
|
||||||
10
src/image_prediction/estimator/preprocessor/utils.py
Normal file
10
src/image_prediction/estimator/preprocessor/utils.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
import numpy as np
|
||||||
|
from PIL.Image import Image
|
||||||
|
|
||||||
|
|
||||||
|
def image_to_normalized_tensor(image: Image) -> np.ndarray:
|
||||||
|
return np.array(image) / 255
|
||||||
|
|
||||||
|
|
||||||
|
def images_to_batch_tensor(images) -> np.ndarray:
|
||||||
|
return np.array(list(map(image_to_normalized_tensor, images)))
|
||||||
42
src/image_prediction/exceptions.py
Normal file
42
src/image_prediction/exceptions.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
class UnknownEstimatorAdapter(ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class UnknownImageExtractor(ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class UnknownModelLoader(ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class UnknownDatabaseType(ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class UnknownLabelFormat(ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class UnexpectedLabelFormat(ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class IncorrectInstantiation(RuntimeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class IntentionalTestException(RuntimeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class InvalidBox(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ParsingError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class BadXref(ValueError):
|
||||||
|
pass
|
||||||
13
src/image_prediction/extraction.py
Normal file
13
src/image_prediction/extraction.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
|
||||||
|
|
||||||
|
|
||||||
|
def extract_images_from_pdf(pdf, extractor=None):
|
||||||
|
|
||||||
|
if not extractor:
|
||||||
|
extractor = ParsablePDFImageExtractor()
|
||||||
|
|
||||||
|
try:
|
||||||
|
images_extracted, metadata_extracted = zip(*extractor(pdf))
|
||||||
|
return images_extracted, metadata_extracted
|
||||||
|
except ValueError:
|
||||||
|
return [], []
|
||||||
60
src/image_prediction/flask.py
Normal file
60
src/image_prediction/flask.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
from typing import Callable
|
||||||
|
|
||||||
|
from flask import Flask, request, jsonify
|
||||||
|
from prometheus_client import generate_latest, CollectorRegistry, Summary
|
||||||
|
|
||||||
|
from image_prediction.utils import get_logger
|
||||||
|
from image_prediction.utils.process_wrapping import wrap_in_process
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
def make_prediction_server(predict_fn: Callable):
|
||||||
|
app = Flask(__name__)
|
||||||
|
registry = CollectorRegistry(auto_describe=True)
|
||||||
|
metric = Summary(
|
||||||
|
f"redactmanager_imageClassification_seconds", f"Time spent on image-service classification.", registry=registry
|
||||||
|
)
|
||||||
|
|
||||||
|
@app.route("/ready", methods=["GET"])
|
||||||
|
def ready():
|
||||||
|
resp = jsonify("OK")
|
||||||
|
resp.status_code = 200
|
||||||
|
return resp
|
||||||
|
|
||||||
|
@app.route("/health", methods=["GET"])
|
||||||
|
def healthy():
|
||||||
|
resp = jsonify("OK")
|
||||||
|
resp.status_code = 200
|
||||||
|
return resp
|
||||||
|
|
||||||
|
def __failure():
|
||||||
|
response = jsonify("Analysis failed")
|
||||||
|
response.status_code = 500
|
||||||
|
return response
|
||||||
|
|
||||||
|
@app.route("/predict", methods=["POST"])
|
||||||
|
@app.route("/", methods=["POST"])
|
||||||
|
@metric.time()
|
||||||
|
def predict():
|
||||||
|
|
||||||
|
# Tensorflow does not free RAM. Workaround: Run prediction function (which instantiates a model) in sub-process.
|
||||||
|
# See: https://stackoverflow.com/questions/39758094/clearing-tensorflow-gpu-memory-after-model-execution
|
||||||
|
predict_fn_wrapped = wrap_in_process(predict_fn)
|
||||||
|
|
||||||
|
logger.info("Analysing...")
|
||||||
|
predictions = predict_fn_wrapped(request.data)
|
||||||
|
|
||||||
|
if predictions is not None:
|
||||||
|
response = jsonify(predictions)
|
||||||
|
logger.info("Analysis completed.")
|
||||||
|
return response
|
||||||
|
else:
|
||||||
|
logger.error("Analysis failed.")
|
||||||
|
return __failure()
|
||||||
|
|
||||||
|
@app.route("/prometheus", methods=["GET"])
|
||||||
|
def prometheus():
|
||||||
|
return generate_latest(registry=registry)
|
||||||
|
|
||||||
|
return app
|
||||||
0
src/image_prediction/formatter/__init__.py
Normal file
0
src/image_prediction/formatter/__init__.py
Normal file
15
src/image_prediction/formatter/formatter.py
Normal file
15
src/image_prediction/formatter/formatter.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
import abc
|
||||||
|
|
||||||
|
from image_prediction.transformer.transformer import Transformer
|
||||||
|
|
||||||
|
|
||||||
|
class Formatter(Transformer):
|
||||||
|
@abc.abstractmethod
|
||||||
|
def format(self, obj):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def transform(self, obj):
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def __call__(self, obj):
|
||||||
|
return self.format(obj)
|
||||||
11
src/image_prediction/formatter/formatters/camel_case.py
Normal file
11
src/image_prediction/formatter/formatters/camel_case.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
from image_prediction.formatter.formatters.key_formatter import KeyFormatter
|
||||||
|
|
||||||
|
|
||||||
|
class Snake2CamelCaseKeyFormatter(KeyFormatter):
|
||||||
|
def format_key(self, key):
|
||||||
|
|
||||||
|
if isinstance(key, str):
|
||||||
|
head, *tail = key.split("_")
|
||||||
|
return head + "".join(map(str.title, tail))
|
||||||
|
else:
|
||||||
|
return key
|
||||||
23
src/image_prediction/formatter/formatters/enum.py
Normal file
23
src/image_prediction/formatter/formatters/enum.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
from image_prediction.formatter.formatters.key_formatter import KeyFormatter
|
||||||
|
|
||||||
|
|
||||||
|
class EnumFormatter(KeyFormatter):
|
||||||
|
def format_key(self, key):
|
||||||
|
return key.value if isinstance(key, Enum) else key
|
||||||
|
|
||||||
|
def transform(self, obj):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class ReverseEnumFormatter(KeyFormatter):
|
||||||
|
def __init__(self, enum):
|
||||||
|
self.enum = enum
|
||||||
|
self.reverse_enum = {e.value: e for e in enum}
|
||||||
|
|
||||||
|
def format_key(self, key):
|
||||||
|
return self.reverse_enum.get(key, key)
|
||||||
|
|
||||||
|
def transform(self, obj):
|
||||||
|
raise NotImplementedError
|
||||||
6
src/image_prediction/formatter/formatters/identity.py
Normal file
6
src/image_prediction/formatter/formatters/identity.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
from image_prediction.formatter.formatter import Formatter
|
||||||
|
|
||||||
|
|
||||||
|
class IdentityFormatter(Formatter):
|
||||||
|
def format(self, obj):
|
||||||
|
return obj
|
||||||
28
src/image_prediction/formatter/formatters/key_formatter.py
Normal file
28
src/image_prediction/formatter/formatters/key_formatter.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
import abc
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
from image_prediction.formatter.formatter import Formatter
|
||||||
|
|
||||||
|
|
||||||
|
class KeyFormatter(Formatter):
|
||||||
|
@abc.abstractmethod
|
||||||
|
def format_key(self, key):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def __format(self, data):
|
||||||
|
|
||||||
|
# If we wanted to do this properly, we would need handlers for all expected types and dispatch based
|
||||||
|
# on a type comparison. This is too much engineering for the limited use-case of this class though.
|
||||||
|
if isinstance(data, Iterable) and not isinstance(data, dict) and not isinstance(data, str):
|
||||||
|
f = map(self.__format, data)
|
||||||
|
return type(data)(f) if not isinstance(data, map) else f
|
||||||
|
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
return data
|
||||||
|
|
||||||
|
keys_formatted = list(map(self.format_key, data))
|
||||||
|
|
||||||
|
return dict(zip(keys_formatted, map(self.__format, data.values())))
|
||||||
|
|
||||||
|
def format(self, data):
|
||||||
|
return self.__format(data)
|
||||||
0
src/image_prediction/image_extractor/__init__.py
Normal file
0
src/image_prediction/image_extractor/__init__.py
Normal file
19
src/image_prediction/image_extractor/extractor.py
Normal file
19
src/image_prediction/image_extractor/extractor.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
import abc
|
||||||
|
from collections import namedtuple
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
from image_prediction.utils import get_logger
|
||||||
|
|
||||||
|
ImageMetadataPair = namedtuple("ImageMetadataPair", ["image", "metadata"])
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class ImageExtractor(abc.ABC):
|
||||||
|
@abc.abstractmethod
|
||||||
|
def extract(self, obj) -> Iterable[ImageMetadataPair]:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def __call__(self, obj, **kwargs):
|
||||||
|
logger.debug("ImageExtractor.extract")
|
||||||
|
return self.extract(obj, **kwargs)
|
||||||
7
src/image_prediction/image_extractor/extractors/mock.py
Normal file
7
src/image_prediction/image_extractor/extractors/mock.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||||
|
|
||||||
|
|
||||||
|
class ImageExtractorMock(ImageExtractor):
|
||||||
|
def extract(self, image_container):
|
||||||
|
for i, image in enumerate(image_container):
|
||||||
|
yield ImageMetadataPair(image, {"image_id": i})
|
||||||
300
src/image_prediction/image_extractor/extractors/parsable.py
Normal file
300
src/image_prediction/image_extractor/extractors/parsable.py
Normal file
@ -0,0 +1,300 @@
|
|||||||
|
import atexit
|
||||||
|
import json
|
||||||
|
import traceback
|
||||||
|
from _operator import itemgetter
|
||||||
|
from functools import partial, lru_cache
|
||||||
|
from itertools import chain, starmap, filterfalse, tee
|
||||||
|
from operator import itemgetter, truth
|
||||||
|
from typing import Iterable, Iterator, List, Union
|
||||||
|
|
||||||
|
import fitz
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image
|
||||||
|
from funcy import merge, pluck, compose, rcompose, remove, keep
|
||||||
|
from scipy.stats import gmean
|
||||||
|
|
||||||
|
from image_prediction.config import CONFIG
|
||||||
|
from image_prediction.exceptions import InvalidBox
|
||||||
|
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||||
|
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||||
|
from image_prediction.info import Info
|
||||||
|
from image_prediction.stitching.stitching import stitch_pairs
|
||||||
|
from image_prediction.stitching.utils import validate_box
|
||||||
|
from image_prediction.transformer.transformers.response import compute_geometric_quotient
|
||||||
|
from image_prediction.utils import get_logger
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class ParsablePDFImageExtractor(ImageExtractor):
|
||||||
|
def __init__(self, verbose=False, tolerance=0):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Args:
|
||||||
|
verbose: Whether to show progressbar
|
||||||
|
tolerance: The tolerance in pixels for the distance between images, beyond which they will not be stitched
|
||||||
|
together
|
||||||
|
"""
|
||||||
|
self.doc: fitz.Document = None
|
||||||
|
self.verbose = verbose
|
||||||
|
self.tolerance = tolerance
|
||||||
|
|
||||||
|
def extract(self, pdf: bytes, page_range: range = None):
|
||||||
|
self.doc = fitz.Document(stream=pdf)
|
||||||
|
|
||||||
|
pages = extract_pages(self.doc, page_range) if page_range else self.doc
|
||||||
|
|
||||||
|
image_metadata_pairs = chain.from_iterable(map(self.__process_images_on_page, pages))
|
||||||
|
|
||||||
|
yield from image_metadata_pairs
|
||||||
|
|
||||||
|
def __process_images_on_page(self, page: fitz.Page):
|
||||||
|
metadata = extract_valid_metadata(self.doc, page)
|
||||||
|
images = get_images_on_page(self.doc, metadata)
|
||||||
|
|
||||||
|
clear_caches()
|
||||||
|
|
||||||
|
image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
|
||||||
|
# TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the
|
||||||
|
# validation here. Invalid images can then be split into a different stream and joined with the intact images
|
||||||
|
# again for the formatting step.
|
||||||
|
image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs)
|
||||||
|
image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
|
||||||
|
|
||||||
|
yield from image_metadata_pairs
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def __filter_valid_images(image_metadata_pairs: Iterable[ImageMetadataPair]) -> Iterator[ImageMetadataPair]:
|
||||||
|
def validate_image_is_not_corrupt(image: Image.Image, metadata: dict):
|
||||||
|
"""See RED-5148: Some images are corrupt and cannot be processed by the image classifier. This function
|
||||||
|
filters out such images by trying to resize and convert them to RGB. If this fails, the image is considered
|
||||||
|
corrupt and is dropped.
|
||||||
|
TODO: find cleaner solution
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
image.resize((100, 100)).convert("RGB")
|
||||||
|
return ImageMetadataPair(image, metadata)
|
||||||
|
except (OSError, Exception) as err:
|
||||||
|
metadata = json.dumps(EnumFormatter()(metadata), indent=2)
|
||||||
|
logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def filter_small_images_on_scanned_pages(image_metadata_pairs) -> Iterable[ImageMetadataPair]:
|
||||||
|
"""See RED-9746: Small images on scanned pages should be dropped, so they are not classified. This is a
|
||||||
|
heuristic to filter out images that are too small in relation to the page size if they are on a scanned page.
|
||||||
|
|
||||||
|
The ratio is computed as the geometric mean of the width and height of the image divided by the geometric mean
|
||||||
|
of the width and height of the page. If the ratio is below the threshold, the image is dropped.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def image_is_a_scanned_page(image_metadata_pair: ImageMetadataPair) -> bool:
|
||||||
|
tolerance = CONFIG.filters.is_scanned_page.tolerance
|
||||||
|
width_ratio = image_metadata_pair.metadata[Info.WIDTH] / image_metadata_pair.metadata[Info.PAGE_WIDTH]
|
||||||
|
height_ratio = (
|
||||||
|
image_metadata_pair.metadata[Info.HEIGHT] / image_metadata_pair.metadata[Info.PAGE_HEIGHT]
|
||||||
|
)
|
||||||
|
return width_ratio >= 1 - tolerance and height_ratio >= 1 - tolerance
|
||||||
|
|
||||||
|
def image_fits_geometric_mean_ratio(image_metadata_pair: ImageMetadataPair) -> bool:
|
||||||
|
min_ratio = CONFIG.filters.image_to_page_quotient.min
|
||||||
|
metadatum = image_metadata_pair.metadata
|
||||||
|
image_gmean = gmean([metadatum[Info.WIDTH], metadatum[Info.HEIGHT]])
|
||||||
|
page_gmean = gmean([metadatum[Info.PAGE_WIDTH], metadatum[Info.PAGE_HEIGHT]])
|
||||||
|
ratio = image_gmean / page_gmean
|
||||||
|
return ratio >= min_ratio
|
||||||
|
|
||||||
|
pairs, pairs_copy = tee(image_metadata_pairs)
|
||||||
|
|
||||||
|
if any(map(image_is_a_scanned_page, pairs_copy)):
|
||||||
|
logger.debug("Scanned page detected, filtering out small images ...")
|
||||||
|
return filter(image_fits_geometric_mean_ratio, pairs)
|
||||||
|
else:
|
||||||
|
return pairs
|
||||||
|
|
||||||
|
image_metadata_pairs = filter_small_images_on_scanned_pages(image_metadata_pairs)
|
||||||
|
|
||||||
|
return filter(truth, starmap(validate_image_is_not_corrupt, image_metadata_pairs))
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pages(doc, page_range):
|
||||||
|
page_range = range(page_range.start + 1, page_range.stop + 1)
|
||||||
|
pages = map(doc.load_page, page_range)
|
||||||
|
|
||||||
|
yield from pages
|
||||||
|
|
||||||
|
|
||||||
|
def get_images_on_page(doc, metadata):
|
||||||
|
xrefs = pluck(Info.XREF, metadata)
|
||||||
|
images = map(partial(xref_to_image, doc), xrefs)
|
||||||
|
|
||||||
|
yield from images
|
||||||
|
|
||||||
|
|
||||||
|
def extract_valid_metadata(doc: fitz.Document, page: fitz.Page):
|
||||||
|
metadata = get_metadata_for_images_on_page(page)
|
||||||
|
metadata = filter_valid_metadata(metadata)
|
||||||
|
metadata = add_alpha_channel_info(doc, metadata)
|
||||||
|
|
||||||
|
return list(metadata)
|
||||||
|
|
||||||
|
|
||||||
|
def get_metadata_for_images_on_page(page: fitz.Page):
|
||||||
|
metadata = map(get_image_metadata, get_image_infos(page))
|
||||||
|
metadata = add_page_metadata(page, metadata)
|
||||||
|
|
||||||
|
yield from metadata
|
||||||
|
|
||||||
|
|
||||||
|
def filter_valid_metadata(metadata):
|
||||||
|
yield from compose(
|
||||||
|
# TODO: Disabled for now, since atm since the backend needs atm the metadata and the hash of every image, even
|
||||||
|
# scanned pages. In the future, this should be resolved differently, e.g. by filtering all page-sized images
|
||||||
|
# and giving the user the ability to reclassify false positives with a separate call.
|
||||||
|
# filter_out_page_sized_images,
|
||||||
|
filter_out_tiny_images,
|
||||||
|
filter_out_invalid_metadata,
|
||||||
|
)(metadata)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_out_invalid_metadata(metadata):
|
||||||
|
def __validate_box(box):
|
||||||
|
try:
|
||||||
|
return validate_box(box)
|
||||||
|
except InvalidBox as err:
|
||||||
|
logger.debug(f"Dropping invalid metadatum, reason: {err}")
|
||||||
|
|
||||||
|
yield from keep(__validate_box, metadata)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_out_page_sized_images(metadata):
|
||||||
|
yield from remove(breaches_image_to_page_quotient, metadata)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_out_tiny_images(metadata):
|
||||||
|
yield from filterfalse(tiny, metadata)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def get_image_infos(page: fitz.Page) -> List[dict]:
|
||||||
|
return page.get_image_info(xrefs=True)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def xref_to_image(doc, xref) -> Union[Image.Image, None]:
|
||||||
|
# NOTE: image extraction is done via pixmap to array, as this method is twice as fast as extraction via bytestream
|
||||||
|
try:
|
||||||
|
pixmap = fitz.Pixmap(doc, xref)
|
||||||
|
array = convert_pixmap_to_array(pixmap)
|
||||||
|
return Image.fromarray(array)
|
||||||
|
except ValueError:
|
||||||
|
logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
def convert_pixmap_to_array(pixmap: fitz.Pixmap):
|
||||||
|
array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(pixmap.h, pixmap.w, pixmap.n)
|
||||||
|
array = _normalize_channels(array)
|
||||||
|
return array
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_channels(array: np.ndarray):
|
||||||
|
if array.shape[-1] == 1:
|
||||||
|
array = array[:, :, 0]
|
||||||
|
elif array.shape[-1] == 4:
|
||||||
|
array = array[..., :3]
|
||||||
|
elif array.shape[-1] != 3:
|
||||||
|
logger.warning(f"Unexpected image format: {array.shape}.")
|
||||||
|
raise ValueError(f"Unexpected image format: {array.shape}.")
|
||||||
|
|
||||||
|
return array
|
||||||
|
|
||||||
|
|
||||||
|
def get_image_metadata(image_info):
|
||||||
|
xref, coords = itemgetter("xref", "bbox")(image_info)
|
||||||
|
x1, y1, x2, y2 = map(rounder, coords)
|
||||||
|
|
||||||
|
width = abs(x2 - x1)
|
||||||
|
height = abs(y2 - y1)
|
||||||
|
|
||||||
|
return {
|
||||||
|
Info.WIDTH: width,
|
||||||
|
Info.HEIGHT: height,
|
||||||
|
Info.X1: x1,
|
||||||
|
Info.X2: x2,
|
||||||
|
Info.Y1: y1,
|
||||||
|
Info.Y2: y2,
|
||||||
|
Info.XREF: xref,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def add_page_metadata(page, metadata):
|
||||||
|
yield from map(partial(merge, get_page_metadata(page)), metadata)
|
||||||
|
|
||||||
|
|
||||||
|
def add_alpha_channel_info(doc, metadata):
|
||||||
|
def add_alpha_value_to_metadatum(metadatum):
|
||||||
|
alpha = metadatum_to_alpha_value(metadatum)
|
||||||
|
return {**metadatum, Info.ALPHA: alpha}
|
||||||
|
|
||||||
|
xref_to_alpha = partial(has_alpha_channel, doc)
|
||||||
|
metadatum_to_alpha_value = compose(xref_to_alpha, itemgetter(Info.XREF))
|
||||||
|
|
||||||
|
yield from map(add_alpha_value_to_metadatum, metadata)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def load_image_handle_from_xref(doc, xref):
|
||||||
|
try:
|
||||||
|
return doc.extract_image(xref)
|
||||||
|
except ValueError:
|
||||||
|
logger.debug(f"Xref {xref} is invalid, skipping extraction ...")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
rounder = rcompose(round, int)
|
||||||
|
|
||||||
|
|
||||||
|
def get_page_metadata(page):
|
||||||
|
page_width, page_height = map(rounder, page.mediabox_size)
|
||||||
|
|
||||||
|
return {
|
||||||
|
Info.PAGE_WIDTH: page_width,
|
||||||
|
Info.PAGE_HEIGHT: page_height,
|
||||||
|
Info.PAGE_IDX: page.number,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def has_alpha_channel(doc, xref):
|
||||||
|
maybe_image = load_image_handle_from_xref(doc, xref)
|
||||||
|
maybe_smask = maybe_image["smask"] if maybe_image else None
|
||||||
|
|
||||||
|
if maybe_smask:
|
||||||
|
return any([doc.extract_image(maybe_smask) is not None, bool(fitz.Pixmap(doc, maybe_smask).alpha)])
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
return bool(fitz.Pixmap(doc, xref).alpha)
|
||||||
|
except ValueError:
|
||||||
|
logger.debug(f"Encountered invalid xref `{xref}` in {doc.metadata.get('title', '<no title>')}.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def tiny(metadata):
|
||||||
|
return metadata[Info.WIDTH] * metadata[Info.HEIGHT] <= 4
|
||||||
|
|
||||||
|
|
||||||
|
def clear_caches():
|
||||||
|
get_image_infos.cache_clear()
|
||||||
|
load_image_handle_from_xref.cache_clear()
|
||||||
|
xref_to_image.cache_clear()
|
||||||
|
|
||||||
|
|
||||||
|
atexit.register(clear_caches)
|
||||||
|
|
||||||
|
|
||||||
|
def breaches_image_to_page_quotient(metadatum):
|
||||||
|
page_width, page_height, x1, x2, y1, y2, width, height = itemgetter(
|
||||||
|
Info.PAGE_WIDTH, Info.PAGE_HEIGHT, Info.X1, Info.X2, Info.Y1, Info.Y2, Info.WIDTH, Info.HEIGHT
|
||||||
|
)(metadatum)
|
||||||
|
geometric_quotient = compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1)
|
||||||
|
quotient_breached = bool(geometric_quotient > CONFIG.filters.image_to_page_quotient.max)
|
||||||
|
return quotient_breached
|
||||||
15
src/image_prediction/info.py
Normal file
15
src/image_prediction/info.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class Info(Enum):
|
||||||
|
PAGE_WIDTH = "page_width"
|
||||||
|
PAGE_HEIGHT = "page_height"
|
||||||
|
PAGE_IDX = "page_idx"
|
||||||
|
WIDTH = "width"
|
||||||
|
HEIGHT = "height"
|
||||||
|
X1 = "x1"
|
||||||
|
X2 = "x2"
|
||||||
|
Y1 = "y1"
|
||||||
|
Y2 = "y2"
|
||||||
|
ALPHA = "alpha"
|
||||||
|
XREF = "xref"
|
||||||
0
src/image_prediction/label_mapper/__init__.py
Normal file
0
src/image_prediction/label_mapper/__init__.py
Normal file
10
src/image_prediction/label_mapper/mapper.py
Normal file
10
src/image_prediction/label_mapper/mapper.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
import abc
|
||||||
|
|
||||||
|
|
||||||
|
class LabelMapper(abc.ABC):
|
||||||
|
@abc.abstractmethod
|
||||||
|
def map_labels(self, items):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def __call__(self, items):
|
||||||
|
return self.map_labels(items)
|
||||||
20
src/image_prediction/label_mapper/mappers/numeric.py
Normal file
20
src/image_prediction/label_mapper/mappers/numeric.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
from typing import Mapping, Iterable
|
||||||
|
|
||||||
|
from image_prediction.exceptions import UnexpectedLabelFormat
|
||||||
|
from image_prediction.label_mapper.mapper import LabelMapper
|
||||||
|
|
||||||
|
|
||||||
|
class IndexMapper(LabelMapper):
|
||||||
|
def __init__(self, labels: Mapping[int, str]):
|
||||||
|
self.__labels = labels
|
||||||
|
|
||||||
|
def __validate_index_label_format(self, index_label: int) -> None:
|
||||||
|
if not 0 <= index_label < len(self.__labels):
|
||||||
|
raise UnexpectedLabelFormat(f"Received index label '{index_label}' that has no associated string label.")
|
||||||
|
|
||||||
|
def __map_label(self, index_label: int) -> str:
|
||||||
|
self.__validate_index_label_format(index_label)
|
||||||
|
return self.__labels[index_label]
|
||||||
|
|
||||||
|
def map_labels(self, index_labels: Iterable[int]) -> Iterable[str]:
|
||||||
|
return map(self.__map_label, index_labels)
|
||||||
39
src/image_prediction/label_mapper/mappers/probability.py
Normal file
39
src/image_prediction/label_mapper/mappers/probability.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
from enum import Enum
|
||||||
|
from operator import itemgetter
|
||||||
|
from typing import Mapping, Iterable
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from funcy import rcompose, rpartial
|
||||||
|
|
||||||
|
from image_prediction.exceptions import UnexpectedLabelFormat
|
||||||
|
from image_prediction.label_mapper.mapper import LabelMapper
|
||||||
|
|
||||||
|
|
||||||
|
class ProbabilityMapperKeys(Enum):
|
||||||
|
LABEL = "label"
|
||||||
|
PROBABILITIES = "probabilities"
|
||||||
|
|
||||||
|
|
||||||
|
class ProbabilityMapper(LabelMapper):
|
||||||
|
def __init__(self, labels: Mapping[int, str]):
|
||||||
|
self.__labels = labels
|
||||||
|
# String conversion in the middle due to floating point precision issues.
|
||||||
|
# See: https://stackoverflow.com/questions/56820/round-doesnt-seem-to-be-rounding-properly
|
||||||
|
self.__rounder = rcompose(rpartial(round, 4), str, float)
|
||||||
|
|
||||||
|
def __validate_array_label_format(self, probabilities: np.ndarray) -> None:
|
||||||
|
if not len(probabilities) == len(self.__labels):
|
||||||
|
raise UnexpectedLabelFormat(
|
||||||
|
f"Received fewer probabilities ({len(probabilities)}) than labels were passed ({len(self.__labels)})."
|
||||||
|
)
|
||||||
|
|
||||||
|
def __map_array(self, probabilities: np.ndarray) -> dict:
|
||||||
|
self.__validate_array_label_format(probabilities)
|
||||||
|
cls2prob = dict(
|
||||||
|
sorted(zip(self.__labels, list(map(self.__rounder, probabilities))), key=itemgetter(1), reverse=True)
|
||||||
|
)
|
||||||
|
most_likely = [*cls2prob][0]
|
||||||
|
return {ProbabilityMapperKeys.LABEL: most_likely, ProbabilityMapperKeys.PROBABILITIES: cls2prob}
|
||||||
|
|
||||||
|
def map_labels(self, probabilities: Iterable[np.ndarray]) -> Iterable[dict]:
|
||||||
|
return map(self.__map_array, probabilities)
|
||||||
18
src/image_prediction/locations.py
Normal file
18
src/image_prediction/locations.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
"""Defines constant paths relative to the module root path."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# FIXME: move these paths to config, only depending on 'ROOT_PATH' environment variable.
|
||||||
|
MODULE_DIR = Path(__file__).resolve().parents[0]
|
||||||
|
PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
|
||||||
|
PROJECT_ROOT_DIR = PACKAGE_ROOT_DIR.parents[0]
|
||||||
|
|
||||||
|
CONFIG_FILE = PROJECT_ROOT_DIR / "config" / "settings.toml"
|
||||||
|
BANNER_FILE = PROJECT_ROOT_DIR / "banner.txt"
|
||||||
|
|
||||||
|
DATA_DIR = PROJECT_ROOT_DIR / "data"
|
||||||
|
MLRUNS_DIR = str(DATA_DIR / "mlruns")
|
||||||
|
|
||||||
|
TEST_DIR = PROJECT_ROOT_DIR / "test"
|
||||||
|
TEST_DATA_DIR = TEST_DIR / "data"
|
||||||
|
TEST_DATA_DIR_DVC = TEST_DIR / "data.dvc"
|
||||||
0
src/image_prediction/model_loader/__init__.py
Normal file
0
src/image_prediction/model_loader/__init__.py
Normal file
7
src/image_prediction/model_loader/database/connector.py
Normal file
7
src/image_prediction/model_loader/database/connector.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
import abc
|
||||||
|
|
||||||
|
|
||||||
|
class DatabaseConnector(abc.ABC):
|
||||||
|
@abc.abstractmethod
|
||||||
|
def get_object(self, identifier):
|
||||||
|
raise NotImplementedError
|
||||||
@ -0,0 +1,9 @@
|
|||||||
|
from image_prediction.model_loader.database.connector import DatabaseConnector
|
||||||
|
|
||||||
|
|
||||||
|
class DatabaseConnectorMock(DatabaseConnector):
|
||||||
|
def __init__(self, store: dict):
|
||||||
|
self.store = store
|
||||||
|
|
||||||
|
def get_object(self, identifier):
|
||||||
|
return self.store[identifier]
|
||||||
18
src/image_prediction/model_loader/loader.py
Normal file
18
src/image_prediction/model_loader/loader.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
|
from image_prediction.model_loader.database.connector import DatabaseConnector
|
||||||
|
|
||||||
|
|
||||||
|
class ModelLoader:
|
||||||
|
def __init__(self, database_connector: DatabaseConnector):
|
||||||
|
self.database_connector = database_connector
|
||||||
|
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def __get_object(self, identifier):
|
||||||
|
return self.database_connector.get_object(identifier)
|
||||||
|
|
||||||
|
def load_model(self, identifier):
|
||||||
|
return self.__get_object(identifier)["model"]
|
||||||
|
|
||||||
|
def load_classes(self, identifier):
|
||||||
|
return self.__get_object(identifier)["classes"]
|
||||||
10
src/image_prediction/model_loader/loaders/mlflow.py
Normal file
10
src/image_prediction/model_loader/loaders/mlflow.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
from image_prediction.model_loader.database.connector import DatabaseConnector
|
||||||
|
from image_prediction.redai_adapter.mlflow import MlflowModelReader
|
||||||
|
|
||||||
|
|
||||||
|
class MlflowConnector(DatabaseConnector):
|
||||||
|
def __init__(self, mlflow_reader: MlflowModelReader):
|
||||||
|
self.mlflow_reader = mlflow_reader
|
||||||
|
|
||||||
|
def get_object(self, run_id):
|
||||||
|
return self.mlflow_reader[run_id]
|
||||||
105
src/image_prediction/pipeline.py
Normal file
105
src/image_prediction/pipeline.py
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
import os
|
||||||
|
from functools import lru_cache, partial
|
||||||
|
from itertools import chain, tee
|
||||||
|
from typing import Iterable, Any
|
||||||
|
|
||||||
|
from funcy import rcompose, first, compose, second, chunks, identity, rpartial
|
||||||
|
from kn_utils.logging import logger
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from image_prediction.config import CONFIG
|
||||||
|
from image_prediction.default_objects import (
|
||||||
|
get_formatter,
|
||||||
|
get_mlflow_model_loader,
|
||||||
|
get_image_classifier,
|
||||||
|
get_extractor,
|
||||||
|
get_encoder,
|
||||||
|
)
|
||||||
|
from image_prediction.locations import MLRUNS_DIR
|
||||||
|
from image_prediction.utils.generic import lift, starlift
|
||||||
|
|
||||||
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def load_pipeline(**kwargs):
|
||||||
|
logger.info(f"Loading pipeline with kwargs: {kwargs}")
|
||||||
|
model_loader = get_mlflow_model_loader(MLRUNS_DIR)
|
||||||
|
model_identifier = CONFIG.service.mlflow_run_id
|
||||||
|
|
||||||
|
pipeline = Pipeline(model_loader, model_identifier, **kwargs)
|
||||||
|
|
||||||
|
return pipeline
|
||||||
|
|
||||||
|
|
||||||
|
def parallel(*fs):
|
||||||
|
return lambda *args: (f(a) for f, a in zip(fs, args))
|
||||||
|
|
||||||
|
|
||||||
|
def star(f):
|
||||||
|
return lambda x: f(*x)
|
||||||
|
|
||||||
|
|
||||||
|
class Pipeline:
|
||||||
|
def __init__(self, model_loader, model_identifier, batch_size=16, verbose=False, **kwargs):
|
||||||
|
self.verbose = verbose
|
||||||
|
|
||||||
|
extract = get_extractor(**kwargs)
|
||||||
|
classifier = get_image_classifier(model_loader, model_identifier)
|
||||||
|
reformat = get_formatter()
|
||||||
|
represent = get_encoder()
|
||||||
|
|
||||||
|
split = compose(star(parallel(*map(lift, (first, first, second)))), rpartial(tee, 3))
|
||||||
|
classify = compose(chain.from_iterable, lift(classifier), partial(chunks, batch_size))
|
||||||
|
pairwise_apply = compose(star, parallel)
|
||||||
|
join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))
|
||||||
|
|
||||||
|
# />--classify--\
|
||||||
|
# --extract-->--split--+->--encode---->+--join-->reformat-->filter_duplicates
|
||||||
|
# \>--identity--/
|
||||||
|
|
||||||
|
self.pipe = rcompose(
|
||||||
|
extract, # ... image-metadata-pairs as a stream
|
||||||
|
split, # ... into an image stream and a metadata stream
|
||||||
|
pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise
|
||||||
|
join, # ... the streams by zipping
|
||||||
|
reformat, # ... the items
|
||||||
|
filter_duplicates, # ... filter out duplicate images
|
||||||
|
)
|
||||||
|
|
||||||
|
def __call__(self, pdf: bytes, page_range: range = None):
|
||||||
|
yield from tqdm(
|
||||||
|
self.pipe(pdf, page_range=page_range),
|
||||||
|
desc="Processing images from document",
|
||||||
|
unit=" images",
|
||||||
|
disable=not self.verbose,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
|
||||||
|
"""Filter out duplicate images from the `position` (image coordinates) and `page`, preferring the one with
|
||||||
|
`allPassed` set to True.
|
||||||
|
See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
|
||||||
|
"""
|
||||||
|
keep = dict()
|
||||||
|
for image_meta in metadata:
|
||||||
|
key: tuple[int, int, int, int, int] = (
|
||||||
|
image_meta["position"]["x1"],
|
||||||
|
image_meta["position"]["x2"],
|
||||||
|
image_meta["position"]["y1"],
|
||||||
|
image_meta["position"]["y2"],
|
||||||
|
image_meta["position"]["pageNumber"],
|
||||||
|
)
|
||||||
|
if key in keep:
|
||||||
|
logger.warning(
|
||||||
|
f"Duplicate image found: x1={key[0]}, x2={key[1]}, y1={key[2]}, y2={key[3]}, pageNumber={key[4]}"
|
||||||
|
)
|
||||||
|
if image_meta["filters"]["allPassed"]:
|
||||||
|
logger.warning("Setting the image with allPassed flag set to True")
|
||||||
|
keep[key] = image_meta
|
||||||
|
else:
|
||||||
|
logger.warning("Keeping the previous image since the current image has allPassed flag set to False")
|
||||||
|
else:
|
||||||
|
keep[key] = image_meta
|
||||||
|
|
||||||
|
yield from keep.values()
|
||||||
0
src/image_prediction/redai_adapter/__init__.py
Normal file
0
src/image_prediction/redai_adapter/__init__.py
Normal file
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user