Compare commits

..

No commits in common. "master" and "1.1.1" have entirely different histories.

143 changed files with 1131 additions and 42306 deletions

View File

@ -1,8 +1,6 @@
[core]
remote = azure_remote
remote = vector
autostage = true
['remote "vector"']
url = ssh://vector.iqser.com/research/image-prediction/
port = 22
['remote "azure_remote"']
url = azure://image-classification-dvc/

5
.gitignore vendored
View File

@ -1,8 +1,7 @@
.vscode/
*.h5
*venv
/venv/
.idea/
src/data
!.gitignore
*.project
@ -173,4 +172,4 @@ fabric.properties
# https://plugins.jetbrains.com/plugin/12206-codestream
.idea/codestream.xml
# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm

View File

@ -1,51 +0,0 @@
include:
- project: "Gitlab/gitlab"
ref: main
file: "/ci-templates/research/dvc.gitlab-ci.yml"
- project: "Gitlab/gitlab"
ref: main
file: "/ci-templates/research/versioning-build-test-release.gitlab-ci.yml"
variables:
NEXUS_PROJECT_DIR: red
IMAGENAME: "${CI_PROJECT_NAME}"
INTEGRATION_TEST_FILE: "${CI_PROJECT_ID}.pdf"
FF_USE_FASTZIP: "true" # enable fastzip - a faster zip implementation that also supports level configuration.
ARTIFACT_COMPRESSION_LEVEL: default # can also be set to fastest, fast, slow and slowest. If just enabling fastzip is not enough try setting this to fastest or fast.
CACHE_COMPRESSION_LEVEL: default # same as above, but for caches
# TRANSFER_METER_FREQUENCY: 5s # will display transfer progress every 5 seconds for artifacts and remote caches. For debugging purposes.
stages:
- data
- setup
- tests
- sonarqube
- versioning
- build
- integration-tests
- release
docker-build:
extends: .docker-build
needs:
- job: dvc-pull
artifacts: true
- !reference [.needs-versioning, needs] # leave this line as is
###################
# INTEGRATION TESTS
trigger-integration-tests:
extends: .integration-tests
# ADD THE MODEL BUILD WHICH SHOULD TRIGGER THE INTEGRATION TESTS
# needs:
# - job: docker-build::model_name
# artifacts: true
rules:
- when: never
#########
# RELEASE
release:
extends: .release
needs:
- !reference [.needs-versioning, needs] # leave this line as is

View File

@ -1 +0,0 @@
3.10

View File

@ -1,73 +1,21 @@
FROM python:3.10-slim AS builder
FROM image-prediction-base
ARG GITLAB_USER
ARG GITLAB_ACCESS_TOKEN
WORKDIR /app/service
ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
COPY src src
COPY data data
COPY image_prediction image_prediction
COPY setup.py setup.py
COPY requirements.txt requirements.txt
COPY config.yaml config.yaml
COPY banner.txt banner.txt
ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
ARG POETRY_SOURCE_REF_RED=gitlab-red
# Install dependencies differing from base image.
RUN python3 -m pip install -r requirements.txt
ARG PYPI_REGISTRY_FFORESIGHT=https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
ARG POETRY_SOURCE_REF_FFORESIGHT=gitlab-fforesight
ARG VERSION=dev
LABEL maintainer="Research <research@knecon.com>"
LABEL version="${VERSION}"
WORKDIR /app
###########
# ENV SETUP
ENV PYTHONDONTWRITEBYTECODE=true
ENV PYTHONUNBUFFERED=true
ENV POETRY_HOME=/opt/poetry
ENV PATH="$POETRY_HOME/bin:$PATH"
RUN apt-get update && \
apt-get install -y curl git bash build-essential libffi-dev libssl-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN curl -sSL https://install.python-poetry.org | python3 -
RUN poetry --version
COPY pyproject.toml poetry.lock ./
RUN poetry config virtualenvs.create true && \
poetry config virtualenvs.in-project true && \
poetry config installer.max-workers 10 && \
poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
poetry config http-basic.${POETRY_SOURCE_REF_RED} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
poetry config repositories.${POETRY_SOURCE_REF_FFORESIGHT} ${PYPI_REGISTRY_FFORESIGHT} && \
poetry config http-basic.${POETRY_SOURCE_REF_FFORESIGHT} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
poetry install --without=dev -vv --no-interaction --no-root
###############
# WORKING IMAGE
FROM python:3.10-slim
WORKDIR /app
# COPY SOURCE CODE FROM BUILDER IMAGE
COPY --from=builder /app /app
# COPY BILL OF MATERIALS (BOM)
COPY bom.json /bom.json
ENV PATH="/app/.venv/bin:$PATH"
###################
# COPY SOURCE CODE
COPY ./src ./src
COPY ./config ./config
COPY ./data ./data
COPY banner.txt ./
RUN python3 -m pip install -e .
EXPOSE 5000
EXPOSE 8080
CMD [ "python", "src/serve.py"]
CMD ["python3", "src/serve.py"]

25
Dockerfile_base Normal file
View File

@ -0,0 +1,25 @@
FROM python:3.8 as builder1
# Use a virtual environment.
RUN python -m venv /app/venv
ENV PATH="/app/venv/bin:$PATH"
# Upgrade pip.
RUN python -m pip install --upgrade pip
# Make a directory for the service files and copy the service repo into the container.
WORKDIR /app/service
COPY ./requirements.txt ./requirements.txt
# Install dependencies.
RUN python3 -m pip install -r requirements.txt
# Make a new container and copy all relevant files over to filter out temporary files
# produced during setup to reduce the final container's size.
FROM python:3.8
WORKDIR /app/
COPY --from=builder1 /app .
ENV PATH="/app/venv/bin:$PATH"
WORKDIR /app/service

View File

@ -1,40 +1,20 @@
FROM python:3.10
ARG BASE_ROOT="nexus.iqser.com:5001/red/"
ARG VERSION_TAG="dev"
ARG USERNAME
ARG TOKEN
ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
ARG POETRY_SOURCE_REF_RED=gitlab-red
ARG VERSION=dev
FROM ${BASE_ROOT}image-prediction:${VERSION_TAG}
LABEL maintainer="Research <research@knecon.com>"
LABEL version="${VERSION}"
WORKDIR /app/service
WORKDIR /app
COPY src src
COPY data data
COPY image_prediction image_prediction
COPY setup.py setup.py
COPY requirements.txt requirements.txt
COPY config.yaml config.yaml
ENV PYTHONUNBUFFERED=true
ENV POETRY_HOME=/opt/poetry
ENV PATH="$POETRY_HOME/bin:$PATH"
RUN curl -sSL https://install.python-poetry.org | python3 -
COPY ./data ./data
COPY ./test ./test
COPY ./config ./config
COPY ./src ./src
COPY pyproject.toml poetry.lock banner.txt config.yaml./
RUN poetry config virtualenvs.create false && \
poetry config installer.max-workers 10 && \
poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${USERNAME} ${TOKEN} && \
poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
poetry config http-basic.${POETRY_SOURCE_REF_RED} ${USERNAME} ${TOKEN} && \
poetry install --without=dev -vv --no-interaction --no-root
EXPOSE 5000
EXPOSE 8080
# Install module & dependencies
RUN python3 -m pip install -e .
RUN python3 -m pip install -r requirements.txt
RUN apt update --yes
RUN apt install vim --yes

View File

@ -2,11 +2,8 @@
Build base image
```bash
docker build -t image-classification-image --progress=plain --no-cache \
-f Dockerfile \
--build-arg USERNAME=$GITLAB_USER \
--build-arg TOKEN=$GITLAB_ACCESS_TOKEN \
.
docker build -f Dockerfile_base -t image-prediction-base .
docker build -f Dockerfile -t image-prediction .
```
### Usage

40
bamboo-specs/pom.xml Normal file
View File

@ -0,0 +1,40 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs-parent</artifactId>
<version>7.1.2</version>
<relativePath/>
</parent>
<artifactId>bamboo-specs</artifactId>
<version>1.0.0-SNAPSHOT</version>
<packaging>jar</packaging>
<properties>
<sonar.skip>true</sonar.skip>
</properties>
<dependencies>
<dependency>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs-api</artifactId>
</dependency>
<dependency>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs</artifactId>
</dependency>
<!-- Test dependencies -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<!-- run 'mvn test' to perform offline validation of the plan -->
<!-- run 'mvn -Ppublish-specs' to upload the plan to your Bamboo server -->
</project>

View File

@ -0,0 +1,178 @@
package buildjob;
import com.atlassian.bamboo.specs.api.BambooSpec;
import com.atlassian.bamboo.specs.api.builders.BambooKey;
import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
import com.atlassian.bamboo.specs.api.builders.permission.PermissionType;
import com.atlassian.bamboo.specs.api.builders.permission.Permissions;
import com.atlassian.bamboo.specs.api.builders.permission.PlanPermissions;
import com.atlassian.bamboo.specs.api.builders.plan.Job;
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
import com.atlassian.bamboo.specs.api.builders.plan.PlanIdentifier;
import com.atlassian.bamboo.specs.api.builders.plan.Stage;
import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup;
import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement;
import com.atlassian.bamboo.specs.api.builders.project.Project;
import com.atlassian.bamboo.specs.builders.task.CheckoutItem;
import com.atlassian.bamboo.specs.builders.task.InjectVariablesTask;
import com.atlassian.bamboo.specs.builders.task.ScriptTask;
import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask;
import com.atlassian.bamboo.specs.builders.task.CleanWorkingDirectoryTask;
import com.atlassian.bamboo.specs.builders.task.VcsTagTask;
import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
import com.atlassian.bamboo.specs.api.builders.Variable;
import com.atlassian.bamboo.specs.util.BambooServer;
import com.atlassian.bamboo.specs.builders.task.ScriptTask;
import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
/**
* Plan configuration for Bamboo.
* Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
*/
@BambooSpec
public class PlanSpec {
private static final String SERVICE_NAME = "image-prediction";
private static final String SERVICE_NAME_BASE = "image-prediction-base";
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_","");
/**
* Run main to publish plan on Bamboo
*/
public static void main(final String[] args) throws Exception {
//By default credentials are read from the '.credentials' file.
BambooServer bambooServer = new BambooServer("http://localhost:8085");
Plan plan = new PlanSpec().createDockerBuildPlan();
bambooServer.publish(plan);
PlanPermissions planPermission = new PlanSpec().createPlanPermission(plan.getIdentifier());
bambooServer.publish(planPermission);
}
private PlanPermissions createPlanPermission(PlanIdentifier planIdentifier) {
Permissions permission = new Permissions()
.userPermissions("atlbamboo", PermissionType.EDIT, PermissionType.VIEW, PermissionType.ADMIN, PermissionType.CLONE, PermissionType.BUILD)
.groupPermissions("research", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
.groupPermissions("Development", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
.groupPermissions("QA", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
.loggedInUserPermissions(PermissionType.VIEW)
.anonymousUserPermissionView();
return new PlanPermissions(planIdentifier.getProjectKey(), planIdentifier.getPlanKey()).permissions(permission);
}
private Project project() {
return new Project()
.name("RED")
.key(new BambooKey("RED"));
}
public Plan createDockerBuildPlan() {
return new Plan(
project(),
SERVICE_NAME, new BambooKey(SERVICE_KEY))
.description("Docker build for image-prediction.")
.stages(
new Stage("Build Stage")
.jobs(
new Job("Build Job", new BambooKey("BUILD"))
.tasks(
new CleanWorkingDirectoryTask()
.description("Clean working directory.")
.enabled(true),
new VcsCheckoutTask()
.description("Checkout default repository.")
.checkoutItems(new CheckoutItem().defaultRepository()),
new ScriptTask()
.description("Set config and keys.")
.inlineBody("mkdir -p ~/.ssh\n" +
"echo \"${bamboo.bamboo_agent_ssh}\" | base64 -d >> ~/.ssh/id_rsa\n" +
"echo \"host vector.iqser.com\" > ~/.ssh/config\n" +
"echo \" user bamboo-agent\" >> ~/.ssh/config\n" +
"chmod 600 ~/.ssh/config ~/.ssh/id_rsa"),
new ScriptTask()
.description("Build Docker container.")
.location(Location.FILE)
.fileFromPath("bamboo-specs/src/main/resources/scripts/docker-build.sh")
.argument(SERVICE_NAME + " " + SERVICE_NAME_BASE))
.dockerConfiguration(
new DockerConfiguration()
.image("nexus.iqser.com:5001/infra/release_build:4.2.0")
.volume("/var/run/docker.sock", "/var/run/docker.sock"))),
new Stage("Sonar Stage")
.jobs(
new Job("Sonar Job", new BambooKey("SONAR"))
.tasks(
new CleanWorkingDirectoryTask()
.description("Clean working directory.")
.enabled(true),
new VcsCheckoutTask()
.description("Checkout default repository.")
.checkoutItems(new CheckoutItem().defaultRepository()),
new ScriptTask()
.description("Set config and keys.")
.inlineBody("mkdir -p ~/.ssh\n" +
"echo \"${bamboo.bamboo_agent_ssh}\" | base64 -d >> ~/.ssh/id_rsa\n" +
"echo \"host vector.iqser.com\" > ~/.ssh/config\n" +
"echo \" user bamboo-agent\" >> ~/.ssh/config\n" +
"chmod 600 ~/.ssh/config ~/.ssh/id_rsa"),
new ScriptTask()
.description("Run Sonarqube scan.")
.location(Location.FILE)
.fileFromPath("bamboo-specs/src/main/resources/scripts/sonar-scan.sh")
.argument(SERVICE_NAME))
.dockerConfiguration(
new DockerConfiguration()
.image("nexus.iqser.com:5001/infra/release_build:4.2.0")
.volume("/var/run/docker.sock", "/var/run/docker.sock"))),
new Stage("Licence Stage")
.jobs(
new Job("Git Tag Job", new BambooKey("GITTAG"))
.tasks(
new VcsCheckoutTask()
.description("Checkout default repository.")
.checkoutItems(new CheckoutItem().defaultRepository()),
new ScriptTask()
.description("Build git tag.")
.location(Location.FILE)
.fileFromPath("bamboo-specs/src/main/resources/scripts/git-tag.sh"),
new InjectVariablesTask()
.description("Inject git tag.")
.path("git.tag")
.namespace("g")
.scope(InjectVariablesScope.LOCAL),
new VcsTagTask()
.description("${bamboo.g.gitTag}")
.tagName("${bamboo.g.gitTag}")
.defaultRepository())
.dockerConfiguration(
new DockerConfiguration()
.image("nexus.iqser.com:5001/infra/release_build:4.4.1")),
new Job("Licence Job", new BambooKey("LICENCE"))
.enabled(false)
.tasks(
new VcsCheckoutTask()
.description("Checkout default repository.")
.checkoutItems(new CheckoutItem().defaultRepository()),
new ScriptTask()
.description("Build licence.")
.location(Location.FILE)
.fileFromPath("bamboo-specs/src/main/resources/scripts/create-licence.sh"))
.dockerConfiguration(
new DockerConfiguration()
.image("nexus.iqser.com:5001/infra/maven:3.6.2-jdk-13-3.0.0")
.volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
.linkedRepositories("RR / " + SERVICE_NAME)
.linkedRepositories("RR / redai_image")
.triggers(new BitbucketServerTrigger())
.planBranchManagement(new PlanBranchManagement()
.createForVcsBranch()
.delete(new BranchCleanup()
.whenInactiveInRepositoryAfterDays(14))
.notificationForCommitters());
}
}

View File

@ -0,0 +1,19 @@
#!/bin/bash
set -e
if [[ \"${bamboo_version_tag}\" != \"dev\" ]]
then
${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
-f ${bamboo_build_working_directory}/pom.xml \
versions:set \
-DnewVersion=${bamboo_version_tag}
${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
-f ${bamboo_build_working_directory}/pom.xml \
-B clean deploy \
-e -DdeployAtEnd=true \
-Dmaven.wagon.http.ssl.insecure=true \
-Dmaven.wagon.http.ssl.allowall=true \
-Dmaven.wagon.http.ssl.ignore.validity.dates=true \
-DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/gin4-platform-releases
fi

View File

@ -0,0 +1,20 @@
#!/bin/bash
set -e
SERVICE_NAME=$1
SERVICE_NAME_BASE=$2
python3 -m venv build_venv
source build_venv/bin/activate
python3 -m pip install --upgrade pip
pip install dvc
pip install 'dvc[ssh]'
echo "Pulling dvc data"
dvc pull
echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
docker build -f Dockerfile_base -t $SERVICE_NAME_BASE .
docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} .
echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag}

View File

@ -0,0 +1,9 @@
#!/bin/bash
set -e
if [[ "${bamboo_version_tag}" = "dev" ]]
then
echo "gitTag=${bamboo_planRepository_1_branch}_${bamboo_buildNumber}" > git.tag
else
echo "gitTag=${bamboo_version_tag}" > git.tag
fi

View File

@ -0,0 +1,57 @@
#!/bin/bash
set -e
export JAVA_HOME=/usr/bin/sonar-scanner/jre
python3 -m venv build_venv
source build_venv/bin/activate
python3 -m pip install --upgrade pip
python3 -m pip install dependency-check
python3 -m pip install coverage
echo "coverage report generation"
bash run_tests.sh
if [ ! -f reports/coverage.xml ]
then
exit 1
fi
SERVICE_NAME=$1
echo "dependency-check:aggregate"
mkdir -p reports
dependency-check --enableExperimental -f JSON -f HTML -f XML \
--disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
--exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**"
if [[ -z "${bamboo_repository_pr_key}" ]]
then
echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
/usr/bin/sonar-scanner/bin/sonar-scanner \
-Dsonar.projectKey=RED_$SERVICE_NAME \
-Dsonar.sources=image_prediction \
-Dsonar.host.url=https://sonarqube.iqser.com \
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-Dsonar.branch.name=${bamboo_planRepository_1_branch} \
-Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
-Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
-Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
-Dsonar.python.coverage.reportPaths=reports/coverage.xml
else
echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
/usr/bin/sonar-scanner/bin/sonar-scanner \
-Dsonar.projectKey=RED_$SERVICE_NAME \
-Dsonar.sources=image_prediction \
-Dsonar.host.url=https://sonarqube.iqser.com \
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
-Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \
-Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \
-Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
-Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
-Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
-Dsonar.python.coverage.reportPaths=reports/coverage.xml
fi

View File

@ -0,0 +1,16 @@
package buildjob;
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
import com.atlassian.bamboo.specs.api.exceptions.PropertiesValidationException;
import com.atlassian.bamboo.specs.api.util.EntityPropertiesBuilders;
import org.junit.Test;
public class PlanSpecTest {
@Test
public void checkYourPlanOffline() throws PropertiesValidationException {
Plan plan = new PlanSpec().createDockerBuildPlan();
EntityPropertiesBuilders.build(plan);
}
}

33697
bom.json

File diff suppressed because it is too large Load Diff

26
config.yaml Normal file
View File

@ -0,0 +1,26 @@
webserver:
host: $SERVER_HOST|"127.0.0.1" # webserver address
port: $SERVER_PORT|5000 # webserver port
service:
logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for service logger
verbose: $VERBOSE|True # Service prints document processing progress to stdout
batch_size: $BATCH_SIZE|16 # Number of images in memory simultaneously
mlflow_run_id: $MLFLOW_RUN_ID|fabfb1f192c745369b88cab34471aba7 # The ID of the mlflow run to load the service_estimator from
# These variables control filters that are applied to either images, image metadata or service_estimator predictions.
# The filter result values are reported in the service responses. For convenience the response to a request contains a
# "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
# specified required value.
filters:
image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)
min: $MIN_REL_IMAGE_SIZE|0.05 # Minimum permissible
max: $MAX_REL_IMAGE_SIZE|0.75 # Maximum permissible
image_width_to_height_quotient: # Image width to height ratio
min: $MIN_IMAGE_FORMAT|0.1 # Minimum permissible
max: $MAX_IMAGE_FORMAT|10 # Maximum permissible
min_confidence: $MIN_CONFIDENCE|0.5 # Minimum permissible prediction confidence

View File

@ -1,68 +0,0 @@
[asyncio]
max_concurrent_tasks = 10
[dynamic_tenant_queues]
enabled = true
[metrics.prometheus]
enabled = true
prefix = "redactmanager_image_service"
[tracing]
enabled = true
# possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
type = "azure_monitor"
[tracing.opentelemetry]
endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
service_name = "redactmanager_image_service"
exporter = "otlp"
[webserver]
host = "0.0.0.0"
port = 8080
[rabbitmq]
host = "localhost"
port = 5672
username = ""
password = ""
heartbeat = 60
# Has to be a divider of heartbeat, and shouldn't be too big, since only in these intervals queue interactions happen (like receiving new messages)
# This is also the minimum time the service needs to process a message
connection_sleep = 5
input_queue = "request_queue"
output_queue = "response_queue"
dead_letter_queue = "dead_letter_queue"
tenant_event_queue_suffix = "_tenant_event_queue"
tenant_event_dlq_suffix = "_tenant_events_dlq"
tenant_exchange_name = "tenants-exchange"
queue_expiration_time = 300000 # 5 minutes in milliseconds
service_request_queue_prefix = "image_request_queue"
service_request_exchange_name = "image_request_exchange"
service_response_exchange_name = "image_response_exchange"
service_dlq_name = "image_dlq"
[storage]
backend = "s3"
[storage.s3]
bucket = "redaction"
endpoint = "http://127.0.0.1:9000"
key = ""
secret = ""
region = "eu-central-1"
[storage.azure]
container = "redaction"
connection_string = ""
[storage.tenant_server]
public_key = ""
endpoint = "http://tenant-user-management:8081/internal-api/tenants"
[kubernetes]
pod_name = "test_pod"

View File

@ -1,42 +0,0 @@
[logging]
level = "INFO"
[service]
# Print document processing progress to stdout
verbose = false
batch_size = 6
image_stiching_tolerance = 1 # in pixels
mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
# These variables control filters that are applied to either images, image metadata or service_estimator predictions.
# The filter result values are reported in the service responses. For convenience the response to a request contains a
# "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
# specified required value.
[filters.confidence]
# Minimum permissible prediction confidence
min = 0.5
# Image size to page size ratio (ratio of geometric means of areas)
[filters.image_to_page_quotient]
min = 0.05
max = 0.75
[filters.is_scanned_page]
# Minimum permissible image to page ratio tolerance for a page to be considered scanned.
# This is only used for filtering small images on scanned pages and is applied before processing the image, therefore
# superseding the image_to_page_quotient filter that only applies a tag to the image after processing.
tolerance = 0
# Image width to height ratio
[filters.image_width_to_height_quotient]
min = 0.1
max = 10
# put class specific filters here ['signature', 'formula', 'logo']
[filters.overrides.signature.image_to_page_quotient]
max = 0.4
[filters.overrides.logo.image_to_page_quotient]
min = 0.06

View File

@ -0,0 +1,40 @@
"""Implements a config object with dot-indexing syntax."""
from envyaml import EnvYAML
from image_prediction.locations import CONFIG_FILE
def _get_item_and_maybe_make_dotindexable(container, item):
ret = container[item]
return DotIndexable(ret) if isinstance(ret, dict) else ret
class DotIndexable:
def __init__(self, x):
self.x = x
def __getattr__(self, item):
return _get_item_and_maybe_make_dotindexable(self.x, item)
def __repr__(self):
return self.x.__repr__()
def __getitem__(self, item):
return self.__getattr__(item)
class Config:
def __init__(self, config_path):
self.__config = EnvYAML(config_path)
def __getattr__(self, item):
if item in self.__config:
return _get_item_and_maybe_make_dotindexable(self.__config, item)
def __getitem__(self, item):
return self.__getattr__(item)
CONFIG = Config(CONFIG_FILE)

View File

@ -3,7 +3,6 @@ from funcy import juxt
from image_prediction.classifier.classifier import Classifier
from image_prediction.classifier.image_classifier import ImageClassifier
from image_prediction.compositor.compositor import TransformerCompositor
from image_prediction.encoder.encoders.hash_encoder import HashEncoder
from image_prediction.estimator.adapter.adapter import EstimatorAdapter
from image_prediction.formatter.formatters.camel_case import Snake2CamelCaseKeyFormatter
from image_prediction.formatter.formatters.enum import EnumFormatter
@ -37,7 +36,3 @@ def get_formatter():
PDFNetCoordinateTransformer(), EnumFormatter(), ResponseTransformer(), Snake2CamelCaseKeyFormatter()
)
return formatter
def get_encoder():
return HashEncoder()

View File

@ -32,11 +32,3 @@ class IntentionalTestException(RuntimeError):
class InvalidBox(Exception):
pass
class ParsingError(Exception):
pass
class BadXref(ValueError):
pass

View File

@ -1,20 +1,39 @@
import multiprocessing
import traceback
from typing import Callable
from flask import Flask, request, jsonify
from prometheus_client import generate_latest, CollectorRegistry, Summary
from image_prediction.utils import get_logger
from image_prediction.utils.process_wrapping import wrap_in_process
logger = get_logger()
def run_in_process(func):
p = multiprocessing.Process(target=func)
p.start()
p.join()
def wrap_in_process(func_to_wrap):
def build_function_and_run_in_process(*args, **kwargs):
def func():
try:
result = func_to_wrap(*args, **kwargs)
return_dict["result"] = result
except:
logger.error(traceback.format_exc())
manager = multiprocessing.Manager()
return_dict = manager.dict()
run_in_process(func)
return return_dict.get("result", None)
return build_function_and_run_in_process
def make_prediction_server(predict_fn: Callable):
app = Flask(__name__)
registry = CollectorRegistry(auto_describe=True)
metric = Summary(
f"redactmanager_imageClassification_seconds", f"Time spent on image-service classification.", registry=registry
)
@app.route("/ready", methods=["GET"])
def ready():
@ -35,7 +54,6 @@ def make_prediction_server(predict_fn: Callable):
@app.route("/predict", methods=["POST"])
@app.route("/", methods=["POST"])
@metric.time()
def predict():
# Tensorflow does not free RAM. Workaround: Run prediction function (which instantiates a model) in sub-process.
@ -53,8 +71,4 @@ def make_prediction_server(predict_fn: Callable):
logger.error("Analysis failed.")
return __failure()
@app.route("/prometheus", methods=["GET"])
def prometheus():
return generate_latest(registry=registry)
return app

View File

@ -0,0 +1,186 @@
import atexit
import io
from functools import partial, lru_cache
from itertools import chain, starmap, filterfalse
from operator import itemgetter
from typing import List
import fitz
from PIL import Image
from funcy import rcompose, merge, pluck, curry, compose
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
from image_prediction.info import Info
from image_prediction.stitching.stitching import stitch_pairs
from image_prediction.stitching.utils import validate_box_coords, validate_box_size
from image_prediction.utils import get_logger
from image_prediction.utils.generic import lift
logger = get_logger()
class ParsablePDFImageExtractor(ImageExtractor):
def __init__(self, verbose=False, tolerance=0):
"""
Args:
verbose: Whether to show progressbar
tolerance: The tolerance in pixels for the distance images beyond which they will not be stitched together
"""
self.doc: fitz.fitz.Document = None
self.verbose = verbose
self.tolerance = tolerance
def extract(self, pdf: bytes, page_range: range = None):
self.doc = fitz.Document(stream=pdf)
pages = extract_pages(self.doc, page_range) if page_range else self.doc
image_metadata_pairs = chain.from_iterable(map(self.__process_images_on_page, pages))
yield from image_metadata_pairs
def __process_images_on_page(self, page: fitz.fitz.Page):
images = get_images_on_page(self.doc, page)
metadata = get_metadata_for_images_on_page(self.doc, page)
clear_caches()
image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
yield from image_metadata_pairs
def extract_pages(doc, page_range):
page_range = range(page_range.start + 1, page_range.stop + 1)
pages = map(doc.load_page, page_range)
yield from pages
@lru_cache(maxsize=None)
def get_images_on_page(doc, page: fitz.Page):
image_infos = get_image_infos(page)
xrefs = map(itemgetter("xref"), image_infos)
images = map(partial(xref_to_image, doc), xrefs)
yield from images
def get_metadata_for_images_on_page(doc, page: fitz.Page):
metadata = map(get_image_metadata, get_image_infos(page))
metadata = validate_coords_and_passthrough(metadata)
metadata = filter_out_tiny_images(metadata)
metadata = validate_size_and_passthrough(metadata)
metadata = add_page_metadata(page, metadata)
metadata = add_alpha_channel_info(doc, page, metadata)
yield from metadata
@lru_cache(maxsize=None)
def get_image_infos(page: fitz.Page) -> List[dict]:
return page.get_image_info(xrefs=True)
@lru_cache(maxsize=None)
def xref_to_image(doc, xref) -> Image:
maybe_image = load_image_handle_from_xref(doc, xref)
return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None
def get_image_metadata(image_info):
x1, y1, x2, y2 = map(rounder, image_info["bbox"])
width = abs(x2 - x1)
height = abs(y2 - y1)
return {
Info.WIDTH: width,
Info.HEIGHT: height,
Info.X1: x1,
Info.X2: x2,
Info.Y1: y1,
Info.Y2: y2,
}
def validate_coords_and_passthrough(metadata):
yield from map(validate_box_coords, metadata)
def filter_out_tiny_images(metadata):
yield from filterfalse(tiny, metadata)
def validate_size_and_passthrough(metadata):
yield from map(validate_box_size, metadata)
def add_page_metadata(page, metadata):
yield from map(partial(merge, get_page_metadata(page)), metadata)
def add_alpha_channel_info(doc, page, metadata):
page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos)
xref_to_alpha = partial(has_alpha_channel, doc)
page_to_alpha_value_per_image = compose(lift(xref_to_alpha), page_to_xrefs)
alpha_to_dict = compose(dict, lambda a: [(Info.ALPHA, a)])
page_to_alpha_mapping_per_image = compose(lift(alpha_to_dict), page_to_alpha_value_per_image)
metadata = starmap(merge, zip(page_to_alpha_mapping_per_image(page), metadata))
yield from metadata
@lru_cache(maxsize=None)
def load_image_handle_from_xref(doc, xref):
return doc.extract_image(xref)
rounder = rcompose(round, int)
def get_page_metadata(page):
page_width, page_height = map(rounder, page.mediabox_size)
return {
Info.PAGE_WIDTH: page_width,
Info.PAGE_HEIGHT: page_height,
Info.PAGE_IDX: page.number,
}
def has_alpha_channel(doc, xref):
maybe_image = load_image_handle_from_xref(doc, xref)
maybe_smask = maybe_image["smask"] if maybe_image else None
if maybe_smask:
return any([doc.extract_image(maybe_smask) is not None, bool(fitz.Pixmap(doc, maybe_smask).alpha)])
else:
try:
return bool(fitz.Pixmap(doc, xref).alpha)
except ValueError:
logger.debug(f"Encountered invalid xref `{xref}` in {doc.metadata.get('title', '<no title>')}.")
return False
def tiny(metadata):
return metadata[Info.WIDTH] * metadata[Info.HEIGHT] <= 4
def clear_caches():
get_image_infos.cache_clear()
load_image_handle_from_xref.cache_clear()
get_images_on_page.cache_clear()
xref_to_image.cache_clear()
atexit.register(clear_caches)

View File

@ -12,4 +12,3 @@ class Info(Enum):
Y1 = "y1"
Y2 = "y2"
ALPHA = "alpha"
XREF = "xref"

View File

@ -0,0 +1,17 @@
"""Defines constant paths relative to the module root path."""
from pathlib import Path
MODULE_DIR = Path(__file__).resolve().parents[0]
PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
CONFIG_FILE = PACKAGE_ROOT_DIR / "config.yaml"
BANNER_FILE = PACKAGE_ROOT_DIR / "banner.txt"
DATA_DIR = PACKAGE_ROOT_DIR / "data"
MLRUNS_DIR = str(DATA_DIR / "mlruns")
TEST_DATA_DIR = PACKAGE_ROOT_DIR / "test" / "data"

View File

@ -0,0 +1,64 @@
import os
from functools import partial
from itertools import chain, tee
from funcy import rcompose, first, compose, second, chunks, identity
from tqdm import tqdm
from image_prediction.config import CONFIG
from image_prediction.default_objects import get_formatter, get_mlflow_model_loader, get_image_classifier, get_extractor
from image_prediction.locations import MLRUNS_DIR
from image_prediction.utils.generic import lift, starlift
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
def load_pipeline(**kwargs):
model_loader = get_mlflow_model_loader(MLRUNS_DIR)
model_identifier = CONFIG.service.mlflow_run_id
pipeline = Pipeline(model_loader, model_identifier, **kwargs)
return pipeline
def parallel(*fs):
return lambda *args: (f(a) for f, a in zip(fs, args))
def star(f):
return lambda x: f(*x)
class Pipeline:
def __init__(self, model_loader, model_identifier, batch_size=16, verbose=True, **kwargs):
self.verbose = verbose
extract = get_extractor(**kwargs)
classifier = get_image_classifier(model_loader, model_identifier)
reformat = get_formatter()
split = compose(star(parallel(*map(lift, (first, second)))), tee)
classify = compose(chain.from_iterable, lift(classifier), partial(chunks, batch_size))
pairwise_apply = compose(star, parallel)
join = compose(starlift(lambda prd, mdt: {"classification": prd, **mdt}), star(zip))
# +>--classify--v
# --extract-->--split--| |--join-->reformat
# +>--identity--^
self.pipe = rcompose(
extract, # ... image-metadata-pairs as a stream
split, # ... into an image stream and a metadata stream
pairwise_apply(classify, identity), # ... apply functions to the streams pairwise
join, # ... the streams by zipping
reformat, # ... the items
)
def __call__(self, pdf: bytes, page_range: range = None):
yield from tqdm(
self.pipe(pdf, page_range=page_range),
desc="Processing images from document",
unit=" images",
disable=not self.verbose,
)

View File

@ -1,5 +1,4 @@
import math
from dynaconf import Dynaconf
from operator import itemgetter
from image_prediction.config import CONFIG
@ -16,45 +15,39 @@ class ResponseTransformer(Transformer):
def build_image_info(data: dict) -> dict:
def compute_geometric_quotient():
page_area_sqrt = math.sqrt(abs(page_width * page_height))
image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1))
return image_area_sqrt / page_area_sqrt
page_width, page_height, x1, x2, y1, y2, width, height, alpha = itemgetter(
"page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height", "alpha"
)(data)
classification = data["classification"]
label = classification["label"]
representation = data["representation"]
geometric_quotient = round(compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1), 4)
min_image_to_page_quotient_breached = bool(
geometric_quotient < get_class_specific_filter_value(label, CONFIG, "image_to_page_quotient", "min")
)
max_image_to_page_quotient_breached = bool(
geometric_quotient > get_class_specific_filter_value(label, CONFIG, "image_to_page_quotient", "max")
)
quotient = round(compute_geometric_quotient(), 4)
min_image_to_page_quotient_breached = bool(quotient < CONFIG.filters.image_to_page_quotient.min)
max_image_to_page_quotient_breached = bool(quotient > CONFIG.filters.image_to_page_quotient.max)
min_image_width_to_height_quotient_breached = bool(
width / height < get_class_specific_filter_value(label, CONFIG, "image_width_to_height_quotient", "min")
width / height < CONFIG.filters.image_width_to_height_quotient.min
)
max_image_width_to_height_quotient_breached = bool(
width / height > get_class_specific_filter_value(label, CONFIG, "image_width_to_height_quotient", "max")
width / height > CONFIG.filters.image_width_to_height_quotient.max
)
min_confidence_breached = bool(
max(classification["probabilities"].values())
< get_class_specific_filter_value(label, CONFIG, "confidence", "min")
)
classification = data["classification"]
min_confidence_breached = bool(max(classification["probabilities"].values()) < CONFIG.filters.min_confidence)
image_info = {
"classification": classification,
"representation": representation,
"position": {"x1": x1, "x2": x2, "y1": y1, "y2": y2, "pageNumber": data["page_idx"] + 1},
"geometry": {"width": width, "height": height},
"alpha": alpha,
"filters": {
"geometry": {
"imageSize": {
"quotient": geometric_quotient,
"quotient": quotient,
"tooLarge": max_image_to_page_quotient_breached,
"tooSmall": min_image_to_page_quotient_breached,
},
@ -78,23 +71,3 @@ def build_image_info(data: dict) -> dict:
}
return image_info
def compute_geometric_quotient(page_width, page_height, x2, x1, y2, y1):
page_area_sqrt = math.sqrt(abs(page_width * page_height))
image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1))
return image_area_sqrt / page_area_sqrt
def get_class_specific_filter_value(label: str, settings: Dynaconf, filter_type: str, bound: str = None):
try:
value = (
settings.filters.overrides[label][filter_type][bound]
if bound
else settings.filters.overrides[label][filter_type]
)
logger.warning(f"Using {label=} specific {bound=} {filter_type=} {value=}.")
except KeyError:
value = settings.filters[filter_type][bound]
return value

View File

@ -4,7 +4,8 @@ from image_prediction.locations import BANNER_FILE
def show_banner():
banner = load_banner()
with open(BANNER_FILE) as f:
banner = "\n" + "".join(f.readlines()) + "\n"
logger = logging.getLogger(__name__)
logger.propagate = False
@ -18,9 +19,3 @@ def show_banner():
logger.addHandler(handler)
logger.info(banner)
def load_banner():
with open(BANNER_FILE) as f:
banner = "\n" + "".join(f.readlines()) + "\n"
return banner

View File

@ -0,0 +1,15 @@
from itertools import starmap
from funcy import iterate, first, curry, map
def until(cond, func, *args, **kwargs):
return first(filter(cond, iterate(func, *args, **kwargs)))
def lift(fn):
return curry(map)(fn)
def starlift(fn):
return curry(starmap)(fn)

View File

@ -0,0 +1,27 @@
import logging
from image_prediction.config import CONFIG
def make_logger_getter():
logger = logging.getLogger("imclf")
logger.propagate = False
handler = logging.StreamHandler()
handler.setLevel(CONFIG.service.logging_level)
log_format = "%(asctime)s %(levelname)-8s %(message)s"
formatter = logging.Formatter(log_format, datefmt="%Y-%m-%d %H:%M:%S")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(CONFIG.service.logging_level)
def get_logger():
return logger
return get_logger
get_logger = make_logger_getter()

View File

@ -56,8 +56,7 @@ def annotate_image(doc, image_info):
def init():
PDFNet.Initialize(
# "Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
"Knecon AG:OEM:DDA-R::WL+:AMS(20270129):EA5FDFB23C7F36B9C2AE606F4F0D9197DE1FB649119F9730B622FABEF5C7"
"Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
)

0
incl/__init__.py Normal file
View File

7267
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,73 +0,0 @@
[tool.poetry]
name = "image-classification-service"
version = "2.17.0"
description = ""
authors = ["Team Research <research@knecon.com>"]
readme = "README.md"
packages = [{ include = "image_prediction", from = "src" }]
[tool.poetry.dependencies]
python = ">=3.10,<3.11"
# FIXME: This should be recent pyinfra, but the recent protobuf packages are not compatible with tensorflow 2.9.0, also
# see RED-9948.
pyinfra = { version = "3.4.2", source = "gitlab-research" }
kn-utils = { version = ">=0.4.0", source = "gitlab-research" }
dvc = "^2.34.0"
dvc-ssh = "^2.20.0"
dvc-azure = "^2.21.2"
Flask = "^2.1.1"
requests = "^2.27.1"
iteration-utilities = "^0.11.0"
waitress = "^2.1.1"
envyaml = "^1.10.211231"
dependency-check = "^0.6.0"
mlflow = "^1.24.0"
numpy = "^1.22.3"
tqdm = "^4.64.0"
pandas = "^1.4.2"
# FIXME: Our current model significantly changes the prediction behaviour when using newer tensorflow (/ protobuf)
# versions which is introduuced by pyinfra updates using newer protobuf versions, see RED-9948.
tensorflow = "2.9.0"
protobuf = "^3.20"
pytest = "^7.1.0"
funcy = "^2"
PyMuPDF = "^1.19.6"
fpdf = "^1.7.2"
coverage = "^6.3.2"
Pillow = "^9.1.0"
pdf2image = "^1.16.0"
frozendict = "^2.3.0"
fsspec = "^2022.11.0"
PyMonad = "^2.4.0"
pdfnetpython3 = "9.4.2"
loguru = "^0.7.0"
cyclonedx-bom = "^4.5.0"
[tool.poetry.group.dev.dependencies]
pytest = "^7.0.1"
pymonad = "^2.4.0"
pylint = "^2.17.4"
ipykernel = "^6.23.2"
[tool.pytest.ini_options]
testpaths = ["test"]
addopts = "--ignore=data"
filterwarnings = ["ignore:.*:DeprecationWarning"]
[[tool.poetry.source]]
name = "PyPI"
priority = "primary"
[[tool.poetry.source]]
name = "gitlab-research"
url = "https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi/simple"
priority = "explicit"
[[tool.poetry.source]]
name = "gitlab-red"
url = "https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi/simple"
priority = "explicit"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

5
pytest.ini Normal file
View File

@ -0,0 +1,5 @@
[pytest]
norecursedirs = incl
filterwarnings =
ignore:.*:DeprecationWarning
ignore:.*:DeprecationWarning

Some files were not shown because too many files have changed in this diff Show More