Compare commits
3 Commits
master
...
release/3.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9292c1f6c6 | ||
|
|
788af1df62 | ||
|
|
097479bc38 |
@ -1,8 +1,6 @@
|
|||||||
[core]
|
[core]
|
||||||
remote = azure_remote
|
remote = vector
|
||||||
autostage = true
|
autostage = true
|
||||||
['remote "vector"']
|
['remote "vector"']
|
||||||
url = ssh://vector.iqser.com/research/image-prediction/
|
url = ssh://vector.iqser.com/research/image-prediction/
|
||||||
port = 22
|
port = 22
|
||||||
['remote "azure_remote"']
|
|
||||||
url = azure://image-classification-dvc/
|
|
||||||
5
.gitignore
vendored
5
.gitignore
vendored
@ -1,8 +1,7 @@
|
|||||||
.vscode/
|
.vscode/
|
||||||
*.h5
|
*.h5
|
||||||
*venv
|
/venv/
|
||||||
.idea/
|
.idea/
|
||||||
src/data
|
|
||||||
|
|
||||||
!.gitignore
|
!.gitignore
|
||||||
*.project
|
*.project
|
||||||
@ -173,4 +172,4 @@ fabric.properties
|
|||||||
# https://plugins.jetbrains.com/plugin/12206-codestream
|
# https://plugins.jetbrains.com/plugin/12206-codestream
|
||||||
.idea/codestream.xml
|
.idea/codestream.xml
|
||||||
|
|
||||||
# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
|
# End of https://www.toptal.com/developers/gitignore/api/linux,pycharm
|
||||||
|
|||||||
@ -1,51 +0,0 @@
|
|||||||
include:
|
|
||||||
- project: "Gitlab/gitlab"
|
|
||||||
ref: main
|
|
||||||
file: "/ci-templates/research/dvc.gitlab-ci.yml"
|
|
||||||
- project: "Gitlab/gitlab"
|
|
||||||
ref: main
|
|
||||||
file: "/ci-templates/research/versioning-build-test-release.gitlab-ci.yml"
|
|
||||||
|
|
||||||
variables:
|
|
||||||
NEXUS_PROJECT_DIR: red
|
|
||||||
IMAGENAME: "${CI_PROJECT_NAME}"
|
|
||||||
INTEGRATION_TEST_FILE: "${CI_PROJECT_ID}.pdf"
|
|
||||||
FF_USE_FASTZIP: "true" # enable fastzip - a faster zip implementation that also supports level configuration.
|
|
||||||
ARTIFACT_COMPRESSION_LEVEL: default # can also be set to fastest, fast, slow and slowest. If just enabling fastzip is not enough try setting this to fastest or fast.
|
|
||||||
CACHE_COMPRESSION_LEVEL: default # same as above, but for caches
|
|
||||||
# TRANSFER_METER_FREQUENCY: 5s # will display transfer progress every 5 seconds for artifacts and remote caches. For debugging purposes.
|
|
||||||
|
|
||||||
stages:
|
|
||||||
- data
|
|
||||||
- setup
|
|
||||||
- tests
|
|
||||||
- sonarqube
|
|
||||||
- versioning
|
|
||||||
- build
|
|
||||||
- integration-tests
|
|
||||||
- release
|
|
||||||
|
|
||||||
docker-build:
|
|
||||||
extends: .docker-build
|
|
||||||
needs:
|
|
||||||
- job: dvc-pull
|
|
||||||
artifacts: true
|
|
||||||
- !reference [.needs-versioning, needs] # leave this line as is
|
|
||||||
|
|
||||||
###################
|
|
||||||
# INTEGRATION TESTS
|
|
||||||
trigger-integration-tests:
|
|
||||||
extends: .integration-tests
|
|
||||||
# ADD THE MODEL BUILD WHICH SHOULD TRIGGER THE INTEGRATION TESTS
|
|
||||||
# needs:
|
|
||||||
# - job: docker-build::model_name
|
|
||||||
# artifacts: true
|
|
||||||
rules:
|
|
||||||
- when: never
|
|
||||||
|
|
||||||
#########
|
|
||||||
# RELEASE
|
|
||||||
release:
|
|
||||||
extends: .release
|
|
||||||
needs:
|
|
||||||
- !reference [.needs-versioning, needs] # leave this line as is
|
|
||||||
6
.gitmodules
vendored
Normal file
6
.gitmodules
vendored
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
[submodule "incl/pyinfra"]
|
||||||
|
path = incl/pyinfra
|
||||||
|
url = ssh://git@git.iqser.com:2222/rr/pyinfra.git
|
||||||
|
[submodule "incl/pdf2image"]
|
||||||
|
path = incl/pdf2image
|
||||||
|
url = ssh://git@git.iqser.com:2222/rr/pdf2image.git
|
||||||
@ -1 +0,0 @@
|
|||||||
3.10
|
|
||||||
84
Dockerfile
84
Dockerfile
@ -1,73 +1,27 @@
|
|||||||
FROM python:3.10-slim AS builder
|
FROM image-prediction-base
|
||||||
|
|
||||||
ARG GITLAB_USER
|
WORKDIR /app/service
|
||||||
ARG GITLAB_ACCESS_TOKEN
|
|
||||||
|
|
||||||
ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
|
COPY src src
|
||||||
ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
|
COPY incl/pyinfra incl/pyinfra
|
||||||
|
COPY incl/pdf2image incl/pdf2image
|
||||||
|
COPY data data
|
||||||
|
COPY image_prediction image_prediction
|
||||||
|
COPY setup.py setup.py
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY config.yaml config.yaml
|
||||||
|
COPY banner.txt banner.txt
|
||||||
|
|
||||||
ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
|
# Install dependencies differing from base image.
|
||||||
ARG POETRY_SOURCE_REF_RED=gitlab-red
|
RUN python3 -m pip install -r requirements.txt
|
||||||
|
RUN python3 -m pip install -r incl/pyinfra/requirements.txt
|
||||||
|
RUN python3 -m pip install -r incl/pdf2image/requirements.txt
|
||||||
|
|
||||||
ARG PYPI_REGISTRY_FFORESIGHT=https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
|
RUN python3 -m pip install -e .
|
||||||
ARG POETRY_SOURCE_REF_FFORESIGHT=gitlab-fforesight
|
RUN python3 -m pip install -e incl/pyinfra
|
||||||
|
RUN python3 -m pip install -e incl/pdf2image
|
||||||
ARG VERSION=dev
|
|
||||||
|
|
||||||
LABEL maintainer="Research <research@knecon.com>"
|
|
||||||
LABEL version="${VERSION}"
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
###########
|
|
||||||
# ENV SETUP
|
|
||||||
ENV PYTHONDONTWRITEBYTECODE=true
|
|
||||||
ENV PYTHONUNBUFFERED=true
|
|
||||||
ENV POETRY_HOME=/opt/poetry
|
|
||||||
ENV PATH="$POETRY_HOME/bin:$PATH"
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y curl git bash build-essential libffi-dev libssl-dev && \
|
|
||||||
apt-get clean && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
RUN curl -sSL https://install.python-poetry.org | python3 -
|
|
||||||
RUN poetry --version
|
|
||||||
|
|
||||||
COPY pyproject.toml poetry.lock ./
|
|
||||||
|
|
||||||
RUN poetry config virtualenvs.create true && \
|
|
||||||
poetry config virtualenvs.in-project true && \
|
|
||||||
poetry config installer.max-workers 10 && \
|
|
||||||
poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
|
|
||||||
poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
|
|
||||||
poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
|
|
||||||
poetry config http-basic.${POETRY_SOURCE_REF_RED} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
|
|
||||||
poetry config repositories.${POETRY_SOURCE_REF_FFORESIGHT} ${PYPI_REGISTRY_FFORESIGHT} && \
|
|
||||||
poetry config http-basic.${POETRY_SOURCE_REF_FFORESIGHT} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
|
|
||||||
poetry install --without=dev -vv --no-interaction --no-root
|
|
||||||
|
|
||||||
###############
|
|
||||||
# WORKING IMAGE
|
|
||||||
FROM python:3.10-slim
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
# COPY SOURCE CODE FROM BUILDER IMAGE
|
|
||||||
COPY --from=builder /app /app
|
|
||||||
# COPY BILL OF MATERIALS (BOM)
|
|
||||||
COPY bom.json /bom.json
|
|
||||||
|
|
||||||
ENV PATH="/app/.venv/bin:$PATH"
|
|
||||||
|
|
||||||
###################
|
|
||||||
# COPY SOURCE CODE
|
|
||||||
COPY ./src ./src
|
|
||||||
COPY ./config ./config
|
|
||||||
COPY ./data ./data
|
|
||||||
COPY banner.txt ./
|
|
||||||
|
|
||||||
EXPOSE 5000
|
EXPOSE 5000
|
||||||
EXPOSE 8080
|
EXPOSE 8080
|
||||||
|
|
||||||
CMD [ "python", "src/serve.py"]
|
CMD ["python3", "src/serve.py"]
|
||||||
|
|||||||
25
Dockerfile_base
Normal file
25
Dockerfile_base
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
FROM python:3.8 as builder1
|
||||||
|
|
||||||
|
# Use a virtual environment.
|
||||||
|
RUN python -m venv /app/venv
|
||||||
|
ENV PATH="/app/venv/bin:$PATH"
|
||||||
|
|
||||||
|
# Upgrade pip.
|
||||||
|
RUN python -m pip install --upgrade pip
|
||||||
|
|
||||||
|
# Make a directory for the service files and copy the service repo into the container.
|
||||||
|
WORKDIR /app/service
|
||||||
|
COPY ./requirements.txt ./requirements.txt
|
||||||
|
|
||||||
|
# Install dependencies.
|
||||||
|
RUN python3 -m pip install -r requirements.txt
|
||||||
|
|
||||||
|
# Make a new container and copy all relevant files over to filter out temporary files
|
||||||
|
# produced during setup to reduce the final container's size.
|
||||||
|
FROM python:3.8
|
||||||
|
|
||||||
|
WORKDIR /app/
|
||||||
|
COPY --from=builder1 /app .
|
||||||
|
ENV PATH="/app/venv/bin:$PATH"
|
||||||
|
|
||||||
|
WORKDIR /app/service
|
||||||
@ -1,40 +1,28 @@
|
|||||||
FROM python:3.10
|
ARG BASE_ROOT="nexus.iqser.com:5001/red/"
|
||||||
|
ARG VERSION_TAG="dev"
|
||||||
|
|
||||||
ARG USERNAME
|
FROM ${BASE_ROOT}image-prediction:${VERSION_TAG}
|
||||||
ARG TOKEN
|
|
||||||
ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
|
|
||||||
ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
|
|
||||||
ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
|
|
||||||
ARG POETRY_SOURCE_REF_RED=gitlab-red
|
|
||||||
ARG VERSION=dev
|
|
||||||
|
|
||||||
LABEL maintainer="Research <research@knecon.com>"
|
WORKDIR /app/service
|
||||||
LABEL version="${VERSION}"
|
|
||||||
|
|
||||||
WORKDIR /app
|
COPY src src
|
||||||
|
COPY incl/pyinfra incl/pyinfra
|
||||||
|
COPY incl/pdf2image incl/pdf2image
|
||||||
|
COPY data data
|
||||||
|
COPY image_prediction image_prediction
|
||||||
|
COPY setup.py setup.py
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY config.yaml config.yaml
|
||||||
|
COPY banner.txt banner.txt
|
||||||
|
|
||||||
ENV PYTHONUNBUFFERED=true
|
# Install module & dependencies
|
||||||
ENV POETRY_HOME=/opt/poetry
|
RUN python3 -m pip install -r requirements.txt
|
||||||
ENV PATH="$POETRY_HOME/bin:$PATH"
|
RUN python3 -m pip install -r incl/pyinfra/requirements.txt
|
||||||
|
RUN python3 -m pip install -r incl/pdf2image/requirements.txt
|
||||||
|
|
||||||
RUN curl -sSL https://install.python-poetry.org | python3 -
|
RUN python3 -m pip install -e .
|
||||||
|
RUN python3 -m pip install -e incl/pyinfra
|
||||||
COPY ./data ./data
|
RUN python3 -m pip install -e incl/pdf2image
|
||||||
COPY ./test ./test
|
|
||||||
COPY ./config ./config
|
|
||||||
COPY ./src ./src
|
|
||||||
COPY pyproject.toml poetry.lock banner.txt config.yaml./
|
|
||||||
|
|
||||||
RUN poetry config virtualenvs.create false && \
|
|
||||||
poetry config installer.max-workers 10 && \
|
|
||||||
poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
|
|
||||||
poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${USERNAME} ${TOKEN} && \
|
|
||||||
poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
|
|
||||||
poetry config http-basic.${POETRY_SOURCE_REF_RED} ${USERNAME} ${TOKEN} && \
|
|
||||||
poetry install --without=dev -vv --no-interaction --no-root
|
|
||||||
|
|
||||||
EXPOSE 5000
|
|
||||||
EXPOSE 8080
|
|
||||||
|
|
||||||
RUN apt update --yes
|
RUN apt update --yes
|
||||||
RUN apt install vim --yes
|
RUN apt install vim --yes
|
||||||
|
|||||||
@ -2,11 +2,8 @@
|
|||||||
|
|
||||||
Build base image
|
Build base image
|
||||||
```bash
|
```bash
|
||||||
docker build -t image-classification-image --progress=plain --no-cache \
|
docker build -f Dockerfile_base -t image-prediction-base .
|
||||||
-f Dockerfile \
|
docker build -f Dockerfile -t image-prediction .
|
||||||
--build-arg USERNAME=$GITLAB_USER \
|
|
||||||
--build-arg TOKEN=$GITLAB_ACCESS_TOKEN \
|
|
||||||
.
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Usage
|
### Usage
|
||||||
|
|||||||
40
bamboo-specs/pom.xml
Normal file
40
bamboo-specs/pom.xml
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<parent>
|
||||||
|
<groupId>com.atlassian.bamboo</groupId>
|
||||||
|
<artifactId>bamboo-specs-parent</artifactId>
|
||||||
|
<version>7.1.2</version>
|
||||||
|
<relativePath/>
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<artifactId>bamboo-specs</artifactId>
|
||||||
|
<version>1.0.0-SNAPSHOT</version>
|
||||||
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<sonar.skip>true</sonar.skip>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.atlassian.bamboo</groupId>
|
||||||
|
<artifactId>bamboo-specs-api</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.atlassian.bamboo</groupId>
|
||||||
|
<artifactId>bamboo-specs</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<!-- Test dependencies -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<!-- run 'mvn test' to perform offline validation of the plan -->
|
||||||
|
<!-- run 'mvn -Ppublish-specs' to upload the plan to your Bamboo server -->
|
||||||
|
</project>
|
||||||
180
bamboo-specs/src/main/java/buildjob/PlanSpec.java
Normal file
180
bamboo-specs/src/main/java/buildjob/PlanSpec.java
Normal file
@ -0,0 +1,180 @@
|
|||||||
|
package buildjob;
|
||||||
|
|
||||||
|
import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
|
||||||
|
|
||||||
|
import java.time.LocalTime;
|
||||||
|
|
||||||
|
import com.atlassian.bamboo.specs.api.BambooSpec;
|
||||||
|
import com.atlassian.bamboo.specs.api.builders.BambooKey;
|
||||||
|
import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
|
||||||
|
import com.atlassian.bamboo.specs.api.builders.permission.PermissionType;
|
||||||
|
import com.atlassian.bamboo.specs.api.builders.permission.Permissions;
|
||||||
|
import com.atlassian.bamboo.specs.api.builders.permission.PlanPermissions;
|
||||||
|
import com.atlassian.bamboo.specs.api.builders.plan.Job;
|
||||||
|
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
|
||||||
|
import com.atlassian.bamboo.specs.api.builders.plan.PlanIdentifier;
|
||||||
|
import com.atlassian.bamboo.specs.api.builders.plan.Stage;
|
||||||
|
import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup;
|
||||||
|
import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement;
|
||||||
|
import com.atlassian.bamboo.specs.api.builders.project.Project;
|
||||||
|
import com.atlassian.bamboo.specs.builders.task.CheckoutItem;
|
||||||
|
import com.atlassian.bamboo.specs.builders.task.InjectVariablesTask;
|
||||||
|
import com.atlassian.bamboo.specs.builders.task.ScriptTask;
|
||||||
|
import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask;
|
||||||
|
import com.atlassian.bamboo.specs.builders.task.CleanWorkingDirectoryTask;
|
||||||
|
import com.atlassian.bamboo.specs.builders.task.VcsTagTask;
|
||||||
|
import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
|
||||||
|
import com.atlassian.bamboo.specs.builders.trigger.ScheduledTrigger;
|
||||||
|
import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
|
||||||
|
import com.atlassian.bamboo.specs.api.builders.Variable;
|
||||||
|
import com.atlassian.bamboo.specs.util.BambooServer;
|
||||||
|
import com.atlassian.bamboo.specs.builders.task.ScriptTask;
|
||||||
|
import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Plan configuration for Bamboo.
|
||||||
|
* Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
|
||||||
|
*/
|
||||||
|
@BambooSpec
|
||||||
|
public class PlanSpec {
|
||||||
|
|
||||||
|
private static final String SERVICE_NAME = "image-prediction";
|
||||||
|
private static final String SERVICE_NAME_BASE = "image-prediction-base";
|
||||||
|
|
||||||
|
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_","");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run main to publish plan on Bamboo
|
||||||
|
*/
|
||||||
|
public static void main(final String[] args) throws Exception {
|
||||||
|
//By default credentials are read from the '.credentials' file.
|
||||||
|
BambooServer bambooServer = new BambooServer("http://localhost:8085");
|
||||||
|
|
||||||
|
Plan plan = new PlanSpec().createDockerBuildPlan();
|
||||||
|
bambooServer.publish(plan);
|
||||||
|
PlanPermissions planPermission = new PlanSpec().createPlanPermission(plan.getIdentifier());
|
||||||
|
bambooServer.publish(planPermission);
|
||||||
|
|
||||||
|
Plan secPlan = new PlanSpec().createSecBuild();
|
||||||
|
bambooServer.publish(secPlan);
|
||||||
|
PlanPermissions secPlanPermission = new PlanSpec().createPlanPermission(secPlan.getIdentifier());
|
||||||
|
bambooServer.publish(secPlanPermission);
|
||||||
|
}
|
||||||
|
|
||||||
|
private PlanPermissions createPlanPermission(PlanIdentifier planIdentifier) {
|
||||||
|
Permissions permission = new Permissions()
|
||||||
|
.userPermissions("atlbamboo", PermissionType.EDIT, PermissionType.VIEW, PermissionType.ADMIN, PermissionType.CLONE, PermissionType.BUILD)
|
||||||
|
.groupPermissions("research", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
|
||||||
|
.groupPermissions("Development", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
|
||||||
|
.groupPermissions("QA", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
|
||||||
|
.loggedInUserPermissions(PermissionType.VIEW)
|
||||||
|
.anonymousUserPermissionView();
|
||||||
|
return new PlanPermissions(planIdentifier.getProjectKey(), planIdentifier.getPlanKey()).permissions(permission);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Project project() {
|
||||||
|
return new Project()
|
||||||
|
.name("RED")
|
||||||
|
.key(new BambooKey("RED"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public Plan createDockerBuildPlan() {
|
||||||
|
return new Plan(
|
||||||
|
project(),
|
||||||
|
SERVICE_NAME, new BambooKey(SERVICE_KEY))
|
||||||
|
.description("Docker build for image-prediction.")
|
||||||
|
.stages(
|
||||||
|
new Stage("Build Stage")
|
||||||
|
.jobs(
|
||||||
|
new Job("Build Job", new BambooKey("BUILD"))
|
||||||
|
.tasks(
|
||||||
|
new CleanWorkingDirectoryTask()
|
||||||
|
.description("Clean working directory.")
|
||||||
|
.enabled(true),
|
||||||
|
new VcsCheckoutTask()
|
||||||
|
.description("Checkout default repository.")
|
||||||
|
.checkoutItems(new CheckoutItem().defaultRepository()),
|
||||||
|
new ScriptTask()
|
||||||
|
.description("Set config and keys.")
|
||||||
|
.location(Location.FILE)
|
||||||
|
.fileFromPath("bamboo-specs/src/main/resources/scripts/key-prepare.sh"),
|
||||||
|
new ScriptTask()
|
||||||
|
.description("Build Docker container.")
|
||||||
|
.location(Location.FILE)
|
||||||
|
.fileFromPath("bamboo-specs/src/main/resources/scripts/docker-build.sh")
|
||||||
|
.argument(SERVICE_NAME + " " + SERVICE_NAME_BASE),
|
||||||
|
new InjectVariablesTask()
|
||||||
|
.description("Inject git tag.")
|
||||||
|
.path("git.tag")
|
||||||
|
.namespace("g")
|
||||||
|
.scope(InjectVariablesScope.LOCAL),
|
||||||
|
new VcsTagTask()
|
||||||
|
.description("${bamboo.g.gitTag}")
|
||||||
|
.tagName("${bamboo.g.gitTag}")
|
||||||
|
.defaultRepository())
|
||||||
|
.dockerConfiguration(
|
||||||
|
new DockerConfiguration()
|
||||||
|
.image("nexus.iqser.com:5001/infra/release_build:4.5.0")
|
||||||
|
.volume("/var/run/docker.sock", "/var/run/docker.sock")),
|
||||||
|
new Job("Licence Job", new BambooKey("LICENCE"))
|
||||||
|
.enabled(false)
|
||||||
|
.tasks(
|
||||||
|
new VcsCheckoutTask()
|
||||||
|
.description("Checkout default repository.")
|
||||||
|
.checkoutItems(new CheckoutItem().defaultRepository()),
|
||||||
|
new ScriptTask()
|
||||||
|
.description("Build licence.")
|
||||||
|
.location(Location.FILE)
|
||||||
|
.fileFromPath("bamboo-specs/src/main/resources/scripts/create-licence.sh"))
|
||||||
|
.dockerConfiguration(
|
||||||
|
new DockerConfiguration()
|
||||||
|
.image("nexus.iqser.com:5001/infra/maven:3.6.2-jdk-13-3.0.0")
|
||||||
|
.volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
|
||||||
|
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
|
||||||
|
.linkedRepositories("RR / " + SERVICE_NAME)
|
||||||
|
.linkedRepositories("RR / redai_image")
|
||||||
|
.triggers(
|
||||||
|
new BitbucketServerTrigger())
|
||||||
|
.planBranchManagement(
|
||||||
|
new PlanBranchManagement()
|
||||||
|
.createForVcsBranch()
|
||||||
|
.delete(
|
||||||
|
new BranchCleanup()
|
||||||
|
.whenInactiveInRepositoryAfterDays(14))
|
||||||
|
.notificationForCommitters());
|
||||||
|
}
|
||||||
|
|
||||||
|
public Plan createSecBuild() {
|
||||||
|
return new Plan(project(), SERVICE_NAME + "-Sec", new BambooKey(SERVICE_KEY + "SEC")).description("Security Analysis Plan")
|
||||||
|
.stages(new Stage("Default Stage").jobs(
|
||||||
|
new Job("Sonar Job", new BambooKey("SONAR"))
|
||||||
|
.tasks(
|
||||||
|
new CleanWorkingDirectoryTask()
|
||||||
|
.description("Clean working directory.")
|
||||||
|
.enabled(true),
|
||||||
|
new VcsCheckoutTask()
|
||||||
|
.description("Checkout default repository.")
|
||||||
|
.checkoutItems(new CheckoutItem().defaultRepository()),
|
||||||
|
new ScriptTask()
|
||||||
|
.description("Set config and keys.")
|
||||||
|
.location(Location.FILE)
|
||||||
|
.fileFromPath("bamboo-specs/src/main/resources/scripts/key-prepare.sh"),
|
||||||
|
new ScriptTask()
|
||||||
|
.description("Run Sonarqube scan.")
|
||||||
|
.location(Location.FILE)
|
||||||
|
.fileFromPath("bamboo-specs/src/main/resources/scripts/sonar-scan.sh")
|
||||||
|
.argument(SERVICE_NAME))
|
||||||
|
.dockerConfiguration(
|
||||||
|
new DockerConfiguration()
|
||||||
|
.image("nexus.iqser.com:5001/infra/release_build:4.2.0")
|
||||||
|
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
|
||||||
|
.linkedRepositories("RR / " + SERVICE_NAME)
|
||||||
|
.triggers(
|
||||||
|
new ScheduledTrigger()
|
||||||
|
.scheduleOnceDaily(LocalTime.of(23, 00)))
|
||||||
|
.planBranchManagement(
|
||||||
|
new PlanBranchManagement()
|
||||||
|
.createForVcsBranchMatching("release.*")
|
||||||
|
.notificationForCommitters());
|
||||||
|
}
|
||||||
|
}
|
||||||
19
bamboo-specs/src/main/resources/scripts/create-licence.sh
Executable file
19
bamboo-specs/src/main/resources/scripts/create-licence.sh
Executable file
@ -0,0 +1,19 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [[ \"${bamboo_version_tag}\" != \"dev\" ]]
|
||||||
|
then
|
||||||
|
${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
|
||||||
|
-f ${bamboo_build_working_directory}/pom.xml \
|
||||||
|
versions:set \
|
||||||
|
-DnewVersion=${bamboo_version_tag}
|
||||||
|
|
||||||
|
${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
|
||||||
|
-f ${bamboo_build_working_directory}/pom.xml \
|
||||||
|
-B clean deploy \
|
||||||
|
-e -DdeployAtEnd=true \
|
||||||
|
-Dmaven.wagon.http.ssl.insecure=true \
|
||||||
|
-Dmaven.wagon.http.ssl.allowall=true \
|
||||||
|
-Dmaven.wagon.http.ssl.ignore.validity.dates=true \
|
||||||
|
-DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/gin4-platform-releases
|
||||||
|
fi
|
||||||
60
bamboo-specs/src/main/resources/scripts/docker-build.sh
Executable file
60
bamboo-specs/src/main/resources/scripts/docker-build.sh
Executable file
@ -0,0 +1,60 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SERVICE_NAME=$1
|
||||||
|
SERVICE_NAME_BASE=$2
|
||||||
|
if [[ "$bamboo_planRepository_branchName" == "master" ]]
|
||||||
|
then
|
||||||
|
branchVersion=$(cat version.yaml | grep -Eo "version: .*" | sed -s 's|version: \(.*\)\..*\..*|\1|g')
|
||||||
|
latestVersion=$( semver $(git tag -l "${branchVersion}.*" ) | tail -n1 )
|
||||||
|
newVersion="$(semver $latestVersion -p -i minor)"
|
||||||
|
echo "new release on master with version $newVersion"
|
||||||
|
elif [[ "$bamboo_planRepository_branchName" == release* ]]
|
||||||
|
then
|
||||||
|
# branchVersion=$(echo $bamboo_planRepository_branchName | sed -s 's|release\/\([0-9]\+\.[0-9]\+\)\.x|\1|')
|
||||||
|
# latestVersion=$( semver $(git tag -l "${branchVersion}.*" ) | tail -n1 )
|
||||||
|
# newVersion="$(semver $latestVersion -p -i patch)"
|
||||||
|
# FIXME: obviously not the best solution
|
||||||
|
newVersion="1.16.1"
|
||||||
|
echo "new release on $bamboo_planRepository_branchName with version $newVersion"
|
||||||
|
elif [[ "${bamboo_version_tag}" != "dev" ]]
|
||||||
|
then
|
||||||
|
newVersion="${bamboo_version_tag}"
|
||||||
|
echo "new special version bild with $newVersion"
|
||||||
|
else
|
||||||
|
newVersion="${bamboo_planRepository_1_branch}_${bamboo_buildNumber}"
|
||||||
|
echo "gitTag=${newVersion}" > git.tag
|
||||||
|
dev_tag="dev"
|
||||||
|
echo "dev build with tag $dev_tag"
|
||||||
|
python3 -m venv build_venv
|
||||||
|
source build_venv/bin/activate
|
||||||
|
python3 -m pip install --upgrade pip
|
||||||
|
|
||||||
|
pip install dvc
|
||||||
|
pip install 'dvc[ssh]'
|
||||||
|
dvc pull
|
||||||
|
|
||||||
|
echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
|
||||||
|
echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
|
||||||
|
docker build -f Dockerfile_base -t $SERVICE_NAME_BASE .
|
||||||
|
docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:$dev_tag .
|
||||||
|
docker push nexus.iqser.com:5001/red/$SERVICE_NAME:$dev_tag
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "gitTag=${newVersion}" > git.tag
|
||||||
|
|
||||||
|
python3 -m venv build_venv
|
||||||
|
source build_venv/bin/activate
|
||||||
|
python3 -m pip install --upgrade pip
|
||||||
|
|
||||||
|
pip install dvc
|
||||||
|
pip install 'dvc[ssh]'
|
||||||
|
echo "Pulling dvc data"
|
||||||
|
dvc pull
|
||||||
|
|
||||||
|
echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
|
||||||
|
docker build -f Dockerfile_base -t $SERVICE_NAME_BASE .
|
||||||
|
docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion} .
|
||||||
|
echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
|
||||||
|
docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion}
|
||||||
8
bamboo-specs/src/main/resources/scripts/key-prepare.sh
Executable file
8
bamboo-specs/src/main/resources/scripts/key-prepare.sh
Executable file
@ -0,0 +1,8 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
mkdir -p ~/.ssh
|
||||||
|
echo "${bamboo_agent_ssh}" | base64 -d >> ~/.ssh/id_rsa
|
||||||
|
echo "host vector.iqser.com" > ~/.ssh/config
|
||||||
|
echo " user bamboo-agent" >> ~/.ssh/config
|
||||||
|
chmod 600 ~/.ssh/config ~/.ssh/id_rsa
|
||||||
57
bamboo-specs/src/main/resources/scripts/sonar-scan.sh
Executable file
57
bamboo-specs/src/main/resources/scripts/sonar-scan.sh
Executable file
@ -0,0 +1,57 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
export JAVA_HOME=/usr/bin/sonar-scanner/jre
|
||||||
|
|
||||||
|
python3 -m venv build_venv
|
||||||
|
source build_venv/bin/activate
|
||||||
|
python3 -m pip install --upgrade pip
|
||||||
|
python3 -m pip install dependency-check
|
||||||
|
python3 -m pip install coverage
|
||||||
|
|
||||||
|
echo "coverage report generation"
|
||||||
|
|
||||||
|
bash run_tests.sh
|
||||||
|
|
||||||
|
if [ ! -f reports/coverage.xml ]
|
||||||
|
then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
SERVICE_NAME=$1
|
||||||
|
|
||||||
|
echo "dependency-check:aggregate"
|
||||||
|
mkdir -p reports
|
||||||
|
dependency-check --enableExperimental -f JSON -f HTML -f XML \
|
||||||
|
--disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
|
||||||
|
--exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**"
|
||||||
|
|
||||||
|
if [[ -z "${bamboo_repository_pr_key}" ]]
|
||||||
|
then
|
||||||
|
echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
|
||||||
|
/usr/bin/sonar-scanner/bin/sonar-scanner \
|
||||||
|
-Dsonar.projectKey=RED_$SERVICE_NAME \
|
||||||
|
-Dsonar.sources=image_prediction \
|
||||||
|
-Dsonar.host.url=https://sonarqube.iqser.com \
|
||||||
|
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
|
||||||
|
-Dsonar.branch.name=${bamboo_planRepository_1_branch} \
|
||||||
|
-Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
|
||||||
|
-Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
|
||||||
|
-Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
|
||||||
|
-Dsonar.python.coverage.reportPaths=reports/coverage.xml
|
||||||
|
|
||||||
|
else
|
||||||
|
echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
|
||||||
|
/usr/bin/sonar-scanner/bin/sonar-scanner \
|
||||||
|
-Dsonar.projectKey=RED_$SERVICE_NAME \
|
||||||
|
-Dsonar.sources=image_prediction \
|
||||||
|
-Dsonar.host.url=https://sonarqube.iqser.com \
|
||||||
|
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
|
||||||
|
-Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
|
||||||
|
-Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \
|
||||||
|
-Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \
|
||||||
|
-Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
|
||||||
|
-Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
|
||||||
|
-Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
|
||||||
|
-Dsonar.python.coverage.reportPaths=reports/coverage.xml
|
||||||
|
fi
|
||||||
21
bamboo-specs/src/test/java/buildjob/PlanSpecTest.java
Normal file
21
bamboo-specs/src/test/java/buildjob/PlanSpecTest.java
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
package buildjob;
|
||||||
|
|
||||||
|
|
||||||
|
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
|
||||||
|
import com.atlassian.bamboo.specs.api.exceptions.PropertiesValidationException;
|
||||||
|
import com.atlassian.bamboo.specs.api.util.EntityPropertiesBuilders;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class PlanSpecTest {
|
||||||
|
@Test
|
||||||
|
public void checkYourPlanOffline() throws PropertiesValidationException {
|
||||||
|
Plan plan = new PlanSpec().createDockerBuildPlan();
|
||||||
|
EntityPropertiesBuilders.build(plan);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void checkYourSecPlanOffline() throws PropertiesValidationException {
|
||||||
|
Plan secPlan = new PlanSpec().createSecBuild();
|
||||||
|
EntityPropertiesBuilders.build(secPlan);
|
||||||
|
}
|
||||||
|
}
|
||||||
26
config.yaml
Normal file
26
config.yaml
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
webserver:
|
||||||
|
host: $SERVER_HOST|"127.0.0.1" # webserver address
|
||||||
|
port: $SERVER_PORT|5000 # webserver port
|
||||||
|
|
||||||
|
service:
|
||||||
|
logging_level: $LOGGING_LEVEL_ROOT|INFO # Logging level for service logger
|
||||||
|
verbose: $VERBOSE|True # Service prints document processing progress to stdout
|
||||||
|
batch_size: $BATCH_SIZE|16 # Number of images in memory simultaneously
|
||||||
|
mlflow_run_id: $MLFLOW_RUN_ID|fabfb1f192c745369b88cab34471aba7 # The ID of the mlflow run to load the service_estimator from
|
||||||
|
|
||||||
|
|
||||||
|
# These variables control filters that are applied to either images, image metadata or service_estimator predictions.
|
||||||
|
# The filter result values are reported in the service responses. For convenience the response to a request contains a
|
||||||
|
# "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
|
||||||
|
# specified required value.
|
||||||
|
filters:
|
||||||
|
|
||||||
|
image_to_page_quotient: # Image size to page size ratio (ratio of geometric means of areas)
|
||||||
|
min: $MIN_REL_IMAGE_SIZE|0.05 # Minimum permissible
|
||||||
|
max: $MAX_REL_IMAGE_SIZE|0.75 # Maximum permissible
|
||||||
|
|
||||||
|
image_width_to_height_quotient: # Image width to height ratio
|
||||||
|
min: $MIN_IMAGE_FORMAT|0.1 # Minimum permissible
|
||||||
|
max: $MAX_IMAGE_FORMAT|10 # Maximum permissible
|
||||||
|
|
||||||
|
min_confidence: $MIN_CONFIDENCE|0.5 # Minimum permissible prediction confidence
|
||||||
@ -1,68 +0,0 @@
|
|||||||
|
|
||||||
[asyncio]
|
|
||||||
max_concurrent_tasks = 10
|
|
||||||
|
|
||||||
[dynamic_tenant_queues]
|
|
||||||
enabled = true
|
|
||||||
|
|
||||||
[metrics.prometheus]
|
|
||||||
enabled = true
|
|
||||||
prefix = "redactmanager_image_service"
|
|
||||||
|
|
||||||
[tracing]
|
|
||||||
enabled = true
|
|
||||||
# possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
|
|
||||||
type = "azure_monitor"
|
|
||||||
|
|
||||||
[tracing.opentelemetry]
|
|
||||||
endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
|
|
||||||
service_name = "redactmanager_image_service"
|
|
||||||
exporter = "otlp"
|
|
||||||
|
|
||||||
[webserver]
|
|
||||||
host = "0.0.0.0"
|
|
||||||
port = 8080
|
|
||||||
|
|
||||||
[rabbitmq]
|
|
||||||
host = "localhost"
|
|
||||||
port = 5672
|
|
||||||
username = ""
|
|
||||||
password = ""
|
|
||||||
heartbeat = 60
|
|
||||||
# Has to be a divider of heartbeat, and shouldn't be too big, since only in these intervals queue interactions happen (like receiving new messages)
|
|
||||||
# This is also the minimum time the service needs to process a message
|
|
||||||
connection_sleep = 5
|
|
||||||
input_queue = "request_queue"
|
|
||||||
output_queue = "response_queue"
|
|
||||||
dead_letter_queue = "dead_letter_queue"
|
|
||||||
|
|
||||||
tenant_event_queue_suffix = "_tenant_event_queue"
|
|
||||||
tenant_event_dlq_suffix = "_tenant_events_dlq"
|
|
||||||
tenant_exchange_name = "tenants-exchange"
|
|
||||||
queue_expiration_time = 300000 # 5 minutes in milliseconds
|
|
||||||
|
|
||||||
service_request_queue_prefix = "image_request_queue"
|
|
||||||
service_request_exchange_name = "image_request_exchange"
|
|
||||||
service_response_exchange_name = "image_response_exchange"
|
|
||||||
service_dlq_name = "image_dlq"
|
|
||||||
|
|
||||||
[storage]
|
|
||||||
backend = "s3"
|
|
||||||
|
|
||||||
[storage.s3]
|
|
||||||
bucket = "redaction"
|
|
||||||
endpoint = "http://127.0.0.1:9000"
|
|
||||||
key = ""
|
|
||||||
secret = ""
|
|
||||||
region = "eu-central-1"
|
|
||||||
|
|
||||||
[storage.azure]
|
|
||||||
container = "redaction"
|
|
||||||
connection_string = ""
|
|
||||||
|
|
||||||
[storage.tenant_server]
|
|
||||||
public_key = ""
|
|
||||||
endpoint = "http://tenant-user-management:8081/internal-api/tenants"
|
|
||||||
|
|
||||||
[kubernetes]
|
|
||||||
pod_name = "test_pod"
|
|
||||||
@ -1,42 +0,0 @@
|
|||||||
[logging]
|
|
||||||
level = "INFO"
|
|
||||||
|
|
||||||
[service]
|
|
||||||
# Print document processing progress to stdout
|
|
||||||
verbose = false
|
|
||||||
batch_size = 6
|
|
||||||
image_stiching_tolerance = 1 # in pixels
|
|
||||||
mlflow_run_id = "fabfb1f192c745369b88cab34471aba7"
|
|
||||||
|
|
||||||
# These variables control filters that are applied to either images, image metadata or service_estimator predictions.
|
|
||||||
# The filter result values are reported in the service responses. For convenience the response to a request contains a
|
|
||||||
# "filters.allPassed" field, which is set to false if any of the values returned by the filters did not meet its
|
|
||||||
# specified required value.
|
|
||||||
[filters.confidence]
|
|
||||||
# Minimum permissible prediction confidence
|
|
||||||
min = 0.5
|
|
||||||
|
|
||||||
# Image size to page size ratio (ratio of geometric means of areas)
|
|
||||||
[filters.image_to_page_quotient]
|
|
||||||
min = 0.05
|
|
||||||
max = 0.75
|
|
||||||
|
|
||||||
[filters.is_scanned_page]
|
|
||||||
# Minimum permissible image to page ratio tolerance for a page to be considered scanned.
|
|
||||||
# This is only used for filtering small images on scanned pages and is applied before processing the image, therefore
|
|
||||||
# superseding the image_to_page_quotient filter that only applies a tag to the image after processing.
|
|
||||||
tolerance = 0
|
|
||||||
|
|
||||||
# Image width to height ratio
|
|
||||||
[filters.image_width_to_height_quotient]
|
|
||||||
min = 0.1
|
|
||||||
max = 10
|
|
||||||
|
|
||||||
# put class specific filters here ['signature', 'formula', 'logo']
|
|
||||||
[filters.overrides.signature.image_to_page_quotient]
|
|
||||||
max = 0.4
|
|
||||||
|
|
||||||
[filters.overrides.logo.image_to_page_quotient]
|
|
||||||
min = 0.06
|
|
||||||
|
|
||||||
|
|
||||||
46
image_prediction/config.py
Normal file
46
image_prediction/config.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
"""Implements a config object with dot-indexing syntax."""
|
||||||
|
|
||||||
|
|
||||||
|
from envyaml import EnvYAML
|
||||||
|
|
||||||
|
from image_prediction.locations import CONFIG_FILE
|
||||||
|
|
||||||
|
|
||||||
|
def _get_item_and_maybe_make_dotindexable(container, item):
|
||||||
|
ret = container[item]
|
||||||
|
return DotIndexable(ret) if isinstance(ret, dict) else ret
|
||||||
|
|
||||||
|
|
||||||
|
class DotIndexable:
|
||||||
|
def __init__(self, x):
|
||||||
|
self.x = x
|
||||||
|
|
||||||
|
def get(self, item, default=None):
|
||||||
|
try:
|
||||||
|
return _get_item_and_maybe_make_dotindexable(self.x, item)
|
||||||
|
except KeyError:
|
||||||
|
return default
|
||||||
|
|
||||||
|
def __getattr__(self, item):
|
||||||
|
return _get_item_and_maybe_make_dotindexable(self.x, item)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return self.x.__repr__()
|
||||||
|
|
||||||
|
def __getitem__(self, item):
|
||||||
|
return self.__getattr__(item)
|
||||||
|
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
def __init__(self, config_path):
|
||||||
|
self.__config = EnvYAML(config_path)
|
||||||
|
|
||||||
|
def __getattr__(self, item):
|
||||||
|
if item in self.__config:
|
||||||
|
return _get_item_and_maybe_make_dotindexable(self.__config, item)
|
||||||
|
|
||||||
|
def __getitem__(self, item):
|
||||||
|
return self.__getattr__(item)
|
||||||
|
|
||||||
|
|
||||||
|
CONFIG = Config(CONFIG_FILE)
|
||||||
@ -1,3 +1,5 @@
|
|||||||
|
from typing import Iterable
|
||||||
|
|
||||||
from funcy import juxt
|
from funcy import juxt
|
||||||
|
|
||||||
from image_prediction.classifier.classifier import Classifier
|
from image_prediction.classifier.classifier import Classifier
|
||||||
@ -5,6 +7,7 @@ from image_prediction.classifier.image_classifier import ImageClassifier
|
|||||||
from image_prediction.compositor.compositor import TransformerCompositor
|
from image_prediction.compositor.compositor import TransformerCompositor
|
||||||
from image_prediction.encoder.encoders.hash_encoder import HashEncoder
|
from image_prediction.encoder.encoders.hash_encoder import HashEncoder
|
||||||
from image_prediction.estimator.adapter.adapter import EstimatorAdapter
|
from image_prediction.estimator.adapter.adapter import EstimatorAdapter
|
||||||
|
from image_prediction.formatter.formatter import format_image_plus
|
||||||
from image_prediction.formatter.formatters.camel_case import Snake2CamelCaseKeyFormatter
|
from image_prediction.formatter.formatters.camel_case import Snake2CamelCaseKeyFormatter
|
||||||
from image_prediction.formatter.formatters.enum import EnumFormatter
|
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||||
from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
|
from image_prediction.image_extractor.extractors.parsable import ParsablePDFImageExtractor
|
||||||
@ -14,6 +17,7 @@ from image_prediction.model_loader.loaders.mlflow import MlflowConnector
|
|||||||
from image_prediction.redai_adapter.mlflow import MlflowModelReader
|
from image_prediction.redai_adapter.mlflow import MlflowModelReader
|
||||||
from image_prediction.transformer.transformers.coordinate.pdfnet import PDFNetCoordinateTransformer
|
from image_prediction.transformer.transformers.coordinate.pdfnet import PDFNetCoordinateTransformer
|
||||||
from image_prediction.transformer.transformers.response import ResponseTransformer
|
from image_prediction.transformer.transformers.response import ResponseTransformer
|
||||||
|
from pdf2img.extraction import extract_images_via_metadata
|
||||||
|
|
||||||
|
|
||||||
def get_mlflow_model_loader(mlruns_dir):
|
def get_mlflow_model_loader(mlruns_dir):
|
||||||
@ -26,10 +30,17 @@ def get_image_classifier(model_loader, model_identifier):
|
|||||||
return ImageClassifier(Classifier(EstimatorAdapter(model), ProbabilityMapper(classes)))
|
return ImageClassifier(Classifier(EstimatorAdapter(model), ProbabilityMapper(classes)))
|
||||||
|
|
||||||
|
|
||||||
def get_extractor(**kwargs):
|
def get_dispatched_extract(**kwargs):
|
||||||
image_extractor = ParsablePDFImageExtractor(**kwargs)
|
image_extractor = ParsablePDFImageExtractor(**kwargs)
|
||||||
|
|
||||||
return image_extractor
|
def extract(pdf: bytes, page_range: range = None, metadata_per_image: Iterable[dict] = None):
|
||||||
|
if metadata_per_image:
|
||||||
|
image_pluses = extract_images_via_metadata(pdf, metadata_per_image)
|
||||||
|
yield from map(format_image_plus, image_pluses)
|
||||||
|
else:
|
||||||
|
yield from image_extractor.extract(pdf, page_range)
|
||||||
|
|
||||||
|
return extract
|
||||||
|
|
||||||
|
|
||||||
def get_formatter():
|
def get_formatter():
|
||||||
@ -13,7 +13,7 @@ class HashEncoder(Encoder):
|
|||||||
yield from self.encode(images)
|
yield from self.encode(images)
|
||||||
|
|
||||||
|
|
||||||
def hash_image(image: Image.Image) -> str:
|
def hash_image(image: Image.Image):
|
||||||
"""See: https://stackoverflow.com/a/49692185/3578468"""
|
"""See: https://stackoverflow.com/a/49692185/3578468"""
|
||||||
image = image.resize((10, 10), Image.ANTIALIAS)
|
image = image.resize((10, 10), Image.ANTIALIAS)
|
||||||
image = image.convert("L")
|
image = image.convert("L")
|
||||||
@ -21,6 +21,4 @@ def hash_image(image: Image.Image) -> str:
|
|||||||
avg_pixel = sum(pixel_data) / len(pixel_data)
|
avg_pixel = sum(pixel_data) / len(pixel_data)
|
||||||
bits = "".join(["1" if (px >= avg_pixel) else "0" for px in pixel_data])
|
bits = "".join(["1" if (px >= avg_pixel) else "0" for px in pixel_data])
|
||||||
hex_representation = str(hex(int(bits, 2)))[2:][::-1].upper()
|
hex_representation = str(hex(int(bits, 2)))[2:][::-1].upper()
|
||||||
# Note: For each 4 leading zeros, the hex representation will be shorter by one character.
|
return hex_representation
|
||||||
# To ensure that all hashes have the same length, we pad the hex representation with zeros (also see RED-3813).
|
|
||||||
return hex_representation.zfill(25)
|
|
||||||
@ -36,7 +36,3 @@ class InvalidBox(Exception):
|
|||||||
|
|
||||||
class ParsingError(Exception):
|
class ParsingError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class BadXref(ValueError):
|
|
||||||
pass
|
|
||||||
35
image_prediction/formatter/formatter.py
Normal file
35
image_prediction/formatter/formatter.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
import abc
|
||||||
|
|
||||||
|
from image_prediction.image_extractor.extractor import ImageMetadataPair
|
||||||
|
from image_prediction.info import Info
|
||||||
|
|
||||||
|
from image_prediction.transformer.transformer import Transformer
|
||||||
|
from pdf2img.default_objects.image import ImagePlus
|
||||||
|
|
||||||
|
|
||||||
|
class Formatter(Transformer):
|
||||||
|
@abc.abstractmethod
|
||||||
|
def format(self, obj):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def transform(self, obj):
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def __call__(self, obj):
|
||||||
|
return self.format(obj)
|
||||||
|
|
||||||
|
|
||||||
|
def format_image_plus(image: ImagePlus) -> ImageMetadataPair:
|
||||||
|
enum_metadata = {
|
||||||
|
Info.PAGE_WIDTH: image.info.pageInfo.width,
|
||||||
|
Info.PAGE_HEIGHT: image.info.pageInfo.height,
|
||||||
|
Info.PAGE_IDX: image.info.pageInfo.number,
|
||||||
|
Info.ALPHA: image.info.alpha,
|
||||||
|
Info.WIDTH: image.info.boundingBox.width,
|
||||||
|
Info.HEIGHT: image.info.boundingBox.height,
|
||||||
|
Info.X1: image.info.boundingBox.x0,
|
||||||
|
Info.X2: image.info.boundingBox.x1,
|
||||||
|
Info.Y1: image.info.boundingBox.y0,
|
||||||
|
Info.Y2: image.info.boundingBox.y1,
|
||||||
|
}
|
||||||
|
return ImageMetadataPair(image.aspil(), enum_metadata)
|
||||||
208
image_prediction/image_extractor/extractors/parsable.py
Normal file
208
image_prediction/image_extractor/extractors/parsable.py
Normal file
@ -0,0 +1,208 @@
|
|||||||
|
import atexit
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import traceback
|
||||||
|
from functools import partial, lru_cache
|
||||||
|
from itertools import chain, starmap, filterfalse
|
||||||
|
from operator import itemgetter, truth
|
||||||
|
from typing import List, Iterable, Iterator
|
||||||
|
|
||||||
|
import fitz
|
||||||
|
from PIL import Image
|
||||||
|
from funcy import rcompose, merge, pluck, curry, compose
|
||||||
|
|
||||||
|
from image_prediction.formatter.formatters.enum import EnumFormatter
|
||||||
|
from image_prediction.image_extractor.extractor import ImageExtractor, ImageMetadataPair
|
||||||
|
from image_prediction.info import Info
|
||||||
|
from image_prediction.stitching.stitching import stitch_pairs
|
||||||
|
from image_prediction.stitching.utils import validate_box_coords, validate_box_size
|
||||||
|
from image_prediction.utils import get_logger
|
||||||
|
from image_prediction.utils.generic import lift
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class ParsablePDFImageExtractor(ImageExtractor):
|
||||||
|
def __init__(self, verbose=False, tolerance=0):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Args:
|
||||||
|
verbose: Whether to show progressbar
|
||||||
|
tolerance: The tolerance in pixels for the distance between images, beyond which they will not be stitched
|
||||||
|
together
|
||||||
|
"""
|
||||||
|
self.doc: fitz.fitz.Document = None
|
||||||
|
self.verbose = verbose
|
||||||
|
self.tolerance = tolerance
|
||||||
|
|
||||||
|
def extract(self, pdf: bytes, page_range: range = None):
|
||||||
|
self.doc = fitz.Document(stream=pdf)
|
||||||
|
|
||||||
|
pages = extract_pages(self.doc, page_range) if page_range else self.doc
|
||||||
|
|
||||||
|
image_metadata_pairs = chain.from_iterable(map(self.__process_images_on_page, pages))
|
||||||
|
|
||||||
|
yield from image_metadata_pairs
|
||||||
|
|
||||||
|
def __process_images_on_page(self, page: fitz.fitz.Page):
|
||||||
|
images = get_images_on_page(self.doc, page)
|
||||||
|
metadata = get_metadata_for_images_on_page(self.doc, page)
|
||||||
|
clear_caches()
|
||||||
|
|
||||||
|
image_metadata_pairs = starmap(ImageMetadataPair, filter(all, zip(images, metadata)))
|
||||||
|
# TODO: In the future, consider to introduce an image validator as a pipeline component rather than doing the
|
||||||
|
# validation here. Invalid images can then be split into a different stream and joined with the intact images
|
||||||
|
# again for the formatting step.
|
||||||
|
image_metadata_pairs = self.__filter_valid_images(image_metadata_pairs)
|
||||||
|
image_metadata_pairs = stitch_pairs(list(image_metadata_pairs), tolerance=self.tolerance)
|
||||||
|
|
||||||
|
yield from image_metadata_pairs
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def __filter_valid_images(image_metadata_pairs: Iterable[ImageMetadataPair]) -> Iterator[ImageMetadataPair]:
|
||||||
|
def validate(image: Image.Image, metadata: dict):
|
||||||
|
try:
|
||||||
|
# TODO: stand-in heuristic for testing if image is valid => find cleaner solution (RED-5148)
|
||||||
|
image.resize((100, 100)).convert("RGB")
|
||||||
|
return ImageMetadataPair(image, metadata)
|
||||||
|
except (OSError, Exception) as err:
|
||||||
|
metadata = json.dumps(EnumFormatter()(metadata), indent=2)
|
||||||
|
logger.warning(f"Invalid image encountered. Image metadata:\n{metadata}\n\n{traceback.format_exc()}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return filter(truth, starmap(validate, image_metadata_pairs))
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pages(doc, page_range):
|
||||||
|
page_range = range(page_range.start + 1, page_range.stop + 1)
|
||||||
|
pages = map(doc.load_page, page_range)
|
||||||
|
|
||||||
|
yield from pages
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def get_images_on_page(doc, page: fitz.Page):
|
||||||
|
image_infos = get_image_infos(page)
|
||||||
|
xrefs = map(itemgetter("xref"), image_infos)
|
||||||
|
images = map(partial(xref_to_image, doc), xrefs)
|
||||||
|
|
||||||
|
yield from images
|
||||||
|
|
||||||
|
|
||||||
|
def get_metadata_for_images_on_page(doc, page: fitz.Page):
|
||||||
|
|
||||||
|
metadata = map(get_image_metadata, get_image_infos(page))
|
||||||
|
metadata = validate_coords_and_passthrough(metadata)
|
||||||
|
|
||||||
|
metadata = filter_out_tiny_images(metadata)
|
||||||
|
metadata = validate_size_and_passthrough(metadata)
|
||||||
|
|
||||||
|
metadata = add_page_metadata(page, metadata)
|
||||||
|
|
||||||
|
metadata = add_alpha_channel_info(doc, page, metadata)
|
||||||
|
|
||||||
|
yield from metadata
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def get_image_infos(page: fitz.Page) -> List[dict]:
|
||||||
|
return page.get_image_info(xrefs=True)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def xref_to_image(doc, xref) -> Image:
|
||||||
|
maybe_image = load_image_handle_from_xref(doc, xref)
|
||||||
|
return Image.open(io.BytesIO(maybe_image["image"])) if maybe_image else None
|
||||||
|
|
||||||
|
|
||||||
|
def get_image_metadata(image_info):
|
||||||
|
|
||||||
|
x1, y1, x2, y2 = map(rounder, image_info["bbox"])
|
||||||
|
|
||||||
|
width = abs(x2 - x1)
|
||||||
|
height = abs(y2 - y1)
|
||||||
|
|
||||||
|
return {
|
||||||
|
Info.WIDTH: width,
|
||||||
|
Info.HEIGHT: height,
|
||||||
|
Info.X1: x1,
|
||||||
|
Info.X2: x2,
|
||||||
|
Info.Y1: y1,
|
||||||
|
Info.Y2: y2,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def validate_coords_and_passthrough(metadata):
|
||||||
|
yield from map(validate_box_coords, metadata)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_out_tiny_images(metadata):
|
||||||
|
yield from filterfalse(tiny, metadata)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_size_and_passthrough(metadata):
|
||||||
|
yield from map(validate_box_size, metadata)
|
||||||
|
|
||||||
|
|
||||||
|
def add_page_metadata(page, metadata):
|
||||||
|
yield from map(partial(merge, get_page_metadata(page)), metadata)
|
||||||
|
|
||||||
|
|
||||||
|
def add_alpha_channel_info(doc, page, metadata):
|
||||||
|
|
||||||
|
page_to_xrefs = compose(curry(pluck)("xref"), get_image_infos)
|
||||||
|
xref_to_alpha = partial(has_alpha_channel, doc)
|
||||||
|
page_to_alpha_value_per_image = compose(lift(xref_to_alpha), page_to_xrefs)
|
||||||
|
alpha_to_dict = compose(dict, lambda a: [(Info.ALPHA, a)])
|
||||||
|
page_to_alpha_mapping_per_image = compose(lift(alpha_to_dict), page_to_alpha_value_per_image)
|
||||||
|
|
||||||
|
metadata = starmap(merge, zip(page_to_alpha_mapping_per_image(page), metadata))
|
||||||
|
|
||||||
|
yield from metadata
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def load_image_handle_from_xref(doc, xref):
|
||||||
|
return doc.extract_image(xref)
|
||||||
|
|
||||||
|
|
||||||
|
rounder = rcompose(round, int)
|
||||||
|
|
||||||
|
|
||||||
|
def get_page_metadata(page):
|
||||||
|
page_width, page_height = map(rounder, page.mediabox_size)
|
||||||
|
|
||||||
|
return {
|
||||||
|
Info.PAGE_WIDTH: page_width,
|
||||||
|
Info.PAGE_HEIGHT: page_height,
|
||||||
|
Info.PAGE_IDX: page.number,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def has_alpha_channel(doc, xref):
|
||||||
|
|
||||||
|
maybe_image = load_image_handle_from_xref(doc, xref)
|
||||||
|
maybe_smask = maybe_image["smask"] if maybe_image else None
|
||||||
|
|
||||||
|
if maybe_smask:
|
||||||
|
return any([doc.extract_image(maybe_smask) is not None, bool(fitz.Pixmap(doc, maybe_smask).alpha)])
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
return bool(fitz.Pixmap(doc, xref).alpha)
|
||||||
|
except ValueError:
|
||||||
|
logger.debug(f"Encountered invalid xref `{xref}` in {doc.metadata.get('title', '<no title>')}.")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def tiny(metadata):
|
||||||
|
return metadata[Info.WIDTH] * metadata[Info.HEIGHT] <= 4
|
||||||
|
|
||||||
|
|
||||||
|
def clear_caches():
|
||||||
|
get_image_infos.cache_clear()
|
||||||
|
load_image_handle_from_xref.cache_clear()
|
||||||
|
get_images_on_page.cache_clear()
|
||||||
|
xref_to_image.cache_clear()
|
||||||
|
|
||||||
|
|
||||||
|
atexit.register(clear_caches)
|
||||||
@ -12,4 +12,3 @@ class Info(Enum):
|
|||||||
Y1 = "y1"
|
Y1 = "y1"
|
||||||
Y2 = "y2"
|
Y2 = "y2"
|
||||||
ALPHA = "alpha"
|
ALPHA = "alpha"
|
||||||
XREF = "xref"
|
|
||||||
17
image_prediction/locations.py
Normal file
17
image_prediction/locations.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
"""Defines constant paths relative to the module root path."""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
MODULE_DIR = Path(__file__).resolve().parents[0]
|
||||||
|
|
||||||
|
PACKAGE_ROOT_DIR = MODULE_DIR.parents[0]
|
||||||
|
|
||||||
|
CONFIG_FILE = PACKAGE_ROOT_DIR / "config.yaml"
|
||||||
|
|
||||||
|
BANNER_FILE = PACKAGE_ROOT_DIR / "banner.txt"
|
||||||
|
|
||||||
|
DATA_DIR = PACKAGE_ROOT_DIR / "data"
|
||||||
|
|
||||||
|
MLRUNS_DIR = str(DATA_DIR / "mlruns")
|
||||||
|
|
||||||
|
TEST_DATA_DIR = PACKAGE_ROOT_DIR / "test" / "data"
|
||||||
@ -1,10 +1,9 @@
|
|||||||
import os
|
import os
|
||||||
from functools import lru_cache, partial
|
from functools import partial
|
||||||
from itertools import chain, tee
|
from itertools import chain, tee
|
||||||
from typing import Iterable, Any
|
from typing import Iterable
|
||||||
|
|
||||||
from funcy import rcompose, first, compose, second, chunks, identity, rpartial
|
from funcy import rcompose, first, compose, second, chunks, identity, rpartial
|
||||||
from kn_utils.logging import logger
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from image_prediction.config import CONFIG
|
from image_prediction.config import CONFIG
|
||||||
@ -12,8 +11,8 @@ from image_prediction.default_objects import (
|
|||||||
get_formatter,
|
get_formatter,
|
||||||
get_mlflow_model_loader,
|
get_mlflow_model_loader,
|
||||||
get_image_classifier,
|
get_image_classifier,
|
||||||
get_extractor,
|
|
||||||
get_encoder,
|
get_encoder,
|
||||||
|
get_dispatched_extract,
|
||||||
)
|
)
|
||||||
from image_prediction.locations import MLRUNS_DIR
|
from image_prediction.locations import MLRUNS_DIR
|
||||||
from image_prediction.utils.generic import lift, starlift
|
from image_prediction.utils.generic import lift, starlift
|
||||||
@ -21,9 +20,7 @@ from image_prediction.utils.generic import lift, starlift
|
|||||||
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
|
||||||
def load_pipeline(**kwargs):
|
def load_pipeline(**kwargs):
|
||||||
logger.info(f"Loading pipeline with kwargs: {kwargs}")
|
|
||||||
model_loader = get_mlflow_model_loader(MLRUNS_DIR)
|
model_loader = get_mlflow_model_loader(MLRUNS_DIR)
|
||||||
model_identifier = CONFIG.service.mlflow_run_id
|
model_identifier = CONFIG.service.mlflow_run_id
|
||||||
|
|
||||||
@ -41,10 +38,10 @@ def star(f):
|
|||||||
|
|
||||||
|
|
||||||
class Pipeline:
|
class Pipeline:
|
||||||
def __init__(self, model_loader, model_identifier, batch_size=16, verbose=False, **kwargs):
|
def __init__(self, model_loader, model_identifier, batch_size=16, verbose=True, **kwargs):
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
|
|
||||||
extract = get_extractor(**kwargs)
|
extract = get_dispatched_extract(**kwargs)
|
||||||
classifier = get_image_classifier(model_loader, model_identifier)
|
classifier = get_image_classifier(model_loader, model_identifier)
|
||||||
reformat = get_formatter()
|
reformat = get_formatter()
|
||||||
represent = get_encoder()
|
represent = get_encoder()
|
||||||
@ -55,7 +52,7 @@ class Pipeline:
|
|||||||
join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))
|
join = compose(starlift(lambda prd, rpr, mdt: {"classification": prd, **mdt, "representation": rpr}), star(zip))
|
||||||
|
|
||||||
# />--classify--\
|
# />--classify--\
|
||||||
# --extract-->--split--+->--encode---->+--join-->reformat-->filter_duplicates
|
# --extract-->--split--+->--encode---->+--join-->reformat
|
||||||
# \>--identity--/
|
# \>--identity--/
|
||||||
|
|
||||||
self.pipe = rcompose(
|
self.pipe = rcompose(
|
||||||
@ -64,42 +61,12 @@ class Pipeline:
|
|||||||
pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise
|
pairwise_apply(classify, represent, identity), # ... apply functions to the streams pairwise
|
||||||
join, # ... the streams by zipping
|
join, # ... the streams by zipping
|
||||||
reformat, # ... the items
|
reformat, # ... the items
|
||||||
filter_duplicates, # ... filter out duplicate images
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def __call__(self, pdf: bytes, page_range: range = None):
|
def __call__(self, pdf: bytes, page_range: range = None, metadata_per_image: Iterable[dict] = None):
|
||||||
yield from tqdm(
|
yield from tqdm(
|
||||||
self.pipe(pdf, page_range=page_range),
|
self.pipe(pdf, page_range=page_range, metadata_per_image=metadata_per_image),
|
||||||
desc="Processing images from document",
|
desc="Processing images from document",
|
||||||
unit=" images",
|
unit=" images",
|
||||||
disable=not self.verbose,
|
disable=not self.verbose,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def filter_duplicates(metadata: Iterable[dict[str, Any]]) -> Iterable[dict[str, Any]]:
|
|
||||||
"""Filter out duplicate images from the `position` (image coordinates) and `page`, preferring the one with
|
|
||||||
`allPassed` set to True.
|
|
||||||
See RED-10765 (RM-241): Removed redactions reappear for why this is necessary.
|
|
||||||
"""
|
|
||||||
keep = dict()
|
|
||||||
for image_meta in metadata:
|
|
||||||
key: tuple[int, int, int, int, int] = (
|
|
||||||
image_meta["position"]["x1"],
|
|
||||||
image_meta["position"]["x2"],
|
|
||||||
image_meta["position"]["y1"],
|
|
||||||
image_meta["position"]["y2"],
|
|
||||||
image_meta["position"]["pageNumber"],
|
|
||||||
)
|
|
||||||
if key in keep:
|
|
||||||
logger.warning(
|
|
||||||
f"Duplicate image found: x1={key[0]}, x2={key[1]}, y1={key[2]}, y2={key[3]}, pageNumber={key[4]}"
|
|
||||||
)
|
|
||||||
if image_meta["filters"]["allPassed"]:
|
|
||||||
logger.warning("Setting the image with allPassed flag set to True")
|
|
||||||
keep[key] = image_meta
|
|
||||||
else:
|
|
||||||
logger.warning("Keeping the previous image since the current image has allPassed flag set to False")
|
|
||||||
else:
|
|
||||||
keep[key] = image_meta
|
|
||||||
|
|
||||||
yield from keep.values()
|
|
||||||
153
image_prediction/transformer/transformers/response.py
Normal file
153
image_prediction/transformer/transformers/response.py
Normal file
@ -0,0 +1,153 @@
|
|||||||
|
import json
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
from functools import lru_cache
|
||||||
|
from operator import itemgetter
|
||||||
|
|
||||||
|
from funcy import first
|
||||||
|
|
||||||
|
from image_prediction.config import CONFIG
|
||||||
|
from image_prediction.exceptions import ParsingError
|
||||||
|
from image_prediction.transformer.transformer import Transformer
|
||||||
|
from image_prediction.utils import get_logger
|
||||||
|
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class ResponseTransformer(Transformer):
|
||||||
|
def transform(self, data):
|
||||||
|
logger.debug("ResponseTransformer.transform")
|
||||||
|
return build_image_info(data)
|
||||||
|
|
||||||
|
|
||||||
|
def build_image_info(data: dict) -> dict:
|
||||||
|
def compute_geometric_quotient():
|
||||||
|
page_area_sqrt = math.sqrt(abs(page_width * page_height))
|
||||||
|
image_area_sqrt = math.sqrt(abs(x2 - x1) * abs(y2 - y1))
|
||||||
|
return image_area_sqrt / page_area_sqrt
|
||||||
|
|
||||||
|
page_width, page_height, x1, x2, y1, y2, width, height, alpha = itemgetter(
|
||||||
|
"page_width", "page_height", "x1", "x2", "y1", "y2", "width", "height", "alpha"
|
||||||
|
)(data)
|
||||||
|
|
||||||
|
classification = data["classification"]
|
||||||
|
label = classification["label"]
|
||||||
|
representation = data["representation"]
|
||||||
|
|
||||||
|
geometric_quotient = round(compute_geometric_quotient(), 4)
|
||||||
|
|
||||||
|
min_image_to_page_quotient_breached = bool(
|
||||||
|
geometric_quotient < get_class_specific_min_image_to_page_quotient(label)
|
||||||
|
)
|
||||||
|
max_image_to_page_quotient_breached = bool(
|
||||||
|
geometric_quotient > get_class_specific_max_image_to_page_quotient(label)
|
||||||
|
)
|
||||||
|
|
||||||
|
min_image_width_to_height_quotient_breached = bool(
|
||||||
|
width / height < get_class_specific_min_image_width_to_height_quotient(label)
|
||||||
|
)
|
||||||
|
max_image_width_to_height_quotient_breached = bool(
|
||||||
|
width / height > get_class_specific_max_image_width_to_height_quotient(label)
|
||||||
|
)
|
||||||
|
|
||||||
|
min_confidence_breached = bool(
|
||||||
|
max(classification["probabilities"].values()) < get_class_specific_min_classification_confidence(label)
|
||||||
|
)
|
||||||
|
|
||||||
|
image_info = {
|
||||||
|
"classification": classification,
|
||||||
|
"representation": representation,
|
||||||
|
"position": {"x1": x1, "x2": x2, "y1": y1, "y2": y2, "pageNumber": data["page_idx"] + 1},
|
||||||
|
"geometry": {"width": width, "height": height},
|
||||||
|
"alpha": alpha,
|
||||||
|
"filters": {
|
||||||
|
"geometry": {
|
||||||
|
"imageSize": {
|
||||||
|
"quotient": geometric_quotient,
|
||||||
|
"tooLarge": max_image_to_page_quotient_breached,
|
||||||
|
"tooSmall": min_image_to_page_quotient_breached,
|
||||||
|
},
|
||||||
|
"imageFormat": {
|
||||||
|
"quotient": round(width / height, 4),
|
||||||
|
"tooTall": min_image_width_to_height_quotient_breached,
|
||||||
|
"tooWide": max_image_width_to_height_quotient_breached,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"probability": {"unconfident": min_confidence_breached},
|
||||||
|
"allPassed": not any(
|
||||||
|
[
|
||||||
|
max_image_to_page_quotient_breached,
|
||||||
|
min_image_to_page_quotient_breached,
|
||||||
|
min_image_width_to_height_quotient_breached,
|
||||||
|
max_image_width_to_height_quotient_breached,
|
||||||
|
min_confidence_breached,
|
||||||
|
]
|
||||||
|
),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
return image_info
|
||||||
|
|
||||||
|
|
||||||
|
def get_class_specific_min_image_to_page_quotient(label, table=None):
|
||||||
|
return get_class_specific_value(
|
||||||
|
"REL_IMAGE_SIZE", label, "min", CONFIG.filters.image_to_page_quotient.min, table=table
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_class_specific_max_image_to_page_quotient(label, table=None):
|
||||||
|
return get_class_specific_value(
|
||||||
|
"REL_IMAGE_SIZE", label, "max", CONFIG.filters.image_to_page_quotient.max, table=table
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_class_specific_min_image_width_to_height_quotient(label, table=None):
|
||||||
|
return get_class_specific_value(
|
||||||
|
"IMAGE_FORMAT", label, "min", CONFIG.filters.image_width_to_height_quotient.min, table=table
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_class_specific_max_image_width_to_height_quotient(label, table=None):
|
||||||
|
return get_class_specific_value(
|
||||||
|
"IMAGE_FORMAT", label, "max", CONFIG.filters.image_width_to_height_quotient.max, table=table
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_class_specific_min_classification_confidence(label, table=None):
|
||||||
|
return get_class_specific_value("CONFIDENCE", label, "min", CONFIG.filters.min_confidence, table=table)
|
||||||
|
|
||||||
|
|
||||||
|
def get_class_specific_value(prefix, label, bound, fallback_value, table=None):
|
||||||
|
def fallback():
|
||||||
|
return fallback_value
|
||||||
|
|
||||||
|
def success():
|
||||||
|
threshold_map = parse_env_var(prefix, table=table) or {}
|
||||||
|
value = threshold_map.get(label, {}).get(bound)
|
||||||
|
if value:
|
||||||
|
logger.debug(f"Using class '{label}' specific {bound} {prefix.lower().replace('_', '-')} value.")
|
||||||
|
return value
|
||||||
|
|
||||||
|
assert bound in ["min", "max"]
|
||||||
|
|
||||||
|
return success() or fallback()
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def parse_env_var(prefix, table=None):
|
||||||
|
table = table or os.environ
|
||||||
|
head = first(filter(lambda s: s == prefix, table))
|
||||||
|
if head:
|
||||||
|
try:
|
||||||
|
return parse_env_var_value(table[head])
|
||||||
|
except ParsingError as err:
|
||||||
|
logger.warning(err)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_env_var_value(env_var_value):
|
||||||
|
try:
|
||||||
|
return json.loads(env_var_value)
|
||||||
|
except Exception as err:
|
||||||
|
raise ParsingError(f"Failed to parse {env_var_value}") from err
|
||||||
15
image_prediction/utils/generic.py
Normal file
15
image_prediction/utils/generic.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
from itertools import starmap
|
||||||
|
|
||||||
|
from funcy import iterate, first, curry, map
|
||||||
|
|
||||||
|
|
||||||
|
def until(cond, func, *args, **kwargs):
|
||||||
|
return first(filter(cond, iterate(func, *args, **kwargs)))
|
||||||
|
|
||||||
|
|
||||||
|
def lift(fn):
|
||||||
|
return curry(map)(fn)
|
||||||
|
|
||||||
|
|
||||||
|
def starlift(fn):
|
||||||
|
return curry(starmap)(fn)
|
||||||
27
image_prediction/utils/logger.py
Normal file
27
image_prediction/utils/logger.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
from image_prediction.config import CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
def make_logger_getter():
|
||||||
|
logger = logging.getLogger("imclf")
|
||||||
|
logger.propagate = False
|
||||||
|
|
||||||
|
handler = logging.StreamHandler()
|
||||||
|
handler.setLevel(CONFIG.service.logging_level)
|
||||||
|
|
||||||
|
log_format = "%(asctime)s %(levelname)-8s %(message)s"
|
||||||
|
formatter = logging.Formatter(log_format, datefmt="%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
handler.setFormatter(formatter)
|
||||||
|
logger.addHandler(handler)
|
||||||
|
|
||||||
|
logger.setLevel(CONFIG.service.logging_level)
|
||||||
|
|
||||||
|
def get_logger():
|
||||||
|
return logger
|
||||||
|
|
||||||
|
return get_logger
|
||||||
|
|
||||||
|
|
||||||
|
get_logger = make_logger_getter()
|
||||||
@ -56,8 +56,7 @@ def annotate_image(doc, image_info):
|
|||||||
|
|
||||||
def init():
|
def init():
|
||||||
PDFNet.Initialize(
|
PDFNet.Initialize(
|
||||||
# "Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
|
"Knecon AG(en.knecon.swiss):OEM:DDA-R::WL+:AMS(20211029):BECC974307DAB4F34B513BC9B2531B24496F6FCB83CD8AC574358A959730B622FABEF5C7"
|
||||||
"Knecon AG:OEM:DDA-R::WL+:AMS(20270129):EA5FDFB23C7F36B9C2AE606F4F0D9197DE1FB649119F9730B622FABEF5C7"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user