diff --git a/.gitlab-ci.yml.bak b/.gitlab-ci.backup.yml similarity index 100% rename from .gitlab-ci.yml.bak rename to .gitlab-ci.backup.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d2c7989..1c16d00 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,12 +1,17 @@ include: - project: "Gitlab/gitlab" - ref: main - file: "/ci-templates/research/versioning-build+azure_model-test-release.gitlab-ci.yml" + ref: 0.3.0 + file: "/ci-templates/research/dvc-versioning-build-release.gitlab-ci.yml" variables: - NEXUS_PROJECT_DIR: ff + NEXUS_PROJECT_DIR: red IMAGENAME: "${CI_PROJECT_NAME}" +################################# +# temp. disable integration tests, b/c they don't cover the CV analysis case yet +trigger integration tests: + rules: + - when: never ###### # DOCS @@ -15,6 +20,7 @@ pages: before_script: - !reference [.activate-venv, script] script: + - cp ./README.md ./docs/source/README.md && cp -r ./data ./docs/source/data/ - sphinx-apidoc ./src -o ./docs/source/modules --no-toc --module-first --follow-links --separate --force - sphinx-build -b html docs/source/ public/ -E -a artifacts: @@ -30,51 +36,18 @@ pages: # MAKE SURE TO SWITCH OUT ALL YOUR MODEL NAMES + VERSIONS # name the job after the model it's using in the build, keep the prefix referencing `docker-build::` -docker-build::keyword-extraction-multi: - extends: .docker-build - variables: - MODEL_NAME: ${MODEL_MULTI_NAME} - MODEL_VERSION: ${MODEL_MULTI_VERSION} - -docker-build::cv-analysis-service: - extends: .docker-build +release build: + stage: release needs: - - !reference [.needs-versioning, needs] - - job: docker-build::cv-analysis-service - variables: - MODEL_NAME: ${MODEL_EN_NAME} - MODEL_VERSION: ${MODEL_EN_VERSION} - -docker-build::keyword-extraction-de: - extends: .docker-build - needs: - - !reference [.needs-versioning, needs] - - job: docker-build::cv-analysis-service - variables: - MODEL_NAME: ${MODEL_DE_NAME} - MODEL_VERSION: ${MODEL_DE_VERSION} - - -################### -# INTEGRATION TESTS -trigger-integration-tests: - extends: .integration-tests - needs: - ###### UPDATE/EDIT ###### - # YOU NEED ONLY TO DEFINE ONE - # reason is that we want to have one built image to use with the integration tests - # this should be the same image you uploaded test data for - - job: docker-build::cv-analysis-service - artifacts: true - rules: - - when: never # temp. disable integration tests - -######### -# RELEASE -release: - extends: .release - needs: - - !reference [.release, needs] # LEAVE THIS LINE AS IS - ###### UPDATE/EDIT ###### - # DEFINE ONE BUILD JOB THAT NEEDS TO EXIST BEFORE RELEASE - - job: docker-build::cv-analysis-service + - job: set custom version + artifacts: true + optional: true + - job: calculate patch version + artifacts: true + optional: true + - job: calculate minor version + artifacts: true + optional: true + - job: build docker nexus + artifacts: true +################################# diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1155696..3c633ba 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,7 +10,7 @@ repos: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml - args: [--unsafe] + args: [--unsafe] # needed for .gitlab-ci.yml - id: check-toml - id: detect-private-key - id: check-added-large-files diff --git a/Makefile b/Makefile index 7498375..c989ab2 100644 --- a/Makefile +++ b/Makefile @@ -9,8 +9,8 @@ export DOCKER=docker export DOCKERFILE=Dockerfile -export IMAGE_NAME=keyword_extraction_service-image -export CONTAINER_NAME=keyword_extraction_service-container +export IMAGE_NAME=cv_analysis_service-image +export CONTAINER_NAME=cv_analysis_service-container export HOST_PORT=9999 export CONTAINER_PORT=9999 export PYTHON_VERSION=python3.10 @@ -88,4 +88,4 @@ sphinx_html: poetry run sphinx-build -b html docs/source/ docs/build/html -E -a sphinx_apidoc: - poetry run sphinx-apidoc ./src -o ./docs/source/modules --no-toc --module-first --follow-links --separate --force + cp ./README.md ./docs/source/README.md && cp -r ./data ./docs/source/data/ && poetry run sphinx-apidoc ./src -o ./docs/source/modules --no-toc --module-first --follow-links --separate --force diff --git a/docs/build/html/.doctrees/README.doctree b/docs/build/html/.doctrees/README.doctree index 91c38e3..834b670 100644 Binary files a/docs/build/html/.doctrees/README.doctree and b/docs/build/html/.doctrees/README.doctree differ diff --git a/docs/build/html/.doctrees/environment.pickle b/docs/build/html/.doctrees/environment.pickle index 58ecfcf..ba3b85c 100644 Binary files a/docs/build/html/.doctrees/environment.pickle and b/docs/build/html/.doctrees/environment.pickle differ diff --git a/docs/build/html/.doctrees/index.doctree b/docs/build/html/.doctrees/index.doctree index f5b0768..acc570b 100644 Binary files a/docs/build/html/.doctrees/index.doctree and b/docs/build/html/.doctrees/index.doctree differ diff --git a/docs/build/html/README.html b/docs/build/html/README.html index 2e810dc..7557b96 100644 --- a/docs/build/html/README.html +++ b/docs/build/html/README.html @@ -8,7 +8,7 @@ - Keyword-Service — CV Analysis Service 2.5.1 documentation + cv-analysis - Visual (CV-Based) Document Parsing — CV Analysis Service 2.5.1 documentation @@ -44,7 +44,7 @@ - + @@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH @@ -259,7 +259,7 @@ document.write(` @@ -343,7 +343,7 @@ document.write(` - + @@ -360,193 +360,163 @@ document.write(`
-
-

Keyword-Service#

-

Service to get keywords of a paragraph or whole document.

- -
+
+

cv-analysis - Visual (CV-Based) Document Parsing#

+

parse_pdf() +This repository implements computer vision based approaches for detecting and parsing visual features such as tables or +previous redactions in documents.

+

API#

-
-

REST#

-

The service provides endpoints to extract keywords from a text and to embed a text. For details, download -OpenAPI schema and view it in a browser.

-
-
-

RabbitMQ#

-

The service listens to a queue and processes the messages. This method is ment to be used for extracting keywords from -whole documents. All RabbitMQ parameters including the queue names are set in environment variables, refer to the -service respective HELM chart for more information.

-

The input message should be a JSON object with the following structure:

+

Input message:

{
-  "targetFilePath": string,
-  "responseFilePath": string
+  "targetFilePath": {
+    "pdf": "absolute file path",
+    "vlp_output": "absolute file path"
+  },
+  "responseFilePath": "absolute file path",
+  "operation": "table_image_inference"
 }
 
-

The service downloads the file specified in targetFilePath. Supported data structures for the target file are:

-
    -
  • simplified text data (signifier key: paragraphs)

  • -
  • structure object data (signifier key: structureObjects)

  • -
-

As a response, the service uploads a JSON-structured file (as defined in responseFilePath) with the result under the -data key. The structure of the response file is as follows:

-
{
-    "targetFilePath"
-:
-    string,
-        "responseFilePath"
-:
-    string,
-        // and eventually further fields if present in the input message      
-        "data"
-:
-    [
+

Response is uploaded to the storage as specified in the responseFilePath field. The structure is as follows:

+
{
+  ...,
+  "data": [
+    {
+      'pageNum': 0,
+      'bbox': {
+        'x1': 55.3407,
+        'y1': 247.0246,
+        'x2': 558.5602,
+        'y2': 598.0585
+      },
+      'uuid': '2b10c1a2-393c-4fca-b9e3-0ad5b774ac84',
+      'label': 'table',
+      'tableLines': [
         {
-            "keywords": Array[string],
-            "paragraphId": int,
-            "embedding": Array[float]  // 384 dimensions
-        }
-    ]
+          'x1': 0,
+          'y1': 16,
+          'x2': 1399,
+          'y2': 16
+        },
+        ...
+      ],
+      'imageInfo': {
+        'height': 693,
+        'width': 1414
+      }
+    },
+    ...
+  ]
 }
 
-

Note that

-
    -
  • the embedding key is optional and can be omitted. The service will not calculate the embedding if the environment -variable MODEL__COMPUTE_EMBEDDINGS is set to false.

  • -
  • the service also computes the keywords for the whole document. In this case, the paragraphId is set to -1.

  • -
-
-
-

Service Configuration#

-

The service is configured via environment variables. The following variables are available:

-

| Variable | Description | Default | -| —————————————— | ———————————————————————————– | ——- | -| LOGGING__LEVEL | Logging level | INFO | -| MODEL__MAX_KEYWORDS_PER_PARAGRAPH | Maximum number of keywords per paragraph | 5 | -| MODLE__MAX_KEYWORDS_PER_DOCUMENT | Maximum number of keywords per document, when set to 0, no keywords are extracted | 0 | -| MODEL__COMPUTE_EMBEDDINGS | Whether to compute keyword embeddings or not | true | -| MODEL__PREPROCESSING__MIN_PARAGRAPH_LENGTH | Minimum number of characters in a paragraph to be considered for keyword extraction | 1 | -| MODEL__POSTPROCESSING__FILTER_SUBWORDS | Whether to filter out subwords from the keywords or not | true |

-

NOTE that these variables are subject to change. For the most recent configuration, refer to the service respective -HELM chart.

-
-
-

Language#

-

Currently, there is an english, a german and a multi-language model for keyword extraction. The models are uploaded to -mlflow and can -be set in the Dockerfile when building the container:

-

example for german model:

-
ENV AZURE_RESOURCE_GROUP="mriedl"
-ENV AZURE_AML_WORKSPACE="azureml-ws"
-ENV AZURE_AML_MODEL_NAME="keyword-extraction-de"
-ENV AZURE_AML_MODEL_VERSION="1"
-
-
-

and example for english model:

-
ENV AZURE_RESOURCE_GROUP="mriedl"
-ENV AZURE_AML_WORKSPACE="azureml-ws"
-ENV AZURE_AML_MODEL_NAME="keyword-extraction-de"
-ENV AZURE_AML_MODEL_VERSION="1"
+
+

Installation#

+
git clone ssh://git@git.iqser.com:2222/rr/cv-analysis.git
+cd cv-analysis
+
+python -m venv env
+source env/bin/activate
+
+pip install -e .
+pip install -r requirements.txt
+
+dvc pull
 

Usage#

-

Two Options:

-
    -
  1. REST: Send text per request to endpoint, endpoint returns keywords

  2. -
  3. Queue: Service gets text from queue, model calculates keywords, save keywords in queue

  4. -
-

To test the REST endpoint you have to set up an environment and do poetry install ( -see https://gitlab.knecon.com/knecon/research/template-python-project for details for setting up poetry) -Then run

-
python ./src/serve.py 
-
-
-

You don’t need to start a queue for that, just ignore the AMQP Error. -Port and host are set in settings.toml . -You can use the FastAPI under 127.0.0.1:8001/docs to send request to endpoint.

-

You can also test the service with docker:

-
-

Run Docker Commands#

-
docker build -t ${IMAGE_NAME} -f Dockerfile --build-arg GITLAB_USER=${GITLAB_USER} \
-                                            --build-arg GITLAB_ACCESS_TOKEN=${GITLAB_ACCESS_TOKEN} \
-                                            --build-arg AZURE_TENANT_ID=${AZURE_TENANT_ID} \
-                                            --build-arg AZURE_SUBSCRIPTION_ID=${AZURE_SUBSCRIPTION_ID} \
-                                            --build-arg AZURE_CLIENT_ID=${AZURE_CLIENT_ID} \
-                                            --build-arg AZURE_CLIENT_SECRET=${AZURE_CLIENT_SECRET} \
-                                            --build-arg AZURE_AML_MODEL_VERSION=${AZURE_AML_MODEL_VERSION} \
-                                            --build-arg AZURE_AML_MODEL_NAME=${AZURE_AML_MODEL_NAME} \
-                                            --build-arg AZURE_RESOURCE_GROUP=${AZURE_RESOURCE_GROUP} \
-                                            --build-arg AZURE_AML_WORKSPACE=${AZURE_AML_WORKSPACE}
-
-
-
docker run --net=host -it --rm --name ${CONTAINER_NAME} ${IMAGE_NAME}
+
+

As an API#

+

The module provided functions for the individual tasks that all return some kind of collection of points, depending on +the specific task.

+
+

Redaction Detection (API)#

+

The below snippet shows hot to find the outlines of previous redactions.

+
from cv_analysis.redaction_detection import find_redactions
+import pdf2image
+import numpy as np
+
+pdf_path = ...
+page_index = ...
+
+page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0]
+page = np.array(page)
+
+redaction_contours = find_redactions(page)
 
-
-

Run locally#

-

First you need to download the model from mlflow. This can be done with the “src/ml_flow/download_model.py” script. -This scripts downloads a model and copies config and model data to the specific locations, such that the model can -be loaded.

-

For running/testing the keyword extraction locally you can use the src/tests/test_process.py script.

-

Model ist stored and loaded via DVC, you need the connection string under -https://portal.azure.com/#@knecon.com/resource/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourceGroups/taas-rg/providers/Microsoft.Storage/storageAccounts/taassaracer/keys

-
-
-

Upload models to ML Flow#

-

To upload the models to mlflow, you can use following script: src/mlflow/upload_model.py -For authentication following environment variables need to be set:

-
#AZURE_TENANT_ID=""
-#AZURE_SUBSCRIPTION_ID=""
-#AZURE_CLIENT_ID=""
-#AZURE_CLIENT_SECRET=""
+
+

As a CLI Tool#

+

Core API functionalities can be used through a CLI.

+
+

Table Parsing#

+

The tables parsing utility detects and segments tables into individual cells.

+
python scripts/annotate.py data/test_pdf.pdf 7 --type table
 
-

Additional settings (resource group, experiment name, etc.) can be specified in the config ( -./src/mlflow/config/azure_config.toml). -The upload_model.py has the following parameters:

-
options:
-  -h, --help            show this help message and exit
-  -a AZURE_CONFIG, --azure_config AZURE_CONFIG
-                        Location of the configuration file for Azure (default: src/mlflow/config/azure_config.toml)
-  -b BASE_CONFIG, --base_config BASE_CONFIG
-                        Location of the basic training configuration (default: src/mlflow/config/settings_de.toml)
-  
+

The below image shows a parsed table, where each table cell has been detected individually.

+

Table Parsing Demonstration

+
+
+

Redaction Detection (CLI)#

+

The redaction detection utility detects previous redactions in PDFs (filled black rectangles).

+
python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
 
-

the base config contains all information for the models used. Examples for German and -English are placed in /src/mlflow/config/

-

Note: Multi-language model tracking does not work for now. After the upload script reports an error, you have to -manually track the -model here -where you can find the run. Adhere to the naming conventions for the model name and versions, -see here

+

The below image shows the detected redactions with green outlines.

+

Redaction Detection Demonstration

+
+
+

Layout Parsing#

+

The layout parsing utility detects elements such as paragraphs, tables and figures.

+
python scripts/annotate.py data/test_pdf.pdf 7 --type layout
+
+
+

The below image shows the detected layout elements on a page.

+

Layout Parsing Demonstration

+
+
+

Figure Detection#

+

The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility.

+
python scripts/annotate.py data/test_pdf.pdf 3 --type figure
+
+
+

The below image shows the detected figure on a page.

+

Figure Detection Demonstration

+
+
+
+

Running as a service#

+
+

Building#

+

Build base image

+
bash setup/docker.sh
+
+
+

Build head image

+
docker build -f Dockerfile -t cv-analysis . --build-arg BASE_ROOT=""
+
+
+
+
+

Usage (service)#

+

Shell 1

+
docker run --rm --net=host --rm cv-analysis
+
+
+

Shell 2

+
python scripts/client_mock.py --pdf_path /path/to/a/pdf
+
+
+
+
@@ -565,7 +535,7 @@ see

previous

-

Welcome to Keyword Extraction Service documentation!

+

Welcome to CV Analysis Service documentation!