chore: fix readme problem with docs and modify gitlab ci to build docs

This commit is contained in:
iriley 2024-04-29 15:39:33 +02:00
parent 3a5fc32ec8
commit ab5096dd86
55 changed files with 554 additions and 642 deletions

View File

@ -1,12 +1,17 @@
include:
- project: "Gitlab/gitlab"
ref: main
file: "/ci-templates/research/versioning-build+azure_model-test-release.gitlab-ci.yml"
ref: 0.3.0
file: "/ci-templates/research/dvc-versioning-build-release.gitlab-ci.yml"
variables:
NEXUS_PROJECT_DIR: ff
NEXUS_PROJECT_DIR: red
IMAGENAME: "${CI_PROJECT_NAME}"
#################################
# temp. disable integration tests, b/c they don't cover the CV analysis case yet
trigger integration tests:
rules:
- when: never
######
# DOCS
@ -15,6 +20,7 @@ pages:
before_script:
- !reference [.activate-venv, script]
script:
- cp ./README.md ./docs/source/README.md && cp -r ./data ./docs/source/data/
- sphinx-apidoc ./src -o ./docs/source/modules --no-toc --module-first --follow-links --separate --force
- sphinx-build -b html docs/source/ public/ -E -a
artifacts:
@ -30,51 +36,18 @@ pages:
# MAKE SURE TO SWITCH OUT ALL YOUR MODEL NAMES + VERSIONS
# name the job after the model it's using in the build, keep the prefix referencing `docker-build::`
docker-build::keyword-extraction-multi:
extends: .docker-build
variables:
MODEL_NAME: ${MODEL_MULTI_NAME}
MODEL_VERSION: ${MODEL_MULTI_VERSION}
docker-build::cv-analysis-service:
extends: .docker-build
release build:
stage: release
needs:
- !reference [.needs-versioning, needs]
- job: docker-build::cv-analysis-service
variables:
MODEL_NAME: ${MODEL_EN_NAME}
MODEL_VERSION: ${MODEL_EN_VERSION}
docker-build::keyword-extraction-de:
extends: .docker-build
needs:
- !reference [.needs-versioning, needs]
- job: docker-build::cv-analysis-service
variables:
MODEL_NAME: ${MODEL_DE_NAME}
MODEL_VERSION: ${MODEL_DE_VERSION}
###################
# INTEGRATION TESTS
trigger-integration-tests:
extends: .integration-tests
needs:
###### UPDATE/EDIT ######
# YOU NEED ONLY TO DEFINE ONE
# reason is that we want to have one built image to use with the integration tests
# this should be the same image you uploaded test data for
- job: docker-build::cv-analysis-service
artifacts: true
rules:
- when: never # temp. disable integration tests
#########
# RELEASE
release:
extends: .release
needs:
- !reference [.release, needs] # LEAVE THIS LINE AS IS
###### UPDATE/EDIT ######
# DEFINE ONE BUILD JOB THAT NEEDS TO EXIST BEFORE RELEASE
- job: docker-build::cv-analysis-service
- job: set custom version
artifacts: true
optional: true
- job: calculate patch version
artifacts: true
optional: true
- job: calculate minor version
artifacts: true
optional: true
- job: build docker nexus
artifacts: true
#################################

View File

@ -10,7 +10,7 @@ repos:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
args: [--unsafe]
args: [--unsafe] # needed for .gitlab-ci.yml
- id: check-toml
- id: detect-private-key
- id: check-added-large-files

View File

@ -9,8 +9,8 @@
export DOCKER=docker
export DOCKERFILE=Dockerfile
export IMAGE_NAME=keyword_extraction_service-image
export CONTAINER_NAME=keyword_extraction_service-container
export IMAGE_NAME=cv_analysis_service-image
export CONTAINER_NAME=cv_analysis_service-container
export HOST_PORT=9999
export CONTAINER_PORT=9999
export PYTHON_VERSION=python3.10
@ -88,4 +88,4 @@ sphinx_html:
poetry run sphinx-build -b html docs/source/ docs/build/html -E -a
sphinx_apidoc:
poetry run sphinx-apidoc ./src -o ./docs/source/modules --no-toc --module-first --follow-links --separate --force
cp ./README.md ./docs/source/README.md && cp -r ./data ./docs/source/data/ && poetry run sphinx-apidoc ./src -o ./docs/source/modules --no-toc --module-first --follow-links --separate --force

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Keyword-Service &#8212; CV Analysis Service 2.5.1 documentation</title>
<title>cv-analysis - Visual (CV-Based) Document Parsing &#8212; CV Analysis Service 2.5.1 documentation</title>
@ -44,7 +44,7 @@
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="cv_analysis package" href="modules/cv_analysis.html" />
<link rel="prev" title="Welcome to Keyword Extraction Service documentation!" href="index.html" />
<link rel="prev" title="Welcome to CV Analysis Service documentation!" href="index.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item current active">
<a class="nav-link nav-internal" href="#">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item current active">
<a class="nav-link nav-internal" href="#">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -343,7 +343,7 @@ document.write(`
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item active" aria-current="page">Keyword-Service</li>
<li class="breadcrumb-item active" aria-current="page">cv-analysis...</li>
</ul>
</nav>
</div>
@ -360,193 +360,163 @@ document.write(`
<div id="searchbox"></div>
<article class="bd-article">
<section id="keyword-service">
<h1>Keyword-Service<a class="headerlink" href="#keyword-service" title="Link to this heading">#</a></h1>
<p>Service to get keywords of a paragraph or whole document.</p>
<!-- TOC --><ul class="simple">
<li><p><a class="reference external" href="#keyword-service">Keyword-Service</a></p>
<ul>
<li><p><a class="reference external" href="#api">API</a></p>
<ul>
<li><p><a class="reference external" href="#rest">REST</a></p></li>
<li><p><a class="reference external" href="#rabbitmq">RabbitMQ</a></p></li>
</ul>
</li>
<li><p><a class="reference external" href="#service-configuration">Service Configuration</a></p></li>
<li><p><a class="reference external" href="#language">Language</a></p></li>
<li><p><a class="reference external" href="#usage">Usage</a></p>
<ul>
<li><p><a class="reference external" href="#run-docker-commands">Run Docker Commands</a></p></li>
<li><p><a class="reference external" href="#run-locally">Run locally</a></p></li>
</ul>
</li>
</ul>
</li>
<li><p><a class="reference external" href="#upload-models-to-ml-flow">Upload models to ML Flow</a></p></li>
</ul>
<!-- TOC --><section id="api">
<section id="cv-analysis-visual-cv-based-document-parsing">
<h1>cv-analysis - Visual (CV-Based) Document Parsing<a class="headerlink" href="#cv-analysis-visual-cv-based-document-parsing" title="Link to this heading">#</a></h1>
<p>parse_pdf()
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
previous redactions in documents.</p>
<section id="api">
<h2>API<a class="headerlink" href="#api" title="Link to this heading">#</a></h2>
<section id="rest">
<h3>REST<a class="headerlink" href="#rest" title="Link to this heading">#</a></h3>
<p>The service provides endpoints to extract keywords from a text and to embed a text. For details, download
<a class="reference external" href="docs/openapi_redoc.html">OpenAPI schema</a> and view it in a browser.</p>
</section>
<section id="rabbitmq">
<h3>RabbitMQ<a class="headerlink" href="#rabbitmq" title="Link to this heading">#</a></h3>
<p>The service listens to a queue and processes the messages. This method is ment to be used for extracting keywords from
whole documents. All RabbitMQ parameters including the queue names are set in environment variables, refer to the
service respective HELM chart for more information.</p>
<p>The input message should be a JSON object with the following structure:</p>
<p>Input message:</p>
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
<span class="w"> </span><span class="nt">&quot;targetFilePath&quot;</span><span class="p">:</span><span class="w"> </span><span class="err">s</span><span class="kc">tr</span><span class="err">i</span><span class="kc">n</span><span class="err">g</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;responseFilePath&quot;</span><span class="p">:</span><span class="w"> </span><span class="err">s</span><span class="kc">tr</span><span class="err">i</span><span class="kc">n</span><span class="err">g</span>
<span class="w"> </span><span class="nt">&quot;targetFilePath&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">&quot;pdf&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;absolute file path&quot;</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;vlp_output&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;absolute file path&quot;</span>
<span class="w"> </span><span class="p">},</span>
<span class="w"> </span><span class="nt">&quot;responseFilePath&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;absolute file path&quot;</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;operation&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;table_image_inference&quot;</span>
<span class="p">}</span>
</pre></div>
</div>
<p>The service downloads the file specified in <code class="docutils literal notranslate"><span class="pre">targetFilePath</span></code>. Supported data structures for the target file are:</p>
<ul class="simple">
<li><p>simplified text data (signifier key: <code class="docutils literal notranslate"><span class="pre">paragraphs</span></code>)</p></li>
<li><p>structure object data (signifier key: <code class="docutils literal notranslate"><span class="pre">structureObjects</span></code>)</p></li>
</ul>
<p>As a response, the service uploads a JSON-structured file (as defined in <code class="docutils literal notranslate"><span class="pre">responseFilePath</span></code>) with the result under the
<code class="docutils literal notranslate"><span class="pre">data</span></code> key. The structure of the response file is as follows:</p>
<div class="highlight-javascript notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
<span class="w"> </span><span class="s2">&quot;targetFilePath&quot;</span>
<span class="o">:</span>
<span class="w"> </span><span class="nx">string</span><span class="p">,</span>
<span class="w"> </span><span class="s2">&quot;responseFilePath&quot;</span>
<span class="o">:</span>
<span class="w"> </span><span class="nx">string</span><span class="p">,</span>
<span class="w"> </span><span class="c1">// and eventually further fields if present in the input message </span>
<span class="w"> </span><span class="s2">&quot;data&quot;</span>
<span class="o">:</span>
<span class="w"> </span><span class="p">[</span>
<p>Response is uploaded to the storage as specified in the <code class="docutils literal notranslate"><span class="pre">responseFilePath</span></code> field. The structure is as follows:</p>
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
<span class="w"> </span><span class="err">...</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;data&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">[</span>
<span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="err">&#39;pageNum&#39;</span><span class="p">:</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;bbox&#39;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="err">&#39;x</span><span class="mi">1</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="mf">55.3407</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;y</span><span class="mi">1</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="mf">247.0246</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;x</span><span class="mi">2</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="mf">558.5602</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;y</span><span class="mi">2</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="mf">598.0585</span>
<span class="w"> </span><span class="p">},</span>
<span class="w"> </span><span class="err">&#39;uuid&#39;</span><span class="p">:</span><span class="w"> </span><span class="err">&#39;</span><span class="mi">2</span><span class="err">b</span><span class="mi">10</span><span class="err">c</span><span class="mi">1</span><span class="err">a</span><span class="mi">2-393</span><span class="err">c</span><span class="mi">-4</span><span class="kc">f</span><span class="err">ca</span><span class="mi">-</span><span class="err">b</span><span class="mf">9e3-0</span><span class="err">ad</span><span class="mi">5</span><span class="err">b</span><span class="mi">774</span><span class="err">ac</span><span class="mi">84</span><span class="err">&#39;</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;label&#39;</span><span class="p">:</span><span class="w"> </span><span class="err">&#39;</span><span class="kc">ta</span><span class="err">ble&#39;</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;</span><span class="kc">ta</span><span class="err">bleLi</span><span class="kc">nes</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="p">[</span>
<span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="s2">&quot;keywords&quot;</span><span class="o">:</span><span class="w"> </span><span class="nb">Array</span><span class="p">[</span><span class="nx">string</span><span class="p">],</span>
<span class="w"> </span><span class="s2">&quot;paragraphId&quot;</span><span class="o">:</span><span class="w"> </span><span class="kr">int</span><span class="p">,</span>
<span class="w"> </span><span class="s2">&quot;embedding&quot;</span><span class="o">:</span><span class="w"> </span><span class="nb">Array</span><span class="p">[</span><span class="kr">float</span><span class="p">]</span><span class="w"> </span><span class="c1">// 384 dimensions</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="p">]</span>
<span class="w"> </span><span class="err">&#39;x</span><span class="mi">1</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;y</span><span class="mi">1</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="mi">16</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;x</span><span class="mi">2</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="mi">1399</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;y</span><span class="mi">2</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="mi">16</span>
<span class="w"> </span><span class="p">},</span>
<span class="w"> </span><span class="err">...</span>
<span class="w"> </span><span class="p">],</span>
<span class="w"> </span><span class="err">&#39;imageI</span><span class="kc">nf</span><span class="err">o&#39;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="err">&#39;heigh</span><span class="kc">t</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="mi">693</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;wid</span><span class="kc">t</span><span class="err">h&#39;</span><span class="p">:</span><span class="w"> </span><span class="mi">1414</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="p">},</span>
<span class="w"> </span><span class="err">...</span>
<span class="w"> </span><span class="p">]</span>
<span class="p">}</span>
</pre></div>
</div>
<p><strong>Note</strong> that</p>
<ul class="simple">
<li><p>the <code class="docutils literal notranslate"><span class="pre">embedding</span></code> key is optional and can be omitted. The service will not calculate the embedding if the environment
variable <code class="docutils literal notranslate"><span class="pre">MODEL__COMPUTE_EMBEDDINGS</span></code> is set to <code class="docutils literal notranslate"><span class="pre">false</span></code>.</p></li>
<li><p>the service also computes the keywords for the whole document. In this case, the <code class="docutils literal notranslate"><span class="pre">paragraphId</span></code> is set to <code class="docutils literal notranslate"><span class="pre">-1</span></code>.</p></li>
</ul>
</section>
</section>
<section id="service-configuration">
<h2>Service Configuration<a class="headerlink" href="#service-configuration" title="Link to this heading">#</a></h2>
<p>The service is configured via environment variables. The following variables are available:</p>
<p>| Variable | Description | Default |
| —————————————— | ———————————————————————————– | ——- |
| LOGGING__LEVEL | Logging level | INFO |
| MODEL__MAX_KEYWORDS_PER_PARAGRAPH | Maximum number of keywords per paragraph | 5 |
| MODLE__MAX_KEYWORDS_PER_DOCUMENT | Maximum number of keywords per document, when set to 0, no keywords are extracted | 0 |
| MODEL__COMPUTE_EMBEDDINGS | Whether to compute keyword embeddings or not | true |
| MODEL__PREPROCESSING__MIN_PARAGRAPH_LENGTH | Minimum number of characters in a paragraph to be considered for keyword extraction | 1 |
| MODEL__POSTPROCESSING__FILTER_SUBWORDS | Whether to filter out subwords from the keywords or not | true |</p>
<p><strong>NOTE</strong> that these variables are subject to change. For the most recent configuration, refer to the service respective
HELM chart.</p>
</section>
<section id="language">
<h2>Language<a class="headerlink" href="#language" title="Link to this heading">#</a></h2>
<p>Currently, there is an english, a german and a multi-language model for keyword extraction. The models are uploaded to
mlflow and can
be set in the Dockerfile when building the container:</p>
<p>example for german model:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">ENV</span> <span class="n">AZURE_RESOURCE_GROUP</span><span class="o">=</span><span class="s2">&quot;mriedl&quot;</span>
<span class="n">ENV</span> <span class="n">AZURE_AML_WORKSPACE</span><span class="o">=</span><span class="s2">&quot;azureml-ws&quot;</span>
<span class="n">ENV</span> <span class="n">AZURE_AML_MODEL_NAME</span><span class="o">=</span><span class="s2">&quot;keyword-extraction-de&quot;</span>
<span class="n">ENV</span> <span class="n">AZURE_AML_MODEL_VERSION</span><span class="o">=</span><span class="s2">&quot;1&quot;</span>
</pre></div>
</div>
<p>and example for english model:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">ENV</span> <span class="n">AZURE_RESOURCE_GROUP</span><span class="o">=</span><span class="s2">&quot;mriedl&quot;</span>
<span class="n">ENV</span> <span class="n">AZURE_AML_WORKSPACE</span><span class="o">=</span><span class="s2">&quot;azureml-ws&quot;</span>
<span class="n">ENV</span> <span class="n">AZURE_AML_MODEL_NAME</span><span class="o">=</span><span class="s2">&quot;keyword-extraction-de&quot;</span>
<span class="n">ENV</span> <span class="n">AZURE_AML_MODEL_VERSION</span><span class="o">=</span><span class="s2">&quot;1&quot;</span>
<section id="installation">
<h2>Installation<a class="headerlink" href="#installation" title="Link to this heading">#</a></h2>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>git<span class="w"> </span>clone<span class="w"> </span>ssh://git@git.iqser.com:2222/rr/cv-analysis.git
<span class="nb">cd</span><span class="w"> </span>cv-analysis
python<span class="w"> </span>-m<span class="w"> </span>venv<span class="w"> </span>env
<span class="nb">source</span><span class="w"> </span>env/bin/activate
pip<span class="w"> </span>install<span class="w"> </span>-e<span class="w"> </span>.
pip<span class="w"> </span>install<span class="w"> </span>-r<span class="w"> </span>requirements.txt
dvc<span class="w"> </span>pull
</pre></div>
</div>
</section>
<section id="usage">
<h2>Usage<a class="headerlink" href="#usage" title="Link to this heading">#</a></h2>
<p><strong>Two Options:</strong></p>
<ol class="simple">
<li><p>REST: Send text per request to endpoint, endpoint returns keywords</p></li>
<li><p>Queue: Service gets text from queue, model calculates keywords, save keywords in queue</p></li>
</ol>
<p>To test the REST endpoint you have to set up an environment and do poetry install (
see https://gitlab.knecon.com/knecon/research/template-python-project for details for setting up poetry)
Then run</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">python</span> <span class="o">./</span><span class="n">src</span><span class="o">/</span><span class="n">serve</span><span class="o">.</span><span class="n">py</span>
</pre></div>
</div>
<p>You dont need to start a queue for that, just ignore the AMQP Error.
Port and host are set in settings.toml .
You can use the FastAPI under 127.0.0.1:8001/docs to send request to endpoint.</p>
<p>You can also test the service with docker:</p>
<section id="run-docker-commands">
<h3>Run Docker Commands<a class="headerlink" href="#run-docker-commands" title="Link to this heading">#</a></h3>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>docker<span class="w"> </span>build<span class="w"> </span>-t<span class="w"> </span><span class="si">${</span><span class="nv">IMAGE_NAME</span><span class="si">}</span><span class="w"> </span>-f<span class="w"> </span>Dockerfile<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">GITLAB_USER</span><span class="o">=</span><span class="si">${</span><span class="nv">GITLAB_USER</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">GITLAB_ACCESS_TOKEN</span><span class="o">=</span><span class="si">${</span><span class="nv">GITLAB_ACCESS_TOKEN</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">AZURE_TENANT_ID</span><span class="o">=</span><span class="si">${</span><span class="nv">AZURE_TENANT_ID</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">AZURE_SUBSCRIPTION_ID</span><span class="o">=</span><span class="si">${</span><span class="nv">AZURE_SUBSCRIPTION_ID</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">AZURE_CLIENT_ID</span><span class="o">=</span><span class="si">${</span><span class="nv">AZURE_CLIENT_ID</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">AZURE_CLIENT_SECRET</span><span class="o">=</span><span class="si">${</span><span class="nv">AZURE_CLIENT_SECRET</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">AZURE_AML_MODEL_VERSION</span><span class="o">=</span><span class="si">${</span><span class="nv">AZURE_AML_MODEL_VERSION</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">AZURE_AML_MODEL_NAME</span><span class="o">=</span><span class="si">${</span><span class="nv">AZURE_AML_MODEL_NAME</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">AZURE_RESOURCE_GROUP</span><span class="o">=</span><span class="si">${</span><span class="nv">AZURE_RESOURCE_GROUP</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">AZURE_AML_WORKSPACE</span><span class="o">=</span><span class="si">${</span><span class="nv">AZURE_AML_WORKSPACE</span><span class="si">}</span>
</pre></div>
</div>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>docker<span class="w"> </span>run<span class="w"> </span>--net<span class="o">=</span>host<span class="w"> </span>-it<span class="w"> </span>--rm<span class="w"> </span>--name<span class="w"> </span><span class="si">${</span><span class="nv">CONTAINER_NAME</span><span class="si">}</span><span class="w"> </span><span class="si">${</span><span class="nv">IMAGE_NAME</span><span class="si">}</span>
<section id="as-an-api">
<h3>As an API<a class="headerlink" href="#as-an-api" title="Link to this heading">#</a></h3>
<p>The module provided functions for the individual tasks that all return some kind of collection of points, depending on
the specific task.</p>
<section id="redaction-detection-api">
<h4>Redaction Detection (API)<a class="headerlink" href="#redaction-detection-api" title="Link to this heading">#</a></h4>
<p>The below snippet shows hot to find the outlines of previous redactions.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">cv_analysis.redaction_detection</span> <span class="kn">import</span> <span class="n">find_redactions</span>
<span class="kn">import</span> <span class="nn">pdf2image</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="n">pdf_path</span> <span class="o">=</span> <span class="o">...</span>
<span class="n">page_index</span> <span class="o">=</span> <span class="o">...</span>
<span class="n">page</span> <span class="o">=</span> <span class="n">pdf2image</span><span class="o">.</span><span class="n">convert_from_path</span><span class="p">(</span><span class="n">pdf_path</span><span class="p">,</span> <span class="n">first_page</span><span class="o">=</span><span class="n">page_index</span><span class="p">,</span> <span class="n">last_page</span><span class="o">=</span><span class="n">page_index</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">page</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">page</span><span class="p">)</span>
<span class="n">redaction_contours</span> <span class="o">=</span> <span class="n">find_redactions</span><span class="p">(</span><span class="n">page</span><span class="p">)</span>
</pre></div>
</div>
</section>
<section id="run-locally">
<h3>Run locally<a class="headerlink" href="#run-locally" title="Link to this heading">#</a></h3>
<p>First you need to download the model from mlflow. This can be done with the <em>“src/ml_flow/download_model.py”</em> script.
This scripts downloads a model and copies config and model data to the specific locations, such that the model can
be loaded.</p>
<p>For running/testing the keyword extraction locally you can use the <em>src/tests/test_process.py</em> script.</p>
<p>Model ist stored and loaded via DVC, you need the connection string under
https://portal.azure.com/#&#64;knecon.com/resource/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourceGroups/taas-rg/providers/Microsoft.Storage/storageAccounts/taassaracer/keys</p>
</section>
</section>
</section>
<section id="upload-models-to-ml-flow">
<h1>Upload models to ML Flow<a class="headerlink" href="#upload-models-to-ml-flow" title="Link to this heading">#</a></h1>
<p>To upload the models to mlflow, you can use following script: src/mlflow/upload_model.py
For authentication following environment variables need to be set:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1">#AZURE_TENANT_ID=&quot;&quot;</span>
<span class="c1">#AZURE_SUBSCRIPTION_ID=&quot;&quot;</span>
<span class="c1">#AZURE_CLIENT_ID=&quot;&quot;</span>
<span class="c1">#AZURE_CLIENT_SECRET=&quot;&quot;</span>
<section id="as-a-cli-tool">
<h2>As a CLI Tool<a class="headerlink" href="#as-a-cli-tool" title="Link to this heading">#</a></h2>
<p>Core API functionalities can be used through a CLI.</p>
<section id="table-parsing">
<h3>Table Parsing<a class="headerlink" href="#table-parsing" title="Link to this heading">#</a></h3>
<p>The tables parsing utility detects and segments tables into individual cells.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">7</span><span class="w"> </span>--type<span class="w"> </span>table
</pre></div>
</div>
<p>Additional settings (resource group, experiment name, etc.) can be specified in the config (
<em>./src/mlflow/config/azure_config.toml</em>).
The <em>upload_model.py</em> has the following parameters:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">options</span><span class="p">:</span>
<span class="o">-</span><span class="n">h</span><span class="p">,</span> <span class="o">--</span><span class="n">help</span> <span class="n">show</span> <span class="n">this</span> <span class="n">help</span> <span class="n">message</span> <span class="ow">and</span> <span class="n">exit</span>
<span class="o">-</span><span class="n">a</span> <span class="n">AZURE_CONFIG</span><span class="p">,</span> <span class="o">--</span><span class="n">azure_config</span> <span class="n">AZURE_CONFIG</span>
<span class="n">Location</span> <span class="n">of</span> <span class="n">the</span> <span class="n">configuration</span> <span class="n">file</span> <span class="k">for</span> <span class="n">Azure</span> <span class="p">(</span><span class="n">default</span><span class="p">:</span> <span class="n">src</span><span class="o">/</span><span class="n">mlflow</span><span class="o">/</span><span class="n">config</span><span class="o">/</span><span class="n">azure_config</span><span class="o">.</span><span class="n">toml</span><span class="p">)</span>
<span class="o">-</span><span class="n">b</span> <span class="n">BASE_CONFIG</span><span class="p">,</span> <span class="o">--</span><span class="n">base_config</span> <span class="n">BASE_CONFIG</span>
<span class="n">Location</span> <span class="n">of</span> <span class="n">the</span> <span class="n">basic</span> <span class="n">training</span> <span class="n">configuration</span> <span class="p">(</span><span class="n">default</span><span class="p">:</span> <span class="n">src</span><span class="o">/</span><span class="n">mlflow</span><span class="o">/</span><span class="n">config</span><span class="o">/</span><span class="n">settings_de</span><span class="o">.</span><span class="n">toml</span><span class="p">)</span>
<p>The below image shows a parsed table, where each table cell has been detected individually.</p>
<p><img alt="Table Parsing Demonstration" src="_images/table_parsing.png" /></p>
</section>
<section id="redaction-detection-cli">
<h3>Redaction Detection (CLI)<a class="headerlink" href="#redaction-detection-cli" title="Link to this heading">#</a></h3>
<p>The redaction detection utility detects previous redactions in PDFs (filled black rectangles).</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">2</span><span class="w"> </span>--type<span class="w"> </span>redaction
</pre></div>
</div>
<p>the base config contains all information for the models used. Examples for German and
English are placed in <em>/src/mlflow/config/</em></p>
<p>Note: Multi-language model tracking does not work for now. After the upload script reports an error, you have to
manually track the
model <a class="reference external" href="https://ml.azure.com/experiments?wsid=/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourcegroups/fforesight-rg/providers/Microsoft.MachineLearningServices/workspaces/ff-aml-main&amp;tid=b44be368-e4f2-4ade-a089-cd2825458048">here</a>
where you can find the run. Adhere to the naming conventions for the model name and versions,
see <a class="reference external" href="https://ml.azure.com/model/list?wsid=/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourcegroups/fforesight-rg/providers/Microsoft.MachineLearningServices/workspaces/ff-aml-main&amp;tid=b44be368-e4f2-4ade-a089-cd2825458048">here</a></p>
<p>The below image shows the detected redactions with green outlines.</p>
<p><img alt="Redaction Detection Demonstration" src="_images/redaction_detection.png" /></p>
</section>
<section id="layout-parsing">
<h3>Layout Parsing<a class="headerlink" href="#layout-parsing" title="Link to this heading">#</a></h3>
<p>The layout parsing utility detects elements such as paragraphs, tables and figures.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">7</span><span class="w"> </span>--type<span class="w"> </span>layout
</pre></div>
</div>
<p>The below image shows the detected layout elements on a page.</p>
<p><img alt="Layout Parsing Demonstration" src="_images/layout_parsing.png" /></p>
</section>
<section id="figure-detection">
<h3>Figure Detection<a class="headerlink" href="#figure-detection" title="Link to this heading">#</a></h3>
<p>The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">3</span><span class="w"> </span>--type<span class="w"> </span>figure
</pre></div>
</div>
<p>The below image shows the detected figure on a page.</p>
<p><img alt="Figure Detection Demonstration" src="_images/figure_detection.png" /></p>
</section>
</section>
<section id="running-as-a-service">
<h2>Running as a service<a class="headerlink" href="#running-as-a-service" title="Link to this heading">#</a></h2>
<section id="building">
<h3>Building<a class="headerlink" href="#building" title="Link to this heading">#</a></h3>
<p>Build base image</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>bash<span class="w"> </span>setup/docker.sh
</pre></div>
</div>
<p>Build head image</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>docker<span class="w"> </span>build<span class="w"> </span>-f<span class="w"> </span>Dockerfile<span class="w"> </span>-t<span class="w"> </span>cv-analysis<span class="w"> </span>.<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">BASE_ROOT</span><span class="o">=</span><span class="s2">&quot;&quot;</span>
</pre></div>
</div>
</section>
<section id="usage-service">
<h3>Usage (service)<a class="headerlink" href="#usage-service" title="Link to this heading">#</a></h3>
<p>Shell 1</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>docker<span class="w"> </span>run<span class="w"> </span>--rm<span class="w"> </span>--net<span class="o">=</span>host<span class="w"> </span>--rm<span class="w"> </span>cv-analysis
</pre></div>
</div>
<p>Shell 2</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/client_mock.py<span class="w"> </span>--pdf_path<span class="w"> </span>/path/to/a/pdf
</pre></div>
</div>
</section>
</section>
</section>
@ -565,7 +535,7 @@ see <a class="reference external" href="https://ml.azure.com/model/list?wsid=/su
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Welcome to Keyword Extraction Service documentation!</p>
<p class="prev-next-title">Welcome to CV Analysis Service documentation!</p>
</div>
</a>
<a class="right-next"
@ -595,24 +565,28 @@ see <a class="reference external" href="https://ml.azure.com/model/list?wsid=/su
</div>
<nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
<ul class="visible nav section-nav flex-column">
<li class="toc-h1 nav-item toc-entry"><a class="reference internal nav-link" href="#">Keyword-Service</a><ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#api">API</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#rest">REST</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#rabbitmq">RabbitMQ</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#service-configuration">Service Configuration</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#language">Language</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#api">API</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#installation">Installation</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#usage">Usage</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#run-docker-commands">Run Docker Commands</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#run-locally">Run locally</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#as-an-api">As an API</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#redaction-detection-api">Redaction Detection (API)</a></li>
</ul>
</li>
</ul>
</li>
<li class="toc-h1 nav-item toc-entry"><a class="reference internal nav-link" href="#upload-models-to-ml-flow">Upload models to ML Flow</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#as-a-cli-tool">As a CLI Tool</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#table-parsing">Table Parsing</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#redaction-detection-cli">Redaction Detection (CLI)</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#layout-parsing">Layout Parsing</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#figure-detection">Figure Detection</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#running-as-a-service">Running as a service</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#building">Building</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#usage-service">Usage (service)</a></li>
</ul>
</li>
</ul>
</nav></div>
<div class="sidebar-secondary-item">

Binary file not shown.

After

Width:  |  Height:  |  Size: 707 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 568 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.2 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 566 KiB

View File

@ -1,203 +1,178 @@
# Keyword-Service
# cv-analysis - Visual (CV-Based) Document Parsing
Service to get keywords of a paragraph or whole document.
<!-- TOC -->
- [Keyword-Service](#keyword-service)
- [API](#api)
- [REST](#rest)
- [RabbitMQ](#rabbitmq)
- [Service Configuration](#service-configuration)
- [Language](#language)
- [Usage](#usage)
- [Run Docker Commands](#run-docker-commands)
- [Run locally](#run-locally)
- [Upload models to ML Flow](#upload-models-to-ml-flow)
<!-- TOC -->
parse_pdf()
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
previous redactions in documents.
## API
### REST
The service provides endpoints to extract keywords from a text and to embed a text. For details, download
[OpenAPI schema](docs/openapi_redoc.html) and view it in a browser.
### RabbitMQ
The service listens to a queue and processes the messages. This method is ment to be used for extracting keywords from
whole documents. All RabbitMQ parameters including the queue names are set in environment variables, refer to the
service respective HELM chart for more information.
The input message should be a JSON object with the following structure:
Input message:
```json
{
"targetFilePath": string,
"responseFilePath": string
"targetFilePath": {
"pdf": "absolute file path",
"vlp_output": "absolute file path"
},
"responseFilePath": "absolute file path",
"operation": "table_image_inference"
}
```
The service downloads the file specified in `targetFilePath`. Supported data structures for the target file are:
Response is uploaded to the storage as specified in the `responseFilePath` field. The structure is as follows:
- simplified text data (signifier key: `paragraphs`)
- structure object data (signifier key: `structureObjects`)
As a response, the service uploads a JSON-structured file (as defined in `responseFilePath`) with the result under the
`data` key. The structure of the response file is as follows:
```javascript
```json
{
"targetFilePath"
:
string,
"responseFilePath"
:
string,
// and eventually further fields if present in the input message
"data"
:
[
...,
"data": [
{
'pageNum': 0,
'bbox': {
'x1': 55.3407,
'y1': 247.0246,
'x2': 558.5602,
'y2': 598.0585
},
'uuid': '2b10c1a2-393c-4fca-b9e3-0ad5b774ac84',
'label': 'table',
'tableLines': [
{
"keywords": Array[string],
"paragraphId": int,
"embedding": Array[float] // 384 dimensions
}
]
'x1': 0,
'y1': 16,
'x2': 1399,
'y2': 16
},
...
],
'imageInfo': {
'height': 693,
'width': 1414
}
},
...
]
}
```
**Note** that
- the `embedding` key is optional and can be omitted. The service will not calculate the embedding if the environment
variable `MODEL__COMPUTE_EMBEDDINGS` is set to `false`.
- the service also computes the keywords for the whole document. In this case, the `paragraphId` is set to `-1`.
## Service Configuration
The service is configured via environment variables. The following variables are available:
| Variable | Description | Default |
| ------------------------------------------ | ----------------------------------------------------------------------------------- | ------- |
| LOGGING__LEVEL | Logging level | INFO |
| MODEL__MAX_KEYWORDS_PER_PARAGRAPH | Maximum number of keywords per paragraph | 5 |
| MODLE__MAX_KEYWORDS_PER_DOCUMENT | Maximum number of keywords per document, when set to 0, no keywords are extracted | 0 |
| MODEL__COMPUTE_EMBEDDINGS | Whether to compute keyword embeddings or not | true |
| MODEL__PREPROCESSING__MIN_PARAGRAPH_LENGTH | Minimum number of characters in a paragraph to be considered for keyword extraction | 1 |
| MODEL__POSTPROCESSING__FILTER_SUBWORDS | Whether to filter out subwords from the keywords or not | true |
**NOTE** that these variables are subject to change. For the most recent configuration, refer to the service respective
HELM chart.
## Language
Currently, there is an english, a german and a multi-language model for keyword extraction. The models are uploaded to
mlflow and can
be set in the Dockerfile when building the container:
example for german model:
```
ENV AZURE_RESOURCE_GROUP="mriedl"
ENV AZURE_AML_WORKSPACE="azureml-ws"
ENV AZURE_AML_MODEL_NAME="keyword-extraction-de"
ENV AZURE_AML_MODEL_VERSION="1"
```
and example for english model:
## Installation
```
ENV AZURE_RESOURCE_GROUP="mriedl"
ENV AZURE_AML_WORKSPACE="azureml-ws"
ENV AZURE_AML_MODEL_NAME="keyword-extraction-de"
ENV AZURE_AML_MODEL_VERSION="1"
```bash
git clone ssh://git@git.iqser.com:2222/rr/cv-analysis.git
cd cv-analysis
python -m venv env
source env/bin/activate
pip install -e .
pip install -r requirements.txt
dvc pull
```
## Usage
**Two Options:**
### As an API
1. REST: Send text per request to endpoint, endpoint returns keywords
2. Queue: Service gets text from queue, model calculates keywords, save keywords in queue
The module provided functions for the individual tasks that all return some kind of collection of points, depending on
the specific task.
To test the REST endpoint you have to set up an environment and do poetry install (
see https://gitlab.knecon.com/knecon/research/template-python-project for details for setting up poetry)
Then run
#### Redaction Detection (API)
```
python ./src/serve.py
The below snippet shows hot to find the outlines of previous redactions.
```python
from cv_analysis.redaction_detection import find_redactions
import pdf2image
import numpy as np
pdf_path = ...
page_index = ...
page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0]
page = np.array(page)
redaction_contours = find_redactions(page)
```
You don't need to start a queue for that, just ignore the AMQP Error.
Port and host are set in settings.toml .
You can use the FastAPI under 127.0.0.1:8001/docs to send request to endpoint.
## As a CLI Tool
You can also test the service with docker:
Core API functionalities can be used through a CLI.
#### Run Docker Commands
### Table Parsing
The tables parsing utility detects and segments tables into individual cells.
```bash
docker build -t ${IMAGE_NAME} -f Dockerfile --build-arg GITLAB_USER=${GITLAB_USER} \
--build-arg GITLAB_ACCESS_TOKEN=${GITLAB_ACCESS_TOKEN} \
--build-arg AZURE_TENANT_ID=${AZURE_TENANT_ID} \
--build-arg AZURE_SUBSCRIPTION_ID=${AZURE_SUBSCRIPTION_ID} \
--build-arg AZURE_CLIENT_ID=${AZURE_CLIENT_ID} \
--build-arg AZURE_CLIENT_SECRET=${AZURE_CLIENT_SECRET} \
--build-arg AZURE_AML_MODEL_VERSION=${AZURE_AML_MODEL_VERSION} \
--build-arg AZURE_AML_MODEL_NAME=${AZURE_AML_MODEL_NAME} \
--build-arg AZURE_RESOURCE_GROUP=${AZURE_RESOURCE_GROUP} \
--build-arg AZURE_AML_WORKSPACE=${AZURE_AML_WORKSPACE}
python scripts/annotate.py data/test_pdf.pdf 7 --type table
```
The below image shows a parsed table, where each table cell has been detected individually.
![Table Parsing Demonstration](data/table_parsing.png)
### Redaction Detection (CLI)
The redaction detection utility detects previous redactions in PDFs (filled black rectangles).
```bash
docker run --net=host -it --rm --name ${CONTAINER_NAME} ${IMAGE_NAME}
python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
```
#### Run locally
The below image shows the detected redactions with green outlines.
First you need to download the model from mlflow. This can be done with the *"src/ml_flow/download_model.py"* script.
This scripts downloads a model and copies config and model data to the specific locations, such that the model can
be loaded.
![Redaction Detection Demonstration](data/redaction_detection.png)
For running/testing the keyword extraction locally you can use the *src/tests/test_process.py* script.
### Layout Parsing
Model ist stored and loaded via DVC, you need the connection string under
https://portal.azure.com/#@knecon.com/resource/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourceGroups/taas-rg/providers/Microsoft.Storage/storageAccounts/taassaracer/keys
The layout parsing utility detects elements such as paragraphs, tables and figures.
# Upload models to ML Flow
To upload the models to mlflow, you can use following script: src/mlflow/upload_model.py
For authentication following environment variables need to be set:
```
#AZURE_TENANT_ID=""
#AZURE_SUBSCRIPTION_ID=""
#AZURE_CLIENT_ID=""
#AZURE_CLIENT_SECRET=""
```bash
python scripts/annotate.py data/test_pdf.pdf 7 --type layout
```
Additional settings (resource group, experiment name, etc.) can be specified in the config (
*./src/mlflow/config/azure_config.toml*).
The *upload_model.py* has the following parameters:
The below image shows the detected layout elements on a page.
```
options:
-h, --help show this help message and exit
-a AZURE_CONFIG, --azure_config AZURE_CONFIG
Location of the configuration file for Azure (default: src/mlflow/config/azure_config.toml)
-b BASE_CONFIG, --base_config BASE_CONFIG
Location of the basic training configuration (default: src/mlflow/config/settings_de.toml)
![Layout Parsing Demonstration](data/layout_parsing.png)
### Figure Detection
The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility.
```bash
python scripts/annotate.py data/test_pdf.pdf 3 --type figure
```
the base config contains all information for the models used. Examples for German and
English are placed in */src/mlflow/config/*
The below image shows the detected figure on a page.
Note: Multi-language model tracking does not work for now. After the upload script reports an error, you have to
manually track the
model [here](https://ml.azure.com/experiments?wsid=/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourcegroups/fforesight-rg/providers/Microsoft.MachineLearningServices/workspaces/ff-aml-main&tid=b44be368-e4f2-4ade-a089-cd2825458048)
where you can find the run. Adhere to the naming conventions for the model name and versions,
see [here](https://ml.azure.com/model/list?wsid=/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourcegroups/fforesight-rg/providers/Microsoft.MachineLearningServices/workspaces/ff-aml-main&tid=b44be368-e4f2-4ade-a089-cd2825458048)
![Figure Detection Demonstration](data/figure_detection.png)
## Running as a service
### Building
Build base image
```bash
bash setup/docker.sh
```
Build head image
```bash
docker build -f Dockerfile -t cv-analysis . --build-arg BASE_ROOT=""
```
### Usage (service)
Shell 1
```bash
docker run --rm --net=host --rm cv-analysis
```
Shell 2
```bash
python scripts/client_mock.py --pdf_path /path/to/a/pdf
```

View File

@ -3,9 +3,9 @@
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
====================================================
Welcome to Keyword Extraction Service documentation!
====================================================
=============================================
Welcome to CV Analysis Service documentation!
=============================================
.. note::

View File

@ -154,7 +154,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -254,7 +254,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -8,7 +8,7 @@
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Welcome to Keyword Extraction Service documentation! &#8212; CV Analysis Service 2.5.1 documentation</title>
<title>Welcome to CV Analysis Service documentation! &#8212; CV Analysis Service 2.5.1 documentation</title>
@ -43,7 +43,7 @@
<script async="async" src="https://assets.readthedocs.org/static/javascript/readthedocs-doc-embed.js"></script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="Keyword-Service" href="README.html" />
<link rel="next" title="cv-analysis - Visual (CV-Based) Document Parsing" href="README.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
@ -156,7 +156,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -260,7 +260,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -326,8 +326,8 @@ document.write(`
<div id="searchbox"></div>
<article class="bd-article">
<section id="welcome-to-keyword-extraction-service-documentation">
<h1>Welcome to Keyword Extraction Service documentation!<a class="headerlink" href="#welcome-to-keyword-extraction-service-documentation" title="Link to this heading">#</a></h1>
<section id="welcome-to-cv-analysis-service-documentation">
<h1>Welcome to CV Analysis Service documentation!<a class="headerlink" href="#welcome-to-cv-analysis-service-documentation" title="Link to this heading">#</a></h1>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>If youd like to change the looks of things 👉 <a class="reference external" href="https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html">https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html</a></p>
@ -337,22 +337,27 @@ document.write(`
<div class="toctree-wrapper compound">
<p aria-level="2" class="caption" role="heading"><span class="caption-text">README</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="README.html">Keyword-Service</a><ul>
<li class="toctree-l2"><a class="reference internal" href="README.html#api">API</a><ul>
<li class="toctree-l3"><a class="reference internal" href="README.html#rest">REST</a></li>
<li class="toctree-l3"><a class="reference internal" href="README.html#rabbitmq">RabbitMQ</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="README.html#service-configuration">Service Configuration</a></li>
<li class="toctree-l2"><a class="reference internal" href="README.html#language">Language</a></li>
<li class="toctree-l1"><a class="reference internal" href="README.html">cv-analysis - Visual (CV-Based) Document Parsing</a><ul>
<li class="toctree-l2"><a class="reference internal" href="README.html#api">API</a></li>
<li class="toctree-l2"><a class="reference internal" href="README.html#installation">Installation</a></li>
<li class="toctree-l2"><a class="reference internal" href="README.html#usage">Usage</a><ul>
<li class="toctree-l3"><a class="reference internal" href="README.html#run-docker-commands">Run Docker Commands</a></li>
<li class="toctree-l3"><a class="reference internal" href="README.html#run-locally">Run locally</a></li>
<li class="toctree-l3"><a class="reference internal" href="README.html#as-an-api">As an API</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="README.html#as-a-cli-tool">As a CLI Tool</a><ul>
<li class="toctree-l3"><a class="reference internal" href="README.html#table-parsing">Table Parsing</a></li>
<li class="toctree-l3"><a class="reference internal" href="README.html#redaction-detection-cli">Redaction Detection (CLI)</a></li>
<li class="toctree-l3"><a class="reference internal" href="README.html#layout-parsing">Layout Parsing</a></li>
<li class="toctree-l3"><a class="reference internal" href="README.html#figure-detection">Figure Detection</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="README.html#running-as-a-service">Running as a service</a><ul>
<li class="toctree-l3"><a class="reference internal" href="README.html#building">Building</a></li>
<li class="toctree-l3"><a class="reference internal" href="README.html#usage-service">Usage (service)</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="README.html#upload-models-to-ml-flow">Upload models to ML Flow</a></li>
</ul>
</div>
<div class="toctree-wrapper compound">
@ -405,7 +410,7 @@ document.write(`
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">Keyword-Service</p>
<p class="prev-next-title">cv-analysis - Visual (CV-Based) Document Parsing</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -44,7 +44,7 @@
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="cv_analysis.figure_detection package" href="cv_analysis.figure_detection.html" />
<link rel="prev" title="Keyword-Service" href="../README.html" />
<link rel="prev" title="cv-analysis - Visual (CV-Based) Document Parsing" href="../README.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -576,7 +576,7 @@ document.write(`
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Keyword-Service</p>
<p class="prev-next-title">cv-analysis - Visual (CV-Based) Document Parsing</p>
</div>
</a>
<a class="right-next"

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -259,7 +259,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -156,7 +156,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -258,7 +258,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="../README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

Binary file not shown.

View File

@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -257,7 +257,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

View File

@ -156,7 +156,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
<li class="nav-item">
<a class="nav-link nav-internal" href="README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
@ -256,7 +256,7 @@ document.write(`
<li class="nav-item">
<a class="nav-link nav-internal" href="README.html">
Keyword-Service
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,203 +1,178 @@
# Keyword-Service
# cv-analysis - Visual (CV-Based) Document Parsing
Service to get keywords of a paragraph or whole document.
<!-- TOC -->
- [Keyword-Service](#keyword-service)
- [API](#api)
- [REST](#rest)
- [RabbitMQ](#rabbitmq)
- [Service Configuration](#service-configuration)
- [Language](#language)
- [Usage](#usage)
- [Run Docker Commands](#run-docker-commands)
- [Run locally](#run-locally)
- [Upload models to ML Flow](#upload-models-to-ml-flow)
<!-- TOC -->
parse_pdf()
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
previous redactions in documents.
## API
### REST
The service provides endpoints to extract keywords from a text and to embed a text. For details, download
[OpenAPI schema](docs/openapi_redoc.html) and view it in a browser.
### RabbitMQ
The service listens to a queue and processes the messages. This method is ment to be used for extracting keywords from
whole documents. All RabbitMQ parameters including the queue names are set in environment variables, refer to the
service respective HELM chart for more information.
The input message should be a JSON object with the following structure:
Input message:
```json
{
"targetFilePath": string,
"responseFilePath": string
"targetFilePath": {
"pdf": "absolute file path",
"vlp_output": "absolute file path"
},
"responseFilePath": "absolute file path",
"operation": "table_image_inference"
}
```
The service downloads the file specified in `targetFilePath`. Supported data structures for the target file are:
Response is uploaded to the storage as specified in the `responseFilePath` field. The structure is as follows:
- simplified text data (signifier key: `paragraphs`)
- structure object data (signifier key: `structureObjects`)
As a response, the service uploads a JSON-structured file (as defined in `responseFilePath`) with the result under the
`data` key. The structure of the response file is as follows:
```javascript
```json
{
"targetFilePath"
:
string,
"responseFilePath"
:
string,
// and eventually further fields if present in the input message
"data"
:
[
...,
"data": [
{
'pageNum': 0,
'bbox': {
'x1': 55.3407,
'y1': 247.0246,
'x2': 558.5602,
'y2': 598.0585
},
'uuid': '2b10c1a2-393c-4fca-b9e3-0ad5b774ac84',
'label': 'table',
'tableLines': [
{
"keywords": Array[string],
"paragraphId": int,
"embedding": Array[float] // 384 dimensions
}
]
'x1': 0,
'y1': 16,
'x2': 1399,
'y2': 16
},
...
],
'imageInfo': {
'height': 693,
'width': 1414
}
},
...
]
}
```
**Note** that
- the `embedding` key is optional and can be omitted. The service will not calculate the embedding if the environment
variable `MODEL__COMPUTE_EMBEDDINGS` is set to `false`.
- the service also computes the keywords for the whole document. In this case, the `paragraphId` is set to `-1`.
## Service Configuration
The service is configured via environment variables. The following variables are available:
| Variable | Description | Default |
| ------------------------------------------ | ----------------------------------------------------------------------------------- | ------- |
| LOGGING__LEVEL | Logging level | INFO |
| MODEL__MAX_KEYWORDS_PER_PARAGRAPH | Maximum number of keywords per paragraph | 5 |
| MODLE__MAX_KEYWORDS_PER_DOCUMENT | Maximum number of keywords per document, when set to 0, no keywords are extracted | 0 |
| MODEL__COMPUTE_EMBEDDINGS | Whether to compute keyword embeddings or not | true |
| MODEL__PREPROCESSING__MIN_PARAGRAPH_LENGTH | Minimum number of characters in a paragraph to be considered for keyword extraction | 1 |
| MODEL__POSTPROCESSING__FILTER_SUBWORDS | Whether to filter out subwords from the keywords or not | true |
**NOTE** that these variables are subject to change. For the most recent configuration, refer to the service respective
HELM chart.
## Language
Currently, there is an english, a german and a multi-language model for keyword extraction. The models are uploaded to
mlflow and can
be set in the Dockerfile when building the container:
example for german model:
```
ENV AZURE_RESOURCE_GROUP="mriedl"
ENV AZURE_AML_WORKSPACE="azureml-ws"
ENV AZURE_AML_MODEL_NAME="keyword-extraction-de"
ENV AZURE_AML_MODEL_VERSION="1"
```
and example for english model:
## Installation
```
ENV AZURE_RESOURCE_GROUP="mriedl"
ENV AZURE_AML_WORKSPACE="azureml-ws"
ENV AZURE_AML_MODEL_NAME="keyword-extraction-de"
ENV AZURE_AML_MODEL_VERSION="1"
```bash
git clone ssh://git@git.iqser.com:2222/rr/cv-analysis.git
cd cv-analysis
python -m venv env
source env/bin/activate
pip install -e .
pip install -r requirements.txt
dvc pull
```
## Usage
**Two Options:**
### As an API
1. REST: Send text per request to endpoint, endpoint returns keywords
2. Queue: Service gets text from queue, model calculates keywords, save keywords in queue
The module provided functions for the individual tasks that all return some kind of collection of points, depending on
the specific task.
To test the REST endpoint you have to set up an environment and do poetry install (
see https://gitlab.knecon.com/knecon/research/template-python-project for details for setting up poetry)
Then run
#### Redaction Detection (API)
```
python ./src/serve.py
The below snippet shows hot to find the outlines of previous redactions.
```python
from cv_analysis.redaction_detection import find_redactions
import pdf2image
import numpy as np
pdf_path = ...
page_index = ...
page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0]
page = np.array(page)
redaction_contours = find_redactions(page)
```
You don't need to start a queue for that, just ignore the AMQP Error.
Port and host are set in settings.toml .
You can use the FastAPI under 127.0.0.1:8001/docs to send request to endpoint.
## As a CLI Tool
You can also test the service with docker:
Core API functionalities can be used through a CLI.
#### Run Docker Commands
### Table Parsing
The tables parsing utility detects and segments tables into individual cells.
```bash
docker build -t ${IMAGE_NAME} -f Dockerfile --build-arg GITLAB_USER=${GITLAB_USER} \
--build-arg GITLAB_ACCESS_TOKEN=${GITLAB_ACCESS_TOKEN} \
--build-arg AZURE_TENANT_ID=${AZURE_TENANT_ID} \
--build-arg AZURE_SUBSCRIPTION_ID=${AZURE_SUBSCRIPTION_ID} \
--build-arg AZURE_CLIENT_ID=${AZURE_CLIENT_ID} \
--build-arg AZURE_CLIENT_SECRET=${AZURE_CLIENT_SECRET} \
--build-arg AZURE_AML_MODEL_VERSION=${AZURE_AML_MODEL_VERSION} \
--build-arg AZURE_AML_MODEL_NAME=${AZURE_AML_MODEL_NAME} \
--build-arg AZURE_RESOURCE_GROUP=${AZURE_RESOURCE_GROUP} \
--build-arg AZURE_AML_WORKSPACE=${AZURE_AML_WORKSPACE}
python scripts/annotate.py data/test_pdf.pdf 7 --type table
```
The below image shows a parsed table, where each table cell has been detected individually.
![Table Parsing Demonstration](data/table_parsing.png)
### Redaction Detection (CLI)
The redaction detection utility detects previous redactions in PDFs (filled black rectangles).
```bash
docker run --net=host -it --rm --name ${CONTAINER_NAME} ${IMAGE_NAME}
python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
```
#### Run locally
The below image shows the detected redactions with green outlines.
First you need to download the model from mlflow. This can be done with the *"src/ml_flow/download_model.py"* script.
This scripts downloads a model and copies config and model data to the specific locations, such that the model can
be loaded.
![Redaction Detection Demonstration](data/redaction_detection.png)
For running/testing the keyword extraction locally you can use the *src/tests/test_process.py* script.
### Layout Parsing
Model ist stored and loaded via DVC, you need the connection string under
https://portal.azure.com/#@knecon.com/resource/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourceGroups/taas-rg/providers/Microsoft.Storage/storageAccounts/taassaracer/keys
The layout parsing utility detects elements such as paragraphs, tables and figures.
# Upload models to ML Flow
To upload the models to mlflow, you can use following script: src/mlflow/upload_model.py
For authentication following environment variables need to be set:
```
#AZURE_TENANT_ID=""
#AZURE_SUBSCRIPTION_ID=""
#AZURE_CLIENT_ID=""
#AZURE_CLIENT_SECRET=""
```bash
python scripts/annotate.py data/test_pdf.pdf 7 --type layout
```
Additional settings (resource group, experiment name, etc.) can be specified in the config (
*./src/mlflow/config/azure_config.toml*).
The *upload_model.py* has the following parameters:
The below image shows the detected layout elements on a page.
```
options:
-h, --help show this help message and exit
-a AZURE_CONFIG, --azure_config AZURE_CONFIG
Location of the configuration file for Azure (default: src/mlflow/config/azure_config.toml)
-b BASE_CONFIG, --base_config BASE_CONFIG
Location of the basic training configuration (default: src/mlflow/config/settings_de.toml)
![Layout Parsing Demonstration](data/layout_parsing.png)
### Figure Detection
The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility.
```bash
python scripts/annotate.py data/test_pdf.pdf 3 --type figure
```
the base config contains all information for the models used. Examples for German and
English are placed in */src/mlflow/config/*
The below image shows the detected figure on a page.
Note: Multi-language model tracking does not work for now. After the upload script reports an error, you have to
manually track the
model [here](https://ml.azure.com/experiments?wsid=/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourcegroups/fforesight-rg/providers/Microsoft.MachineLearningServices/workspaces/ff-aml-main&tid=b44be368-e4f2-4ade-a089-cd2825458048)
where you can find the run. Adhere to the naming conventions for the model name and versions,
see [here](https://ml.azure.com/model/list?wsid=/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourcegroups/fforesight-rg/providers/Microsoft.MachineLearningServices/workspaces/ff-aml-main&tid=b44be368-e4f2-4ade-a089-cd2825458048)
![Figure Detection Demonstration](data/figure_detection.png)
## Running as a service
### Building
Build base image
```bash
bash setup/docker.sh
```
Build head image
```bash
docker build -f Dockerfile -t cv-analysis . --build-arg BASE_ROOT=""
```
### Usage (service)
Shell 1
```bash
docker run --rm --net=host --rm cv-analysis
```
Shell 2
```bash
python scripts/client_mock.py --pdf_path /path/to/a/pdf
```

5
docs/source/data/.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
/test_pdf.pdf
/figure_detection.png
/layout_parsing.png
/redaction_detection.png
/table_parsing.png

5
docs/source/data/data/.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
/test_pdf.pdf
/figure_detection.png
/layout_parsing.png
/redaction_detection.png
/table_parsing.png

View File

@ -3,9 +3,9 @@
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
====================================================
Welcome to Keyword Extraction Service documentation!
====================================================
=============================================
Welcome to CV Analysis Service documentation!
=============================================
.. note::