chore: fix readme problem with docs and modify gitlab ci to build docs
This commit is contained in:
parent
3a5fc32ec8
commit
ab5096dd86
@ -1,12 +1,17 @@
|
||||
include:
|
||||
- project: "Gitlab/gitlab"
|
||||
ref: main
|
||||
file: "/ci-templates/research/versioning-build+azure_model-test-release.gitlab-ci.yml"
|
||||
ref: 0.3.0
|
||||
file: "/ci-templates/research/dvc-versioning-build-release.gitlab-ci.yml"
|
||||
|
||||
variables:
|
||||
NEXUS_PROJECT_DIR: ff
|
||||
NEXUS_PROJECT_DIR: red
|
||||
IMAGENAME: "${CI_PROJECT_NAME}"
|
||||
|
||||
#################################
|
||||
# temp. disable integration tests, b/c they don't cover the CV analysis case yet
|
||||
trigger integration tests:
|
||||
rules:
|
||||
- when: never
|
||||
|
||||
######
|
||||
# DOCS
|
||||
@ -15,6 +20,7 @@ pages:
|
||||
before_script:
|
||||
- !reference [.activate-venv, script]
|
||||
script:
|
||||
- cp ./README.md ./docs/source/README.md && cp -r ./data ./docs/source/data/
|
||||
- sphinx-apidoc ./src -o ./docs/source/modules --no-toc --module-first --follow-links --separate --force
|
||||
- sphinx-build -b html docs/source/ public/ -E -a
|
||||
artifacts:
|
||||
@ -30,51 +36,18 @@ pages:
|
||||
# MAKE SURE TO SWITCH OUT ALL YOUR MODEL NAMES + VERSIONS
|
||||
# name the job after the model it's using in the build, keep the prefix referencing `docker-build::`
|
||||
|
||||
docker-build::keyword-extraction-multi:
|
||||
extends: .docker-build
|
||||
variables:
|
||||
MODEL_NAME: ${MODEL_MULTI_NAME}
|
||||
MODEL_VERSION: ${MODEL_MULTI_VERSION}
|
||||
|
||||
docker-build::cv-analysis-service:
|
||||
extends: .docker-build
|
||||
release build:
|
||||
stage: release
|
||||
needs:
|
||||
- !reference [.needs-versioning, needs]
|
||||
- job: docker-build::cv-analysis-service
|
||||
variables:
|
||||
MODEL_NAME: ${MODEL_EN_NAME}
|
||||
MODEL_VERSION: ${MODEL_EN_VERSION}
|
||||
|
||||
docker-build::keyword-extraction-de:
|
||||
extends: .docker-build
|
||||
needs:
|
||||
- !reference [.needs-versioning, needs]
|
||||
- job: docker-build::cv-analysis-service
|
||||
variables:
|
||||
MODEL_NAME: ${MODEL_DE_NAME}
|
||||
MODEL_VERSION: ${MODEL_DE_VERSION}
|
||||
|
||||
|
||||
###################
|
||||
# INTEGRATION TESTS
|
||||
trigger-integration-tests:
|
||||
extends: .integration-tests
|
||||
needs:
|
||||
###### UPDATE/EDIT ######
|
||||
# YOU NEED ONLY TO DEFINE ONE
|
||||
# reason is that we want to have one built image to use with the integration tests
|
||||
# this should be the same image you uploaded test data for
|
||||
- job: docker-build::cv-analysis-service
|
||||
artifacts: true
|
||||
rules:
|
||||
- when: never # temp. disable integration tests
|
||||
|
||||
#########
|
||||
# RELEASE
|
||||
release:
|
||||
extends: .release
|
||||
needs:
|
||||
- !reference [.release, needs] # LEAVE THIS LINE AS IS
|
||||
###### UPDATE/EDIT ######
|
||||
# DEFINE ONE BUILD JOB THAT NEEDS TO EXIST BEFORE RELEASE
|
||||
- job: docker-build::cv-analysis-service
|
||||
- job: set custom version
|
||||
artifacts: true
|
||||
optional: true
|
||||
- job: calculate patch version
|
||||
artifacts: true
|
||||
optional: true
|
||||
- job: calculate minor version
|
||||
artifacts: true
|
||||
optional: true
|
||||
- job: build docker nexus
|
||||
artifacts: true
|
||||
#################################
|
||||
|
||||
@ -10,7 +10,7 @@ repos:
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
- id: check-yaml
|
||||
args: [--unsafe]
|
||||
args: [--unsafe] # needed for .gitlab-ci.yml
|
||||
- id: check-toml
|
||||
- id: detect-private-key
|
||||
- id: check-added-large-files
|
||||
|
||||
6
Makefile
6
Makefile
@ -9,8 +9,8 @@
|
||||
|
||||
export DOCKER=docker
|
||||
export DOCKERFILE=Dockerfile
|
||||
export IMAGE_NAME=keyword_extraction_service-image
|
||||
export CONTAINER_NAME=keyword_extraction_service-container
|
||||
export IMAGE_NAME=cv_analysis_service-image
|
||||
export CONTAINER_NAME=cv_analysis_service-container
|
||||
export HOST_PORT=9999
|
||||
export CONTAINER_PORT=9999
|
||||
export PYTHON_VERSION=python3.10
|
||||
@ -88,4 +88,4 @@ sphinx_html:
|
||||
poetry run sphinx-build -b html docs/source/ docs/build/html -E -a
|
||||
|
||||
sphinx_apidoc:
|
||||
poetry run sphinx-apidoc ./src -o ./docs/source/modules --no-toc --module-first --follow-links --separate --force
|
||||
cp ./README.md ./docs/source/README.md && cp -r ./data ./docs/source/data/ && poetry run sphinx-apidoc ./src -o ./docs/source/modules --no-toc --module-first --follow-links --separate --force
|
||||
|
||||
BIN
docs/build/html/.doctrees/README.doctree
vendored
BIN
docs/build/html/.doctrees/README.doctree
vendored
Binary file not shown.
BIN
docs/build/html/.doctrees/environment.pickle
vendored
BIN
docs/build/html/.doctrees/environment.pickle
vendored
Binary file not shown.
BIN
docs/build/html/.doctrees/index.doctree
vendored
BIN
docs/build/html/.doctrees/index.doctree
vendored
Binary file not shown.
336
docs/build/html/README.html
vendored
336
docs/build/html/README.html
vendored
@ -8,7 +8,7 @@
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
|
||||
<title>Keyword-Service — CV Analysis Service 2.5.1 documentation</title>
|
||||
<title>cv-analysis - Visual (CV-Based) Document Parsing — CV Analysis Service 2.5.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@ -44,7 +44,7 @@
|
||||
<link rel="index" title="Index" href="genindex.html" />
|
||||
<link rel="search" title="Search" href="search.html" />
|
||||
<link rel="next" title="cv_analysis package" href="modules/cv_analysis.html" />
|
||||
<link rel="prev" title="Welcome to Keyword Extraction Service documentation!" href="index.html" />
|
||||
<link rel="prev" title="Welcome to CV Analysis Service documentation!" href="index.html" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item current active">
|
||||
<a class="nav-link nav-internal" href="#">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item current active">
|
||||
<a class="nav-link nav-internal" href="#">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -343,7 +343,7 @@ document.write(`
|
||||
<i class="fa-solid fa-home"></i>
|
||||
</a>
|
||||
</li>
|
||||
<li class="breadcrumb-item active" aria-current="page">Keyword-Service</li>
|
||||
<li class="breadcrumb-item active" aria-current="page">cv-analysis...</li>
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
@ -360,193 +360,163 @@ document.write(`
|
||||
<div id="searchbox"></div>
|
||||
<article class="bd-article">
|
||||
|
||||
<section id="keyword-service">
|
||||
<h1>Keyword-Service<a class="headerlink" href="#keyword-service" title="Link to this heading">#</a></h1>
|
||||
<p>Service to get keywords of a paragraph or whole document.</p>
|
||||
<!-- TOC --><ul class="simple">
|
||||
<li><p><a class="reference external" href="#keyword-service">Keyword-Service</a></p>
|
||||
<ul>
|
||||
<li><p><a class="reference external" href="#api">API</a></p>
|
||||
<ul>
|
||||
<li><p><a class="reference external" href="#rest">REST</a></p></li>
|
||||
<li><p><a class="reference external" href="#rabbitmq">RabbitMQ</a></p></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><p><a class="reference external" href="#service-configuration">Service Configuration</a></p></li>
|
||||
<li><p><a class="reference external" href="#language">Language</a></p></li>
|
||||
<li><p><a class="reference external" href="#usage">Usage</a></p>
|
||||
<ul>
|
||||
<li><p><a class="reference external" href="#run-docker-commands">Run Docker Commands</a></p></li>
|
||||
<li><p><a class="reference external" href="#run-locally">Run locally</a></p></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><p><a class="reference external" href="#upload-models-to-ml-flow">Upload models to ML Flow</a></p></li>
|
||||
</ul>
|
||||
<!-- TOC --><section id="api">
|
||||
<section id="cv-analysis-visual-cv-based-document-parsing">
|
||||
<h1>cv-analysis - Visual (CV-Based) Document Parsing<a class="headerlink" href="#cv-analysis-visual-cv-based-document-parsing" title="Link to this heading">#</a></h1>
|
||||
<p>parse_pdf()
|
||||
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
|
||||
previous redactions in documents.</p>
|
||||
<section id="api">
|
||||
<h2>API<a class="headerlink" href="#api" title="Link to this heading">#</a></h2>
|
||||
<section id="rest">
|
||||
<h3>REST<a class="headerlink" href="#rest" title="Link to this heading">#</a></h3>
|
||||
<p>The service provides endpoints to extract keywords from a text and to embed a text. For details, download
|
||||
<a class="reference external" href="docs/openapi_redoc.html">OpenAPI schema</a> and view it in a browser.</p>
|
||||
</section>
|
||||
<section id="rabbitmq">
|
||||
<h3>RabbitMQ<a class="headerlink" href="#rabbitmq" title="Link to this heading">#</a></h3>
|
||||
<p>The service listens to a queue and processes the messages. This method is ment to be used for extracting keywords from
|
||||
whole documents. All RabbitMQ parameters including the queue names are set in environment variables, refer to the
|
||||
service respective HELM chart for more information.</p>
|
||||
<p>The input message should be a JSON object with the following structure:</p>
|
||||
<p>Input message:</p>
|
||||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="nt">"targetFilePath"</span><span class="p">:</span><span class="w"> </span><span class="err">s</span><span class="kc">tr</span><span class="err">i</span><span class="kc">n</span><span class="err">g</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="nt">"responseFilePath"</span><span class="p">:</span><span class="w"> </span><span class="err">s</span><span class="kc">tr</span><span class="err">i</span><span class="kc">n</span><span class="err">g</span>
|
||||
<span class="w"> </span><span class="nt">"targetFilePath"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="nt">"pdf"</span><span class="p">:</span><span class="w"> </span><span class="s2">"absolute file path"</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="nt">"vlp_output"</span><span class="p">:</span><span class="w"> </span><span class="s2">"absolute file path"</span>
|
||||
<span class="w"> </span><span class="p">},</span>
|
||||
<span class="w"> </span><span class="nt">"responseFilePath"</span><span class="p">:</span><span class="w"> </span><span class="s2">"absolute file path"</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="nt">"operation"</span><span class="p">:</span><span class="w"> </span><span class="s2">"table_image_inference"</span>
|
||||
<span class="p">}</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>The service downloads the file specified in <code class="docutils literal notranslate"><span class="pre">targetFilePath</span></code>. Supported data structures for the target file are:</p>
|
||||
<ul class="simple">
|
||||
<li><p>simplified text data (signifier key: <code class="docutils literal notranslate"><span class="pre">paragraphs</span></code>)</p></li>
|
||||
<li><p>structure object data (signifier key: <code class="docutils literal notranslate"><span class="pre">structureObjects</span></code>)</p></li>
|
||||
</ul>
|
||||
<p>As a response, the service uploads a JSON-structured file (as defined in <code class="docutils literal notranslate"><span class="pre">responseFilePath</span></code>) with the result under the
|
||||
<code class="docutils literal notranslate"><span class="pre">data</span></code> key. The structure of the response file is as follows:</p>
|
||||
<div class="highlight-javascript notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="s2">"targetFilePath"</span>
|
||||
<span class="o">:</span>
|
||||
<span class="w"> </span><span class="nx">string</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="s2">"responseFilePath"</span>
|
||||
<span class="o">:</span>
|
||||
<span class="w"> </span><span class="nx">string</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="c1">// and eventually further fields if present in the input message </span>
|
||||
<span class="w"> </span><span class="s2">"data"</span>
|
||||
<span class="o">:</span>
|
||||
<span class="w"> </span><span class="p">[</span>
|
||||
<p>Response is uploaded to the storage as specified in the <code class="docutils literal notranslate"><span class="pre">responseFilePath</span></code> field. The structure is as follows:</p>
|
||||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="err">...</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="nt">"data"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span>
|
||||
<span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="err">'pageNum'</span><span class="p">:</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'bbox'</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="err">'x</span><span class="mi">1</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mf">55.3407</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'y</span><span class="mi">1</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mf">247.0246</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'x</span><span class="mi">2</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mf">558.5602</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'y</span><span class="mi">2</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mf">598.0585</span>
|
||||
<span class="w"> </span><span class="p">},</span>
|
||||
<span class="w"> </span><span class="err">'uuid'</span><span class="p">:</span><span class="w"> </span><span class="err">'</span><span class="mi">2</span><span class="err">b</span><span class="mi">10</span><span class="err">c</span><span class="mi">1</span><span class="err">a</span><span class="mi">2-393</span><span class="err">c</span><span class="mi">-4</span><span class="kc">f</span><span class="err">ca</span><span class="mi">-</span><span class="err">b</span><span class="mf">9e3-0</span><span class="err">ad</span><span class="mi">5</span><span class="err">b</span><span class="mi">774</span><span class="err">ac</span><span class="mi">84</span><span class="err">'</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'label'</span><span class="p">:</span><span class="w"> </span><span class="err">'</span><span class="kc">ta</span><span class="err">ble'</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'</span><span class="kc">ta</span><span class="err">bleLi</span><span class="kc">nes</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="p">[</span>
|
||||
<span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="s2">"keywords"</span><span class="o">:</span><span class="w"> </span><span class="nb">Array</span><span class="p">[</span><span class="nx">string</span><span class="p">],</span>
|
||||
<span class="w"> </span><span class="s2">"paragraphId"</span><span class="o">:</span><span class="w"> </span><span class="kr">int</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="s2">"embedding"</span><span class="o">:</span><span class="w"> </span><span class="nb">Array</span><span class="p">[</span><span class="kr">float</span><span class="p">]</span><span class="w"> </span><span class="c1">// 384 dimensions</span>
|
||||
<span class="w"> </span><span class="p">}</span>
|
||||
<span class="w"> </span><span class="p">]</span>
|
||||
<span class="w"> </span><span class="err">'x</span><span class="mi">1</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'y</span><span class="mi">1</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mi">16</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'x</span><span class="mi">2</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mi">1399</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'y</span><span class="mi">2</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mi">16</span>
|
||||
<span class="w"> </span><span class="p">},</span>
|
||||
<span class="w"> </span><span class="err">...</span>
|
||||
<span class="w"> </span><span class="p">],</span>
|
||||
<span class="w"> </span><span class="err">'imageI</span><span class="kc">nf</span><span class="err">o'</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="err">'heigh</span><span class="kc">t</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mi">693</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'wid</span><span class="kc">t</span><span class="err">h'</span><span class="p">:</span><span class="w"> </span><span class="mi">1414</span>
|
||||
<span class="w"> </span><span class="p">}</span>
|
||||
<span class="w"> </span><span class="p">},</span>
|
||||
<span class="w"> </span><span class="err">...</span>
|
||||
<span class="w"> </span><span class="p">]</span>
|
||||
<span class="p">}</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<p><strong>Note</strong> that</p>
|
||||
<ul class="simple">
|
||||
<li><p>the <code class="docutils literal notranslate"><span class="pre">embedding</span></code> key is optional and can be omitted. The service will not calculate the embedding if the environment
|
||||
variable <code class="docutils literal notranslate"><span class="pre">MODEL__COMPUTE_EMBEDDINGS</span></code> is set to <code class="docutils literal notranslate"><span class="pre">false</span></code>.</p></li>
|
||||
<li><p>the service also computes the keywords for the whole document. In this case, the <code class="docutils literal notranslate"><span class="pre">paragraphId</span></code> is set to <code class="docutils literal notranslate"><span class="pre">-1</span></code>.</p></li>
|
||||
</ul>
|
||||
</section>
|
||||
</section>
|
||||
<section id="service-configuration">
|
||||
<h2>Service Configuration<a class="headerlink" href="#service-configuration" title="Link to this heading">#</a></h2>
|
||||
<p>The service is configured via environment variables. The following variables are available:</p>
|
||||
<p>| Variable | Description | Default |
|
||||
| —————————————— | ———————————————————————————– | ——- |
|
||||
| LOGGING__LEVEL | Logging level | INFO |
|
||||
| MODEL__MAX_KEYWORDS_PER_PARAGRAPH | Maximum number of keywords per paragraph | 5 |
|
||||
| MODLE__MAX_KEYWORDS_PER_DOCUMENT | Maximum number of keywords per document, when set to 0, no keywords are extracted | 0 |
|
||||
| MODEL__COMPUTE_EMBEDDINGS | Whether to compute keyword embeddings or not | true |
|
||||
| MODEL__PREPROCESSING__MIN_PARAGRAPH_LENGTH | Minimum number of characters in a paragraph to be considered for keyword extraction | 1 |
|
||||
| MODEL__POSTPROCESSING__FILTER_SUBWORDS | Whether to filter out subwords from the keywords or not | true |</p>
|
||||
<p><strong>NOTE</strong> that these variables are subject to change. For the most recent configuration, refer to the service respective
|
||||
HELM chart.</p>
|
||||
</section>
|
||||
<section id="language">
|
||||
<h2>Language<a class="headerlink" href="#language" title="Link to this heading">#</a></h2>
|
||||
<p>Currently, there is an english, a german and a multi-language model for keyword extraction. The models are uploaded to
|
||||
mlflow and can
|
||||
be set in the Dockerfile when building the container:</p>
|
||||
<p>example for german model:</p>
|
||||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">ENV</span> <span class="n">AZURE_RESOURCE_GROUP</span><span class="o">=</span><span class="s2">"mriedl"</span>
|
||||
<span class="n">ENV</span> <span class="n">AZURE_AML_WORKSPACE</span><span class="o">=</span><span class="s2">"azureml-ws"</span>
|
||||
<span class="n">ENV</span> <span class="n">AZURE_AML_MODEL_NAME</span><span class="o">=</span><span class="s2">"keyword-extraction-de"</span>
|
||||
<span class="n">ENV</span> <span class="n">AZURE_AML_MODEL_VERSION</span><span class="o">=</span><span class="s2">"1"</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>and example for english model:</p>
|
||||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">ENV</span> <span class="n">AZURE_RESOURCE_GROUP</span><span class="o">=</span><span class="s2">"mriedl"</span>
|
||||
<span class="n">ENV</span> <span class="n">AZURE_AML_WORKSPACE</span><span class="o">=</span><span class="s2">"azureml-ws"</span>
|
||||
<span class="n">ENV</span> <span class="n">AZURE_AML_MODEL_NAME</span><span class="o">=</span><span class="s2">"keyword-extraction-de"</span>
|
||||
<span class="n">ENV</span> <span class="n">AZURE_AML_MODEL_VERSION</span><span class="o">=</span><span class="s2">"1"</span>
|
||||
<section id="installation">
|
||||
<h2>Installation<a class="headerlink" href="#installation" title="Link to this heading">#</a></h2>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>git<span class="w"> </span>clone<span class="w"> </span>ssh://git@git.iqser.com:2222/rr/cv-analysis.git
|
||||
<span class="nb">cd</span><span class="w"> </span>cv-analysis
|
||||
|
||||
python<span class="w"> </span>-m<span class="w"> </span>venv<span class="w"> </span>env
|
||||
<span class="nb">source</span><span class="w"> </span>env/bin/activate
|
||||
|
||||
pip<span class="w"> </span>install<span class="w"> </span>-e<span class="w"> </span>.
|
||||
pip<span class="w"> </span>install<span class="w"> </span>-r<span class="w"> </span>requirements.txt
|
||||
|
||||
dvc<span class="w"> </span>pull
|
||||
</pre></div>
|
||||
</div>
|
||||
</section>
|
||||
<section id="usage">
|
||||
<h2>Usage<a class="headerlink" href="#usage" title="Link to this heading">#</a></h2>
|
||||
<p><strong>Two Options:</strong></p>
|
||||
<ol class="simple">
|
||||
<li><p>REST: Send text per request to endpoint, endpoint returns keywords</p></li>
|
||||
<li><p>Queue: Service gets text from queue, model calculates keywords, save keywords in queue</p></li>
|
||||
</ol>
|
||||
<p>To test the REST endpoint you have to set up an environment and do poetry install (
|
||||
see https://gitlab.knecon.com/knecon/research/template-python-project for details for setting up poetry)
|
||||
Then run</p>
|
||||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">python</span> <span class="o">./</span><span class="n">src</span><span class="o">/</span><span class="n">serve</span><span class="o">.</span><span class="n">py</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>You don’t need to start a queue for that, just ignore the AMQP Error.
|
||||
Port and host are set in settings.toml .
|
||||
You can use the FastAPI under 127.0.0.1:8001/docs to send request to endpoint.</p>
|
||||
<p>You can also test the service with docker:</p>
|
||||
<section id="run-docker-commands">
|
||||
<h3>Run Docker Commands<a class="headerlink" href="#run-docker-commands" title="Link to this heading">#</a></h3>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>docker<span class="w"> </span>build<span class="w"> </span>-t<span class="w"> </span><span class="si">${</span><span class="nv">IMAGE_NAME</span><span class="si">}</span><span class="w"> </span>-f<span class="w"> </span>Dockerfile<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">GITLAB_USER</span><span class="o">=</span><span class="si">${</span><span class="nv">GITLAB_USER</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
|
||||
<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">GITLAB_ACCESS_TOKEN</span><span class="o">=</span><span class="si">${</span><span class="nv">GITLAB_ACCESS_TOKEN</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
|
||||
<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">AZURE_TENANT_ID</span><span class="o">=</span><span class="si">${</span><span class="nv">AZURE_TENANT_ID</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
|
||||
<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">AZURE_SUBSCRIPTION_ID</span><span class="o">=</span><span class="si">${</span><span class="nv">AZURE_SUBSCRIPTION_ID</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
|
||||
<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">AZURE_CLIENT_ID</span><span class="o">=</span><span class="si">${</span><span class="nv">AZURE_CLIENT_ID</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
|
||||
<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">AZURE_CLIENT_SECRET</span><span class="o">=</span><span class="si">${</span><span class="nv">AZURE_CLIENT_SECRET</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
|
||||
<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">AZURE_AML_MODEL_VERSION</span><span class="o">=</span><span class="si">${</span><span class="nv">AZURE_AML_MODEL_VERSION</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
|
||||
<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">AZURE_AML_MODEL_NAME</span><span class="o">=</span><span class="si">${</span><span class="nv">AZURE_AML_MODEL_NAME</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
|
||||
<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">AZURE_RESOURCE_GROUP</span><span class="o">=</span><span class="si">${</span><span class="nv">AZURE_RESOURCE_GROUP</span><span class="si">}</span><span class="w"> </span><span class="se">\</span>
|
||||
<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">AZURE_AML_WORKSPACE</span><span class="o">=</span><span class="si">${</span><span class="nv">AZURE_AML_WORKSPACE</span><span class="si">}</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>docker<span class="w"> </span>run<span class="w"> </span>--net<span class="o">=</span>host<span class="w"> </span>-it<span class="w"> </span>--rm<span class="w"> </span>--name<span class="w"> </span><span class="si">${</span><span class="nv">CONTAINER_NAME</span><span class="si">}</span><span class="w"> </span><span class="si">${</span><span class="nv">IMAGE_NAME</span><span class="si">}</span>
|
||||
<section id="as-an-api">
|
||||
<h3>As an API<a class="headerlink" href="#as-an-api" title="Link to this heading">#</a></h3>
|
||||
<p>The module provided functions for the individual tasks that all return some kind of collection of points, depending on
|
||||
the specific task.</p>
|
||||
<section id="redaction-detection-api">
|
||||
<h4>Redaction Detection (API)<a class="headerlink" href="#redaction-detection-api" title="Link to this heading">#</a></h4>
|
||||
<p>The below snippet shows hot to find the outlines of previous redactions.</p>
|
||||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">cv_analysis.redaction_detection</span> <span class="kn">import</span> <span class="n">find_redactions</span>
|
||||
<span class="kn">import</span> <span class="nn">pdf2image</span>
|
||||
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
|
||||
|
||||
<span class="n">pdf_path</span> <span class="o">=</span> <span class="o">...</span>
|
||||
<span class="n">page_index</span> <span class="o">=</span> <span class="o">...</span>
|
||||
|
||||
<span class="n">page</span> <span class="o">=</span> <span class="n">pdf2image</span><span class="o">.</span><span class="n">convert_from_path</span><span class="p">(</span><span class="n">pdf_path</span><span class="p">,</span> <span class="n">first_page</span><span class="o">=</span><span class="n">page_index</span><span class="p">,</span> <span class="n">last_page</span><span class="o">=</span><span class="n">page_index</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
|
||||
<span class="n">page</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">page</span><span class="p">)</span>
|
||||
|
||||
<span class="n">redaction_contours</span> <span class="o">=</span> <span class="n">find_redactions</span><span class="p">(</span><span class="n">page</span><span class="p">)</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
</section>
|
||||
<section id="run-locally">
|
||||
<h3>Run locally<a class="headerlink" href="#run-locally" title="Link to this heading">#</a></h3>
|
||||
<p>First you need to download the model from mlflow. This can be done with the <em>“src/ml_flow/download_model.py”</em> script.
|
||||
This scripts downloads a model and copies config and model data to the specific locations, such that the model can
|
||||
be loaded.</p>
|
||||
<p>For running/testing the keyword extraction locally you can use the <em>src/tests/test_process.py</em> script.</p>
|
||||
<p>Model ist stored and loaded via DVC, you need the connection string under
|
||||
https://portal.azure.com/#@knecon.com/resource/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourceGroups/taas-rg/providers/Microsoft.Storage/storageAccounts/taassaracer/keys</p>
|
||||
</section>
|
||||
</section>
|
||||
</section>
|
||||
<section id="upload-models-to-ml-flow">
|
||||
<h1>Upload models to ML Flow<a class="headerlink" href="#upload-models-to-ml-flow" title="Link to this heading">#</a></h1>
|
||||
<p>To upload the models to mlflow, you can use following script: src/mlflow/upload_model.py
|
||||
For authentication following environment variables need to be set:</p>
|
||||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1">#AZURE_TENANT_ID=""</span>
|
||||
<span class="c1">#AZURE_SUBSCRIPTION_ID=""</span>
|
||||
<span class="c1">#AZURE_CLIENT_ID=""</span>
|
||||
<span class="c1">#AZURE_CLIENT_SECRET=""</span>
|
||||
<section id="as-a-cli-tool">
|
||||
<h2>As a CLI Tool<a class="headerlink" href="#as-a-cli-tool" title="Link to this heading">#</a></h2>
|
||||
<p>Core API functionalities can be used through a CLI.</p>
|
||||
<section id="table-parsing">
|
||||
<h3>Table Parsing<a class="headerlink" href="#table-parsing" title="Link to this heading">#</a></h3>
|
||||
<p>The tables parsing utility detects and segments tables into individual cells.</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">7</span><span class="w"> </span>--type<span class="w"> </span>table
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>Additional settings (resource group, experiment name, etc.) can be specified in the config (
|
||||
<em>./src/mlflow/config/azure_config.toml</em>).
|
||||
The <em>upload_model.py</em> has the following parameters:</p>
|
||||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">options</span><span class="p">:</span>
|
||||
<span class="o">-</span><span class="n">h</span><span class="p">,</span> <span class="o">--</span><span class="n">help</span> <span class="n">show</span> <span class="n">this</span> <span class="n">help</span> <span class="n">message</span> <span class="ow">and</span> <span class="n">exit</span>
|
||||
<span class="o">-</span><span class="n">a</span> <span class="n">AZURE_CONFIG</span><span class="p">,</span> <span class="o">--</span><span class="n">azure_config</span> <span class="n">AZURE_CONFIG</span>
|
||||
<span class="n">Location</span> <span class="n">of</span> <span class="n">the</span> <span class="n">configuration</span> <span class="n">file</span> <span class="k">for</span> <span class="n">Azure</span> <span class="p">(</span><span class="n">default</span><span class="p">:</span> <span class="n">src</span><span class="o">/</span><span class="n">mlflow</span><span class="o">/</span><span class="n">config</span><span class="o">/</span><span class="n">azure_config</span><span class="o">.</span><span class="n">toml</span><span class="p">)</span>
|
||||
<span class="o">-</span><span class="n">b</span> <span class="n">BASE_CONFIG</span><span class="p">,</span> <span class="o">--</span><span class="n">base_config</span> <span class="n">BASE_CONFIG</span>
|
||||
<span class="n">Location</span> <span class="n">of</span> <span class="n">the</span> <span class="n">basic</span> <span class="n">training</span> <span class="n">configuration</span> <span class="p">(</span><span class="n">default</span><span class="p">:</span> <span class="n">src</span><span class="o">/</span><span class="n">mlflow</span><span class="o">/</span><span class="n">config</span><span class="o">/</span><span class="n">settings_de</span><span class="o">.</span><span class="n">toml</span><span class="p">)</span>
|
||||
|
||||
<p>The below image shows a parsed table, where each table cell has been detected individually.</p>
|
||||
<p><img alt="Table Parsing Demonstration" src="_images/table_parsing.png" /></p>
|
||||
</section>
|
||||
<section id="redaction-detection-cli">
|
||||
<h3>Redaction Detection (CLI)<a class="headerlink" href="#redaction-detection-cli" title="Link to this heading">#</a></h3>
|
||||
<p>The redaction detection utility detects previous redactions in PDFs (filled black rectangles).</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">2</span><span class="w"> </span>--type<span class="w"> </span>redaction
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>the base config contains all information for the models used. Examples for German and
|
||||
English are placed in <em>/src/mlflow/config/</em></p>
|
||||
<p>Note: Multi-language model tracking does not work for now. After the upload script reports an error, you have to
|
||||
manually track the
|
||||
model <a class="reference external" href="https://ml.azure.com/experiments?wsid=/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourcegroups/fforesight-rg/providers/Microsoft.MachineLearningServices/workspaces/ff-aml-main&tid=b44be368-e4f2-4ade-a089-cd2825458048">here</a>
|
||||
where you can find the run. Adhere to the naming conventions for the model name and versions,
|
||||
see <a class="reference external" href="https://ml.azure.com/model/list?wsid=/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourcegroups/fforesight-rg/providers/Microsoft.MachineLearningServices/workspaces/ff-aml-main&tid=b44be368-e4f2-4ade-a089-cd2825458048">here</a></p>
|
||||
<p>The below image shows the detected redactions with green outlines.</p>
|
||||
<p><img alt="Redaction Detection Demonstration" src="_images/redaction_detection.png" /></p>
|
||||
</section>
|
||||
<section id="layout-parsing">
|
||||
<h3>Layout Parsing<a class="headerlink" href="#layout-parsing" title="Link to this heading">#</a></h3>
|
||||
<p>The layout parsing utility detects elements such as paragraphs, tables and figures.</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">7</span><span class="w"> </span>--type<span class="w"> </span>layout
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>The below image shows the detected layout elements on a page.</p>
|
||||
<p><img alt="Layout Parsing Demonstration" src="_images/layout_parsing.png" /></p>
|
||||
</section>
|
||||
<section id="figure-detection">
|
||||
<h3>Figure Detection<a class="headerlink" href="#figure-detection" title="Link to this heading">#</a></h3>
|
||||
<p>The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility.</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">3</span><span class="w"> </span>--type<span class="w"> </span>figure
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>The below image shows the detected figure on a page.</p>
|
||||
<p><img alt="Figure Detection Demonstration" src="_images/figure_detection.png" /></p>
|
||||
</section>
|
||||
</section>
|
||||
<section id="running-as-a-service">
|
||||
<h2>Running as a service<a class="headerlink" href="#running-as-a-service" title="Link to this heading">#</a></h2>
|
||||
<section id="building">
|
||||
<h3>Building<a class="headerlink" href="#building" title="Link to this heading">#</a></h3>
|
||||
<p>Build base image</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>bash<span class="w"> </span>setup/docker.sh
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>Build head image</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>docker<span class="w"> </span>build<span class="w"> </span>-f<span class="w"> </span>Dockerfile<span class="w"> </span>-t<span class="w"> </span>cv-analysis<span class="w"> </span>.<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">BASE_ROOT</span><span class="o">=</span><span class="s2">""</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
</section>
|
||||
<section id="usage-service">
|
||||
<h3>Usage (service)<a class="headerlink" href="#usage-service" title="Link to this heading">#</a></h3>
|
||||
<p>Shell 1</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>docker<span class="w"> </span>run<span class="w"> </span>--rm<span class="w"> </span>--net<span class="o">=</span>host<span class="w"> </span>--rm<span class="w"> </span>cv-analysis
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>Shell 2</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/client_mock.py<span class="w"> </span>--pdf_path<span class="w"> </span>/path/to/a/pdf
|
||||
</pre></div>
|
||||
</div>
|
||||
</section>
|
||||
</section>
|
||||
</section>
|
||||
|
||||
|
||||
@ -565,7 +535,7 @@ see <a class="reference external" href="https://ml.azure.com/model/list?wsid=/su
|
||||
<i class="fa-solid fa-angle-left"></i>
|
||||
<div class="prev-next-info">
|
||||
<p class="prev-next-subtitle">previous</p>
|
||||
<p class="prev-next-title">Welcome to Keyword Extraction Service documentation!</p>
|
||||
<p class="prev-next-title">Welcome to CV Analysis Service documentation!</p>
|
||||
</div>
|
||||
</a>
|
||||
<a class="right-next"
|
||||
@ -595,24 +565,28 @@ see <a class="reference external" href="https://ml.azure.com/model/list?wsid=/su
|
||||
</div>
|
||||
<nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
|
||||
<ul class="visible nav section-nav flex-column">
|
||||
<li class="toc-h1 nav-item toc-entry"><a class="reference internal nav-link" href="#">Keyword-Service</a><ul class="visible nav section-nav flex-column">
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#api">API</a><ul class="nav section-nav flex-column">
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#rest">REST</a></li>
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#rabbitmq">RabbitMQ</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#service-configuration">Service Configuration</a></li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#language">Language</a></li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#api">API</a></li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#installation">Installation</a></li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#usage">Usage</a><ul class="nav section-nav flex-column">
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#run-docker-commands">Run Docker Commands</a></li>
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#run-locally">Run locally</a></li>
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#as-an-api">As an API</a><ul class="nav section-nav flex-column">
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#redaction-detection-api">Redaction Detection (API)</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toc-h1 nav-item toc-entry"><a class="reference internal nav-link" href="#upload-models-to-ml-flow">Upload models to ML Flow</a></li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#as-a-cli-tool">As a CLI Tool</a><ul class="nav section-nav flex-column">
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#table-parsing">Table Parsing</a></li>
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#redaction-detection-cli">Redaction Detection (CLI)</a></li>
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#layout-parsing">Layout Parsing</a></li>
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#figure-detection">Figure Detection</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#running-as-a-service">Running as a service</a><ul class="nav section-nav flex-column">
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#building">Building</a></li>
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#usage-service">Usage (service)</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
</nav></div>
|
||||
|
||||
<div class="sidebar-secondary-item">
|
||||
|
||||
BIN
docs/build/html/_images/figure_detection.png
vendored
Normal file
BIN
docs/build/html/_images/figure_detection.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 707 KiB |
BIN
docs/build/html/_images/layout_parsing.png
vendored
Normal file
BIN
docs/build/html/_images/layout_parsing.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 568 KiB |
BIN
docs/build/html/_images/redaction_detection.png
vendored
Normal file
BIN
docs/build/html/_images/redaction_detection.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 3.2 MiB |
BIN
docs/build/html/_images/table_parsing.png
vendored
Normal file
BIN
docs/build/html/_images/table_parsing.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 566 KiB |
285
docs/build/html/_sources/README.md.txt
vendored
285
docs/build/html/_sources/README.md.txt
vendored
@ -1,203 +1,178 @@
|
||||
# Keyword-Service
|
||||
# cv-analysis - Visual (CV-Based) Document Parsing
|
||||
|
||||
Service to get keywords of a paragraph or whole document.
|
||||
|
||||
<!-- TOC -->
|
||||
|
||||
- [Keyword-Service](#keyword-service)
|
||||
- [API](#api)
|
||||
- [REST](#rest)
|
||||
- [RabbitMQ](#rabbitmq)
|
||||
- [Service Configuration](#service-configuration)
|
||||
- [Language](#language)
|
||||
- [Usage](#usage)
|
||||
- [Run Docker Commands](#run-docker-commands)
|
||||
- [Run locally](#run-locally)
|
||||
- [Upload models to ML Flow](#upload-models-to-ml-flow)
|
||||
|
||||
<!-- TOC -->
|
||||
parse_pdf()
|
||||
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
|
||||
previous redactions in documents.
|
||||
|
||||
## API
|
||||
|
||||
### REST
|
||||
|
||||
The service provides endpoints to extract keywords from a text and to embed a text. For details, download
|
||||
[OpenAPI schema](docs/openapi_redoc.html) and view it in a browser.
|
||||
|
||||
### RabbitMQ
|
||||
|
||||
The service listens to a queue and processes the messages. This method is ment to be used for extracting keywords from
|
||||
whole documents. All RabbitMQ parameters including the queue names are set in environment variables, refer to the
|
||||
service respective HELM chart for more information.
|
||||
|
||||
The input message should be a JSON object with the following structure:
|
||||
Input message:
|
||||
|
||||
```json
|
||||
{
|
||||
"targetFilePath": string,
|
||||
"responseFilePath": string
|
||||
"targetFilePath": {
|
||||
"pdf": "absolute file path",
|
||||
"vlp_output": "absolute file path"
|
||||
},
|
||||
"responseFilePath": "absolute file path",
|
||||
"operation": "table_image_inference"
|
||||
}
|
||||
```
|
||||
|
||||
The service downloads the file specified in `targetFilePath`. Supported data structures for the target file are:
|
||||
Response is uploaded to the storage as specified in the `responseFilePath` field. The structure is as follows:
|
||||
|
||||
- simplified text data (signifier key: `paragraphs`)
|
||||
- structure object data (signifier key: `structureObjects`)
|
||||
|
||||
As a response, the service uploads a JSON-structured file (as defined in `responseFilePath`) with the result under the
|
||||
`data` key. The structure of the response file is as follows:
|
||||
|
||||
```javascript
|
||||
```json
|
||||
{
|
||||
"targetFilePath"
|
||||
:
|
||||
string,
|
||||
"responseFilePath"
|
||||
:
|
||||
string,
|
||||
// and eventually further fields if present in the input message
|
||||
"data"
|
||||
:
|
||||
[
|
||||
...,
|
||||
"data": [
|
||||
{
|
||||
'pageNum': 0,
|
||||
'bbox': {
|
||||
'x1': 55.3407,
|
||||
'y1': 247.0246,
|
||||
'x2': 558.5602,
|
||||
'y2': 598.0585
|
||||
},
|
||||
'uuid': '2b10c1a2-393c-4fca-b9e3-0ad5b774ac84',
|
||||
'label': 'table',
|
||||
'tableLines': [
|
||||
{
|
||||
"keywords": Array[string],
|
||||
"paragraphId": int,
|
||||
"embedding": Array[float] // 384 dimensions
|
||||
}
|
||||
]
|
||||
'x1': 0,
|
||||
'y1': 16,
|
||||
'x2': 1399,
|
||||
'y2': 16
|
||||
},
|
||||
...
|
||||
],
|
||||
'imageInfo': {
|
||||
'height': 693,
|
||||
'width': 1414
|
||||
}
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
**Note** that
|
||||
|
||||
- the `embedding` key is optional and can be omitted. The service will not calculate the embedding if the environment
|
||||
variable `MODEL__COMPUTE_EMBEDDINGS` is set to `false`.
|
||||
- the service also computes the keywords for the whole document. In this case, the `paragraphId` is set to `-1`.
|
||||
|
||||
## Service Configuration
|
||||
|
||||
The service is configured via environment variables. The following variables are available:
|
||||
|
||||
| Variable | Description | Default |
|
||||
| ------------------------------------------ | ----------------------------------------------------------------------------------- | ------- |
|
||||
| LOGGING__LEVEL | Logging level | INFO |
|
||||
| MODEL__MAX_KEYWORDS_PER_PARAGRAPH | Maximum number of keywords per paragraph | 5 |
|
||||
| MODLE__MAX_KEYWORDS_PER_DOCUMENT | Maximum number of keywords per document, when set to 0, no keywords are extracted | 0 |
|
||||
| MODEL__COMPUTE_EMBEDDINGS | Whether to compute keyword embeddings or not | true |
|
||||
| MODEL__PREPROCESSING__MIN_PARAGRAPH_LENGTH | Minimum number of characters in a paragraph to be considered for keyword extraction | 1 |
|
||||
| MODEL__POSTPROCESSING__FILTER_SUBWORDS | Whether to filter out subwords from the keywords or not | true |
|
||||
|
||||
**NOTE** that these variables are subject to change. For the most recent configuration, refer to the service respective
|
||||
HELM chart.
|
||||
|
||||
## Language
|
||||
|
||||
Currently, there is an english, a german and a multi-language model for keyword extraction. The models are uploaded to
|
||||
mlflow and can
|
||||
be set in the Dockerfile when building the container:
|
||||
|
||||
example for german model:
|
||||
|
||||
```
|
||||
ENV AZURE_RESOURCE_GROUP="mriedl"
|
||||
ENV AZURE_AML_WORKSPACE="azureml-ws"
|
||||
ENV AZURE_AML_MODEL_NAME="keyword-extraction-de"
|
||||
ENV AZURE_AML_MODEL_VERSION="1"
|
||||
```
|
||||
|
||||
and example for english model:
|
||||
## Installation
|
||||
|
||||
```
|
||||
ENV AZURE_RESOURCE_GROUP="mriedl"
|
||||
ENV AZURE_AML_WORKSPACE="azureml-ws"
|
||||
ENV AZURE_AML_MODEL_NAME="keyword-extraction-de"
|
||||
ENV AZURE_AML_MODEL_VERSION="1"
|
||||
```bash
|
||||
git clone ssh://git@git.iqser.com:2222/rr/cv-analysis.git
|
||||
cd cv-analysis
|
||||
|
||||
python -m venv env
|
||||
source env/bin/activate
|
||||
|
||||
pip install -e .
|
||||
pip install -r requirements.txt
|
||||
|
||||
dvc pull
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
**Two Options:**
|
||||
### As an API
|
||||
|
||||
1. REST: Send text per request to endpoint, endpoint returns keywords
|
||||
2. Queue: Service gets text from queue, model calculates keywords, save keywords in queue
|
||||
The module provided functions for the individual tasks that all return some kind of collection of points, depending on
|
||||
the specific task.
|
||||
|
||||
To test the REST endpoint you have to set up an environment and do poetry install (
|
||||
see https://gitlab.knecon.com/knecon/research/template-python-project for details for setting up poetry)
|
||||
Then run
|
||||
#### Redaction Detection (API)
|
||||
|
||||
```
|
||||
python ./src/serve.py
|
||||
The below snippet shows hot to find the outlines of previous redactions.
|
||||
|
||||
```python
|
||||
from cv_analysis.redaction_detection import find_redactions
|
||||
import pdf2image
|
||||
import numpy as np
|
||||
|
||||
pdf_path = ...
|
||||
page_index = ...
|
||||
|
||||
page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0]
|
||||
page = np.array(page)
|
||||
|
||||
redaction_contours = find_redactions(page)
|
||||
```
|
||||
|
||||
You don't need to start a queue for that, just ignore the AMQP Error.
|
||||
Port and host are set in settings.toml .
|
||||
You can use the FastAPI under 127.0.0.1:8001/docs to send request to endpoint.
|
||||
## As a CLI Tool
|
||||
|
||||
You can also test the service with docker:
|
||||
Core API functionalities can be used through a CLI.
|
||||
|
||||
#### Run Docker Commands
|
||||
### Table Parsing
|
||||
|
||||
The tables parsing utility detects and segments tables into individual cells.
|
||||
|
||||
```bash
|
||||
docker build -t ${IMAGE_NAME} -f Dockerfile --build-arg GITLAB_USER=${GITLAB_USER} \
|
||||
--build-arg GITLAB_ACCESS_TOKEN=${GITLAB_ACCESS_TOKEN} \
|
||||
--build-arg AZURE_TENANT_ID=${AZURE_TENANT_ID} \
|
||||
--build-arg AZURE_SUBSCRIPTION_ID=${AZURE_SUBSCRIPTION_ID} \
|
||||
--build-arg AZURE_CLIENT_ID=${AZURE_CLIENT_ID} \
|
||||
--build-arg AZURE_CLIENT_SECRET=${AZURE_CLIENT_SECRET} \
|
||||
--build-arg AZURE_AML_MODEL_VERSION=${AZURE_AML_MODEL_VERSION} \
|
||||
--build-arg AZURE_AML_MODEL_NAME=${AZURE_AML_MODEL_NAME} \
|
||||
--build-arg AZURE_RESOURCE_GROUP=${AZURE_RESOURCE_GROUP} \
|
||||
--build-arg AZURE_AML_WORKSPACE=${AZURE_AML_WORKSPACE}
|
||||
python scripts/annotate.py data/test_pdf.pdf 7 --type table
|
||||
```
|
||||
|
||||
The below image shows a parsed table, where each table cell has been detected individually.
|
||||
|
||||

|
||||
|
||||
### Redaction Detection (CLI)
|
||||
|
||||
The redaction detection utility detects previous redactions in PDFs (filled black rectangles).
|
||||
|
||||
```bash
|
||||
docker run --net=host -it --rm --name ${CONTAINER_NAME} ${IMAGE_NAME}
|
||||
python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
|
||||
```
|
||||
|
||||
#### Run locally
|
||||
The below image shows the detected redactions with green outlines.
|
||||
|
||||
First you need to download the model from mlflow. This can be done with the *"src/ml_flow/download_model.py"* script.
|
||||
This scripts downloads a model and copies config and model data to the specific locations, such that the model can
|
||||
be loaded.
|
||||

|
||||
|
||||
For running/testing the keyword extraction locally you can use the *src/tests/test_process.py* script.
|
||||
### Layout Parsing
|
||||
|
||||
Model ist stored and loaded via DVC, you need the connection string under
|
||||
https://portal.azure.com/#@knecon.com/resource/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourceGroups/taas-rg/providers/Microsoft.Storage/storageAccounts/taassaracer/keys
|
||||
The layout parsing utility detects elements such as paragraphs, tables and figures.
|
||||
|
||||
# Upload models to ML Flow
|
||||
|
||||
To upload the models to mlflow, you can use following script: src/mlflow/upload_model.py
|
||||
For authentication following environment variables need to be set:
|
||||
|
||||
```
|
||||
#AZURE_TENANT_ID=""
|
||||
#AZURE_SUBSCRIPTION_ID=""
|
||||
#AZURE_CLIENT_ID=""
|
||||
#AZURE_CLIENT_SECRET=""
|
||||
```bash
|
||||
python scripts/annotate.py data/test_pdf.pdf 7 --type layout
|
||||
```
|
||||
|
||||
Additional settings (resource group, experiment name, etc.) can be specified in the config (
|
||||
*./src/mlflow/config/azure_config.toml*).
|
||||
The *upload_model.py* has the following parameters:
|
||||
The below image shows the detected layout elements on a page.
|
||||
|
||||
```
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
-a AZURE_CONFIG, --azure_config AZURE_CONFIG
|
||||
Location of the configuration file for Azure (default: src/mlflow/config/azure_config.toml)
|
||||
-b BASE_CONFIG, --base_config BASE_CONFIG
|
||||
Location of the basic training configuration (default: src/mlflow/config/settings_de.toml)
|
||||
|
||||

|
||||
|
||||
### Figure Detection
|
||||
|
||||
The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility.
|
||||
|
||||
```bash
|
||||
python scripts/annotate.py data/test_pdf.pdf 3 --type figure
|
||||
```
|
||||
|
||||
the base config contains all information for the models used. Examples for German and
|
||||
English are placed in */src/mlflow/config/*
|
||||
The below image shows the detected figure on a page.
|
||||
|
||||
Note: Multi-language model tracking does not work for now. After the upload script reports an error, you have to
|
||||
manually track the
|
||||
model [here](https://ml.azure.com/experiments?wsid=/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourcegroups/fforesight-rg/providers/Microsoft.MachineLearningServices/workspaces/ff-aml-main&tid=b44be368-e4f2-4ade-a089-cd2825458048)
|
||||
where you can find the run. Adhere to the naming conventions for the model name and versions,
|
||||
see [here](https://ml.azure.com/model/list?wsid=/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourcegroups/fforesight-rg/providers/Microsoft.MachineLearningServices/workspaces/ff-aml-main&tid=b44be368-e4f2-4ade-a089-cd2825458048)
|
||||

|
||||
|
||||
## Running as a service
|
||||
|
||||
### Building
|
||||
|
||||
Build base image
|
||||
|
||||
```bash
|
||||
bash setup/docker.sh
|
||||
```
|
||||
|
||||
Build head image
|
||||
|
||||
```bash
|
||||
docker build -f Dockerfile -t cv-analysis . --build-arg BASE_ROOT=""
|
||||
```
|
||||
|
||||
### Usage (service)
|
||||
|
||||
Shell 1
|
||||
|
||||
```bash
|
||||
docker run --rm --net=host --rm cv-analysis
|
||||
```
|
||||
|
||||
Shell 2
|
||||
|
||||
```bash
|
||||
python scripts/client_mock.py --pdf_path /path/to/a/pdf
|
||||
```
|
||||
|
||||
6
docs/build/html/_sources/index.rst.txt
vendored
6
docs/build/html/_sources/index.rst.txt
vendored
@ -3,9 +3,9 @@
|
||||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
====================================================
|
||||
Welcome to Keyword Extraction Service documentation!
|
||||
====================================================
|
||||
=============================================
|
||||
Welcome to CV Analysis Service documentation!
|
||||
=============================================
|
||||
|
||||
.. note::
|
||||
|
||||
|
||||
4
docs/build/html/genindex.html
vendored
4
docs/build/html/genindex.html
vendored
@ -154,7 +154,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -254,7 +254,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
41
docs/build/html/index.html
vendored
41
docs/build/html/index.html
vendored
@ -8,7 +8,7 @@
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
|
||||
<title>Welcome to Keyword Extraction Service documentation! — CV Analysis Service 2.5.1 documentation</title>
|
||||
<title>Welcome to CV Analysis Service documentation! — CV Analysis Service 2.5.1 documentation</title>
|
||||
|
||||
|
||||
|
||||
@ -43,7 +43,7 @@
|
||||
<script async="async" src="https://assets.readthedocs.org/static/javascript/readthedocs-doc-embed.js"></script>
|
||||
<link rel="index" title="Index" href="genindex.html" />
|
||||
<link rel="search" title="Search" href="search.html" />
|
||||
<link rel="next" title="Keyword-Service" href="README.html" />
|
||||
<link rel="next" title="cv-analysis - Visual (CV-Based) Document Parsing" href="README.html" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
|
||||
@ -156,7 +156,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -260,7 +260,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -326,8 +326,8 @@ document.write(`
|
||||
<div id="searchbox"></div>
|
||||
<article class="bd-article">
|
||||
|
||||
<section id="welcome-to-keyword-extraction-service-documentation">
|
||||
<h1>Welcome to Keyword Extraction Service documentation!<a class="headerlink" href="#welcome-to-keyword-extraction-service-documentation" title="Link to this heading">#</a></h1>
|
||||
<section id="welcome-to-cv-analysis-service-documentation">
|
||||
<h1>Welcome to CV Analysis Service documentation!<a class="headerlink" href="#welcome-to-cv-analysis-service-documentation" title="Link to this heading">#</a></h1>
|
||||
<div class="admonition note">
|
||||
<p class="admonition-title">Note</p>
|
||||
<p>If you’d like to change the looks of things 👉 <a class="reference external" href="https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html">https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html</a></p>
|
||||
@ -337,22 +337,27 @@ document.write(`
|
||||
<div class="toctree-wrapper compound">
|
||||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">README</span></p>
|
||||
<ul>
|
||||
<li class="toctree-l1"><a class="reference internal" href="README.html">Keyword-Service</a><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="README.html#api">API</a><ul>
|
||||
<li class="toctree-l3"><a class="reference internal" href="README.html#rest">REST</a></li>
|
||||
<li class="toctree-l3"><a class="reference internal" href="README.html#rabbitmq">RabbitMQ</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="README.html#service-configuration">Service Configuration</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="README.html#language">Language</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="README.html">cv-analysis - Visual (CV-Based) Document Parsing</a><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="README.html#api">API</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="README.html#installation">Installation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="README.html#usage">Usage</a><ul>
|
||||
<li class="toctree-l3"><a class="reference internal" href="README.html#run-docker-commands">Run Docker Commands</a></li>
|
||||
<li class="toctree-l3"><a class="reference internal" href="README.html#run-locally">Run locally</a></li>
|
||||
<li class="toctree-l3"><a class="reference internal" href="README.html#as-an-api">As an API</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="README.html#as-a-cli-tool">As a CLI Tool</a><ul>
|
||||
<li class="toctree-l3"><a class="reference internal" href="README.html#table-parsing">Table Parsing</a></li>
|
||||
<li class="toctree-l3"><a class="reference internal" href="README.html#redaction-detection-cli">Redaction Detection (CLI)</a></li>
|
||||
<li class="toctree-l3"><a class="reference internal" href="README.html#layout-parsing">Layout Parsing</a></li>
|
||||
<li class="toctree-l3"><a class="reference internal" href="README.html#figure-detection">Figure Detection</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="README.html#running-as-a-service">Running as a service</a><ul>
|
||||
<li class="toctree-l3"><a class="reference internal" href="README.html#building">Building</a></li>
|
||||
<li class="toctree-l3"><a class="reference internal" href="README.html#usage-service">Usage (service)</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="README.html#upload-models-to-ml-flow">Upload models to ML Flow</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="toctree-wrapper compound">
|
||||
@ -405,7 +410,7 @@ document.write(`
|
||||
title="next page">
|
||||
<div class="prev-next-info">
|
||||
<p class="prev-next-subtitle">next</p>
|
||||
<p class="prev-next-title">Keyword-Service</p>
|
||||
<p class="prev-next-title">cv-analysis - Visual (CV-Based) Document Parsing</p>
|
||||
</div>
|
||||
<i class="fa-solid fa-angle-right"></i>
|
||||
</a>
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
8
docs/build/html/modules/cv_analysis.html
vendored
8
docs/build/html/modules/cv_analysis.html
vendored
@ -44,7 +44,7 @@
|
||||
<link rel="index" title="Index" href="../genindex.html" />
|
||||
<link rel="search" title="Search" href="../search.html" />
|
||||
<link rel="next" title="cv_analysis.figure_detection package" href="cv_analysis.figure_detection.html" />
|
||||
<link rel="prev" title="Keyword-Service" href="../README.html" />
|
||||
<link rel="prev" title="cv-analysis - Visual (CV-Based) Document Parsing" href="../README.html" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -576,7 +576,7 @@ document.write(`
|
||||
<i class="fa-solid fa-angle-left"></i>
|
||||
<div class="prev-next-info">
|
||||
<p class="prev-next-subtitle">previous</p>
|
||||
<p class="prev-next-title">Keyword-Service</p>
|
||||
<p class="prev-next-title">cv-analysis - Visual (CV-Based) Document Parsing</p>
|
||||
</div>
|
||||
</a>
|
||||
<a class="right-next"
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -259,7 +259,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
4
docs/build/html/modules/serve.html
vendored
4
docs/build/html/modules/serve.html
vendored
@ -156,7 +156,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -258,7 +258,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="../README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
BIN
docs/build/html/objects.inv
vendored
BIN
docs/build/html/objects.inv
vendored
Binary file not shown.
4
docs/build/html/py-modindex.html
vendored
4
docs/build/html/py-modindex.html
vendored
@ -157,7 +157,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -257,7 +257,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
4
docs/build/html/search.html
vendored
4
docs/build/html/search.html
vendored
@ -156,7 +156,7 @@ READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerH
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
@ -256,7 +256,7 @@ document.write(`
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="README.html">
|
||||
Keyword-Service
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
2
docs/build/html/searchindex.js
vendored
2
docs/build/html/searchindex.js
vendored
File diff suppressed because one or more lines are too long
6
docs/build/json/README.fjson
vendored
6
docs/build/json/README.fjson
vendored
File diff suppressed because one or more lines are too long
6
docs/build/json/index.fjson
vendored
6
docs/build/json/index.fjson
vendored
File diff suppressed because one or more lines are too long
@ -1,203 +1,178 @@
|
||||
# Keyword-Service
|
||||
# cv-analysis - Visual (CV-Based) Document Parsing
|
||||
|
||||
Service to get keywords of a paragraph or whole document.
|
||||
|
||||
<!-- TOC -->
|
||||
|
||||
- [Keyword-Service](#keyword-service)
|
||||
- [API](#api)
|
||||
- [REST](#rest)
|
||||
- [RabbitMQ](#rabbitmq)
|
||||
- [Service Configuration](#service-configuration)
|
||||
- [Language](#language)
|
||||
- [Usage](#usage)
|
||||
- [Run Docker Commands](#run-docker-commands)
|
||||
- [Run locally](#run-locally)
|
||||
- [Upload models to ML Flow](#upload-models-to-ml-flow)
|
||||
|
||||
<!-- TOC -->
|
||||
parse_pdf()
|
||||
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
|
||||
previous redactions in documents.
|
||||
|
||||
## API
|
||||
|
||||
### REST
|
||||
|
||||
The service provides endpoints to extract keywords from a text and to embed a text. For details, download
|
||||
[OpenAPI schema](docs/openapi_redoc.html) and view it in a browser.
|
||||
|
||||
### RabbitMQ
|
||||
|
||||
The service listens to a queue and processes the messages. This method is ment to be used for extracting keywords from
|
||||
whole documents. All RabbitMQ parameters including the queue names are set in environment variables, refer to the
|
||||
service respective HELM chart for more information.
|
||||
|
||||
The input message should be a JSON object with the following structure:
|
||||
Input message:
|
||||
|
||||
```json
|
||||
{
|
||||
"targetFilePath": string,
|
||||
"responseFilePath": string
|
||||
"targetFilePath": {
|
||||
"pdf": "absolute file path",
|
||||
"vlp_output": "absolute file path"
|
||||
},
|
||||
"responseFilePath": "absolute file path",
|
||||
"operation": "table_image_inference"
|
||||
}
|
||||
```
|
||||
|
||||
The service downloads the file specified in `targetFilePath`. Supported data structures for the target file are:
|
||||
Response is uploaded to the storage as specified in the `responseFilePath` field. The structure is as follows:
|
||||
|
||||
- simplified text data (signifier key: `paragraphs`)
|
||||
- structure object data (signifier key: `structureObjects`)
|
||||
|
||||
As a response, the service uploads a JSON-structured file (as defined in `responseFilePath`) with the result under the
|
||||
`data` key. The structure of the response file is as follows:
|
||||
|
||||
```javascript
|
||||
```json
|
||||
{
|
||||
"targetFilePath"
|
||||
:
|
||||
string,
|
||||
"responseFilePath"
|
||||
:
|
||||
string,
|
||||
// and eventually further fields if present in the input message
|
||||
"data"
|
||||
:
|
||||
[
|
||||
...,
|
||||
"data": [
|
||||
{
|
||||
'pageNum': 0,
|
||||
'bbox': {
|
||||
'x1': 55.3407,
|
||||
'y1': 247.0246,
|
||||
'x2': 558.5602,
|
||||
'y2': 598.0585
|
||||
},
|
||||
'uuid': '2b10c1a2-393c-4fca-b9e3-0ad5b774ac84',
|
||||
'label': 'table',
|
||||
'tableLines': [
|
||||
{
|
||||
"keywords": Array[string],
|
||||
"paragraphId": int,
|
||||
"embedding": Array[float] // 384 dimensions
|
||||
}
|
||||
]
|
||||
'x1': 0,
|
||||
'y1': 16,
|
||||
'x2': 1399,
|
||||
'y2': 16
|
||||
},
|
||||
...
|
||||
],
|
||||
'imageInfo': {
|
||||
'height': 693,
|
||||
'width': 1414
|
||||
}
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
**Note** that
|
||||
|
||||
- the `embedding` key is optional and can be omitted. The service will not calculate the embedding if the environment
|
||||
variable `MODEL__COMPUTE_EMBEDDINGS` is set to `false`.
|
||||
- the service also computes the keywords for the whole document. In this case, the `paragraphId` is set to `-1`.
|
||||
|
||||
## Service Configuration
|
||||
|
||||
The service is configured via environment variables. The following variables are available:
|
||||
|
||||
| Variable | Description | Default |
|
||||
| ------------------------------------------ | ----------------------------------------------------------------------------------- | ------- |
|
||||
| LOGGING__LEVEL | Logging level | INFO |
|
||||
| MODEL__MAX_KEYWORDS_PER_PARAGRAPH | Maximum number of keywords per paragraph | 5 |
|
||||
| MODLE__MAX_KEYWORDS_PER_DOCUMENT | Maximum number of keywords per document, when set to 0, no keywords are extracted | 0 |
|
||||
| MODEL__COMPUTE_EMBEDDINGS | Whether to compute keyword embeddings or not | true |
|
||||
| MODEL__PREPROCESSING__MIN_PARAGRAPH_LENGTH | Minimum number of characters in a paragraph to be considered for keyword extraction | 1 |
|
||||
| MODEL__POSTPROCESSING__FILTER_SUBWORDS | Whether to filter out subwords from the keywords or not | true |
|
||||
|
||||
**NOTE** that these variables are subject to change. For the most recent configuration, refer to the service respective
|
||||
HELM chart.
|
||||
|
||||
## Language
|
||||
|
||||
Currently, there is an english, a german and a multi-language model for keyword extraction. The models are uploaded to
|
||||
mlflow and can
|
||||
be set in the Dockerfile when building the container:
|
||||
|
||||
example for german model:
|
||||
|
||||
```
|
||||
ENV AZURE_RESOURCE_GROUP="mriedl"
|
||||
ENV AZURE_AML_WORKSPACE="azureml-ws"
|
||||
ENV AZURE_AML_MODEL_NAME="keyword-extraction-de"
|
||||
ENV AZURE_AML_MODEL_VERSION="1"
|
||||
```
|
||||
|
||||
and example for english model:
|
||||
## Installation
|
||||
|
||||
```
|
||||
ENV AZURE_RESOURCE_GROUP="mriedl"
|
||||
ENV AZURE_AML_WORKSPACE="azureml-ws"
|
||||
ENV AZURE_AML_MODEL_NAME="keyword-extraction-de"
|
||||
ENV AZURE_AML_MODEL_VERSION="1"
|
||||
```bash
|
||||
git clone ssh://git@git.iqser.com:2222/rr/cv-analysis.git
|
||||
cd cv-analysis
|
||||
|
||||
python -m venv env
|
||||
source env/bin/activate
|
||||
|
||||
pip install -e .
|
||||
pip install -r requirements.txt
|
||||
|
||||
dvc pull
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
**Two Options:**
|
||||
### As an API
|
||||
|
||||
1. REST: Send text per request to endpoint, endpoint returns keywords
|
||||
2. Queue: Service gets text from queue, model calculates keywords, save keywords in queue
|
||||
The module provided functions for the individual tasks that all return some kind of collection of points, depending on
|
||||
the specific task.
|
||||
|
||||
To test the REST endpoint you have to set up an environment and do poetry install (
|
||||
see https://gitlab.knecon.com/knecon/research/template-python-project for details for setting up poetry)
|
||||
Then run
|
||||
#### Redaction Detection (API)
|
||||
|
||||
```
|
||||
python ./src/serve.py
|
||||
The below snippet shows hot to find the outlines of previous redactions.
|
||||
|
||||
```python
|
||||
from cv_analysis.redaction_detection import find_redactions
|
||||
import pdf2image
|
||||
import numpy as np
|
||||
|
||||
pdf_path = ...
|
||||
page_index = ...
|
||||
|
||||
page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0]
|
||||
page = np.array(page)
|
||||
|
||||
redaction_contours = find_redactions(page)
|
||||
```
|
||||
|
||||
You don't need to start a queue for that, just ignore the AMQP Error.
|
||||
Port and host are set in settings.toml .
|
||||
You can use the FastAPI under 127.0.0.1:8001/docs to send request to endpoint.
|
||||
## As a CLI Tool
|
||||
|
||||
You can also test the service with docker:
|
||||
Core API functionalities can be used through a CLI.
|
||||
|
||||
#### Run Docker Commands
|
||||
### Table Parsing
|
||||
|
||||
The tables parsing utility detects and segments tables into individual cells.
|
||||
|
||||
```bash
|
||||
docker build -t ${IMAGE_NAME} -f Dockerfile --build-arg GITLAB_USER=${GITLAB_USER} \
|
||||
--build-arg GITLAB_ACCESS_TOKEN=${GITLAB_ACCESS_TOKEN} \
|
||||
--build-arg AZURE_TENANT_ID=${AZURE_TENANT_ID} \
|
||||
--build-arg AZURE_SUBSCRIPTION_ID=${AZURE_SUBSCRIPTION_ID} \
|
||||
--build-arg AZURE_CLIENT_ID=${AZURE_CLIENT_ID} \
|
||||
--build-arg AZURE_CLIENT_SECRET=${AZURE_CLIENT_SECRET} \
|
||||
--build-arg AZURE_AML_MODEL_VERSION=${AZURE_AML_MODEL_VERSION} \
|
||||
--build-arg AZURE_AML_MODEL_NAME=${AZURE_AML_MODEL_NAME} \
|
||||
--build-arg AZURE_RESOURCE_GROUP=${AZURE_RESOURCE_GROUP} \
|
||||
--build-arg AZURE_AML_WORKSPACE=${AZURE_AML_WORKSPACE}
|
||||
python scripts/annotate.py data/test_pdf.pdf 7 --type table
|
||||
```
|
||||
|
||||
The below image shows a parsed table, where each table cell has been detected individually.
|
||||
|
||||

|
||||
|
||||
### Redaction Detection (CLI)
|
||||
|
||||
The redaction detection utility detects previous redactions in PDFs (filled black rectangles).
|
||||
|
||||
```bash
|
||||
docker run --net=host -it --rm --name ${CONTAINER_NAME} ${IMAGE_NAME}
|
||||
python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
|
||||
```
|
||||
|
||||
#### Run locally
|
||||
The below image shows the detected redactions with green outlines.
|
||||
|
||||
First you need to download the model from mlflow. This can be done with the *"src/ml_flow/download_model.py"* script.
|
||||
This scripts downloads a model and copies config and model data to the specific locations, such that the model can
|
||||
be loaded.
|
||||

|
||||
|
||||
For running/testing the keyword extraction locally you can use the *src/tests/test_process.py* script.
|
||||
### Layout Parsing
|
||||
|
||||
Model ist stored and loaded via DVC, you need the connection string under
|
||||
https://portal.azure.com/#@knecon.com/resource/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourceGroups/taas-rg/providers/Microsoft.Storage/storageAccounts/taassaracer/keys
|
||||
The layout parsing utility detects elements such as paragraphs, tables and figures.
|
||||
|
||||
# Upload models to ML Flow
|
||||
|
||||
To upload the models to mlflow, you can use following script: src/mlflow/upload_model.py
|
||||
For authentication following environment variables need to be set:
|
||||
|
||||
```
|
||||
#AZURE_TENANT_ID=""
|
||||
#AZURE_SUBSCRIPTION_ID=""
|
||||
#AZURE_CLIENT_ID=""
|
||||
#AZURE_CLIENT_SECRET=""
|
||||
```bash
|
||||
python scripts/annotate.py data/test_pdf.pdf 7 --type layout
|
||||
```
|
||||
|
||||
Additional settings (resource group, experiment name, etc.) can be specified in the config (
|
||||
*./src/mlflow/config/azure_config.toml*).
|
||||
The *upload_model.py* has the following parameters:
|
||||
The below image shows the detected layout elements on a page.
|
||||
|
||||
```
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
-a AZURE_CONFIG, --azure_config AZURE_CONFIG
|
||||
Location of the configuration file for Azure (default: src/mlflow/config/azure_config.toml)
|
||||
-b BASE_CONFIG, --base_config BASE_CONFIG
|
||||
Location of the basic training configuration (default: src/mlflow/config/settings_de.toml)
|
||||
|
||||

|
||||
|
||||
### Figure Detection
|
||||
|
||||
The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility.
|
||||
|
||||
```bash
|
||||
python scripts/annotate.py data/test_pdf.pdf 3 --type figure
|
||||
```
|
||||
|
||||
the base config contains all information for the models used. Examples for German and
|
||||
English are placed in */src/mlflow/config/*
|
||||
The below image shows the detected figure on a page.
|
||||
|
||||
Note: Multi-language model tracking does not work for now. After the upload script reports an error, you have to
|
||||
manually track the
|
||||
model [here](https://ml.azure.com/experiments?wsid=/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourcegroups/fforesight-rg/providers/Microsoft.MachineLearningServices/workspaces/ff-aml-main&tid=b44be368-e4f2-4ade-a089-cd2825458048)
|
||||
where you can find the run. Adhere to the naming conventions for the model name and versions,
|
||||
see [here](https://ml.azure.com/model/list?wsid=/subscriptions/4b9531fc-c5e4-4b11-8492-0cc173c1f97d/resourcegroups/fforesight-rg/providers/Microsoft.MachineLearningServices/workspaces/ff-aml-main&tid=b44be368-e4f2-4ade-a089-cd2825458048)
|
||||

|
||||
|
||||
## Running as a service
|
||||
|
||||
### Building
|
||||
|
||||
Build base image
|
||||
|
||||
```bash
|
||||
bash setup/docker.sh
|
||||
```
|
||||
|
||||
Build head image
|
||||
|
||||
```bash
|
||||
docker build -f Dockerfile -t cv-analysis . --build-arg BASE_ROOT=""
|
||||
```
|
||||
|
||||
### Usage (service)
|
||||
|
||||
Shell 1
|
||||
|
||||
```bash
|
||||
docker run --rm --net=host --rm cv-analysis
|
||||
```
|
||||
|
||||
Shell 2
|
||||
|
||||
```bash
|
||||
python scripts/client_mock.py --pdf_path /path/to/a/pdf
|
||||
```
|
||||
|
||||
5
docs/source/data/.gitignore
vendored
Normal file
5
docs/source/data/.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
/test_pdf.pdf
|
||||
/figure_detection.png
|
||||
/layout_parsing.png
|
||||
/redaction_detection.png
|
||||
/table_parsing.png
|
||||
5
docs/source/data/data/.gitignore
vendored
Normal file
5
docs/source/data/data/.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
/test_pdf.pdf
|
||||
/figure_detection.png
|
||||
/layout_parsing.png
|
||||
/redaction_detection.png
|
||||
/table_parsing.png
|
||||
@ -3,9 +3,9 @@
|
||||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
====================================================
|
||||
Welcome to Keyword Extraction Service documentation!
|
||||
====================================================
|
||||
=============================================
|
||||
Welcome to CV Analysis Service documentation!
|
||||
=============================================
|
||||
|
||||
.. note::
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user