Compare commits

..

4 Commits

Author SHA1 Message Date
llocarnini
aa3d90a2dc merge master 2022-09-20 17:26:56 +02:00
llocarnini
f4cdc13dcf Merge branch 'master' of ssh://git.iqser.com:2222/rr/cv-analysis into optimize_layout_detection 2022-09-20 17:25:43 +02:00
llocarnini
ba33417166 partial clean up 2022-09-20 17:25:03 +02:00
llocarnini
5a7b756fc1 added two files with two not completed version for labeling layout rects. WIP 2022-09-20 08:28:18 +02:00
302 changed files with 1138 additions and 160994 deletions

View File

@ -10,7 +10,7 @@ omit =
*/build_venv/*
*/incl/*
source =
cv_analysis
cv_analysis
relative_files = True
data_file = .coverage
@ -46,4 +46,4 @@ ignore_errors = True
directory = reports
[xml]
output = reports/coverage.xml
output = reports/coverage.xml

View File

@ -97,4 +97,4 @@ target/
*.swp
*/*.swp
*/*/*.swp
*/*/*/*.swp
*/*/*/*.swp

View File

@ -1,10 +1,7 @@
[core]
remote = azure_remote
remote = vector
autostage = true
['remote "vector"']
url = ssh://vector.iqser.com/research/nonml_cv_doc_parsing/
port = 22
['remote "azure_remote"']
url = azure://cv-sa-dvc/
connection_string = "DefaultEndpointsProtocol=https;AccountName=cvsacricket;AccountKey=KOuTAQ6Mp00ePTT5ObYmgaHlxwS1qukY4QU4Kuk7gy/vldneA+ZiKjaOpEFtqKA6Mtym2gQz8THy+ASts/Y1Bw==;EndpointSuffix=core.windows.net"
['remote "local"']
url = ../dvc_local_remote

77
.gitignore vendored
View File

@ -1,52 +1,27 @@
# Environments
.env
.venv
env/
venv/
.pytest*
.python-version
.DS_Store
# Project folders
scratch/
*.vscode/
.idea
*_app
*pytest_cache
*joblib
*tmp
*profiling
*logs
*docker
*drivers
*bamboo-specs/target
# Python specific files
__pycache__/
*.py[cod]
*.ipynb
*.ipynb_checkpoints
# file extensions
*.log
*.csv
*.json
*.pkl
*.profile
*.cbm
# temp files
*.swp
*~
*.un~
# keep files
!notebooks/*.ipynb
# keep folders
!secrets
!data/*
!drivers
# unignore files
!bom.*
*.egg-info/
deskew_model/
build_venv/
/pdfs/
/results/
/pdfs/
/env/
/.idea/
/.idea/.gitignore
/.idea/misc.xml
/.idea/inspectionProfiles/profiles_settings.xml
/.idea/table_parsing.iml
/.idea/vcs.xml
/results/
/table_parsing.egg-info
/target/
/tests/
/cv_analysis.egg-info/dependency_links.txt
/cv_analysis.egg-info/PKG-INFO
/cv_analysis.egg-info/SOURCES.txt
/cv_analysis.egg-info/top_level.txt
/.vscode/
/cv_analysis/test/test_data/example_pages.json
/data/metadata_testing_files.csv
.coverage
/data/

View File

@ -1,30 +0,0 @@
include:
- project: "Gitlab/gitlab"
ref: 0.3.0
file: "/ci-templates/research/dvc-versioning-build-release.gitlab-ci.yml"
variables:
NEXUS_PROJECT_DIR: red
IMAGENAME: "${CI_PROJECT_NAME}"
#################################
# temp. disable integration tests, b/c they don't cover the CV analysis case yet
trigger integration tests:
rules:
- when: never
release build:
stage: release
needs:
- job: set custom version
artifacts: true
optional: true
- job: calculate patch version
artifacts: true
optional: true
- job: calculate minor version
artifacts: true
optional: true
- job: build docker nexus
artifacts: true
#################################

View File

@ -1,35 +0,0 @@
# CI for services, check gitlab repo for python package CI
include:
- project: "Gitlab/gitlab"
ref: main
file: "/ci-templates/research/versioning-build-test-release.gitlab-ci.yml"
- project: "Gitlab/gitlab"
ref: main
file: "/ci-templates/research/docs.gitlab-ci.yml"
# set project variables here
variables:
NEXUS_PROJECT_DIR: red # subfolder in Nexus docker-gin where your container will be stored
IMAGENAME: $CI_PROJECT_NAME # if the project URL is gitlab.example.com/group-name/project-1, CI_PROJECT_NAME is project-1
pages:
only:
- master # KEEP THIS, necessary because `master` branch and not `main` branch
###################
# INTEGRATION TESTS
trigger-integration-tests:
extends: .integration-tests
# ADD THE MODEL BUILD WHICH SHOULD TRIGGER THE INTEGRATION TESTS
# needs:
# - job: docker-build::model_name
# artifacts: true
rules:
- when: never
#########
# RELEASE
release:
extends: .release
needs:
- !reference [.needs-versioning, needs] # leave this line as is

View File

@ -1,61 +0,0 @@
import subprocess
import sys
from pathlib import Path
import semver
from loguru import logger
from semver.version import Version
logger.remove()
logger.add(sys.stdout, level="INFO")
def bashcmd(cmds: list) -> str:
try:
logger.debug(f"running: {' '.join(cmds)}")
return subprocess.run(cmds, check=True, capture_output=True, text=True).stdout.strip("\n")
except:
logger.warning(f"Error executing the following bash command: {' '.join(cmds)}.")
raise
def get_highest_existing_git_version_tag() -> str:
"""Get highest versions from git tags depending on bump level"""
try:
git_tags = bashcmd(["git", "tag", "-l"]).split()
semver_compat_tags = list(filter(Version.is_valid, git_tags))
highest_git_version_tag = max(semver_compat_tags, key=semver.version.Version.parse)
logger.info(f"Highest git version tag: {highest_git_version_tag}")
return highest_git_version_tag
except:
logger.warning("Error getting git version tags")
raise
def auto_bump_version() -> bool:
active = Path(".autoversion").is_file()
logger.debug(f"Automated version bump is set to '{active}'")
return active
def main() -> None:
poetry_project_version = bashcmd(["poetry", "version", "-s"])
logger.info(f"Poetry project version: {poetry_project_version}")
highest_git_version_tag = get_highest_existing_git_version_tag()
comparison_result = semver.compare(poetry_project_version, highest_git_version_tag)
if comparison_result in (-1, 0):
logger.warning("Poetry version must be greater than git tag version.")
if auto_bump_version():
logger.info(bashcmd(["poetry", "version", highest_git_version_tag]))
sys.exit(0)
sys.exit(1)
else:
logger.info(f"All good: {poetry_project_version} > {highest_git_version_tag}")
if __name__ == "__main__":
main()

View File

@ -1,72 +0,0 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
exclude: ^(docs/|notebooks/|data/|src/configs/|tests/|.hooks/|bom.json)
default_language_version:
python: python3.10
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
args: [--unsafe] # needed for .gitlab-ci.yml
- id: check-toml
- id: detect-private-key
- id: check-added-large-files
args: ['--maxkb=10000']
- id: check-case-conflict
- id: mixed-line-ending
# - repo: https://github.com/pre-commit/mirrors-pylint
# rev: v3.0.0a5
# hooks:
# - id: pylint
# args:
# - --disable=C0111,R0903,E0401
# - --max-line-length=120
- repo: https://github.com/pre-commit/mirrors-isort
rev: v5.10.1
hooks:
- id: isort
args:
- --profile black
- repo: https://github.com/psf/black
rev: 24.10.0
hooks:
- id: black
# exclude: ^(docs/|notebooks/|data/|src/secrets/)
args:
- --line-length=120
- repo: https://github.com/compilerla/conventional-pre-commit
rev: v4.0.0
hooks:
- id: conventional-pre-commit
pass_filenames: false
stages: [commit-msg]
# args: [] # optional: list of Conventional Commits types to allow e.g. [feat, fix, ci, chore, test]
- repo: local
hooks:
- id: version-checker
name: version-checker
entry: python .hooks/poetry_version_check.py
language: python
always_run: true
additional_dependencies:
- "semver"
- "loguru"
# - repo: local
# hooks:
# - id: docker-build-test
# name: testing docker build
# entry: ./scripts/ops/docker-compose-build-run.sh
# language: script
# # always_run: true
# pass_filenames: false
# args: []
# stages: [pre-commit]

View File

@ -1,78 +1,30 @@
###############
# BUILDER IMAGE
FROM python:3.10-slim as builder
FROM python:3.10
ARG GITLAB_USER
ARG GITLAB_ACCESS_TOKEN
RUN python -m venv /app/venv
ENV PATH="/app/venv/bin:$PATH"
ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
RUN python -m pip install --upgrade pip
ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
ARG POETRY_SOURCE_REF_RED=gitlab-red
WORKDIR /app/service
ARG PYPI_REGISTRY_FFORESIGHT=https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
ARG POETRY_SOURCE_REF_FFORESIGHT=gitlab-fforesight
COPY ./requirements.txt ./requirements.txt
RUN python3 -m pip install -r requirements.txt
ARG VERSION=dev
COPY ./incl/pyinfra/requirements.txt ./incl/pyinfra/requirements.txt
RUN python -m pip install -r incl/pyinfra/requirements.txt
LABEL maintainer="Research <research@knecon.com>"
LABEL version="${VERSION}"
COPY ./incl/pdf2image/requirements.txt ./incl/pdf2image/requirements.txt
RUN python -m pip install -r incl/pdf2image/requirements.txt
WORKDIR /app
COPY ./incl ./incl
###########
# ENV SETUP
ENV PYTHONDONTWRITEBYTECODE=true
ENV PYTHONUNBUFFERED=true
ENV POETRY_HOME=/opt/poetry
ENV PATH="$POETRY_HOME/bin:$PATH"
RUN python3 -m pip install -e incl/pyinfra
RUN python3 -m pip install -e incl/pdf2image
RUN apt-get update && \
apt-get install -y curl git bash build-essential libffi-dev libssl-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN curl -sSL https://install.python-poetry.org | python3 -
RUN poetry --version
COPY pyproject.toml poetry.lock ./
RUN poetry config virtualenvs.create true && \
poetry config virtualenvs.in-project true && \
poetry config installer.max-workers 10 && \
poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
poetry config http-basic.${POETRY_SOURCE_REF_RED} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
poetry config repositories.${POETRY_SOURCE_REF_FFORESIGHT} ${PYPI_REGISTRY_FFORESIGHT} && \
poetry config http-basic.${POETRY_SOURCE_REF_FFORESIGHT} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
poetry install --without=dev,docs,test -vv --no-interaction --no-root
##################
# COPY SOURCE CODE
COPY ./config ./config
COPY ./src ./src
COPY ./cv_analysis ./cv_analysis
COPY ./setup.py ./setup.py
###############
# WORKING IMAGE
FROM python:3.10-slim
RUN python3 -m pip install -e .
# COPY BILL OF MATERIALS (BOM)
COPY bom.json /bom.json
# COPY SOURCE CODE FROM BUILDER IMAGE
COPY --from=builder /app /app
WORKDIR /app
ENV PATH="/app/.venv/bin:$PATH"
############
# NETWORKING
EXPOSE 5000
EXPOSE 8080
################
# LAUNCH COMMAND
CMD [ "python", "src/serve.py"]
CMD ["python3", "-u", "src/serve.py"]

View File

@ -1,94 +0,0 @@
.PHONY: \
poetry in-project-venv dev-env use-env install install-dev tests \
update-version sync-version-with-git \
docker docker-build-run docker-build docker-run \
docker-rm docker-rm-container docker-rm-image \
pre-commit get-licenses prep-commit \
docs sphinx_html sphinx_apidoc bom
.DEFAULT_GOAL := run
export DOCKER=docker
export DOCKERFILE=Dockerfile
export IMAGE_NAME=cv_analysis_service-image
export CONTAINER_NAME=cv_analysis_service-container
export HOST_PORT=9999
export CONTAINER_PORT=9999
export PYTHON_VERSION=python3.10
# all commands should be executed in the root dir or the project,
# specific environments should be deactivated
poetry: in-project-venv use-env dev-env
in-project-venv:
poetry config virtualenvs.in-project true
use-env:
poetry env use ${PYTHON_VERSION}
dev-env:
poetry install --with dev && poetry update
install:
poetry add $(pkg)
install-dev:
poetry add --dev $(pkg)
requirements:
poetry export --without-hashes --output requirements.txt
update-version:
poetry version prerelease
sync-version-with-git:
git pull -p && poetry version $(git rev-list --tags --max-count=1 | git describe --tags --abbrev=0)
bom:
cyclonedx-py poetry -o bom.json
docker: docker-rm docker-build-run
docker-build-run: docker-build docker-run
docker-build:
$(DOCKER) build \
--no-cache --progress=plain \
-t $(IMAGE_NAME) -f $(DOCKERFILE) \
--build-arg USERNAME=${USERNAME} \
--build-arg TOKEN=${GITLAB_TOKEN} \
.
docker-run:
$(DOCKER) run -it --rm -p $(HOST_PORT):$(CONTAINER_PORT)/tcp --name $(CONTAINER_NAME) $(IMAGE_NAME)
docker-rm: docker-rm-container docker-rm-image
docker-rm-container:
-$(DOCKER) rm $(CONTAINER_NAME)
docker-rm-image:
-$(DOCKER) image rm $(IMAGE_NAME)
tests:
poetry run pytest ./tests
prep-commit:
docs get-license sync-version-with-git update-version pre-commit
pre-commit:
pre-commit run --all-files
get-licenses:
pip-licenses --format=json --order=license --with-urls > pkg-licenses.json
docs: sphinx_apidoc sphinx_html
sphinx_html:
poetry run sphinx-build -b html docs/source/ docs/build/html -E -a
sphinx_apidoc:
cp ./README.md ./docs/source/README.md && cp -r ./data ./docs/source/data/ && poetry run sphinx-apidoc ./src -o ./docs/source/modules --no-toc --module-first --follow-links --separate --force
bom:
cyclonedx-py poetry -o bom.json

View File

@ -1,60 +1,8 @@
# cv-analysis - Visual (CV-Based) Document Parsing
# cv-analysis &mdash; Visual (CV-Based) Document Parsing
parse_pdf()
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
previous redactions in documents.
## API
Input message:
```json
{
"targetFilePath": {
"pdf": "absolute file path",
"vlp_output": "absolute file path"
},
"responseFilePath": "absolute file path",
"operation": "table_image_inference"
}
```
Response is uploaded to the storage as specified in the `responseFilePath` field. The structure is as follows:
```json
{
...,
"data": [
{
'pageNum': 0,
'bbox': {
'x1': 55.3407,
'y1': 247.0246,
'x2': 558.5602,
'y2': 598.0585
},
'uuid': '2b10c1a2-393c-4fca-b9e3-0ad5b774ac84',
'label': 'table',
'tableLines': [
{
'x1': 0,
'y1': 16,
'x2': 1399,
'y2': 16
},
...
],
'imageInfo': {
'height': 693,
'width': 1414
}
},
...
]
}
```
## Installation
```bash
@ -83,9 +31,10 @@ The below snippet shows hot to find the outlines of previous redactions.
```python
from cv_analysis.redaction_detection import find_redactions
import pdf2image
import pdf2image
import numpy as np
pdf_path = ...
page_index = ...

40
bamboo-specs/pom.xml Normal file
View File

@ -0,0 +1,40 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs-parent</artifactId>
<version>7.1.2</version>
<relativePath/>
</parent>
<artifactId>bamboo-specs</artifactId>
<version>1.0.0-SNAPSHOT</version>
<packaging>jar</packaging>
<properties>
<sonar.skip>true</sonar.skip>
</properties>
<dependencies>
<dependency>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs-api</artifactId>
</dependency>
<dependency>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs</artifactId>
</dependency>
<!-- Test dependencies -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<!-- run 'mvn test' to perform offline validation of the plan -->
<!-- run 'mvn -Ppublish-specs' to upload the plan to your Bamboo server -->
</project>

View File

@ -0,0 +1,178 @@
package buildjob;
import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
import java.time.LocalTime;
import com.atlassian.bamboo.specs.api.BambooSpec;
import com.atlassian.bamboo.specs.api.builders.BambooKey;
import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
import com.atlassian.bamboo.specs.api.builders.permission.PermissionType;
import com.atlassian.bamboo.specs.api.builders.permission.Permissions;
import com.atlassian.bamboo.specs.api.builders.permission.PlanPermissions;
import com.atlassian.bamboo.specs.api.builders.plan.Job;
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
import com.atlassian.bamboo.specs.api.builders.plan.PlanIdentifier;
import com.atlassian.bamboo.specs.api.builders.plan.Stage;
import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup;
import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement;
import com.atlassian.bamboo.specs.api.builders.project.Project;
import com.atlassian.bamboo.specs.builders.task.CheckoutItem;
import com.atlassian.bamboo.specs.builders.task.InjectVariablesTask;
import com.atlassian.bamboo.specs.builders.task.ScriptTask;
import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask;
import com.atlassian.bamboo.specs.builders.task.CleanWorkingDirectoryTask;
import com.atlassian.bamboo.specs.builders.task.VcsTagTask;
import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
import com.atlassian.bamboo.specs.builders.trigger.ScheduledTrigger;
import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
import com.atlassian.bamboo.specs.api.builders.Variable;
import com.atlassian.bamboo.specs.util.BambooServer;
import com.atlassian.bamboo.specs.builders.task.ScriptTask;
import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
/**
* Plan configuration for Bamboo.
* Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
*/
@BambooSpec
public class PlanSpec {
private static final String SERVICE_NAME = "cv-analysis";
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_","");
/**
* Run main to publish plan on Bamboo
*/
public static void main(final String[] args) throws Exception {
//By default credentials are read from the '.credentials' file.
BambooServer bambooServer = new BambooServer("http://localhost:8085");
Plan plan = new PlanSpec().createDockerBuildPlan();
bambooServer.publish(plan);
PlanPermissions planPermission = new PlanSpec().createPlanPermission(plan.getIdentifier());
bambooServer.publish(planPermission);
Plan secPlan = new PlanSpec().createSecBuild();
bambooServer.publish(secPlan);
PlanPermissions secPlanPermission = new PlanSpec().createPlanPermission(secPlan.getIdentifier());
bambooServer.publish(secPlanPermission);
}
private PlanPermissions createPlanPermission(PlanIdentifier planIdentifier) {
Permissions permission = new Permissions()
.userPermissions("atlbamboo", PermissionType.EDIT, PermissionType.VIEW, PermissionType.ADMIN, PermissionType.CLONE, PermissionType.BUILD)
.groupPermissions("research", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
.groupPermissions("Development", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
.groupPermissions("QA", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
.loggedInUserPermissions(PermissionType.VIEW)
.anonymousUserPermissionView();
return new PlanPermissions(planIdentifier.getProjectKey(), planIdentifier.getPlanKey()).permissions(permission);
}
private Project project() {
return new Project()
.name("RED")
.key(new BambooKey("RED"));
}
public Plan createDockerBuildPlan() {
return new Plan(
project(),
SERVICE_NAME, new BambooKey(SERVICE_KEY))
// .description("Docker build for cv-analysis.")
// .variables()
.stages(new Stage("Build Stage")
.jobs(
new Job("Build Job", new BambooKey("BUILD"))
.tasks(
new CleanWorkingDirectoryTask()
.description("Clean working directory.")
.enabled(true),
new VcsCheckoutTask()
.description("Checkout default repository.")
.checkoutItems(new CheckoutItem().defaultRepository()),
new ScriptTask()
.description("Set config and keys.")
.location(Location.FILE)
.fileFromPath("bamboo-specs/src/main/resources/scripts/key-prepare.sh"),
new ScriptTask()
.description("Build Docker container.")
.location(Location.FILE)
.fileFromPath("bamboo-specs/src/main/resources/scripts/docker-build.sh")
.argument(SERVICE_NAME),
new InjectVariablesTask()
.description("Inject git tag.")
.path("git.tag")
.namespace("g")
.scope(InjectVariablesScope.LOCAL),
new VcsTagTask()
.description("${bamboo.g.gitTag}")
.tagName("${bamboo.g.gitTag}")
.defaultRepository())
.dockerConfiguration(
new DockerConfiguration()
.image("nexus.iqser.com:5001/infra/release_build:4.5.0")
.volume("/var/run/docker.sock", "/var/run/docker.sock")),
new Job("Licence Job", new BambooKey("LICENCE"))
.enabled(false)
.tasks(
new VcsCheckoutTask()
.description("Checkout default repository.")
.checkoutItems(new CheckoutItem().defaultRepository()),
new ScriptTask()
.description("Build licence.")
.location(Location.FILE)
.fileFromPath("bamboo-specs/src/main/resources/scripts/create-licence.sh"))
.dockerConfiguration(
new DockerConfiguration()
.image("nexus.iqser.com:5001/infra/maven:3.6.2-jdk-13-3.0.0")
.volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
.linkedRepositories("RR / " + SERVICE_NAME)
.triggers(
new BitbucketServerTrigger())
.planBranchManagement(
new PlanBranchManagement()
.createForVcsBranch()
.delete(
new BranchCleanup()
.whenInactiveInRepositoryAfterDays(14))
.notificationForCommitters());
}
public Plan createSecBuild() {
return new Plan(project(), SERVICE_NAME + "-Sec", new BambooKey(SERVICE_KEY + "SEC")).description("Security Analysis Plan")
.stages(new Stage("Default Stage").jobs(
new Job("Sonar Job", new BambooKey("SONAR"))
.tasks(
new CleanWorkingDirectoryTask()
.description("Clean working directory.")
.enabled(true),
new VcsCheckoutTask()
.description("Checkout default repository.")
.checkoutItems(new CheckoutItem().defaultRepository()),
new ScriptTask()
.description("Set config and keys.")
.location(Location.FILE)
.fileFromPath("bamboo-specs/src/main/resources/scripts/key-prepare.sh"),
new ScriptTask()
.description("Run Sonarqube scan.")
.location(Location.FILE)
.fileFromPath("bamboo-specs/src/main/resources/scripts/sonar-scan.sh")
.argument(SERVICE_NAME))
.dockerConfiguration(
new DockerConfiguration()
.image("nexus.iqser.com:5001/infra/release_build:4.2.0")
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
.linkedRepositories("RR / " + SERVICE_NAME)
.triggers(
new ScheduledTrigger()
.scheduleOnceDaily(LocalTime.of(23, 00)))
.planBranchManagement(
new PlanBranchManagement()
.createForVcsBranchMatching("release.*")
.notificationForCommitters());
}
}

View File

@ -0,0 +1,19 @@
#!/bin/bash
set -e
if [[ \"${bamboo_version_tag}\" != \"dev\" ]]
then
${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
-f ${bamboo_build_working_directory}/pom.xml \
versions:set \
-DnewVersion=${bamboo_version_tag}
${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
-f ${bamboo_build_working_directory}/pom.xml \
-B clean deploy \
-e -DdeployAtEnd=true \
-Dmaven.wagon.http.ssl.insecure=true \
-Dmaven.wagon.http.ssl.allowall=true \
-Dmaven.wagon.http.ssl.ignore.validity.dates=true \
-DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/gin4-platform-releases
fi

View File

@ -0,0 +1,53 @@
#!/bin/bash
set -e
SERVICE_NAME=$1
if [[ "$bamboo_planRepository_branchName" == "master" ]]
then
branchVersion=$(cat version.yaml | grep -Eo "version: .*" | sed -s 's|version: \(.*\)\..*\..*|\1|g')
latestVersion=$( semver $(git tag -l "${branchVersion}.*" ) | tail -n1 )
newVersion="$(semver $latestVersion -p -i minor)"
echo "new release on master with version $newVersion"
elif [[ "$bamboo_planRepository_branchName" == release* ]]
then
branchVersion=$(echo $bamboo_planRepository_branchName | sed -s 's|release\/\([0-9]\+\.[0-9]\+\)\.x|\1|')
latestVersion=$( semver $(git tag -l "${branchVersion}.*" ) | tail -n1 )
newVersion="$(semver $latestVersion -p -i patch)"
echo "new release on $bamboo_planRepository_branchName with version $newVersion"
elif [[ "${bamboo_version_tag}" != "dev" ]]
then
newVersion="${bamboo_version_tag}"
echo "new special version bild with $newVersion"
else
newVersion="${bamboo_planRepository_1_branch}_${bamboo_buildNumber}"
echo "gitTag=${newVersion}" > git.tag
echo "dev build with tag ${newVersion}"
python3 -m venv build_venv
source build_venv/bin/activate
python3 -m pip install --upgrade pip
pip install dvc
pip install 'dvc[ssh]'
dvc pull
echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
docker build -f Dockerfile .
exit 0
fi
echo "gitTag=${newVersion}" > git.tag
python3 -m venv build_venv
source build_venv/bin/activate
python3 -m pip install --upgrade pip
pip install dvc
pip install 'dvc[ssh]'
dvc pull
echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion} .
echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion}

View File

@ -0,0 +1,8 @@
#!/bin/bash
set -e
mkdir -p ~/.ssh
echo "${bamboo_agent_ssh}" | base64 -d >> ~/.ssh/id_rsa
echo "host vector.iqser.com" > ~/.ssh/config
echo " user bamboo-agent" >> ~/.ssh/config
chmod 600 ~/.ssh/config ~/.ssh/id_rsa

View File

@ -0,0 +1,67 @@
#!/bin/bash
set -e
export JAVA_HOME=/usr/bin/sonar-scanner/jre
python3 -m venv build_venv
source build_venv/bin/activate
python3 -m pip install --upgrade pip
echo "dev setup for unit test and coverage"
pip install -e incl/pyinfra
pip install -r incl/pyinfra/requirements.txt
pip install -e incl/pdf2image
pip install -r incl/pdf2image/requirements.txt
pip install -e .
pip install -r requirements.txt
echo "DVC pull step"
dvc pull
echo "coverage calculation"
coverage run -m pytest
echo "coverage report generation"
coverage report -m
coverage xml
SERVICE_NAME=$1
echo "dependency-check:aggregate"
mkdir -p reports
dependency-check --enableExperimental -f JSON -f HTML -f XML \
--disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
--exclude "build_venv/**" --exclude "**/__pycache__/**"
if [[ -z "${bamboo_repository_pr_key}" ]]
then
echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
/usr/bin/sonar-scanner/bin/sonar-scanner -X\
-Dsonar.projectKey=RED_$SERVICE_NAME \
-Dsonar.sources=src,cv_analysis \
-Dsonar.host.url=https://sonarqube.iqser.com \
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-Dsonar.branch.name=${bamboo_planRepository_1_branch} \
-Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
-Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
-Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
-Dsonar.python.coverage.reportPaths=reports/coverage.xml
else
echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
/usr/bin/sonar-scanner/bin/sonar-scanner \
-Dsonar.projectKey=RED_$SERVICE_NAME \
-Dsonar.sources=src,cv_analysis \
-Dsonar.host.url=https://sonarqube.iqser.com \
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
-Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \
-Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \
-Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
-Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
-Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
-Dsonar.python.coverage.reportPaths=reports/coverage.xml
fi

View File

@ -0,0 +1,22 @@
package buildjob;
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
import com.atlassian.bamboo.specs.api.exceptions.PropertiesValidationException;
import com.atlassian.bamboo.specs.api.util.EntityPropertiesBuilders;
import org.junit.Test;
public class PlanSpecTest {
@Test
public void checkYourPlanOffline() throws PropertiesValidationException {
Plan plan = new PlanSpec().createDockerBuildPlan();
EntityPropertiesBuilders.build(plan);
}
@Test
public void checkYourSecPlanOffline() throws PropertiesValidationException {
Plan secPlan = new PlanSpec().createSecBuild();
EntityPropertiesBuilders.build(secPlan);
}
}

30096
bom.json

File diff suppressed because it is too large Load Diff

View File

@ -1,67 +0,0 @@
[asyncio]
max_concurrent_tasks = 10
[dynamic_tenant_queues]
enabled = true
[metrics.prometheus]
enabled = true
prefix = "redactmanager_cv_analysis_service"
[tracing]
enabled = true
# possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
type = "azure_monitor"
[tracing.opentelemetry]
endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
service_name = "redactmanager_cv_analysis_service"
exporter = "otlp"
[webserver]
host = "0.0.0.0"
port = 8080
[rabbitmq]
host = "localhost"
port = 5672
username = ""
password = ""
heartbeat = 60
# Has to be a divider of heartbeat, and shouldn't be too big, since only in these intervals queue interactions happen (like receiving new messages)
# This is also the minimum time the service needs to process a message
connection_sleep = 5
input_queue = "request_queue"
output_queue = "response_queue"
dead_letter_queue = "dead_letter_queue"
tenant_event_queue_suffix = "_tenant_event_queue"
tenant_event_dlq_suffix = "_tenant_events_dlq"
tenant_exchange_name = "tenants-exchange"
queue_expiration_time = 300000 # 5 minutes in milliseconds
service_request_queue_prefix = "cv_analysis_request_queue"
service_request_exchange_name = "cv_analysis_request_exchange"
service_response_exchange_name = "cv_analysis_response_exchange"
service_dlq_name = "cv_analysis_dlq"
[storage]
backend = "s3"
[storage.s3]
bucket = "redaction"
endpoint = "http://127.0.0.1:9000"
key = ""
secret = ""
region = "eu-central-1"
[storage.azure]
container = "redaction"
connection_string = ""
[storage.tenant_server]
public_key = ""
endpoint = "http://tenant-user-management:8081/internal-api/tenants"
[kubernetes]
pod_name = "test_pod"

View File

@ -1,19 +0,0 @@
[logging]
level = "INFO"
visual_logging_level = "DISABLED"
visual_logging_output_folder = "/tmp/debug"
[table_parsing]
skip_pages_without_images = true
[paths]
root = "@format {env[ROOT_PATH]}"
dvc_data_dir = "${paths.root}/data"
pdf_for_testing = "${paths.dvc_data_dir}/pdfs_for_testing"
png_for_testing = "${paths.dvc_data_dir}/pngs_for_testing"
png_figures_detected = "${paths.png_for_testing}/figures_detected"
png_tables_detected = "${paths.png_for_testing}/tables_detected_by_tp"
hashed_pdfs_for_testing = "${paths.pdf_for_testing}/hashed"
metadata_test_files = "${paths.dvc_data_dir}/metadata_testing_files.csv"
test_dir = "${paths.dvc_data_dir}/test"
test_data_dir = "${paths.dvc_data_dir}/test/test_data"

30
cv_analysis/config.py Normal file
View File

@ -0,0 +1,30 @@
import os
def get_config():
return Config()
class Config:
def __init__(self):
self.logging_level_root = os.environ.get("LOGGING_LEVEL_ROOT", "INFO")
# visual_logging_level: NOTHING > INFO > DEBUG > ALL
self.visual_logging_level = "DISABLED"
self.visual_logging_output_folder = "/tmp/debug"
# locations
# FIXME: is everything here necessary?
root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
self.dvc_data_dir = os.path.join(root, "data")
self.pdf_for_testing = os.path.join(self.dvc_data_dir, "pdfs_for_testing")
self.png_for_testing = os.path.join(self.dvc_data_dir, "pngs_for_testing")
self.png_figures_detected = os.path.join(self.png_for_testing, "figures_detected")
self.png_tables_detected = os.path.join(self.png_for_testing, "tables_detected_by_tp")
self.hashed_pdfs_for_testing = os.path.join(self.pdf_for_testing, "hashed")
self.metadata_test_files = os.path.join(self.dvc_data_dir, "metadata_testing_files.csv")
self.test_dir = os.path.join(root, "test")
self.test_data_dir = os.path.join(self.test_dir, "test_data")
def __getitem__(self, key):
return self.__getattribute__(key)

View File

@ -6,15 +6,15 @@ import numpy as np
from cv_analysis.figure_detection.figures import detect_large_coherent_structures
from cv_analysis.figure_detection.text import remove_primary_text_regions
from cv_analysis.utils.filters import (
has_acceptable_format,
is_large_enough,
has_acceptable_format,
is_not_too_large,
)
from cv_analysis.utils.postprocessing import remove_included
from cv_analysis.utils.structures import Rectangle
def detect_figures(image: np.ndarray):
def detect_figures(image: np.array):
max_area = image.shape[0] * image.shape[1] * 0.99
min_area = 5000
max_width_to_height_ratio = 6
@ -24,10 +24,9 @@ def detect_figures(image: np.ndarray):
cnts = detect_large_coherent_structures(image)
cnts = filter(figure_filter, cnts)
# rects = map(compose(Rectangle.from_xywh, cv2.boundingRect), (cnts))
bounding_rects = map(cv2.boundingRect, cnts)
rects: list[Rectangle] = remove_included(map(Rectangle.from_xywh, rects))
rects = map(cv2.boundingRect, cnts)
rects = map(Rectangle.from_xywh, rects)
rects = remove_included(rects)
return rects

View File

@ -2,7 +2,7 @@ import cv2
import numpy as np
def detect_large_coherent_structures(image: np.ndarray):
def detect_large_coherent_structures(image: np.array):
"""Detects large coherent structures on an image.
Expects an image with binary color space (e.g. threshold applied).

View File

@ -1,21 +1,25 @@
import itertools
from itertools import compress, starmap
from itertools import compress
from itertools import starmap
from operator import __and__
import cv2
import numpy as np
from cv_analysis.utils.connect_rects import connect_related_rects2
from cv_analysis.utils.postprocessing import (
has_no_parent,
remove_included,
remove_overlapping,
)
from cv_analysis.utils.connect_rects import connect_related_rects
from cv_analysis.utils.display import show_image_mpl
from cv_analysis.utils.draw import draw_rectangles
from cv_analysis.utils.label_rects import label_rects
from cv_analysis.utils.structures import Rectangle
from cv_analysis.utils.postprocessing import (
remove_overlapping,
remove_included,
has_no_parent,
)
from cv_analysis.utils.visual_logging import vizlogger
# could be dynamic parameter is the scan is noisy
#could be dynamic parameter is the scan is noisy
def is_likely_segment(rect, min_area=100):
return cv2.contourArea(rect, False) > min_area
@ -33,7 +37,7 @@ def find_segments(image):
def dilate_page_components(image):
# if text is detected in words make kernel bigger
#if text is detected in words make kernel bigger
image = cv2.GaussianBlur(image, (7, 7), 0)
thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
@ -48,10 +52,11 @@ def fill_in_component_area(image, rect):
return ~image
def parse_layout(image: np.ndarray):
def parse_layout(image: np.array):
image = image.copy()
image_ = image.copy()
#show_image_mpl(image)
if len(image_.shape) > 2:
image_ = cv2.cvtColor(image_, cv2.COLOR_BGR2GRAY)
@ -77,8 +82,8 @@ def parse_layout(image: np.ndarray):
rects = list(map(Rectangle.from_xywh, rects))
rects = remove_included(rects)
rects = connect_related_rects2(map(lambda r: r.xywh(), rects))
rects = map(lambda r: r.xywh(), rects)
rects = connect_related_rects(rects)
rects = list(map(Rectangle.from_xywh, rects))
rects = remove_included(rects)
# rects = remove_included(rects)
return rects

View File

@ -2,9 +2,9 @@ from functools import partial
import cv2
import numpy as np
from iteration_utilities import first, starfilter # type: ignore
from iteration_utilities import starfilter, first
from cv_analysis.utils.filters import is_boxy, is_filled, is_large_enough
from cv_analysis.utils.filters import is_large_enough, is_filled, is_boxy
from cv_analysis.utils.visual_logging import vizlogger
@ -12,7 +12,7 @@ def is_likely_redaction(contour, hierarchy, min_area):
return is_filled(hierarchy) and is_boxy(contour) and is_large_enough(contour, min_area)
def find_redactions(image: np.ndarray, min_normalized_area=200000):
def find_redactions(image: np.array, min_normalized_area=200000):
vizlogger.debug(image, "redactions01_start.png")
min_normalized_area /= 200 # Assumes 200 DPI PDF -> image conversion resolution
@ -29,14 +29,13 @@ def find_redactions(image: np.ndarray, min_normalized_area=200000):
contours, hierarchies = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
try:
return list(
map(
first,
starfilter(
partial(is_likely_redaction, min_area=min_normalized_area),
zip(contours, hierarchies[0]),
),
)
contours = map(
first,
starfilter(
partial(is_likely_redaction, min_area=min_normalized_area),
zip(contours, hierarchies[0]),
),
)
return list(contours)
except:
return []

View File

@ -0,0 +1,56 @@
from dataclasses import asdict
from operator import truth
from funcy import lmap, flatten
from cv_analysis.figure_detection.figure_detection import detect_figures
from cv_analysis.table_parsing import parse_tables
from cv_analysis.utils.structures import Rectangle
from pdf2img.conversion import convert_pages_to_images
from pdf2img.default_objects.image import ImagePlus, ImageInfo
from pdf2img.default_objects.rectangle import RectanglePlus
def get_analysis_pipeline(operation):
if operation == "table":
return make_analysis_pipeline(parse_tables, table_parsing_formatter, dpi=200)
elif operation == "figure":
return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200)
else:
raise
def make_analysis_pipeline(analysis_fn, formatter, dpi):
def analyse_pipeline(pdf: bytes, index=None):
def parse_page(page: ImagePlus):
image = page.asarray()
rects = analysis_fn(image)
if not rects:
return
infos = formatter(rects, page, dpi)
return infos
pages = convert_pages_to_images(pdf, index=index, dpi=dpi)
results = map(parse_page, pages)
yield from flatten(filter(truth, results))
return analyse_pipeline
def table_parsing_formatter(rects, page, dpi):
def format_rect(rect: Rectangle):
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
return rect_plus.asdict(derotate=True)
bboxes = lmap(format_rect, rects)
return {"pageInfo": page.asdict(), "tableCells": bboxes}
def figure_detection_formatter(rects, page, dpi):
def format_rect(rect: Rectangle):
rect_plus = RectanglePlus.from_pixels(*rect.xyxy(), page.info, alpha=False, dpi=dpi)
return asdict(ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha))
return lmap(format_rect, rects)

View File

@ -0,0 +1,139 @@
from functools import partial
from itertools import chain, starmap
from operator import attrgetter
import cv2
import numpy as np
from funcy import lmap
from cv_analysis.utils.postprocessing import remove_isolated # xywh_to_vecs, xywh_to_vec_rect, adjacent1d
from cv_analysis.utils.structures import Rectangle
from cv_analysis.utils.visual_logging import vizlogger
def add_external_contours(image, image_h_w_lines_only):
contours, _ = cv2.findContours(image_h_w_lines_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
return image
def apply_motion_blur(image: np.array, angle, size=80):
"""Solidifies and slightly extends detected lines.
Args:
image (np.array): page image as array
angle: direction in which to apply blur, 0 or 90
size (int): kernel size; 80 found empirically to work well
Returns:
np.array
"""
k = np.zeros((size, size), dtype=np.float32)
vizlogger.debug(k, "tables08_blur_kernel1.png")
k[(size - 1) // 2, :] = np.ones(size, dtype=np.float32)
vizlogger.debug(k, "tables09_blur_kernel2.png")
k = cv2.warpAffine(
k,
cv2.getRotationMatrix2D((size / 2 - 0.5, size / 2 - 0.5), angle, 1.0),
(size, size),
)
vizlogger.debug(k, "tables10_blur_kernel3.png")
k = k * (1.0 / np.sum(k))
vizlogger.debug(k, "tables11_blur_kernel4.png")
blurred = cv2.filter2D(image, -1, k)
return blurred
def isolate_vertical_and_horizontal_components(img_bin):
"""Identifies and reinforces horizontal and vertical lines in a binary image.
Args:
img_bin (np.array): array corresponding to single binarized page image
bounding_rects (list): list of layout boxes of the form (x, y, w, h), potentially containing tables
Returns:
np.array
"""
line_min_width = 48
kernel_h = np.ones((1, line_min_width), np.uint8)
kernel_v = np.ones((line_min_width, 1), np.uint8)
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
img_lines_raw = img_bin_v | img_bin_h
kernel_h = np.ones((1, 30), np.uint8)
kernel_v = np.ones((30, 1), np.uint8)
img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=2)
img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2)
img_bin_h = apply_motion_blur(img_bin_h, 0)
img_bin_v = apply_motion_blur(img_bin_v, 90)
img_bin_extended = img_bin_h | img_bin_v
th1, img_bin_extended = cv2.threshold(img_bin_extended, 120, 255, cv2.THRESH_BINARY)
img_bin_final = cv2.dilate(img_bin_extended, np.ones((1, 1), np.uint8), iterations=1)
# add contours before lines are extended by blurring
img_bin_final = add_external_contours(img_bin_final, img_lines_raw)
return img_bin_final
def preprocess(image: np.array):
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
_, image = cv2.threshold(image, 195, 255, cv2.THRESH_BINARY)
return ~image
# def turn_connected_components_into_rects(image: np.array):
# def is_large_enough(stat):
# x1, y1, w, h, area = stat
# return area > 2000 and w > 35 and h > 25
#
# _, _, stats, _ = cv2.connectedComponentsWithStats(~image, connectivity=8, ltype=cv2.CV_32S)
#
# stats = np.vstack(list(filter(is_large_enough, stats)))
# rects = list(map(Rectangle.from_xywh, stats[:, :-1][2:]))
# return remove_isolated(rects)
def turn_connected_components_into_rects(image: np.array):
def is_large_enough(stat):
x1, y1, w, h, area = stat
return area > 2000 and w > 35 and h > 25
_, _, stats, _ = cv2.connectedComponentsWithStats(~image, connectivity=8, ltype=cv2.CV_32S)
try:
stats = np.vstack(list(filter(is_large_enough, stats)))
rects = list(map(Rectangle.from_xywh, stats[:, :-1][2:]))
return remove_isolated(rects)
except ValueError:
return []
def parse_tables(image: np.array, show=False):
"""Runs the full table parsing process.
Args:
image (np.array): single PDF page, converted to a numpy array
Returns:
list: list of rectangles corresponding to table cells
"""
image = preprocess(image)
image = isolate_vertical_and_horizontal_components(image)
rects = turn_connected_components_into_rects(image)
#print(rects, "\n\n")
#rects = list(map(Rectangle.from_xywh, rects))
#print(rects, "\n\n")
#rects = remove_isolated(rects)
#print(rects, "\n\n")
return rects

View File

@ -1,13 +1,13 @@
def make_art():
art = r"""
__
_ |@@|
__
_ |@@|
/ \ \--/ __ .__ .__
) O|----| | __ ___ __ _____ ____ _____ | | ___.__. _____|__| ______
/ / \ }{ /\ )_ / _\\ \/ / ______ \__ \ / \\__ \ | | | | |/ ___/ |/ ___/
)/ /\__/\ \__O (__ \ / /_____/ / __ \| | \/ __ \| |_\___ |\___ \| |\___ \
|/ (--/\--) \__/ \_/ (______/___|__(______/____/\____/_____/|__/_____/
/ _)( )(_
`---''---`
|/ (--/\--) \__/ \_/ (______/___|__(______/____/\____/_____/|__/_____/
/ _)( )(_
`---''---`
"""
return art

View File

@ -0,0 +1,90 @@
from itertools import combinations, starmap, product
from typing import Iterable
def is_near_enough(rect_pair, max_gap=14):
x1, y1, w1, h1 = rect_pair[0]
x2, y2, w2, h2 = rect_pair[1]
return any([abs(x1 - (x2 + w2)) <= max_gap,
abs(x2 - (x1 + w1)) <= max_gap,
abs(y2 - (y1 + h1)) <= max_gap,
abs(y1 - (y2 + h2)) <= max_gap])
def is_overlapping(rect_pair):
x1, y1, w1, h1 = rect_pair[0]
x2, y2, w2, h2 = rect_pair[1]
dx = min(x1 + w1, x2 + w2) - max(x1, x2)
dy = min(y1 + h1, y2 + h2) - max(y1, y2)
return True if (dx >= 0) and (dy >= 0) else False
def is_on_same_line(rect_pair):
x1, y1, w1, h1 = rect_pair[0]
x2, y2, w2, h2 = rect_pair[1]
return any([any([abs(y1 - y2) <= 10,
abs(y1 + h1 - (y2 + h2)) <= 10]),
any([y2 <= y1 and y1 + h1 <= y2 + h2,
y1 <= y2 and y2 + h2 <= y1 + h1])])
def has_correct_position1(rect_pair):
x1, y1, w1, h1 = rect_pair[0]
x2, y2, w2, h2 = rect_pair[1]
return any([any([abs(x1 - x2) <= 10,
abs(y1 - y2) <= 10,
abs(x1 + w1 - (x2 + w2)) <= 10,
abs(y1 + h1 - (y2 + h2)) <= 10]),
any([y2 <= y1 and y1 + h1 <= y2 + h2,
y1 <= y2 and y2 + h2 <= y1 + h1,
x2 <= x1 and x1 + w1 <= x2 + w2,
x1 <= x2 and x2 + w2 <= x1 + w1])])
def is_related(rect_pair):
return (is_near_enough(rect_pair) and has_correct_position1(rect_pair)) or is_overlapping(
rect_pair)
def fuse_rects(rect1, rect2):
if rect1 == rect2:
return rect1
x1, y1, w1, h1 = rect1
x2, y2, w2, h2 = rect2
topleft = list(min(product([x1, x2], [y1, y2])))
bottomright = list(max(product([x1 + w1, x2 + w2], [y1 + h1, y2 + h2])))
w = [bottomright[0] - topleft[0]]
h = [bottomright[1] - topleft[1]]
return tuple(topleft + w + h)
def rects_not_the_same(r):
return r[0] != r[1]
def connect_related_rects(rects: Iterable[tuple]):
rects = list(rects)
current_idx = 0
while True:
if current_idx + 1 >= len(rects) or len(rects) <= 1:
break
merge_happened = False
current_rect = rects.pop(current_idx)
for idx, maybe_related_rect in enumerate(rects):
if is_related((current_rect, maybe_related_rect)):
current_rect = fuse_rects(current_rect, maybe_related_rect)
rects.pop(idx)
merge_happened = True
break
rects.insert(0, current_rect)
if not merge_happened:
current_idx += 1
elif merge_happened:
current_idx = 0
return rects

View File

@ -1,13 +1,6 @@
import os
import cv2
from matplotlib import pyplot as plt
# if os.environ.get("USER") == "isaac":
# import matplotlib
# matplotlib.use("module://matplotlib-backend-wezterm")
def show_image_cv2(image, maxdim=700):
h, w, c = image.shape

View File

@ -4,6 +4,7 @@ from cv_analysis.utils import copy_and_normalize_channels
def draw_contours(image, contours, color=None, annotate=False):
image = copy_and_normalize_channels(image)
for cont in contours:

View File

@ -0,0 +1,93 @@
from itertools import starmap
from typing import Iterable
import cv2
import numpy as np
from cv_analysis.figure_detection.text import remove_primary_text_regions, apply_threshold_to_image
from cv_analysis.table_parsing import preprocess, isolate_vertical_and_horizontal_components, \
turn_connected_components_into_rects
from cv_analysis.utils.display import show_image_mpl
def area_is_bigger_than(rect: tuple, maxarea=100000):
x, y, w, h = rect
return w * h >= maxarea
def define_rect(rect_img, original_position):
# print(original_position)
# show_image_mpl(rect_img)
xo, yo, wo, ho = original_position
rect_img_inv = preprocess(rect_img)
# print("pixel density inverted img", pixel_density(rect_img_inv))
grid_inv = isolate_vertical_and_horizontal_components(rect_img_inv)
cnts, _ = cv2.findContours(image=grid_inv, mode=cv2.RETR_EXTERNAL, method=cv2.CHAIN_APPROX_SIMPLE)
if cnts:
rects = turn_connected_components_into_rects(grid_inv)
rects = map(lambda r: r.xywh(), rects)
bbox = list((cv2.boundingRect(c) for c in cnts))
if len(list(rects)) > 1 and len(bbox) == 1:
x, y, w, h = bbox[0]
w_img, h_img = rect_img.shape
if w * h / (w_img * h_img) >= 0.75:
# print("is table")
return "table"
else:
# show_image_mpl(rect_img)
# print(" table detected but to small for layout rect, so cant be table, maybe figure?")
return "other"
else:
if is_header(yo + ho):
# print("is header component")
return "header component"
elif is_footer(yo):
# print("is footer component")
return "footer component"
else:
# print("single cell or no connected components, maybe figure?")
return "other"
else:
if is_header(yo + ho):
# print("is header text")
return "header text"
elif is_footer(yo):
# print("is footer text")
return "footer text"
else:
# print("is text")
return "text"
def is_header(y):
return y < 200
def is_footer(y):
return y > 2100
def is_text(img):
show_image_mpl(img)
cleaned = remove_primary_text_regions(img)
show_image_mpl(cleaned)
return pixel_density(cleaned) < 0.05
def pixel_density(img):
pixels = np.count_nonzero(img)
density = pixels / img.size
return density
def label_rects(image: np.array, rects: Iterable[tuple]):
def crop_image_rects(rect):
x, y, w, h = rect
return image[y:y + h, x:x + w]
rect_images = map(crop_image_rects, rects)
rect_labels = starmap(define_rect, zip(rect_images, rects))
return rect_labels

View File

@ -1,11 +1,12 @@
import pdf2image
from numpy import array, ndarray
import pdf2image
from PIL import Image
from cv_analysis.utils.preprocessing import preprocess_page_array
def open_pdf(pdf, first_page=0, last_page=None):
first_page += 1
last_page = None if last_page is None else last_page + 1

View File

@ -1,28 +1,27 @@
from collections import namedtuple
from functools import partial
from itertools import compress, starmap
from typing import Iterable, List
from itertools import starmap, compress
from typing import Iterable
from cv_analysis.utils.structures import Rectangle
def remove_overlapping(rectangles: Iterable[Rectangle]) -> List[Rectangle]:
def remove_overlapping(rectangles: Iterable[Rectangle]) -> list[Rectangle]:
def overlap(a: Rectangle, rect2: Rectangle) -> float:
return a.intersection(rect2) > 0
def does_not_overlap(rect: Rectangle, rectangles: Iterable[Rectangle]) -> bool:
def does_not_overlap(rect: Rectangle, rectangles: Iterable[Rectangle]) -> list:
return not any(overlap(rect, rect2) for rect2 in rectangles if not rect == rect2)
rectangles = list(filter(partial(does_not_overlap, rectangles=rectangles), rectangles))
return rectangles
def remove_included(rectangles: Iterable[Rectangle]) -> List[Rectangle]:
def remove_included(rectangles: Iterable[Rectangle]) -> list[Rectangle]:
keep = [rect for rect in rectangles if not rect.is_included(rectangles)]
return keep
def __remove_isolated_unsorted(rectangles: Iterable[Rectangle]) -> List[Rectangle]:
def __remove_isolated_unsorted(rectangles: Iterable[Rectangle]) -> list[Rectangle]:
def is_connected(rect: Rectangle, rectangles: Iterable[Rectangle]):
return any(rect.adjacent(rect2) for rect2 in rectangles if not rect == rect2)
@ -30,7 +29,7 @@ def __remove_isolated_unsorted(rectangles: Iterable[Rectangle]) -> List[Rectangl
return rectangles
def __remove_isolated_sorted(rectangles: Iterable[Rectangle]) -> List[Rectangle]:
def __remove_isolated_sorted(rectangles: Iterable[Rectangle]) -> list[Rectangle]:
def is_connected(left, center, right):
return any([left.adjacent(center), center.adjacent(right)])
@ -43,7 +42,7 @@ def __remove_isolated_sorted(rectangles: Iterable[Rectangle]) -> List[Rectangle]
return rectangles
def remove_isolated(rectangles: Iterable[Rectangle], input_unsorted=True) -> List[Rectangle]:
def remove_isolated(rectangles: Iterable[Rectangle], input_unsorted=True) -> list[Rectangle]:
return (__remove_isolated_unsorted if input_unsorted else __remove_isolated_sorted)(rectangles)

View File

@ -1,5 +1,5 @@
import cv2
from numpy import frombuffer, ndarray
import cv2
def preprocess_page_array(page):
@ -10,6 +10,7 @@ def preprocess_page_array(page):
def page2image(page):
if type(page) == bytes:
page = frombuffer(page)
elif type(page) == ndarray:

View File

@ -0,0 +1,96 @@
from itertools import starmap
from typing import Iterable
import cv2
import numpy as np
from cv_analysis.figure_detection.text import remove_primary_text_regions, apply_threshold_to_image
from cv_analysis.table_parsing import preprocess, isolate_vertical_and_horizontal_components, \
turn_connected_components_into_rects
from cv_analysis.utils.display import show_image_mpl
def area_is_bigger_than(rect: tuple, maxarea=100000):
x, y, w, h = rect
return w * h >= maxarea
def define_rect(rect_img, original_position):
show_image_mpl(rect_img)
x,y,w,h = original_position
if is_header(y+h):
print(original_position, " is header")
return "header"
elif is_footer(y):
print(original_position, " is footer")
return "footer"
elif is_table(rect_img):
print(original_position, " is table")
return "table"
elif is_text(rect_img):
print(original_position, " is text")
return "text"
else:
return "other"
def is_table(rect_img):
rect_img_inv = preprocess(rect_img)
grid_inv = isolate_vertical_and_horizontal_components(rect_img_inv)
cnts, _ = cv2.findContours(image=grid_inv, mode=cv2.RETR_EXTERNAL, method=cv2.CHAIN_APPROX_SIMPLE)
if cnts:
rects = turn_connected_components_into_rects(grid_inv)
rects = map(lambda r: r.xywh(), rects)
bbox = list((cv2.boundingRect(c) for c in cnts))
if len(list(rects)) > 1 and len(bbox) == 1:
x, y, w, h = bbox[0]
w_img, h_img = rect_img.shape
if w * h / (w_img * h_img) >= 0.75:
#print("is table")
return True
else:
print(" table detected but to small for layout rect, so cant be table, maybe figure?")
return False
else:
print("single cell or no connected components, maybe figure?")
return False
else:
print("not a table, but text?")
return False
def is_header(y):
return y < 200
def is_footer(y):
return y > 2150
def is_text(img):
show_image_mpl(img)
cleaned = remove_primary_text_regions(img)
show_image_mpl(cleaned)
return pixel_density(cleaned) < 0.05
def pixel_density(img):
pixels = np.count_nonzero(img)
density = pixels / img.size
return density
def annotate_rect(rect, rect_img):
pass
def label_rects(rects: Iterable[tuple], image: np.array):
def crop_image_rects(rect):
x, y, w, h = rect
return image[y:y + h, x:x + w]
rect_images = map(crop_image_rects, rects)
rect_labels = starmap(define_rect, zip(rect_images, rects))
print(rect_labels)
return rect_labels

View File

@ -1,23 +1,12 @@
from json import dumps
from typing import Iterable
from typing import Iterable
import numpy as np
from funcy import identity # type: ignore
from funcy import identity
class Rectangle:
def __init__(
self,
x1=None,
y1=None,
w=None,
h=None,
x2=None,
y2=None,
indent=4,
format="xywh",
discrete=True,
):
def __init__(self, x1=None, y1=None, w=None, h=None, x2=None, y2=None, indent=4, format="xywh", discrete=True):
make_discrete = int if discrete else identity
try:
@ -122,13 +111,7 @@ class Rectangle:
@classmethod
def from_dict_xywh(cls, xywh_dict, discrete=True):
return cls(
x1=xywh_dict["x"],
y1=xywh_dict["y"],
w=xywh_dict["width"],
h=xywh_dict["height"],
discrete=discrete,
)
return cls(x1=xywh_dict["x"], y1=xywh_dict["y"], w=xywh_dict["width"], h=xywh_dict["height"], discrete=discrete)
def __str__(self):
return dumps(self.json(), indent=self.indent)

View File

@ -1,7 +1,5 @@
from typing import Iterable
import numpy as np
from cv_analysis.utils.structures import Rectangle
@ -28,6 +26,7 @@ def compute_page_iou(results_boxes: Iterable[Rectangle], ground_truth_boxes: Ite
def compute_document_score(results_dict, annotation_dict):
page_weights = np.array([len(page["cells"]) for page in annotation_dict["pages"]])
page_weights = page_weights / sum(page_weights)

View File

@ -1,8 +1,9 @@
import cv2
from numpy import generic
import cv2
def copy_and_normalize_channels(image):
image = image.copy()
try:
image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)

View File

@ -1,11 +1,9 @@
import os
from pyinfra.config.loader import load_settings # type: ignore
from cv_analysis.config import get_config
from cv_analysis.utils.display import save_image
settings = get_config()
CV_CONFIG = get_config()
class VisualLogger:
@ -41,4 +39,4 @@ class VisualLogger:
return self.level == "ALL"
vizlogger = VisualLogger(settings.logging.visual_logging_level, settings.logging.visual_logging_output_folder)
vizlogger = VisualLogger(CV_CONFIG.visual_logging_level, CV_CONFIG.visual_logging_output_folder)

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -1,30 +0,0 @@
#!/bin/bash
python_version=$1
gitlab_user=$2
gitlab_personal_access_token=$3
# cookiecutter https://gitlab.knecon.com/knecon/research/template-python-project.git --checkout master
# latest_dir=$(ls -td -- */ | head -n 1) # should be the dir cookiecutter just created
# cd $latest_dir
pyenv install $python_version
pyenv local $python_version
pyenv shell $python_version
pip install --upgrade pip
pip install poetry
poetry config installer.max-workers 10
# research package registry
poetry config repositories.gitlab-research https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
poetry config http-basic.gitlab-research ${gitlab_user} ${gitlab_personal_access_token}
# redactmanager package registry
poetry config repositories.gitlab-red https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
poetry config http-basic.gitlab-red ${gitlab_user} ${gitlab_personal_access_token}
poetry env use $(pyenv which python)
poetry install --with=dev
poetry update
source .venv/bin/activate

View File

@ -28,4 +28,4 @@ services:
volumes:
- /opt/bitnami/rabbitmq/.rabbitmq/:/data/bitnami
volumes:
mdata:
mdata:

View File

@ -1,4 +0,0 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: 04e9c6c5d3e412413c2949e598da60dc
tags: 645f666f9bcd5a90fca523b33c5a78b7

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,657 +0,0 @@
<!DOCTYPE html>
<html lang="en" data-content_root="./" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>cv-analysis - Visual (CV-Based) Document Parsing &#8212; CV Analysis Service 2.5.2 documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="_static/styles/theme.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link href="_static/styles/bootstrap.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link href="_static/styles/pydata-sphinx-theme.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link href="_static/vendor/fontawesome/6.5.1/css/all.min.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.5.1/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.5.1/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.5.1/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=a746c00c" />
<link rel="stylesheet" type="text/css" href="https://assets.readthedocs.org/static/css/badge_only.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=8d27b9dea8ad943066ae" />
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=8d27b9dea8ad943066ae" />
<script src="_static/vendor/fontawesome/6.5.1/js/all.min.js?digest=8d27b9dea8ad943066ae"></script>
<script src="_static/documentation_options.js?v=afc61bbc"></script>
<script src="_static/doctools.js?v=9a2dae69"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script>DOCUMENTATION_OPTIONS.pagename = 'README';</script>
<script async="async" src="https://assets.readthedocs.org/static/javascript/readthedocs-doc-embed.js"></script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="cv_analysis package" href="modules/cv_analysis.html" />
<link rel="prev" title="Welcome to CV Analysis Service documentation!" href="index.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<!-- RTD Extra Head -->
<link rel="stylesheet" href="https://assets.readthedocs.org/static/css/readthedocs-doc-embed.css" type="text/css" />
<script type="application/json" id="READTHEDOCS_DATA">{"ad_free": "", "api_host": "", "builder": "sphinx", "canonical_url": "", "docroot": "", "features": {"docsearch_disabled": false}, "global_analytics_code": null, "language": "", "page": "README", "programming_language": "", "project": "", "source_suffix": ".md", "subprojects": {}, "theme": "", "user_analytics_code": null, "version": ""}</script>
<!--
Using this variable directly instead of using `JSON.parse` is deprecated.
The READTHEDOCS_DATA global variable will be removed in the future.
-->
<script type="text/javascript">
READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerHTML);
</script>
<script type="text/javascript" src="https://assets.readthedocs.org/static/javascript/readthedocs-analytics.js" async="async"></script>
<!-- end RTD <extrahead> -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a id="pst-skip-link" class="skip-link" href="#main-content">Skip to main content</a>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>
Back to top
</button>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="col-lg-3 navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="index.html">
<img src="_static/logo.png" class="logo__image only-light" alt="CV Analysis Service 2.5.2 documentation - Home"/>
<script>document.write(`<img src="_static/logo.png" class="logo__image only-dark" alt="CV Analysis Service 2.5.2 documentation - Home"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item">
<nav class="navbar-nav">
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item current active">
<a class="nav-link nav-internal" href="#">
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="modules/cv_analysis.html">
cv_analysis package
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="modules/serve.html">
serve module
</a>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn navbar-btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script>
</div>
<div class="navbar-item">
<script>
document.write(`
<button class="btn btn-sm navbar-btn theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch nav-link" data-mode="light"><i class="fa-solid fa-sun fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="dark"><i class="fa-solid fa-moon fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="auto"><i class="fa-solid fa-circle-half-stroke fa-lg"></i></span>
</button>
`);
</script></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn navbar-btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script>
</div>
<label class="sidebar-toggle secondary-toggle" for="__secondary" tabindex="0">
<span class="fa-solid fa-outdent"></span>
</label>
</div>
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item">
<nav class="navbar-nav">
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item current active">
<a class="nav-link nav-internal" href="#">
cv-analysis - Visual (CV-Based) Document Parsing
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="modules/cv_analysis.html">
cv_analysis package
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="modules/serve.html">
serve module
</a>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item">
<script>
document.write(`
<button class="btn btn-sm navbar-btn theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch nav-link" data-mode="light"><i class="fa-solid fa-sun fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="dark"><i class="fa-solid fa-moon fa-lg"></i></span>
<span class="theme-switch nav-link" data-mode="auto"><i class="fa-solid fa-circle-half-stroke fa-lg"></i></span>
</button>
`);
</script></div>
</div>
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<nav class="bd-docs-nav bd-links"
aria-label="Section Navigation">
<p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p>
<div class="bd-toc-item navbar-nav"></div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumb">
<ul class="bd-breadcrumbs">
<li class="breadcrumb-item breadcrumb-home">
<a href="index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item active" aria-current="page">cv-analysis...</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<section id="cv-analysis-visual-cv-based-document-parsing">
<h1>cv-analysis - Visual (CV-Based) Document Parsing<a class="headerlink" href="#cv-analysis-visual-cv-based-document-parsing" title="Link to this heading">#</a></h1>
<p>parse_pdf()
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
previous redactions in documents.</p>
<section id="api">
<h2>API<a class="headerlink" href="#api" title="Link to this heading">#</a></h2>
<p>Input message:</p>
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
<span class="w"> </span><span class="nt">&quot;targetFilePath&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="nt">&quot;pdf&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;absolute file path&quot;</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;vlp_output&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;absolute file path&quot;</span>
<span class="w"> </span><span class="p">},</span>
<span class="w"> </span><span class="nt">&quot;responseFilePath&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;absolute file path&quot;</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;operation&quot;</span><span class="p">:</span><span class="w"> </span><span class="s2">&quot;table_image_inference&quot;</span>
<span class="p">}</span>
</pre></div>
</div>
<p>Response is uploaded to the storage as specified in the <code class="docutils literal notranslate"><span class="pre">responseFilePath</span></code> field. The structure is as follows:</p>
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
<span class="w"> </span><span class="err">...</span><span class="p">,</span>
<span class="w"> </span><span class="nt">&quot;data&quot;</span><span class="p">:</span><span class="w"> </span><span class="p">[</span>
<span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="err">&#39;pageNum&#39;</span><span class="p">:</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;bbox&#39;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="err">&#39;x</span><span class="mi">1</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="mf">55.3407</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;y</span><span class="mi">1</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="mf">247.0246</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;x</span><span class="mi">2</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="mf">558.5602</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;y</span><span class="mi">2</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="mf">598.0585</span>
<span class="w"> </span><span class="p">},</span>
<span class="w"> </span><span class="err">&#39;uuid&#39;</span><span class="p">:</span><span class="w"> </span><span class="err">&#39;</span><span class="mi">2</span><span class="err">b</span><span class="mi">10</span><span class="err">c</span><span class="mi">1</span><span class="err">a</span><span class="mi">2-393</span><span class="err">c</span><span class="mi">-4</span><span class="kc">f</span><span class="err">ca</span><span class="mi">-</span><span class="err">b</span><span class="mf">9e3-0</span><span class="err">ad</span><span class="mi">5</span><span class="err">b</span><span class="mi">774</span><span class="err">ac</span><span class="mi">84</span><span class="err">&#39;</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;label&#39;</span><span class="p">:</span><span class="w"> </span><span class="err">&#39;</span><span class="kc">ta</span><span class="err">ble&#39;</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;</span><span class="kc">ta</span><span class="err">bleLi</span><span class="kc">nes</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="p">[</span>
<span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="err">&#39;x</span><span class="mi">1</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;y</span><span class="mi">1</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="mi">16</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;x</span><span class="mi">2</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="mi">1399</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;y</span><span class="mi">2</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="mi">16</span>
<span class="w"> </span><span class="p">},</span>
<span class="w"> </span><span class="err">...</span>
<span class="w"> </span><span class="p">],</span>
<span class="w"> </span><span class="err">&#39;imageI</span><span class="kc">nf</span><span class="err">o&#39;</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="err">&#39;heigh</span><span class="kc">t</span><span class="err">&#39;</span><span class="p">:</span><span class="w"> </span><span class="mi">693</span><span class="p">,</span>
<span class="w"> </span><span class="err">&#39;wid</span><span class="kc">t</span><span class="err">h&#39;</span><span class="p">:</span><span class="w"> </span><span class="mi">1414</span>
<span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="p">},</span>
<span class="w"> </span><span class="err">...</span>
<span class="w"> </span><span class="p">]</span>
<span class="p">}</span>
</pre></div>
</div>
</section>
<section id="installation">
<h2>Installation<a class="headerlink" href="#installation" title="Link to this heading">#</a></h2>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>git<span class="w"> </span>clone<span class="w"> </span>ssh://git@git.iqser.com:2222/rr/cv-analysis.git
<span class="nb">cd</span><span class="w"> </span>cv-analysis
python<span class="w"> </span>-m<span class="w"> </span>venv<span class="w"> </span>env
<span class="nb">source</span><span class="w"> </span>env/bin/activate
pip<span class="w"> </span>install<span class="w"> </span>-e<span class="w"> </span>.
pip<span class="w"> </span>install<span class="w"> </span>-r<span class="w"> </span>requirements.txt
dvc<span class="w"> </span>pull
</pre></div>
</div>
</section>
<section id="usage">
<h2>Usage<a class="headerlink" href="#usage" title="Link to this heading">#</a></h2>
<section id="as-an-api">
<h3>As an API<a class="headerlink" href="#as-an-api" title="Link to this heading">#</a></h3>
<p>The module provided functions for the individual tasks that all return some kind of collection of points, depending on
the specific task.</p>
<section id="redaction-detection-api">
<h4>Redaction Detection (API)<a class="headerlink" href="#redaction-detection-api" title="Link to this heading">#</a></h4>
<p>The below snippet shows hot to find the outlines of previous redactions.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">cv_analysis.redaction_detection</span> <span class="kn">import</span> <span class="n">find_redactions</span>
<span class="kn">import</span> <span class="nn">pdf2image</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="n">pdf_path</span> <span class="o">=</span> <span class="o">...</span>
<span class="n">page_index</span> <span class="o">=</span> <span class="o">...</span>
<span class="n">page</span> <span class="o">=</span> <span class="n">pdf2image</span><span class="o">.</span><span class="n">convert_from_path</span><span class="p">(</span><span class="n">pdf_path</span><span class="p">,</span> <span class="n">first_page</span><span class="o">=</span><span class="n">page_index</span><span class="p">,</span> <span class="n">last_page</span><span class="o">=</span><span class="n">page_index</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">page</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">page</span><span class="p">)</span>
<span class="n">redaction_contours</span> <span class="o">=</span> <span class="n">find_redactions</span><span class="p">(</span><span class="n">page</span><span class="p">)</span>
</pre></div>
</div>
</section>
</section>
</section>
<section id="as-a-cli-tool">
<h2>As a CLI Tool<a class="headerlink" href="#as-a-cli-tool" title="Link to this heading">#</a></h2>
<p>Core API functionalities can be used through a CLI.</p>
<section id="table-parsing">
<h3>Table Parsing<a class="headerlink" href="#table-parsing" title="Link to this heading">#</a></h3>
<p>The tables parsing utility detects and segments tables into individual cells.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">7</span><span class="w"> </span>--type<span class="w"> </span>table
</pre></div>
</div>
<p>The below image shows a parsed table, where each table cell has been detected individually.</p>
<p><img alt="Table Parsing Demonstration" src="_images/table_parsing.png" /></p>
</section>
<section id="redaction-detection-cli">
<h3>Redaction Detection (CLI)<a class="headerlink" href="#redaction-detection-cli" title="Link to this heading">#</a></h3>
<p>The redaction detection utility detects previous redactions in PDFs (filled black rectangles).</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">2</span><span class="w"> </span>--type<span class="w"> </span>redaction
</pre></div>
</div>
<p>The below image shows the detected redactions with green outlines.</p>
<p><img alt="Redaction Detection Demonstration" src="_images/redaction_detection.png" /></p>
</section>
<section id="layout-parsing">
<h3>Layout Parsing<a class="headerlink" href="#layout-parsing" title="Link to this heading">#</a></h3>
<p>The layout parsing utility detects elements such as paragraphs, tables and figures.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">7</span><span class="w"> </span>--type<span class="w"> </span>layout
</pre></div>
</div>
<p>The below image shows the detected layout elements on a page.</p>
<p><img alt="Layout Parsing Demonstration" src="_images/layout_parsing.png" /></p>
</section>
<section id="figure-detection">
<h3>Figure Detection<a class="headerlink" href="#figure-detection" title="Link to this heading">#</a></h3>
<p>The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">3</span><span class="w"> </span>--type<span class="w"> </span>figure
</pre></div>
</div>
<p>The below image shows the detected figure on a page.</p>
<p><img alt="Figure Detection Demonstration" src="_images/figure_detection.png" /></p>
</section>
</section>
<section id="running-as-a-service">
<h2>Running as a service<a class="headerlink" href="#running-as-a-service" title="Link to this heading">#</a></h2>
<section id="building">
<h3>Building<a class="headerlink" href="#building" title="Link to this heading">#</a></h3>
<p>Build base image</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>bash<span class="w"> </span>setup/docker.sh
</pre></div>
</div>
<p>Build head image</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>docker<span class="w"> </span>build<span class="w"> </span>-f<span class="w"> </span>Dockerfile<span class="w"> </span>-t<span class="w"> </span>cv-analysis<span class="w"> </span>.<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">BASE_ROOT</span><span class="o">=</span><span class="s2">&quot;&quot;</span>
</pre></div>
</div>
</section>
<section id="usage-service">
<h3>Usage (service)<a class="headerlink" href="#usage-service" title="Link to this heading">#</a></h3>
<p>Shell 1</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>docker<span class="w"> </span>run<span class="w"> </span>--rm<span class="w"> </span>--net<span class="o">=</span>host<span class="w"> </span>--rm<span class="w"> </span>cv-analysis
</pre></div>
</div>
<p>Shell 2</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/client_mock.py<span class="w"> </span>--pdf_path<span class="w"> </span>/path/to/a/pdf
</pre></div>
</div>
</section>
</section>
</section>
</article>
<footer class="prev-next-footer">
<div class="prev-next-area">
<a class="left-prev"
href="index.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Welcome to CV Analysis Service documentation!</p>
</div>
</a>
<a class="right-next"
href="modules/cv_analysis.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">cv_analysis package</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div
id="pst-page-navigation-heading-2"
class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> On this page
</div>
<nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#api">API</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#installation">Installation</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#usage">Usage</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#as-an-api">As an API</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#redaction-detection-api">Redaction Detection (API)</a></li>
</ul>
</li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#as-a-cli-tool">As a CLI Tool</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#table-parsing">Table Parsing</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#redaction-detection-cli">Redaction Detection (CLI)</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#layout-parsing">Layout Parsing</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#figure-detection">Figure Detection</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#running-as-a-service">Running as a service</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#building">Building</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#usage-service">Usage (service)</a></li>
</ul>
</li>
</ul>
</nav></div>
<div class="sidebar-secondary-item">
<div class="tocsection sourcelink">
<a href="_sources/README.md.txt">
<i class="fa-solid fa-file-lines"></i> Show Source
</a>
</div>
</div>
</div></div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="_static/scripts/bootstrap.js?digest=8d27b9dea8ad943066ae"></script>
<script src="_static/scripts/pydata-sphinx-theme.js?digest=8d27b9dea8ad943066ae"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item">
<p class="copyright">
© Copyright All rights reserved.
<br/>
</p>
</div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 7.3.7.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item">
<p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.15.2.
</p></div>
</div>
</div>
</footer>
</body>
</html>

Binary file not shown.

Before

Width:  |  Height:  |  Size: 707 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 568 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.2 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 566 KiB

View File

@ -1,178 +0,0 @@
# cv-analysis - Visual (CV-Based) Document Parsing
parse_pdf()
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
previous redactions in documents.
## API
Input message:
```json
{
"targetFilePath": {
"pdf": "absolute file path",
"vlp_output": "absolute file path"
},
"responseFilePath": "absolute file path",
"operation": "table_image_inference"
}
```
Response is uploaded to the storage as specified in the `responseFilePath` field. The structure is as follows:
```json
{
...,
"data": [
{
'pageNum': 0,
'bbox': {
'x1': 55.3407,
'y1': 247.0246,
'x2': 558.5602,
'y2': 598.0585
},
'uuid': '2b10c1a2-393c-4fca-b9e3-0ad5b774ac84',
'label': 'table',
'tableLines': [
{
'x1': 0,
'y1': 16,
'x2': 1399,
'y2': 16
},
...
],
'imageInfo': {
'height': 693,
'width': 1414
}
},
...
]
}
```
## Installation
```bash
git clone ssh://git@git.iqser.com:2222/rr/cv-analysis.git
cd cv-analysis
python -m venv env
source env/bin/activate
pip install -e .
pip install -r requirements.txt
dvc pull
```
## Usage
### As an API
The module provided functions for the individual tasks that all return some kind of collection of points, depending on
the specific task.
#### Redaction Detection (API)
The below snippet shows hot to find the outlines of previous redactions.
```python
from cv_analysis.redaction_detection import find_redactions
import pdf2image
import numpy as np
pdf_path = ...
page_index = ...
page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0]
page = np.array(page)
redaction_contours = find_redactions(page)
```
## As a CLI Tool
Core API functionalities can be used through a CLI.
### Table Parsing
The tables parsing utility detects and segments tables into individual cells.
```bash
python scripts/annotate.py data/test_pdf.pdf 7 --type table
```
The below image shows a parsed table, where each table cell has been detected individually.
![Table Parsing Demonstration](data/table_parsing.png)
### Redaction Detection (CLI)
The redaction detection utility detects previous redactions in PDFs (filled black rectangles).
```bash
python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
```
The below image shows the detected redactions with green outlines.
![Redaction Detection Demonstration](data/redaction_detection.png)
### Layout Parsing
The layout parsing utility detects elements such as paragraphs, tables and figures.
```bash
python scripts/annotate.py data/test_pdf.pdf 7 --type layout
```
The below image shows the detected layout elements on a page.
![Layout Parsing Demonstration](data/layout_parsing.png)
### Figure Detection
The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility.
```bash
python scripts/annotate.py data/test_pdf.pdf 3 --type figure
```
The below image shows the detected figure on a page.
![Figure Detection Demonstration](data/figure_detection.png)
## Running as a service
### Building
Build base image
```bash
bash setup/docker.sh
```
Build head image
```bash
docker build -f Dockerfile -t cv-analysis . --build-arg BASE_ROOT=""
```
### Usage (service)
Shell 1
```bash
docker run --rm --net=host --rm cv-analysis
```
Shell 2
```bash
python scripts/client_mock.py --pdf_path /path/to/a/pdf
```

View File

@ -1,37 +0,0 @@
.. Keyword Extraction Service documentation master file, created by
sphinx-quickstart on Mon Sep 12 12:04:24 2022.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
=============================================
Welcome to CV Analysis Service documentation!
=============================================
.. note::
If you'd like to change the looks of things 👉 https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html
Table of Contents
-----------------
.. toctree::
:maxdepth: 3
:caption: README
README.md
.. toctree::
:maxdepth: 3
:caption: Modules
modules/cv_analysis
modules/serve
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

View File

@ -1,7 +0,0 @@
cv\_analysis.config module
==========================
.. automodule:: cv_analysis.config
:members:
:undoc-members:
:show-inheritance:

View File

@ -1,7 +0,0 @@
cv\_analysis.figure\_detection.figure\_detection module
=======================================================
.. automodule:: cv_analysis.figure_detection.figure_detection
:members:
:undoc-members:
:show-inheritance:

View File

@ -1,7 +0,0 @@
cv\_analysis.figure\_detection.figures module
=============================================
.. automodule:: cv_analysis.figure_detection.figures
:members:
:undoc-members:
:show-inheritance:

View File

@ -1,17 +0,0 @@
cv\_analysis.figure\_detection package
======================================
.. automodule:: cv_analysis.figure_detection
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
cv_analysis.figure_detection.figure_detection
cv_analysis.figure_detection.figures
cv_analysis.figure_detection.text

View File

@ -1,7 +0,0 @@
cv\_analysis.figure\_detection.text module
==========================================
.. automodule:: cv_analysis.figure_detection.text
:members:
:undoc-members:
:show-inheritance:

View File

@ -1,7 +0,0 @@
cv\_analysis.layout\_parsing module
===================================
.. automodule:: cv_analysis.layout_parsing
:members:
:undoc-members:
:show-inheritance:

View File

@ -1,7 +0,0 @@
cv\_analysis.locations module
=============================
.. automodule:: cv_analysis.locations
:members:
:undoc-members:
:show-inheritance:

Some files were not shown because too many files have changed in this diff Show More