Compare commits
152 Commits
texture_co
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
799fe331c3 | ||
|
|
dfbfc50556 | ||
|
|
63fbd387a3 | ||
|
|
41dbfc69d9 | ||
|
|
b73e9b2ed9 | ||
|
|
92692281ce | ||
|
|
cb0c58d699 | ||
|
|
eb96403fe2 | ||
|
|
c8daf888c6 | ||
|
|
eb921c365d | ||
|
|
7762f81a4a | ||
|
|
e991cfe1bf | ||
|
|
35c5ee5831 | ||
|
|
e97f34391a | ||
|
|
1fa10721aa | ||
|
|
7f0d0a48db | ||
|
|
333cd498b9 | ||
|
|
9df8c8f936 | ||
|
|
60adf0c381 | ||
|
|
537f605a85 | ||
|
|
66987ab8e9 | ||
|
|
43570142c3 | ||
|
|
d457f49001 | ||
|
|
536928c032 | ||
|
|
dc6183490f | ||
|
|
bbc2d0c8bf | ||
|
|
3462faf8c7 | ||
|
|
b136cc9ff3 | ||
|
|
cf431df1cb | ||
|
|
23406004ed | ||
|
|
b0467f2335 | ||
|
|
e86214f6b7 | ||
|
|
3b8d6eda04 | ||
|
|
3c9ddfcf0f | ||
|
|
b854312b08 | ||
|
|
0f45a25bc8 | ||
|
|
8762363aa9 | ||
|
|
72d26c4712 | ||
|
|
62fb637978 | ||
|
|
802372a504 | ||
|
|
ceb1c00784 | ||
|
|
f1f9e8d2bc | ||
|
|
8fcb6f29fb | ||
|
|
79926b9990 | ||
|
|
6d37622e95 | ||
|
|
6341512250 | ||
|
|
713697b32d | ||
|
|
b6e2540399 | ||
|
|
78b8f18865 | ||
|
|
55795b9e58 | ||
|
|
d2ec32b37c | ||
|
|
3202d95638 | ||
|
|
8c1e30c6df | ||
|
|
127fd7a399 | ||
|
|
560c73a5cb | ||
|
|
d821b93af9 | ||
|
|
2f20ec4ecd | ||
|
|
c2027df1c7 | ||
|
|
a966b49f89 | ||
|
|
8d81551da3 | ||
|
|
626da20afd | ||
|
|
a55b34379a | ||
|
|
2c5c3669a4 | ||
|
|
55b8e209d3 | ||
|
|
ab5096dd86 | ||
|
|
3a5fc32ec8 | ||
|
|
2c6232a1bf | ||
|
|
b43033e6bf | ||
|
|
5d13d8b3d0 | ||
|
|
f213a16cd0 | ||
|
|
9e04693ee1 | ||
|
|
fee357872f | ||
|
|
12bb7ee25f | ||
|
|
f7a0db2651 | ||
|
|
1d3b077ace | ||
|
|
102617fe2f | ||
|
|
0f0fe516d0 | ||
|
|
8de913840f | ||
|
|
aefb73bf28 | ||
|
|
20f8dcd336 | ||
|
|
681e59d24e | ||
|
|
abd350cc42 | ||
|
|
e264c948cf | ||
|
|
ddd680bb4c | ||
|
|
ebdf3cefbf | ||
|
|
ffb10876f5 | ||
|
|
95abb5d5fb | ||
|
|
482673f927 | ||
|
|
a52226d8fe | ||
|
|
fa959332cb | ||
|
|
688217f3cd | ||
|
|
183aad4bf8 | ||
|
|
0a11471191 | ||
|
|
55fb4e06f2 | ||
|
|
306c9b67cf | ||
|
|
60b1c15f82 | ||
|
|
940d7b9277 | ||
|
|
d1c2610bd5 | ||
|
|
50831036f5 | ||
|
|
726aae03a6 | ||
|
|
423842a4c9 | ||
|
|
6426c14fb7 | ||
|
|
6070736df9 | ||
|
|
295a5dea77 | ||
|
|
515cd2309b | ||
|
|
be65ea4ff5 | ||
|
|
85885f929b | ||
|
|
fc7d4ee829 | ||
|
|
83a922deed | ||
|
|
efcd661948 | ||
|
|
4c4ed8ba1e | ||
|
|
415d2b135b | ||
|
|
5538f12d3f | ||
|
|
a08799d7b8 | ||
|
|
db55d4ccf9 | ||
|
|
76940a28ba | ||
|
|
5331cb7c5b | ||
|
|
d44ed1c596 | ||
|
|
384d4b6f73 | ||
|
|
861c3e347e | ||
|
|
9c753fede3 | ||
|
|
fa93255ba1 | ||
|
|
f743bf6171 | ||
|
|
4ee343f6df | ||
|
|
335da13cb5 | ||
|
|
441814f201 | ||
|
|
f9a9a86bc7 | ||
|
|
d98f38607f | ||
|
|
63d2f891e4 | ||
|
|
cb974b19b6 | ||
|
|
019f0da11a | ||
|
|
9adc0e2ced | ||
|
|
11515f6f71 | ||
|
|
ee5f960a3f | ||
|
|
5b991d3a69 | ||
|
|
6033fec952 | ||
|
|
c64f02696d | ||
|
|
79163c33cf | ||
|
|
44dd613715 | ||
|
|
3654ab3c8d | ||
|
|
bb6ba8e0e9 | ||
|
|
6323884683 | ||
|
|
def2d2d108 | ||
|
|
cfbd2e287a | ||
|
|
436824c926 | ||
|
|
1a4ae6735d | ||
|
|
08c0096c07 | ||
|
|
233c6facfd | ||
|
|
4ce6c9bdc9 | ||
|
|
5bb9282da6 | ||
|
|
eef371e2a8 | ||
|
|
ad45e2c1da |
@ -10,7 +10,7 @@ omit =
|
||||
*/build_venv/*
|
||||
*/incl/*
|
||||
source =
|
||||
cv_analysis
|
||||
cv_analysis
|
||||
relative_files = True
|
||||
data_file = .coverage
|
||||
|
||||
@ -46,4 +46,4 @@ ignore_errors = True
|
||||
directory = reports
|
||||
|
||||
[xml]
|
||||
output = reports/coverage.xml
|
||||
output = reports/coverage.xml
|
||||
|
||||
@ -97,4 +97,4 @@ target/
|
||||
*.swp
|
||||
*/*.swp
|
||||
*/*/*.swp
|
||||
*/*/*/*.swp
|
||||
*/*/*/*.swp
|
||||
|
||||
@ -1,7 +1,10 @@
|
||||
[core]
|
||||
remote = vector
|
||||
autostage = true
|
||||
remote = azure_remote
|
||||
['remote "vector"']
|
||||
url = ssh://vector.iqser.com/research/nonml_cv_doc_parsing/
|
||||
port = 22
|
||||
|
||||
['remote "azure_remote"']
|
||||
url = azure://cv-sa-dvc/
|
||||
connection_string = "DefaultEndpointsProtocol=https;AccountName=cvsacricket;AccountKey=KOuTAQ6Mp00ePTT5ObYmgaHlxwS1qukY4QU4Kuk7gy/vldneA+ZiKjaOpEFtqKA6Mtym2gQz8THy+ASts/Y1Bw==;EndpointSuffix=core.windows.net"
|
||||
['remote "local"']
|
||||
url = ../dvc_local_remote
|
||||
|
||||
77
.gitignore
vendored
77
.gitignore
vendored
@ -1,27 +1,52 @@
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
.pytest*
|
||||
.python-version
|
||||
.DS_Store
|
||||
|
||||
# Project folders
|
||||
scratch/
|
||||
*.vscode/
|
||||
.idea
|
||||
*_app
|
||||
*pytest_cache
|
||||
*joblib
|
||||
*tmp
|
||||
*profiling
|
||||
*logs
|
||||
*docker
|
||||
*drivers
|
||||
*bamboo-specs/target
|
||||
|
||||
# Python specific files
|
||||
__pycache__/
|
||||
*.egg-info/
|
||||
deskew_model/
|
||||
build_venv/
|
||||
/pdfs/
|
||||
/results/
|
||||
/pdfs/
|
||||
/env/
|
||||
/.idea/
|
||||
/.idea/.gitignore
|
||||
/.idea/misc.xml
|
||||
/.idea/inspectionProfiles/profiles_settings.xml
|
||||
/.idea/table_parsing.iml
|
||||
/.idea/vcs.xml
|
||||
/results/
|
||||
/table_parsing.egg-info
|
||||
/target/
|
||||
/tests/
|
||||
/cv_analysis.egg-info/dependency_links.txt
|
||||
/cv_analysis.egg-info/PKG-INFO
|
||||
/cv_analysis.egg-info/SOURCES.txt
|
||||
/cv_analysis.egg-info/top_level.txt
|
||||
/.vscode/
|
||||
/cv_analysis/test/test_data/example_pages.json
|
||||
/data/metadata_testing_files.csv
|
||||
.coverage
|
||||
/data/
|
||||
*.py[cod]
|
||||
*.ipynb
|
||||
*.ipynb_checkpoints
|
||||
|
||||
# file extensions
|
||||
*.log
|
||||
*.csv
|
||||
*.json
|
||||
*.pkl
|
||||
*.profile
|
||||
*.cbm
|
||||
|
||||
# temp files
|
||||
*.swp
|
||||
*~
|
||||
*.un~
|
||||
|
||||
# keep files
|
||||
!notebooks/*.ipynb
|
||||
|
||||
# keep folders
|
||||
!secrets
|
||||
!data/*
|
||||
!drivers
|
||||
|
||||
# unignore files
|
||||
!bom.*
|
||||
|
||||
30
.gitlab-ci.backup.yml
Normal file
30
.gitlab-ci.backup.yml
Normal file
@ -0,0 +1,30 @@
|
||||
include:
|
||||
- project: "Gitlab/gitlab"
|
||||
ref: 0.3.0
|
||||
file: "/ci-templates/research/dvc-versioning-build-release.gitlab-ci.yml"
|
||||
|
||||
variables:
|
||||
NEXUS_PROJECT_DIR: red
|
||||
IMAGENAME: "${CI_PROJECT_NAME}"
|
||||
|
||||
#################################
|
||||
# temp. disable integration tests, b/c they don't cover the CV analysis case yet
|
||||
trigger integration tests:
|
||||
rules:
|
||||
- when: never
|
||||
|
||||
release build:
|
||||
stage: release
|
||||
needs:
|
||||
- job: set custom version
|
||||
artifacts: true
|
||||
optional: true
|
||||
- job: calculate patch version
|
||||
artifacts: true
|
||||
optional: true
|
||||
- job: calculate minor version
|
||||
artifacts: true
|
||||
optional: true
|
||||
- job: build docker nexus
|
||||
artifacts: true
|
||||
#################################
|
||||
35
.gitlab-ci.yml
Normal file
35
.gitlab-ci.yml
Normal file
@ -0,0 +1,35 @@
|
||||
# CI for services, check gitlab repo for python package CI
|
||||
include:
|
||||
- project: "Gitlab/gitlab"
|
||||
ref: main
|
||||
file: "/ci-templates/research/versioning-build-test-release.gitlab-ci.yml"
|
||||
- project: "Gitlab/gitlab"
|
||||
ref: main
|
||||
file: "/ci-templates/research/docs.gitlab-ci.yml"
|
||||
|
||||
# set project variables here
|
||||
variables:
|
||||
NEXUS_PROJECT_DIR: red # subfolder in Nexus docker-gin where your container will be stored
|
||||
IMAGENAME: $CI_PROJECT_NAME # if the project URL is gitlab.example.com/group-name/project-1, CI_PROJECT_NAME is project-1
|
||||
|
||||
pages:
|
||||
only:
|
||||
- master # KEEP THIS, necessary because `master` branch and not `main` branch
|
||||
|
||||
###################
|
||||
# INTEGRATION TESTS
|
||||
trigger-integration-tests:
|
||||
extends: .integration-tests
|
||||
# ADD THE MODEL BUILD WHICH SHOULD TRIGGER THE INTEGRATION TESTS
|
||||
# needs:
|
||||
# - job: docker-build::model_name
|
||||
# artifacts: true
|
||||
rules:
|
||||
- when: never
|
||||
|
||||
#########
|
||||
# RELEASE
|
||||
release:
|
||||
extends: .release
|
||||
needs:
|
||||
- !reference [.needs-versioning, needs] # leave this line as is
|
||||
61
.hooks/poetry_version_check.py
Normal file
61
.hooks/poetry_version_check.py
Normal file
@ -0,0 +1,61 @@
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import semver
|
||||
from loguru import logger
|
||||
from semver.version import Version
|
||||
|
||||
logger.remove()
|
||||
logger.add(sys.stdout, level="INFO")
|
||||
|
||||
|
||||
def bashcmd(cmds: list) -> str:
|
||||
try:
|
||||
logger.debug(f"running: {' '.join(cmds)}")
|
||||
return subprocess.run(cmds, check=True, capture_output=True, text=True).stdout.strip("\n")
|
||||
except:
|
||||
logger.warning(f"Error executing the following bash command: {' '.join(cmds)}.")
|
||||
raise
|
||||
|
||||
|
||||
def get_highest_existing_git_version_tag() -> str:
|
||||
"""Get highest versions from git tags depending on bump level"""
|
||||
try:
|
||||
git_tags = bashcmd(["git", "tag", "-l"]).split()
|
||||
semver_compat_tags = list(filter(Version.is_valid, git_tags))
|
||||
highest_git_version_tag = max(semver_compat_tags, key=semver.version.Version.parse)
|
||||
logger.info(f"Highest git version tag: {highest_git_version_tag}")
|
||||
return highest_git_version_tag
|
||||
except:
|
||||
logger.warning("Error getting git version tags")
|
||||
raise
|
||||
|
||||
|
||||
def auto_bump_version() -> bool:
|
||||
active = Path(".autoversion").is_file()
|
||||
logger.debug(f"Automated version bump is set to '{active}'")
|
||||
return active
|
||||
|
||||
|
||||
def main() -> None:
|
||||
poetry_project_version = bashcmd(["poetry", "version", "-s"])
|
||||
|
||||
logger.info(f"Poetry project version: {poetry_project_version}")
|
||||
|
||||
highest_git_version_tag = get_highest_existing_git_version_tag()
|
||||
|
||||
comparison_result = semver.compare(poetry_project_version, highest_git_version_tag)
|
||||
|
||||
if comparison_result in (-1, 0):
|
||||
logger.warning("Poetry version must be greater than git tag version.")
|
||||
if auto_bump_version():
|
||||
logger.info(bashcmd(["poetry", "version", highest_git_version_tag]))
|
||||
sys.exit(0)
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.info(f"All good: {poetry_project_version} > {highest_git_version_tag}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
72
.pre-commit-config.yaml
Normal file
72
.pre-commit-config.yaml
Normal file
@ -0,0 +1,72 @@
|
||||
# See https://pre-commit.com for more information
|
||||
# See https://pre-commit.com/hooks.html for more hooks
|
||||
exclude: ^(docs/|notebooks/|data/|src/configs/|tests/|.hooks/|bom.json)
|
||||
default_language_version:
|
||||
python: python3.10
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v5.0.0
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
- id: check-yaml
|
||||
args: [--unsafe] # needed for .gitlab-ci.yml
|
||||
- id: check-toml
|
||||
- id: detect-private-key
|
||||
- id: check-added-large-files
|
||||
args: ['--maxkb=10000']
|
||||
- id: check-case-conflict
|
||||
- id: mixed-line-ending
|
||||
|
||||
# - repo: https://github.com/pre-commit/mirrors-pylint
|
||||
# rev: v3.0.0a5
|
||||
# hooks:
|
||||
# - id: pylint
|
||||
# args:
|
||||
# - --disable=C0111,R0903,E0401
|
||||
# - --max-line-length=120
|
||||
|
||||
- repo: https://github.com/pre-commit/mirrors-isort
|
||||
rev: v5.10.1
|
||||
hooks:
|
||||
- id: isort
|
||||
args:
|
||||
- --profile black
|
||||
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 24.10.0
|
||||
hooks:
|
||||
- id: black
|
||||
# exclude: ^(docs/|notebooks/|data/|src/secrets/)
|
||||
args:
|
||||
- --line-length=120
|
||||
|
||||
- repo: https://github.com/compilerla/conventional-pre-commit
|
||||
rev: v4.0.0
|
||||
hooks:
|
||||
- id: conventional-pre-commit
|
||||
pass_filenames: false
|
||||
stages: [commit-msg]
|
||||
# args: [] # optional: list of Conventional Commits types to allow e.g. [feat, fix, ci, chore, test]
|
||||
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: version-checker
|
||||
name: version-checker
|
||||
entry: python .hooks/poetry_version_check.py
|
||||
language: python
|
||||
always_run: true
|
||||
additional_dependencies:
|
||||
- "semver"
|
||||
- "loguru"
|
||||
|
||||
# - repo: local
|
||||
# hooks:
|
||||
# - id: docker-build-test
|
||||
# name: testing docker build
|
||||
# entry: ./scripts/ops/docker-compose-build-run.sh
|
||||
# language: script
|
||||
# # always_run: true
|
||||
# pass_filenames: false
|
||||
# args: []
|
||||
# stages: [pre-commit]
|
||||
84
Dockerfile
84
Dockerfile
@ -1,30 +1,78 @@
|
||||
FROM python:3.10
|
||||
###############
|
||||
# BUILDER IMAGE
|
||||
FROM python:3.10-slim as builder
|
||||
|
||||
RUN python -m venv /app/venv
|
||||
ENV PATH="/app/venv/bin:$PATH"
|
||||
ARG GITLAB_USER
|
||||
ARG GITLAB_ACCESS_TOKEN
|
||||
|
||||
RUN python -m pip install --upgrade pip
|
||||
ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
|
||||
ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
|
||||
|
||||
WORKDIR /app/service
|
||||
ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
|
||||
ARG POETRY_SOURCE_REF_RED=gitlab-red
|
||||
|
||||
COPY ./requirements.txt ./requirements.txt
|
||||
RUN python3 -m pip install -r requirements.txt
|
||||
ARG PYPI_REGISTRY_FFORESIGHT=https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
|
||||
ARG POETRY_SOURCE_REF_FFORESIGHT=gitlab-fforesight
|
||||
|
||||
COPY ./incl/pyinfra/requirements.txt ./incl/pyinfra/requirements.txt
|
||||
RUN python -m pip install -r incl/pyinfra/requirements.txt
|
||||
ARG VERSION=dev
|
||||
|
||||
COPY ./incl/pdf2image/requirements.txt ./incl/pdf2image/requirements.txt
|
||||
RUN python -m pip install -r incl/pdf2image/requirements.txt
|
||||
LABEL maintainer="Research <research@knecon.com>"
|
||||
LABEL version="${VERSION}"
|
||||
|
||||
COPY ./incl ./incl
|
||||
WORKDIR /app
|
||||
|
||||
RUN python3 -m pip install -e incl/pyinfra
|
||||
RUN python3 -m pip install -e incl/pdf2image
|
||||
###########
|
||||
# ENV SETUP
|
||||
ENV PYTHONDONTWRITEBYTECODE=true
|
||||
ENV PYTHONUNBUFFERED=true
|
||||
ENV POETRY_HOME=/opt/poetry
|
||||
ENV PATH="$POETRY_HOME/bin:$PATH"
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl git bash build-essential libffi-dev libssl-dev && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN curl -sSL https://install.python-poetry.org | python3 -
|
||||
RUN poetry --version
|
||||
|
||||
COPY pyproject.toml poetry.lock ./
|
||||
|
||||
RUN poetry config virtualenvs.create true && \
|
||||
poetry config virtualenvs.in-project true && \
|
||||
poetry config installer.max-workers 10 && \
|
||||
poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
|
||||
poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
|
||||
poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
|
||||
poetry config http-basic.${POETRY_SOURCE_REF_RED} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
|
||||
poetry config repositories.${POETRY_SOURCE_REF_FFORESIGHT} ${PYPI_REGISTRY_FFORESIGHT} && \
|
||||
poetry config http-basic.${POETRY_SOURCE_REF_FFORESIGHT} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
|
||||
poetry install --without=dev,docs,test -vv --no-interaction --no-root
|
||||
|
||||
##################
|
||||
# COPY SOURCE CODE
|
||||
COPY ./config ./config
|
||||
COPY ./src ./src
|
||||
COPY ./cv_analysis ./cv_analysis
|
||||
COPY ./setup.py ./setup.py
|
||||
|
||||
RUN python3 -m pip install -e .
|
||||
###############
|
||||
# WORKING IMAGE
|
||||
FROM python:3.10-slim
|
||||
|
||||
CMD ["python3", "-u", "src/serve.py"]
|
||||
# COPY BILL OF MATERIALS (BOM)
|
||||
COPY bom.json /bom.json
|
||||
|
||||
# COPY SOURCE CODE FROM BUILDER IMAGE
|
||||
COPY --from=builder /app /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENV PATH="/app/.venv/bin:$PATH"
|
||||
|
||||
############
|
||||
# NETWORKING
|
||||
EXPOSE 5000
|
||||
EXPOSE 8080
|
||||
|
||||
################
|
||||
# LAUNCH COMMAND
|
||||
CMD [ "python", "src/serve.py"]
|
||||
|
||||
94
Makefile
Normal file
94
Makefile
Normal file
@ -0,0 +1,94 @@
|
||||
.PHONY: \
|
||||
poetry in-project-venv dev-env use-env install install-dev tests \
|
||||
update-version sync-version-with-git \
|
||||
docker docker-build-run docker-build docker-run \
|
||||
docker-rm docker-rm-container docker-rm-image \
|
||||
pre-commit get-licenses prep-commit \
|
||||
docs sphinx_html sphinx_apidoc bom
|
||||
.DEFAULT_GOAL := run
|
||||
|
||||
export DOCKER=docker
|
||||
export DOCKERFILE=Dockerfile
|
||||
export IMAGE_NAME=cv_analysis_service-image
|
||||
export CONTAINER_NAME=cv_analysis_service-container
|
||||
export HOST_PORT=9999
|
||||
export CONTAINER_PORT=9999
|
||||
export PYTHON_VERSION=python3.10
|
||||
|
||||
# all commands should be executed in the root dir or the project,
|
||||
# specific environments should be deactivated
|
||||
|
||||
poetry: in-project-venv use-env dev-env
|
||||
|
||||
in-project-venv:
|
||||
poetry config virtualenvs.in-project true
|
||||
|
||||
use-env:
|
||||
poetry env use ${PYTHON_VERSION}
|
||||
|
||||
dev-env:
|
||||
poetry install --with dev && poetry update
|
||||
|
||||
install:
|
||||
poetry add $(pkg)
|
||||
|
||||
install-dev:
|
||||
poetry add --dev $(pkg)
|
||||
|
||||
requirements:
|
||||
poetry export --without-hashes --output requirements.txt
|
||||
|
||||
update-version:
|
||||
poetry version prerelease
|
||||
|
||||
sync-version-with-git:
|
||||
git pull -p && poetry version $(git rev-list --tags --max-count=1 | git describe --tags --abbrev=0)
|
||||
|
||||
bom:
|
||||
cyclonedx-py poetry -o bom.json
|
||||
|
||||
docker: docker-rm docker-build-run
|
||||
|
||||
docker-build-run: docker-build docker-run
|
||||
|
||||
docker-build:
|
||||
$(DOCKER) build \
|
||||
--no-cache --progress=plain \
|
||||
-t $(IMAGE_NAME) -f $(DOCKERFILE) \
|
||||
--build-arg USERNAME=${USERNAME} \
|
||||
--build-arg TOKEN=${GITLAB_TOKEN} \
|
||||
.
|
||||
|
||||
docker-run:
|
||||
$(DOCKER) run -it --rm -p $(HOST_PORT):$(CONTAINER_PORT)/tcp --name $(CONTAINER_NAME) $(IMAGE_NAME)
|
||||
|
||||
docker-rm: docker-rm-container docker-rm-image
|
||||
|
||||
docker-rm-container:
|
||||
-$(DOCKER) rm $(CONTAINER_NAME)
|
||||
|
||||
docker-rm-image:
|
||||
-$(DOCKER) image rm $(IMAGE_NAME)
|
||||
|
||||
tests:
|
||||
poetry run pytest ./tests
|
||||
|
||||
prep-commit:
|
||||
docs get-license sync-version-with-git update-version pre-commit
|
||||
|
||||
pre-commit:
|
||||
pre-commit run --all-files
|
||||
|
||||
get-licenses:
|
||||
pip-licenses --format=json --order=license --with-urls > pkg-licenses.json
|
||||
|
||||
docs: sphinx_apidoc sphinx_html
|
||||
|
||||
sphinx_html:
|
||||
poetry run sphinx-build -b html docs/source/ docs/build/html -E -a
|
||||
|
||||
sphinx_apidoc:
|
||||
cp ./README.md ./docs/source/README.md && cp -r ./data ./docs/source/data/ && poetry run sphinx-apidoc ./src -o ./docs/source/modules --no-toc --module-first --follow-links --separate --force
|
||||
|
||||
bom:
|
||||
cyclonedx-py poetry -o bom.json
|
||||
57
README.md
57
README.md
@ -1,8 +1,60 @@
|
||||
# cv-analysis — Visual (CV-Based) Document Parsing
|
||||
# cv-analysis - Visual (CV-Based) Document Parsing
|
||||
|
||||
parse_pdf()
|
||||
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
|
||||
previous redactions in documents.
|
||||
|
||||
## API
|
||||
|
||||
Input message:
|
||||
|
||||
```json
|
||||
{
|
||||
"targetFilePath": {
|
||||
"pdf": "absolute file path",
|
||||
"vlp_output": "absolute file path"
|
||||
},
|
||||
"responseFilePath": "absolute file path",
|
||||
"operation": "table_image_inference"
|
||||
}
|
||||
```
|
||||
|
||||
Response is uploaded to the storage as specified in the `responseFilePath` field. The structure is as follows:
|
||||
|
||||
```json
|
||||
{
|
||||
...,
|
||||
"data": [
|
||||
{
|
||||
'pageNum': 0,
|
||||
'bbox': {
|
||||
'x1': 55.3407,
|
||||
'y1': 247.0246,
|
||||
'x2': 558.5602,
|
||||
'y2': 598.0585
|
||||
},
|
||||
'uuid': '2b10c1a2-393c-4fca-b9e3-0ad5b774ac84',
|
||||
'label': 'table',
|
||||
'tableLines': [
|
||||
{
|
||||
'x1': 0,
|
||||
'y1': 16,
|
||||
'x2': 1399,
|
||||
'y2': 16
|
||||
},
|
||||
...
|
||||
],
|
||||
'imageInfo': {
|
||||
'height': 693,
|
||||
'width': 1414
|
||||
}
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
@ -31,10 +83,9 @@ The below snippet shows hot to find the outlines of previous redactions.
|
||||
|
||||
```python
|
||||
from cv_analysis.redaction_detection import find_redactions
|
||||
import pdf2image
|
||||
import pdf2image
|
||||
import numpy as np
|
||||
|
||||
|
||||
pdf_path = ...
|
||||
page_index = ...
|
||||
|
||||
|
||||
@ -1,40 +0,0 @@
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>com.atlassian.bamboo</groupId>
|
||||
<artifactId>bamboo-specs-parent</artifactId>
|
||||
<version>7.1.2</version>
|
||||
<relativePath/>
|
||||
</parent>
|
||||
|
||||
<artifactId>bamboo-specs</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<properties>
|
||||
<sonar.skip>true</sonar.skip>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.atlassian.bamboo</groupId>
|
||||
<artifactId>bamboo-specs-api</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.atlassian.bamboo</groupId>
|
||||
<artifactId>bamboo-specs</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- Test dependencies -->
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<!-- run 'mvn test' to perform offline validation of the plan -->
|
||||
<!-- run 'mvn -Ppublish-specs' to upload the plan to your Bamboo server -->
|
||||
</project>
|
||||
@ -1,178 +0,0 @@
|
||||
package buildjob;
|
||||
|
||||
import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
|
||||
|
||||
import java.time.LocalTime;
|
||||
|
||||
import com.atlassian.bamboo.specs.api.BambooSpec;
|
||||
import com.atlassian.bamboo.specs.api.builders.BambooKey;
|
||||
import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
|
||||
import com.atlassian.bamboo.specs.api.builders.permission.PermissionType;
|
||||
import com.atlassian.bamboo.specs.api.builders.permission.Permissions;
|
||||
import com.atlassian.bamboo.specs.api.builders.permission.PlanPermissions;
|
||||
import com.atlassian.bamboo.specs.api.builders.plan.Job;
|
||||
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
|
||||
import com.atlassian.bamboo.specs.api.builders.plan.PlanIdentifier;
|
||||
import com.atlassian.bamboo.specs.api.builders.plan.Stage;
|
||||
import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup;
|
||||
import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement;
|
||||
import com.atlassian.bamboo.specs.api.builders.project.Project;
|
||||
import com.atlassian.bamboo.specs.builders.task.CheckoutItem;
|
||||
import com.atlassian.bamboo.specs.builders.task.InjectVariablesTask;
|
||||
import com.atlassian.bamboo.specs.builders.task.ScriptTask;
|
||||
import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask;
|
||||
import com.atlassian.bamboo.specs.builders.task.CleanWorkingDirectoryTask;
|
||||
import com.atlassian.bamboo.specs.builders.task.VcsTagTask;
|
||||
import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
|
||||
import com.atlassian.bamboo.specs.builders.trigger.ScheduledTrigger;
|
||||
import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
|
||||
import com.atlassian.bamboo.specs.api.builders.Variable;
|
||||
import com.atlassian.bamboo.specs.util.BambooServer;
|
||||
import com.atlassian.bamboo.specs.builders.task.ScriptTask;
|
||||
import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
|
||||
|
||||
/**
|
||||
* Plan configuration for Bamboo.
|
||||
* Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
|
||||
*/
|
||||
@BambooSpec
|
||||
public class PlanSpec {
|
||||
|
||||
private static final String SERVICE_NAME = "cv-analysis";
|
||||
|
||||
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_","");
|
||||
|
||||
/**
|
||||
* Run main to publish plan on Bamboo
|
||||
*/
|
||||
public static void main(final String[] args) throws Exception {
|
||||
//By default credentials are read from the '.credentials' file.
|
||||
BambooServer bambooServer = new BambooServer("http://localhost:8085");
|
||||
|
||||
Plan plan = new PlanSpec().createDockerBuildPlan();
|
||||
bambooServer.publish(plan);
|
||||
PlanPermissions planPermission = new PlanSpec().createPlanPermission(plan.getIdentifier());
|
||||
bambooServer.publish(planPermission);
|
||||
|
||||
Plan secPlan = new PlanSpec().createSecBuild();
|
||||
bambooServer.publish(secPlan);
|
||||
PlanPermissions secPlanPermission = new PlanSpec().createPlanPermission(secPlan.getIdentifier());
|
||||
bambooServer.publish(secPlanPermission);
|
||||
}
|
||||
|
||||
private PlanPermissions createPlanPermission(PlanIdentifier planIdentifier) {
|
||||
Permissions permission = new Permissions()
|
||||
.userPermissions("atlbamboo", PermissionType.EDIT, PermissionType.VIEW, PermissionType.ADMIN, PermissionType.CLONE, PermissionType.BUILD)
|
||||
.groupPermissions("research", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
|
||||
.groupPermissions("Development", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
|
||||
.groupPermissions("QA", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
|
||||
.loggedInUserPermissions(PermissionType.VIEW)
|
||||
.anonymousUserPermissionView();
|
||||
return new PlanPermissions(planIdentifier.getProjectKey(), planIdentifier.getPlanKey()).permissions(permission);
|
||||
}
|
||||
|
||||
private Project project() {
|
||||
return new Project()
|
||||
.name("RED")
|
||||
.key(new BambooKey("RED"));
|
||||
}
|
||||
|
||||
public Plan createDockerBuildPlan() {
|
||||
return new Plan(
|
||||
project(),
|
||||
SERVICE_NAME, new BambooKey(SERVICE_KEY))
|
||||
// .description("Docker build for cv-analysis.")
|
||||
// .variables()
|
||||
.stages(new Stage("Build Stage")
|
||||
.jobs(
|
||||
new Job("Build Job", new BambooKey("BUILD"))
|
||||
.tasks(
|
||||
new CleanWorkingDirectoryTask()
|
||||
.description("Clean working directory.")
|
||||
.enabled(true),
|
||||
new VcsCheckoutTask()
|
||||
.description("Checkout default repository.")
|
||||
.checkoutItems(new CheckoutItem().defaultRepository()),
|
||||
new ScriptTask()
|
||||
.description("Set config and keys.")
|
||||
.location(Location.FILE)
|
||||
.fileFromPath("bamboo-specs/src/main/resources/scripts/key-prepare.sh"),
|
||||
new ScriptTask()
|
||||
.description("Build Docker container.")
|
||||
.location(Location.FILE)
|
||||
.fileFromPath("bamboo-specs/src/main/resources/scripts/docker-build.sh")
|
||||
.argument(SERVICE_NAME),
|
||||
new InjectVariablesTask()
|
||||
.description("Inject git tag.")
|
||||
.path("git.tag")
|
||||
.namespace("g")
|
||||
.scope(InjectVariablesScope.LOCAL),
|
||||
new VcsTagTask()
|
||||
.description("${bamboo.g.gitTag}")
|
||||
.tagName("${bamboo.g.gitTag}")
|
||||
.defaultRepository())
|
||||
.dockerConfiguration(
|
||||
new DockerConfiguration()
|
||||
.image("nexus.iqser.com:5001/infra/release_build:4.5.0")
|
||||
.volume("/var/run/docker.sock", "/var/run/docker.sock")),
|
||||
new Job("Licence Job", new BambooKey("LICENCE"))
|
||||
.enabled(false)
|
||||
.tasks(
|
||||
new VcsCheckoutTask()
|
||||
.description("Checkout default repository.")
|
||||
.checkoutItems(new CheckoutItem().defaultRepository()),
|
||||
new ScriptTask()
|
||||
.description("Build licence.")
|
||||
.location(Location.FILE)
|
||||
.fileFromPath("bamboo-specs/src/main/resources/scripts/create-licence.sh"))
|
||||
.dockerConfiguration(
|
||||
new DockerConfiguration()
|
||||
.image("nexus.iqser.com:5001/infra/maven:3.6.2-jdk-13-3.0.0")
|
||||
.volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
|
||||
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
|
||||
.linkedRepositories("RR / " + SERVICE_NAME)
|
||||
.triggers(
|
||||
new BitbucketServerTrigger())
|
||||
.planBranchManagement(
|
||||
new PlanBranchManagement()
|
||||
.createForVcsBranch()
|
||||
.delete(
|
||||
new BranchCleanup()
|
||||
.whenInactiveInRepositoryAfterDays(14))
|
||||
.notificationForCommitters());
|
||||
}
|
||||
|
||||
public Plan createSecBuild() {
|
||||
return new Plan(project(), SERVICE_NAME + "-Sec", new BambooKey(SERVICE_KEY + "SEC")).description("Security Analysis Plan")
|
||||
.stages(new Stage("Default Stage").jobs(
|
||||
new Job("Sonar Job", new BambooKey("SONAR"))
|
||||
.tasks(
|
||||
new CleanWorkingDirectoryTask()
|
||||
.description("Clean working directory.")
|
||||
.enabled(true),
|
||||
new VcsCheckoutTask()
|
||||
.description("Checkout default repository.")
|
||||
.checkoutItems(new CheckoutItem().defaultRepository()),
|
||||
new ScriptTask()
|
||||
.description("Set config and keys.")
|
||||
.location(Location.FILE)
|
||||
.fileFromPath("bamboo-specs/src/main/resources/scripts/key-prepare.sh"),
|
||||
new ScriptTask()
|
||||
.description("Run Sonarqube scan.")
|
||||
.location(Location.FILE)
|
||||
.fileFromPath("bamboo-specs/src/main/resources/scripts/sonar-scan.sh")
|
||||
.argument(SERVICE_NAME))
|
||||
.dockerConfiguration(
|
||||
new DockerConfiguration()
|
||||
.image("nexus.iqser.com:5001/infra/release_build:4.2.0")
|
||||
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
|
||||
.linkedRepositories("RR / " + SERVICE_NAME)
|
||||
.triggers(
|
||||
new ScheduledTrigger()
|
||||
.scheduleOnceDaily(LocalTime.of(23, 00)))
|
||||
.planBranchManagement(
|
||||
new PlanBranchManagement()
|
||||
.createForVcsBranchMatching("release.*")
|
||||
.notificationForCommitters());
|
||||
}
|
||||
}
|
||||
@ -1,19 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
if [[ \"${bamboo_version_tag}\" != \"dev\" ]]
|
||||
then
|
||||
${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
|
||||
-f ${bamboo_build_working_directory}/pom.xml \
|
||||
versions:set \
|
||||
-DnewVersion=${bamboo_version_tag}
|
||||
|
||||
${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
|
||||
-f ${bamboo_build_working_directory}/pom.xml \
|
||||
-B clean deploy \
|
||||
-e -DdeployAtEnd=true \
|
||||
-Dmaven.wagon.http.ssl.insecure=true \
|
||||
-Dmaven.wagon.http.ssl.allowall=true \
|
||||
-Dmaven.wagon.http.ssl.ignore.validity.dates=true \
|
||||
-DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/gin4-platform-releases
|
||||
fi
|
||||
@ -1,53 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
SERVICE_NAME=$1
|
||||
|
||||
if [[ "$bamboo_planRepository_branchName" == "master" ]]
|
||||
then
|
||||
branchVersion=$(cat version.yaml | grep -Eo "version: .*" | sed -s 's|version: \(.*\)\..*\..*|\1|g')
|
||||
latestVersion=$( semver $(git tag -l "${branchVersion}.*" ) | tail -n1 )
|
||||
newVersion="$(semver $latestVersion -p -i minor)"
|
||||
echo "new release on master with version $newVersion"
|
||||
elif [[ "$bamboo_planRepository_branchName" == release* ]]
|
||||
then
|
||||
branchVersion=$(echo $bamboo_planRepository_branchName | sed -s 's|release\/\([0-9]\+\.[0-9]\+\)\.x|\1|')
|
||||
latestVersion=$( semver $(git tag -l "${branchVersion}.*" ) | tail -n1 )
|
||||
newVersion="$(semver $latestVersion -p -i patch)"
|
||||
echo "new release on $bamboo_planRepository_branchName with version $newVersion"
|
||||
elif [[ "${bamboo_version_tag}" != "dev" ]]
|
||||
then
|
||||
newVersion="${bamboo_version_tag}"
|
||||
echo "new special version bild with $newVersion"
|
||||
else
|
||||
newVersion="${bamboo_planRepository_1_branch}_${bamboo_buildNumber}"
|
||||
echo "gitTag=${newVersion}" > git.tag
|
||||
echo "dev build with tag ${newVersion}"
|
||||
python3 -m venv build_venv
|
||||
source build_venv/bin/activate
|
||||
python3 -m pip install --upgrade pip
|
||||
|
||||
pip install dvc
|
||||
pip install 'dvc[ssh]'
|
||||
dvc pull
|
||||
|
||||
echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
|
||||
echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
|
||||
docker build -f Dockerfile .
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "gitTag=${newVersion}" > git.tag
|
||||
|
||||
python3 -m venv build_venv
|
||||
source build_venv/bin/activate
|
||||
python3 -m pip install --upgrade pip
|
||||
|
||||
pip install dvc
|
||||
pip install 'dvc[ssh]'
|
||||
dvc pull
|
||||
|
||||
echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
|
||||
docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion} .
|
||||
echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
|
||||
docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion}
|
||||
@ -1,8 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
mkdir -p ~/.ssh
|
||||
echo "${bamboo_agent_ssh}" | base64 -d >> ~/.ssh/id_rsa
|
||||
echo "host vector.iqser.com" > ~/.ssh/config
|
||||
echo " user bamboo-agent" >> ~/.ssh/config
|
||||
chmod 600 ~/.ssh/config ~/.ssh/id_rsa
|
||||
@ -1,67 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
export JAVA_HOME=/usr/bin/sonar-scanner/jre
|
||||
|
||||
python3 -m venv build_venv
|
||||
source build_venv/bin/activate
|
||||
python3 -m pip install --upgrade pip
|
||||
|
||||
echo "dev setup for unit test and coverage"
|
||||
|
||||
pip install -e incl/pyinfra
|
||||
pip install -r incl/pyinfra/requirements.txt
|
||||
|
||||
pip install -e incl/pdf2image
|
||||
pip install -r incl/pdf2image/requirements.txt
|
||||
|
||||
pip install -e .
|
||||
pip install -r requirements.txt
|
||||
|
||||
|
||||
echo "DVC pull step"
|
||||
dvc pull
|
||||
|
||||
echo "coverage calculation"
|
||||
coverage run -m pytest
|
||||
echo "coverage report generation"
|
||||
coverage report -m
|
||||
coverage xml
|
||||
|
||||
SERVICE_NAME=$1
|
||||
|
||||
echo "dependency-check:aggregate"
|
||||
mkdir -p reports
|
||||
dependency-check --enableExperimental -f JSON -f HTML -f XML \
|
||||
--disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
|
||||
--exclude "build_venv/**" --exclude "**/__pycache__/**"
|
||||
|
||||
if [[ -z "${bamboo_repository_pr_key}" ]]
|
||||
then
|
||||
echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
|
||||
/usr/bin/sonar-scanner/bin/sonar-scanner -X\
|
||||
-Dsonar.projectKey=RED_$SERVICE_NAME \
|
||||
-Dsonar.sources=src,cv_analysis \
|
||||
-Dsonar.host.url=https://sonarqube.iqser.com \
|
||||
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
|
||||
-Dsonar.branch.name=${bamboo_planRepository_1_branch} \
|
||||
-Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
|
||||
-Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
|
||||
-Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
|
||||
-Dsonar.python.coverage.reportPaths=reports/coverage.xml
|
||||
|
||||
else
|
||||
echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
|
||||
/usr/bin/sonar-scanner/bin/sonar-scanner \
|
||||
-Dsonar.projectKey=RED_$SERVICE_NAME \
|
||||
-Dsonar.sources=src,cv_analysis \
|
||||
-Dsonar.host.url=https://sonarqube.iqser.com \
|
||||
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
|
||||
-Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
|
||||
-Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \
|
||||
-Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \
|
||||
-Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
|
||||
-Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
|
||||
-Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
|
||||
-Dsonar.python.coverage.reportPaths=reports/coverage.xml
|
||||
fi
|
||||
@ -1,22 +0,0 @@
|
||||
package buildjob;
|
||||
|
||||
|
||||
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
|
||||
import com.atlassian.bamboo.specs.api.exceptions.PropertiesValidationException;
|
||||
import com.atlassian.bamboo.specs.api.util.EntityPropertiesBuilders;
|
||||
import org.junit.Test;
|
||||
|
||||
public class PlanSpecTest {
|
||||
@Test
|
||||
public void checkYourPlanOffline() throws PropertiesValidationException {
|
||||
Plan plan = new PlanSpec().createDockerBuildPlan();
|
||||
|
||||
EntityPropertiesBuilders.build(plan);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void checkYourSecPlanOffline() throws PropertiesValidationException {
|
||||
Plan secPlan = new PlanSpec().createSecBuild();
|
||||
EntityPropertiesBuilders.build(secPlan);
|
||||
}
|
||||
}
|
||||
67
config/pyinfra.toml
Normal file
67
config/pyinfra.toml
Normal file
@ -0,0 +1,67 @@
|
||||
|
||||
[asyncio]
|
||||
max_concurrent_tasks = 10
|
||||
|
||||
[dynamic_tenant_queues]
|
||||
enabled = true
|
||||
|
||||
[metrics.prometheus]
|
||||
enabled = true
|
||||
prefix = "redactmanager_cv_analysis_service"
|
||||
|
||||
[tracing]
|
||||
enabled = true
|
||||
# possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
|
||||
type = "azure_monitor"
|
||||
|
||||
[tracing.opentelemetry]
|
||||
endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
|
||||
service_name = "redactmanager_cv_analysis_service"
|
||||
exporter = "otlp"
|
||||
|
||||
[webserver]
|
||||
host = "0.0.0.0"
|
||||
port = 8080
|
||||
|
||||
[rabbitmq]
|
||||
host = "localhost"
|
||||
port = 5672
|
||||
username = ""
|
||||
password = ""
|
||||
heartbeat = 60
|
||||
# Has to be a divider of heartbeat, and shouldn't be too big, since only in these intervals queue interactions happen (like receiving new messages)
|
||||
# This is also the minimum time the service needs to process a message
|
||||
connection_sleep = 5
|
||||
input_queue = "request_queue"
|
||||
output_queue = "response_queue"
|
||||
dead_letter_queue = "dead_letter_queue"
|
||||
|
||||
tenant_event_queue_suffix = "_tenant_event_queue"
|
||||
tenant_event_dlq_suffix = "_tenant_events_dlq"
|
||||
tenant_exchange_name = "tenants-exchange"
|
||||
queue_expiration_time = 300000 # 5 minutes in milliseconds
|
||||
service_request_queue_prefix = "cv_analysis_request_queue"
|
||||
service_request_exchange_name = "cv_analysis_request_exchange"
|
||||
service_response_exchange_name = "cv_analysis_response_exchange"
|
||||
service_dlq_name = "cv_analysis_dlq"
|
||||
|
||||
[storage]
|
||||
backend = "s3"
|
||||
|
||||
[storage.s3]
|
||||
bucket = "redaction"
|
||||
endpoint = "http://127.0.0.1:9000"
|
||||
key = ""
|
||||
secret = ""
|
||||
region = "eu-central-1"
|
||||
|
||||
[storage.azure]
|
||||
container = "redaction"
|
||||
connection_string = ""
|
||||
|
||||
[storage.tenant_server]
|
||||
public_key = ""
|
||||
endpoint = "http://tenant-user-management:8081/internal-api/tenants"
|
||||
|
||||
[kubernetes]
|
||||
pod_name = "test_pod"
|
||||
19
config/settings.toml
Normal file
19
config/settings.toml
Normal file
@ -0,0 +1,19 @@
|
||||
[logging]
|
||||
level = "INFO"
|
||||
visual_logging_level = "DISABLED"
|
||||
visual_logging_output_folder = "/tmp/debug"
|
||||
|
||||
[table_parsing]
|
||||
skip_pages_without_images = true
|
||||
|
||||
[paths]
|
||||
root = "@format {env[ROOT_PATH]}"
|
||||
dvc_data_dir = "${paths.root}/data"
|
||||
pdf_for_testing = "${paths.dvc_data_dir}/pdfs_for_testing"
|
||||
png_for_testing = "${paths.dvc_data_dir}/pngs_for_testing"
|
||||
png_figures_detected = "${paths.png_for_testing}/figures_detected"
|
||||
png_tables_detected = "${paths.png_for_testing}/tables_detected_by_tp"
|
||||
hashed_pdfs_for_testing = "${paths.pdf_for_testing}/hashed"
|
||||
metadata_test_files = "${paths.dvc_data_dir}/metadata_testing_files.csv"
|
||||
test_dir = "${paths.dvc_data_dir}/test"
|
||||
test_data_dir = "${paths.dvc_data_dir}/test/test_data"
|
||||
@ -1,31 +0,0 @@
|
||||
import os
|
||||
|
||||
|
||||
def get_config():
|
||||
return Config()
|
||||
|
||||
|
||||
class Config:
|
||||
def __init__(self):
|
||||
self.logging_level_root = os.environ.get("LOGGING_LEVEL_ROOT", "INFO")
|
||||
self.table_parsing_skip_pages_without_images = os.environ.get("TABLE_PARSING_SKIP_PAGES_WITHOUT_IMAGES", True)
|
||||
|
||||
# visual_logging_level: NOTHING > INFO > DEBUG > ALL
|
||||
self.visual_logging_level = "DISABLED"
|
||||
self.visual_logging_output_folder = "/tmp/debug"
|
||||
|
||||
# locations
|
||||
# FIXME: is everything here necessary?
|
||||
root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
self.dvc_data_dir = os.path.join(root, "data")
|
||||
self.pdf_for_testing = os.path.join(self.dvc_data_dir, "pdfs_for_testing")
|
||||
self.png_for_testing = os.path.join(self.dvc_data_dir, "pngs_for_testing")
|
||||
self.png_figures_detected = os.path.join(self.png_for_testing, "figures_detected")
|
||||
self.png_tables_detected = os.path.join(self.png_for_testing, "tables_detected_by_tp")
|
||||
self.hashed_pdfs_for_testing = os.path.join(self.pdf_for_testing, "hashed")
|
||||
self.metadata_test_files = os.path.join(self.dvc_data_dir, "metadata_testing_files.csv")
|
||||
self.test_dir = os.path.join(root, "test")
|
||||
self.test_data_dir = os.path.join(self.test_dir, "test_data")
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.__getattribute__(key)
|
||||
@ -1,80 +0,0 @@
|
||||
from functools import partial
|
||||
from typing import Iterable, List
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from funcy import compose, rcompose, lkeep
|
||||
|
||||
from cv_analysis.utils import lstarkeep
|
||||
from cv_analysis.utils.common import (
|
||||
find_contours_and_hierarchies,
|
||||
dilate_page_components,
|
||||
normalize_to_gray_scale,
|
||||
threshold_image,
|
||||
invert_image,
|
||||
fill_rectangles,
|
||||
)
|
||||
from cv_analysis.utils.conversion import contour_to_rectangle
|
||||
from cv_analysis.utils.merging import merge_related_rectangles
|
||||
from cv_analysis.utils.postprocessing import remove_included, has_no_parent
|
||||
from cv_analysis.utils.rectangle import Rectangle
|
||||
|
||||
|
||||
def parse_layout(image: np.array) -> List[Rectangle]:
|
||||
"""Parse the layout of a page.
|
||||
|
||||
Args:
|
||||
image: Image of the page.
|
||||
|
||||
Returns:
|
||||
List of rectangles representing the layout of the page as identified page elements.
|
||||
"""
|
||||
rectangles = rcompose(
|
||||
find_segments,
|
||||
remove_included,
|
||||
merge_related_rectangles,
|
||||
remove_included,
|
||||
)(image)
|
||||
|
||||
return rectangles
|
||||
|
||||
|
||||
def find_segments(image: np.ndarray) -> List[Rectangle]:
|
||||
"""Find segments in a page. Segments are structural elements of a page, such as text blocks, tables, etc."""
|
||||
rectangles = rcompose(
|
||||
prepare_for_initial_detection,
|
||||
__find_segments,
|
||||
partial(prepare_for_meta_detection, image.copy()),
|
||||
__find_segments,
|
||||
)(image)
|
||||
|
||||
return rectangles
|
||||
|
||||
|
||||
def prepare_for_initial_detection(image: np.ndarray) -> np.ndarray:
|
||||
return compose(dilate_page_components, normalize_to_gray_scale)(image)
|
||||
|
||||
|
||||
def __find_segments(image: np.ndarray) -> List[Rectangle]:
|
||||
def to_rectangle_if_valid(contour, hierarchy):
|
||||
return contour_to_rectangle(contour) if is_likely_segment(contour) and has_no_parent(hierarchy) else None
|
||||
|
||||
rectangles = lstarkeep(to_rectangle_if_valid, zip(*find_contours_and_hierarchies(image)))
|
||||
|
||||
return rectangles
|
||||
|
||||
|
||||
def prepare_for_meta_detection(image: np.ndarray, rectangles: Iterable[Rectangle]) -> np.ndarray:
|
||||
image = rcompose(
|
||||
fill_rectangles,
|
||||
threshold_image,
|
||||
invert_image,
|
||||
normalize_to_gray_scale,
|
||||
)(image, rectangles)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def is_likely_segment(rectangle: Rectangle, min_area: float = 100) -> bool:
|
||||
# FIXME: Parameterize via factory
|
||||
return cv2.contourArea(rectangle, False) > min_area
|
||||
@ -1,12 +0,0 @@
|
||||
"""Defines constant paths relative to a root path."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
MODULE_PATH = Path(__file__).resolve().parents[0]
|
||||
PACKAGE_ROOT_PATH = MODULE_PATH.parents[0]
|
||||
REPO_ROOT_PATH = PACKAGE_ROOT_PATH
|
||||
|
||||
TEST_DIR_PATH = REPO_ROOT_PATH / "test"
|
||||
TEST_DATA_DVC = TEST_DIR_PATH / "test_data.dvc" # TODO: remove once new tests are in place
|
||||
TEST_DATA_DIR = TEST_DIR_PATH / "data"
|
||||
TEST_PAGE_TEXTURES_DIR = TEST_DATA_DIR / "paper"
|
||||
@ -1,60 +0,0 @@
|
||||
from dataclasses import asdict
|
||||
from operator import truth
|
||||
|
||||
from funcy import lmap, flatten
|
||||
|
||||
from cv_analysis.figure_detection.figure_detection import detect_figures
|
||||
from cv_analysis.table_parsing import parse_tables
|
||||
from cv_analysis.utils.rectangle import Rectangle
|
||||
from pdf2img.conversion import convert_pages_to_images
|
||||
from pdf2img.default_objects.image import ImagePlus, ImageInfo
|
||||
from pdf2img.default_objects.rectangle import RectanglePlus
|
||||
|
||||
|
||||
def make_analysis_pipeline_for_element_type(segment_type, **kwargs):
|
||||
if segment_type == "table":
|
||||
return make_analysis_pipeline(parse_tables, table_parsing_formatter, dpi=200, **kwargs)
|
||||
elif segment_type == "figure":
|
||||
return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200, **kwargs)
|
||||
else:
|
||||
raise ValueError(f"Unknown segment type {segment_type}.")
|
||||
|
||||
|
||||
def make_analysis_pipeline(analysis_fn, formatter, dpi, skip_pages_without_images=False):
|
||||
def analysis_pipeline(pdf: bytes, index=None):
|
||||
def parse_page(page: ImagePlus):
|
||||
image = page.asarray()
|
||||
rectangles = analysis_fn(image)
|
||||
if not rectangles:
|
||||
return
|
||||
infos = formatter(rectangles, page, dpi)
|
||||
return infos
|
||||
|
||||
pages = convert_pages_to_images(pdf, index=index, dpi=dpi, skip_pages_without_images=skip_pages_without_images)
|
||||
results = map(parse_page, pages)
|
||||
|
||||
yield from flatten(filter(truth, results))
|
||||
|
||||
return analysis_pipeline
|
||||
|
||||
|
||||
def table_parsing_formatter(rectangles, page: ImagePlus, dpi):
|
||||
def format_rectangle(rectangle: Rectangle):
|
||||
rectangle_plus = RectanglePlus.from_pixels(*rectangle_to_xyxy(rectangle), page.info, alpha=False, dpi=dpi)
|
||||
return rectangle_plus.asdict(derotate=True)
|
||||
|
||||
bboxes = lmap(format_rectangle, rectangles)
|
||||
|
||||
return {"pageInfo": page.asdict(natural_index=True), "tableCells": bboxes}
|
||||
|
||||
|
||||
def figure_detection_formatter(rectangles, page, dpi):
|
||||
def format_rectangle(rectangle: Rectangle):
|
||||
rect_plus = RectanglePlus.from_pixels(*rectangle_to_xyxy(rectangle), page.info, alpha=False, dpi=dpi)
|
||||
return asdict(ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha))
|
||||
|
||||
return lmap(format_rectangle, rectangles)
|
||||
|
||||
|
||||
def rectangle_to_xyxy(rectangle: Rectangle):
|
||||
return rectangle.x1, rectangle.y1, rectangle.x2, rectangle.y2
|
||||
@ -1,129 +0,0 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
from funcy import lmap, lfilter
|
||||
|
||||
from cv_analysis.layout_parsing import parse_layout
|
||||
from cv_analysis.utils.conversion import box_to_rectangle
|
||||
from cv_analysis.utils.postprocessing import remove_isolated
|
||||
from cv_analysis.utils.visual_logger import vizlogger
|
||||
|
||||
|
||||
def add_external_contours(image, image_h_w_lines_only):
|
||||
|
||||
contours, _ = cv2.findContours(image_h_w_lines_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
|
||||
for cnt in contours:
|
||||
x, y, w, h = cv2.boundingRect(cnt)
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def apply_motion_blur(image: np.array, angle, size=80):
|
||||
"""Solidifies and slightly extends detected lines.
|
||||
|
||||
Args:
|
||||
image (np.array): page image as array
|
||||
angle: direction in which to apply blur, 0 or 90
|
||||
size (int): kernel size; 80 found empirically to work well
|
||||
|
||||
Returns:
|
||||
np.ndarray
|
||||
"""
|
||||
k = np.zeros((size, size), dtype=np.float32)
|
||||
vizlogger.debug(k, "tables08_blur_kernel1.png")
|
||||
k[(size - 1) // 2, :] = np.ones(size, dtype=np.float32)
|
||||
vizlogger.debug(k, "tables09_blur_kernel2.png")
|
||||
k = cv2.warpAffine(
|
||||
k,
|
||||
cv2.getRotationMatrix2D((size / 2 - 0.5, size / 2 - 0.5), angle, 1.0),
|
||||
(size, size),
|
||||
)
|
||||
vizlogger.debug(k, "tables10_blur_kernel3.png")
|
||||
k = k * (1.0 / np.sum(k))
|
||||
vizlogger.debug(k, "tables11_blur_kernel4.png")
|
||||
blurred = cv2.filter2D(image, -1, k)
|
||||
return blurred
|
||||
|
||||
|
||||
def isolate_vertical_and_horizontal_components(img_bin):
|
||||
"""Identifies and reinforces horizontal and vertical lines in a binary image.
|
||||
|
||||
Args:
|
||||
img_bin (np.array): array corresponding to single binarized page image
|
||||
|
||||
Returns:
|
||||
np.ndarray
|
||||
"""
|
||||
line_min_width = 48
|
||||
kernel_h = np.ones((1, line_min_width), np.uint8)
|
||||
kernel_v = np.ones((line_min_width, 1), np.uint8)
|
||||
|
||||
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
|
||||
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
|
||||
img_lines_raw = img_bin_v | img_bin_h
|
||||
|
||||
kernel_h = np.ones((1, 30), np.uint8)
|
||||
kernel_v = np.ones((30, 1), np.uint8)
|
||||
img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=2)
|
||||
img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2)
|
||||
|
||||
img_bin_h = apply_motion_blur(img_bin_h, 0)
|
||||
img_bin_v = apply_motion_blur(img_bin_v, 90)
|
||||
|
||||
img_bin_extended = img_bin_h | img_bin_v
|
||||
|
||||
th1, img_bin_extended = cv2.threshold(img_bin_extended, 120, 255, cv2.THRESH_BINARY)
|
||||
img_bin_final = cv2.dilate(img_bin_extended, np.ones((1, 1), np.uint8), iterations=1)
|
||||
# add contours before lines are extended by blurring
|
||||
img_bin_final = add_external_contours(img_bin_final, img_lines_raw)
|
||||
|
||||
return img_bin_final
|
||||
|
||||
|
||||
def find_table_layout_boxes(image: np.array):
|
||||
def is_large_enough(box):
|
||||
(x, y, w, h) = box
|
||||
if w * h >= 100000:
|
||||
return box_to_rectangle(box)
|
||||
|
||||
layout_boxes = parse_layout(image)
|
||||
return lmap(is_large_enough, layout_boxes)
|
||||
|
||||
|
||||
def preprocess(image: np.array):
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
|
||||
_, image = cv2.threshold(image, 195, 255, cv2.THRESH_BINARY)
|
||||
return ~image
|
||||
|
||||
|
||||
def turn_connected_components_into_rectangles(image: np.array):
|
||||
def is_large_enough(stat):
|
||||
x1, y1, w, h, area = stat
|
||||
return area > 2000 and w > 35 and h > 25
|
||||
|
||||
_, _, stats, _ = cv2.connectedComponentsWithStats(~image, connectivity=8, ltype=cv2.CV_32S)
|
||||
|
||||
stats = lfilter(is_large_enough, stats)
|
||||
if stats:
|
||||
stats = np.vstack(stats)
|
||||
return stats[:, :-1][2:]
|
||||
return []
|
||||
|
||||
|
||||
def parse_tables(image: np.array):
|
||||
"""Runs the full table parsing process.
|
||||
|
||||
Args:
|
||||
image (np.array): single PDF page, converted to a numpy array
|
||||
|
||||
Returns:
|
||||
list: list of rectangles corresponding to table cells
|
||||
"""
|
||||
|
||||
image = preprocess(image)
|
||||
image = isolate_vertical_and_horizontal_components(image)
|
||||
boxes = turn_connected_components_into_rectangles(image)
|
||||
rectangles = lmap(box_to_rectangle, boxes)
|
||||
rectangles = remove_isolated(rectangles)
|
||||
|
||||
return rectangles
|
||||
@ -1,51 +0,0 @@
|
||||
from functools import reduce
|
||||
from typing import Iterable
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from funcy import first
|
||||
|
||||
from cv_analysis.utils.rectangle import Rectangle
|
||||
|
||||
|
||||
def find_contours_and_hierarchies(image):
|
||||
contours, hierarchies = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
return contours, first(hierarchies) if hierarchies is not None else None
|
||||
|
||||
|
||||
def dilate_page_components(image: np.ndarray) -> np.ndarray:
|
||||
# FIXME: Parameterize via factory
|
||||
image = cv2.GaussianBlur(image, (7, 7), 0)
|
||||
# FIXME: Parameterize via factory
|
||||
thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
||||
# FIXME: Parameterize via factory
|
||||
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
|
||||
# FIXME: Parameterize via factory
|
||||
dilate = cv2.dilate(thresh, kernel, iterations=4)
|
||||
return dilate
|
||||
|
||||
|
||||
def normalize_to_gray_scale(image: np.ndarray) -> np.ndarray:
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
|
||||
return image
|
||||
|
||||
|
||||
def threshold_image(image: np.ndarray) -> np.ndarray:
|
||||
# FIXME: Parameterize via factory
|
||||
_, image = cv2.threshold(image, 254, 255, cv2.THRESH_BINARY)
|
||||
return image
|
||||
|
||||
|
||||
def invert_image(image: np.ndarray):
|
||||
return ~image
|
||||
|
||||
|
||||
def fill_rectangles(image: np.ndarray, rectangles: Iterable[Rectangle]) -> np.ndarray:
|
||||
image = reduce(fill_in_component_area, rectangles, image)
|
||||
return image
|
||||
|
||||
|
||||
def fill_in_component_area(image: np.ndarray, rectangle: Rectangle) -> np.ndarray:
|
||||
cv2.rectangle(image, (rectangle.x1, rectangle.y1), (rectangle.x2, rectangle.y2), (0, 0, 0), -1)
|
||||
cv2.rectangle(image, (rectangle.x1, rectangle.y1), (rectangle.x2, rectangle.y2), (255, 255, 255), 7)
|
||||
return image
|
||||
@ -1,47 +0,0 @@
|
||||
import json
|
||||
from typing import Sequence, Union
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from cv_analysis.utils.rectangle import Rectangle
|
||||
|
||||
Image_t = Union[Image.Image, np.ndarray]
|
||||
|
||||
|
||||
def contour_to_rectangle(contour):
|
||||
return box_to_rectangle(cv2.boundingRect(contour))
|
||||
|
||||
|
||||
def box_to_rectangle(box: Sequence[int]) -> Rectangle:
|
||||
x, y, w, h = box
|
||||
return Rectangle(x, y, x + w, y + h)
|
||||
|
||||
|
||||
def rectangle_to_box(rectangle: Rectangle) -> Sequence[int]:
|
||||
return [rectangle.x1, rectangle.y1, rectangle.width, rectangle.height]
|
||||
|
||||
|
||||
class RectangleJSONEncoder(json.JSONEncoder):
|
||||
def __init__(self, *args, **kwargs):
|
||||
json.JSONEncoder.__init__(self, *args, **kwargs)
|
||||
self._replacement_map = {}
|
||||
|
||||
def default(self, o):
|
||||
if isinstance(o, Rectangle):
|
||||
return {"x1": o.x1, "x2": o.x2, "y1": o.y1, "y2": o.y2}
|
||||
else:
|
||||
return json.JSONEncoder.default(self, o)
|
||||
|
||||
def encode(self, o):
|
||||
result = json.JSONEncoder.encode(self, o)
|
||||
return result
|
||||
|
||||
|
||||
def normalize_image_format_to_array(image: Image_t):
|
||||
return np.array(image).astype(np.uint8) if isinstance(image, Image.Image) else image
|
||||
|
||||
|
||||
def normalize_image_format_to_pil(image: Image_t):
|
||||
return Image.fromarray(image.astype(np.uint8)) if isinstance(image, np.ndarray) else image
|
||||
@ -1,51 +0,0 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from PIL.Image import Image as Image_t
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
from cv_analysis.utils.conversion import normalize_image_format_to_array
|
||||
|
||||
|
||||
def show_image(image, backend="mpl", **kwargs):
|
||||
image = normalize_image_format_to_array(image)
|
||||
if backend == "mpl":
|
||||
show_image_mpl(image, **kwargs)
|
||||
elif backend == "cv2":
|
||||
show_image_cv2(image, **kwargs)
|
||||
elif backend == "pil":
|
||||
Image.fromarray(image).show()
|
||||
else:
|
||||
raise ValueError(f"Unknown backend: {backend}")
|
||||
|
||||
|
||||
def show_image_cv2(image, maxdim=700, **kwargs):
|
||||
h, w, c = image.shape
|
||||
maxhw = max(h, w)
|
||||
if maxhw > maxdim:
|
||||
ratio = maxdim / maxhw
|
||||
h = int(h * ratio)
|
||||
w = int(w * ratio)
|
||||
|
||||
img = cv2.resize(image, (h, w))
|
||||
cv2.imshow("", img)
|
||||
cv2.waitKey(0)
|
||||
cv2.destroyAllWindows()
|
||||
|
||||
|
||||
def show_image_mpl(image, **kwargs):
|
||||
if isinstance(image, Image_t):
|
||||
# noinspection PyTypeChecker
|
||||
image = np.array(image)
|
||||
# noinspection PyArgumentList
|
||||
assert image.max() <= 255
|
||||
fig, ax = plt.subplots(1, 1)
|
||||
fig.set_size_inches(20, 20)
|
||||
assert image.dtype == np.uint8
|
||||
ax.imshow(image, cmap="gray")
|
||||
ax.title.set_text(kwargs.get("title", ""))
|
||||
plt.show()
|
||||
|
||||
|
||||
def save_image(image, path):
|
||||
cv2.imwrite(path, image)
|
||||
@ -1,29 +0,0 @@
|
||||
from numpy import array, ndarray
|
||||
import pdf2image
|
||||
from PIL import Image
|
||||
|
||||
from cv_analysis.utils.preprocessing import preprocess_page_array
|
||||
|
||||
|
||||
def open_analysis_input_file(path_or_bytes, first_page=1, last_page=None):
|
||||
|
||||
assert first_page > 0, "Page numbers are 1-based."
|
||||
assert last_page is None or last_page >= first_page, "last_page must be greater than or equal to first_page."
|
||||
|
||||
last_page = last_page or first_page
|
||||
|
||||
if type(path_or_bytes) == str:
|
||||
if path_or_bytes.lower().endswith((".png", ".jpg", ".jpeg")):
|
||||
pages = [Image.open(path_or_bytes)]
|
||||
elif path_or_bytes.lower().endswith(".pdf"):
|
||||
pages = pdf2image.convert_from_path(path_or_bytes, first_page=first_page, last_page=last_page)
|
||||
else:
|
||||
raise IOError("Invalid file extension. Accepted filetypes: .png, .jpg, .jpeg, .pdf")
|
||||
elif type(path_or_bytes) == bytes:
|
||||
pages = pdf2image.convert_from_bytes(path_or_bytes, first_page=first_page, last_page=last_page)
|
||||
elif type(path_or_bytes) in {list, ndarray}:
|
||||
return path_or_bytes
|
||||
|
||||
pages = [preprocess_page_array(array(p)) for p in pages]
|
||||
|
||||
return pages
|
||||
@ -1,54 +0,0 @@
|
||||
from functools import reduce
|
||||
from itertools import combinations
|
||||
from typing import List, Tuple, Set
|
||||
|
||||
from funcy import all
|
||||
|
||||
from cv_analysis.utils import until, make_merger_sentinel
|
||||
from cv_analysis.utils.rectangle import Rectangle
|
||||
from cv_analysis.utils.spacial import related
|
||||
|
||||
|
||||
def merge_related_rectangles(rectangles: List[Rectangle]) -> List[Rectangle]:
|
||||
"""Merges rectangles that are related to each other, iterating on partial merge results until no more mergers are
|
||||
possible."""
|
||||
assert isinstance(rectangles, list)
|
||||
no_new_merges = make_merger_sentinel()
|
||||
return until(no_new_merges, merge_rectangles_once, rectangles)
|
||||
|
||||
|
||||
def merge_rectangles_once(rectangles: List[Rectangle]) -> List[Rectangle]:
|
||||
"""Merges rectangles that are related to each other, but does not iterate on the results."""
|
||||
rectangles = set(rectangles)
|
||||
merged, used = reduce(merge_if_related, combinations(rectangles, 2), (set(), set()))
|
||||
|
||||
return list(merged | rectangles - used)
|
||||
|
||||
|
||||
T = Tuple[Set[Rectangle], Set[Rectangle]]
|
||||
V = Tuple[Rectangle, Rectangle]
|
||||
|
||||
|
||||
def merge_if_related(merged_and_used_so_far: T, rectangle_pair: V) -> T:
|
||||
"""Merges two rectangles if they are related, otherwise returns the accumulator unchanged."""
|
||||
alpha, beta = rectangle_pair
|
||||
merged, used = merged_and_used_so_far
|
||||
|
||||
def unused(*args) -> bool:
|
||||
return not used & {*args}
|
||||
|
||||
if all(unused, (alpha, beta)) and related(alpha, beta):
|
||||
return merged | {bounding_rect(alpha, beta)}, used | {alpha, beta}
|
||||
|
||||
else:
|
||||
return merged, used
|
||||
|
||||
|
||||
def bounding_rect(alpha: Rectangle, beta: Rectangle) -> Rectangle:
|
||||
"""Returns the smallest rectangle that contains both rectangles."""
|
||||
return Rectangle(
|
||||
min(alpha.x1, beta.x1),
|
||||
min(alpha.y1, beta.y1),
|
||||
max(alpha.x2, beta.x2),
|
||||
max(alpha.y2, beta.y2),
|
||||
)
|
||||
@ -1,56 +0,0 @@
|
||||
from functools import reduce
|
||||
from operator import itemgetter
|
||||
from typing import Iterable
|
||||
|
||||
import numpy as np
|
||||
from funcy import lmap, lpluck, first
|
||||
|
||||
from cv_analysis.utils import lift
|
||||
from cv_analysis.utils.rectangle import Rectangle
|
||||
|
||||
|
||||
def compute_document_score(result_dict, ground_truth_dicts):
|
||||
|
||||
extract_cells = lambda dicts: lpluck("cells", dicts["pages"])
|
||||
|
||||
cells_per_ground_truth_page, cells_per_result_page = map(extract_cells, (ground_truth_dicts, result_dict))
|
||||
cells_on_page_to_rectangles = lift(rectangle_from_dict)
|
||||
cells_on_pages_to_rectangles = lift(cells_on_page_to_rectangles)
|
||||
|
||||
rectangles_per_ground_truth_page, rectangles_per_result_page = map(
|
||||
cells_on_pages_to_rectangles, (cells_per_ground_truth_page, cells_per_result_page)
|
||||
)
|
||||
|
||||
scores = lmap(compute_page_iou, rectangles_per_result_page, rectangles_per_ground_truth_page)
|
||||
|
||||
n_cells_per_page = np.array(lmap(len, cells_per_ground_truth_page))
|
||||
document_score = np.average(scores, weights=n_cells_per_page / n_cells_per_page.sum())
|
||||
|
||||
return document_score
|
||||
|
||||
|
||||
def rectangle_from_dict(d):
|
||||
x1, y1, w, h = itemgetter("x", "y", "width", "height")(d)
|
||||
return Rectangle(x1, y1, x1 + w, y1 + h)
|
||||
|
||||
|
||||
def compute_page_iou(predicted_rectangles: Iterable[Rectangle], true_rectangles: Iterable[Rectangle]):
|
||||
def find_best_iou(sum_so_far_and_candidate_rectangles, true_rectangle):
|
||||
sum_so_far, predicted_rectangles = sum_so_far_and_candidate_rectangles
|
||||
best_match, best_iou = find_max_overlap(true_rectangle, predicted_rectangles)
|
||||
return sum_so_far + best_iou, predicted_rectangles - {best_match}
|
||||
|
||||
predicted_rectangles = set(predicted_rectangles)
|
||||
true_rectangles = set(true_rectangles)
|
||||
|
||||
iou_sum = first(reduce(find_best_iou, true_rectangles, (0, predicted_rectangles)))
|
||||
normalizing_factor = 1 / max(len(predicted_rectangles), len(true_rectangles))
|
||||
score = normalizing_factor * iou_sum
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def find_max_overlap(rectangle: Rectangle, candidate_rectangles: Iterable[Rectangle]):
|
||||
best_candidate_rectangle = max(candidate_rectangles, key=rectangle.iou)
|
||||
iou = rectangle.iou(best_candidate_rectangle)
|
||||
return best_candidate_rectangle, iou
|
||||
@ -1,85 +0,0 @@
|
||||
# See https://stackoverflow.com/a/33533514
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable, Union
|
||||
|
||||
from funcy import identity
|
||||
|
||||
from cv_analysis.utils.spacial import adjacent, contains, intersection, iou, area, is_contained
|
||||
|
||||
Coord = Union[int, float]
|
||||
|
||||
|
||||
class Rectangle:
|
||||
def __init__(self, x1, y1, x2, y2, discrete=True):
|
||||
"""Creates a rectangle from two points."""
|
||||
nearest_valid = int if discrete else identity
|
||||
|
||||
self.__x1 = nearest_valid(x1)
|
||||
self.__y1 = nearest_valid(y1)
|
||||
self.__x2 = nearest_valid(x2)
|
||||
self.__y2 = nearest_valid(y2)
|
||||
|
||||
def __repr__(self):
|
||||
return f"Rectangle({self.x1}, {self.y1}, {self.x2}, {self.y2})"
|
||||
|
||||
@property
|
||||
def x1(self):
|
||||
return self.__x1
|
||||
|
||||
@property
|
||||
def x2(self):
|
||||
return self.__x2
|
||||
|
||||
@property
|
||||
def y1(self):
|
||||
return self.__y1
|
||||
|
||||
@property
|
||||
def y2(self):
|
||||
return self.__y2
|
||||
|
||||
@property
|
||||
def width(self):
|
||||
return abs(self.x2 - self.x1)
|
||||
|
||||
@property
|
||||
def height(self):
|
||||
return abs(self.y2 - self.y1)
|
||||
|
||||
@property
|
||||
def coords(self):
|
||||
return [self.x1, self.y1, self.x2, self.y2]
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.x1, self.y1, self.x2, self.y2))
|
||||
|
||||
def __iter__(self):
|
||||
yield self.x1
|
||||
yield self.y1
|
||||
yield self.width
|
||||
yield self.height
|
||||
|
||||
def area(self):
|
||||
"""Calculates the area of this rectangle."""
|
||||
return area(self)
|
||||
|
||||
def intersection(self, other):
|
||||
"""Calculates the intersection of this and the given other rectangle."""
|
||||
return intersection(self, other)
|
||||
|
||||
def iou(self, other: Rectangle):
|
||||
"""Calculates the intersection over union of this and the given other rectangle."""
|
||||
return iou(self, other)
|
||||
|
||||
def includes(self, other: Rectangle, tol=3):
|
||||
"""Checks if this rectangle contains the given other."""
|
||||
return contains(self, other, tol)
|
||||
|
||||
def is_included(self, rectangles: Iterable[Rectangle]):
|
||||
"""Checks if this rectangle is contained by any of the given rectangles."""
|
||||
return is_contained(self, rectangles)
|
||||
|
||||
def adjacent(self, other: Rectangle, tolerance=7):
|
||||
"""Checks if this rectangle is adjacent to the given other."""
|
||||
return adjacent(self, other, tolerance)
|
||||
@ -1,286 +0,0 @@
|
||||
# See https://stackoverflow.com/a/39757388
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
from operator import attrgetter
|
||||
from typing import TYPE_CHECKING, Iterable
|
||||
|
||||
from funcy import juxt, rpartial, compose, lflatten, first, second
|
||||
|
||||
from cv_analysis.utils import lift
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from cv_analysis.utils.rectangle import Rectangle
|
||||
|
||||
|
||||
def adjacent(alpha: Rectangle, beta: Rectangle, tolerance=7, strict=False):
|
||||
"""Checks if the two rectangles are adjacent to each other.
|
||||
|
||||
Args:
|
||||
alpha: The first rectangle.
|
||||
beta: The second rectangle.
|
||||
tolerance: The maximum distance between the two rectangles.
|
||||
strict: If True, the rectangles must be adjacent along one axis and contained within the other axis. Else, the
|
||||
rectangles must be adjacent along one axis and overlapping the other axis.
|
||||
Returns:
|
||||
True if the two rectangles are adjacent to each other, False otherwise.
|
||||
"""
|
||||
select_strictness_variant = first if strict else second
|
||||
test_candidates = [
|
||||
# +---+
|
||||
# | | +---+
|
||||
# | a | | b |
|
||||
# | | +___+
|
||||
# +___+
|
||||
(right_left_aligned_and_vertically_contained, right_left_aligned_and_vertically_overlapping),
|
||||
# +---+
|
||||
# +---+ | |
|
||||
# | b | | a |
|
||||
# +___+ | |
|
||||
# +___+
|
||||
(left_right_aligned_and_vertically_contained, left_right_aligned_and_vertically_overlapping),
|
||||
# +-----------+
|
||||
# | a |
|
||||
# +___________+
|
||||
# +-----+
|
||||
# | b |
|
||||
# +_____+
|
||||
(bottom_top_aligned_and_horizontally_contained, bottom_top_aligned_and_horizontally_overlapping),
|
||||
# +-----+
|
||||
# | b |
|
||||
# +_____+
|
||||
# +-----------+
|
||||
# | a |
|
||||
# +___________+
|
||||
(top_bottom_aligned_and_horizontally_contained, top_bottom_aligned_and_horizontally_overlapping),
|
||||
]
|
||||
|
||||
tests = map(select_strictness_variant, test_candidates)
|
||||
return any(juxt(*tests)(alpha, beta, tolerance))
|
||||
|
||||
|
||||
def right_left_aligned_and_vertically_overlapping(alpha: Rectangle, beta: Rectangle, tol):
|
||||
"""Checks if the first rectangle is left of the other within a tolerance and also overlaps the other's y range."""
|
||||
return adjacent_along_one_axis_and_overlapping_along_perpendicular_axis(
|
||||
alpha.x2, beta.x1, beta.y1, beta.y2, alpha.y1, alpha.y2, tolerance=tol
|
||||
)
|
||||
|
||||
|
||||
def left_right_aligned_and_vertically_overlapping(alpha: Rectangle, beta: Rectangle, tol):
|
||||
"""Checks if the first rectangle is right of the other within a tolerance and also overlaps the other's y range."""
|
||||
return adjacent_along_one_axis_and_overlapping_along_perpendicular_axis(
|
||||
alpha.x1, beta.x2, beta.y1, beta.y2, alpha.y1, alpha.y2, tolerance=tol
|
||||
)
|
||||
|
||||
|
||||
def bottom_top_aligned_and_horizontally_overlapping(alpha: Rectangle, beta: Rectangle, tol):
|
||||
"""Checks if the first rectangle is above the other within a tolerance and also overlaps the other's x range."""
|
||||
return adjacent_along_one_axis_and_overlapping_along_perpendicular_axis(
|
||||
alpha.y2, beta.y1, beta.x1, beta.x2, alpha.x1, alpha.x2, tolerance=tol
|
||||
)
|
||||
|
||||
|
||||
def top_bottom_aligned_and_horizontally_overlapping(alpha: Rectangle, beta: Rectangle, tol):
|
||||
"""Checks if the first rectangle is below the other within a tolerance and also overlaps the other's x range."""
|
||||
return adjacent_along_one_axis_and_overlapping_along_perpendicular_axis(
|
||||
alpha.y1, beta.y2, beta.x1, beta.x2, alpha.x1, alpha.x2, tolerance=tol
|
||||
)
|
||||
|
||||
|
||||
def right_left_aligned_and_vertically_contained(alpha: Rectangle, beta: Rectangle, tol):
|
||||
"""Checks if the first rectangle is left of the other within a tolerance and also contains the other's y range."""
|
||||
return adjacent_along_one_axis_and_contained_within_perpendicular_axis(
|
||||
alpha.x2, beta.x1, beta.y1, beta.y2, alpha.y1, alpha.y2, tolerance=tol
|
||||
)
|
||||
|
||||
|
||||
def left_right_aligned_and_vertically_contained(alpha: Rectangle, beta: Rectangle, tol):
|
||||
"""Checks if the first rectangle is right of the other within a tolerance and also contains the other's y range."""
|
||||
return adjacent_along_one_axis_and_contained_within_perpendicular_axis(
|
||||
alpha.x1, beta.x2, beta.y1, beta.y2, alpha.y1, alpha.y2, tolerance=tol
|
||||
)
|
||||
|
||||
|
||||
def bottom_top_aligned_and_horizontally_contained(alpha: Rectangle, beta: Rectangle, tol):
|
||||
"""Checks if the first rectangle is above the other within a tolerance and also contains the other's x range."""
|
||||
return adjacent_along_one_axis_and_contained_within_perpendicular_axis(
|
||||
alpha.y2, beta.y1, beta.x1, beta.x2, alpha.x1, alpha.x2, tolerance=tol
|
||||
)
|
||||
|
||||
|
||||
def top_bottom_aligned_and_horizontally_contained(alpha: Rectangle, beta: Rectangle, tol):
|
||||
"""Checks if the first rectangle is below the other within a tolerance and also contains the other's x range."""
|
||||
return adjacent_along_one_axis_and_contained_within_perpendicular_axis(
|
||||
alpha.y1, beta.y2, beta.x1, beta.x2, alpha.x1, alpha.x2, tolerance=tol
|
||||
)
|
||||
|
||||
|
||||
def adjacent_along_one_axis_and_overlapping_along_perpendicular_axis(
|
||||
axis_0_point_1,
|
||||
axis_1_point_2,
|
||||
axis_1_contained_point_1,
|
||||
axis_1_contained_point_2,
|
||||
axis_1_lower_bound,
|
||||
axis_1_upper_bound,
|
||||
tolerance,
|
||||
):
|
||||
"""Checks if two points are adjacent along one axis and two other points overlap a range along the perpendicular
|
||||
axis.
|
||||
"""
|
||||
return adjacent_along_one_axis_and_overlapping_or_contained_along_perpendicular_axis(
|
||||
axis_0_point_1,
|
||||
axis_1_point_2,
|
||||
axis_1_contained_point_1,
|
||||
axis_1_contained_point_2,
|
||||
axis_1_lower_bound,
|
||||
axis_1_upper_bound,
|
||||
tolerance,
|
||||
mode="overlapping",
|
||||
)
|
||||
|
||||
|
||||
def adjacent_along_one_axis_and_contained_within_perpendicular_axis(
|
||||
axis_0_point_1,
|
||||
axis_1_point_2,
|
||||
axis_1_contained_point_1,
|
||||
axis_1_contained_point_2,
|
||||
axis_1_lower_bound,
|
||||
axis_1_upper_bound,
|
||||
tolerance,
|
||||
):
|
||||
"""Checks if two points are adjacent along one axis and two other points overlap a range along the perpendicular
|
||||
axis.
|
||||
"""
|
||||
return adjacent_along_one_axis_and_overlapping_or_contained_along_perpendicular_axis(
|
||||
axis_0_point_1,
|
||||
axis_1_point_2,
|
||||
axis_1_contained_point_1,
|
||||
axis_1_contained_point_2,
|
||||
axis_1_lower_bound,
|
||||
axis_1_upper_bound,
|
||||
tolerance,
|
||||
mode="contained",
|
||||
)
|
||||
|
||||
|
||||
def adjacent_along_one_axis_and_overlapping_or_contained_along_perpendicular_axis(
|
||||
axis_0_point_1,
|
||||
axis_1_point_2,
|
||||
axis_1_contained_point_1,
|
||||
axis_1_contained_point_2,
|
||||
axis_1_lower_bound,
|
||||
axis_1_upper_bound,
|
||||
tolerance,
|
||||
mode,
|
||||
):
|
||||
"""Checks if two points are adjacent along one axis and two other points overlap a range along the perpendicular
|
||||
axis or are contained in that range, depending on the mode specified.
|
||||
"""
|
||||
assert mode in ["overlapping", "contained"]
|
||||
quantifier = any if mode == "overlapping" else all
|
||||
return all(
|
||||
[
|
||||
abs(axis_0_point_1 - axis_1_point_2) <= tolerance,
|
||||
quantifier(
|
||||
[
|
||||
axis_1_lower_bound <= p <= axis_1_upper_bound
|
||||
for p in [axis_1_contained_point_1, axis_1_contained_point_2]
|
||||
]
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def contains(alpha: Rectangle, beta: Rectangle, tol=3):
|
||||
"""Checks if the first rectangle contains the second rectangle."""
|
||||
return (
|
||||
beta.x1 + tol >= alpha.x1
|
||||
and beta.y1 + tol >= alpha.y1
|
||||
and beta.x2 - tol <= alpha.x2
|
||||
and beta.y2 - tol <= alpha.y2
|
||||
)
|
||||
|
||||
|
||||
def is_contained(rectangle: Rectangle, rectangles: Iterable[Rectangle]):
|
||||
"""Checks if the rectangle is contained within any of the other rectangles."""
|
||||
other_rectangles = filter(lambda r: r != rectangle, rectangles)
|
||||
return any(map(rpartial(contains, rectangle), other_rectangles))
|
||||
|
||||
|
||||
def iou(alpha: Rectangle, beta: Rectangle):
|
||||
"""Calculates the intersection area over the union area of two rectangles."""
|
||||
return intersection(alpha, beta) / union(alpha, beta)
|
||||
|
||||
|
||||
def area(rectangle: Rectangle):
|
||||
"""Calculates the area of a rectangle."""
|
||||
return abs((rectangle.x2 - rectangle.x1) * (rectangle.y2 - rectangle.y1))
|
||||
|
||||
|
||||
def union(alpha: Rectangle, beta: Rectangle):
|
||||
"""Calculates the union area of two rectangles."""
|
||||
return area(alpha) + area(beta) - intersection(alpha, beta)
|
||||
|
||||
|
||||
@lru_cache(maxsize=1000)
|
||||
def intersection(alpha, beta):
|
||||
"""Calculates the intersection of two rectangles."""
|
||||
return intersection_along_x_axis(alpha, beta) * intersection_along_y_axis(alpha, beta)
|
||||
|
||||
|
||||
def intersection_along_x_axis(alpha, beta):
|
||||
"""Calculates the intersection along the x-axis."""
|
||||
return intersection_along_axis(alpha, beta, "x")
|
||||
|
||||
|
||||
def intersection_along_y_axis(alpha, beta):
|
||||
"""Calculates the intersection along the y-axis."""
|
||||
return intersection_along_axis(alpha, beta, "y")
|
||||
|
||||
|
||||
def intersection_along_axis(alpha, beta, axis):
|
||||
"""Calculates the intersection along the given axis.
|
||||
|
||||
Cases:
|
||||
a b
|
||||
[-----] (---) ==> [a1, b1, a2, b2] ==> max(0, (a2 - b1)) = 0
|
||||
b a
|
||||
(---) [-----] ==> [b1, a1, b2, a2] ==> max(0, (b2 - a1)) = 0
|
||||
a b
|
||||
[--(----]----) ==> [a1, b1, a2, b2] ==> max(0, (a2 - b1)) = (a2 - b1)
|
||||
a b
|
||||
(-[---]----) ==> [b1, a1, a2, b2] ==> max(0, (a2 - a1)) = (a2 - a1)
|
||||
b a
|
||||
[-(---)----] ==> [a1, b1, b2, a2] ==> max(0, (b2 - b1)) = (b2 - b1)
|
||||
b a
|
||||
(----[--)----] ==> [b1, a1, b2, a2] ==> max(0, (b2 - a1)) = (b2 - a1)
|
||||
"""
|
||||
assert axis in ["x", "y"]
|
||||
|
||||
def get_component_accessor(component):
|
||||
"""Returns a function that accesses the given component of a rectangle."""
|
||||
return attrgetter(f"{axis}{component}")
|
||||
|
||||
def make_access_components_and_sort_fn(component):
|
||||
"""Returns a function that accesses and sorts the given component of multiple rectangles."""
|
||||
assert component in [1, 2]
|
||||
return compose(sorted, lift(get_component_accessor(component)))
|
||||
|
||||
sort_first_components, sort_second_components = map(make_access_components_and_sort_fn, [1, 2])
|
||||
|
||||
min_c1, max_c1, min_c2, max_c2 = lflatten(juxt(sort_first_components, sort_second_components)((alpha, beta)))
|
||||
intersection = max(0, min_c2 - max_c1)
|
||||
return intersection
|
||||
|
||||
|
||||
def related(alpha: Rectangle, beta: Rectangle):
|
||||
return close(alpha, beta) or overlap(alpha, beta)
|
||||
|
||||
|
||||
def close(alpha: Rectangle, beta: Rectangle, max_gap=14):
|
||||
# FIXME: Parameterize via factory
|
||||
return adjacent(alpha, beta, tolerance=max_gap, strict=True)
|
||||
|
||||
|
||||
def overlap(alpha: Rectangle, beta: Rectangle):
|
||||
return intersection(alpha, beta) > 0
|
||||
@ -1,79 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from funcy import first, iterate, keep
|
||||
from numpy import generic
|
||||
|
||||
|
||||
def copy_and_normalize_channels(image):
|
||||
|
||||
if isinstance(image, Image.Image):
|
||||
image = np.array(image)
|
||||
|
||||
image = image.copy()
|
||||
try:
|
||||
image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
|
||||
except cv2.error:
|
||||
pass
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def npconvert(ob):
|
||||
if isinstance(ob, generic):
|
||||
return ob.item()
|
||||
raise TypeError
|
||||
|
||||
|
||||
def lift(fn):
|
||||
def lifted(coll):
|
||||
yield from map(fn, coll)
|
||||
|
||||
return lifted
|
||||
|
||||
|
||||
def star(fn):
|
||||
def starred(args):
|
||||
return fn(*args)
|
||||
|
||||
return starred
|
||||
|
||||
|
||||
def lstarkeep(fn, coll):
|
||||
return list(starkeep(fn, coll))
|
||||
|
||||
|
||||
def starkeep(fn, coll):
|
||||
yield from keep(star(fn), coll)
|
||||
|
||||
|
||||
def until(cond, func, *args, **kwargs):
|
||||
return first(filter(cond, iterate(func, *args, **kwargs)))
|
||||
|
||||
|
||||
def conj(x, xs):
|
||||
return [x, *xs]
|
||||
|
||||
|
||||
def rconj(xs, x):
|
||||
return [*xs, x]
|
||||
|
||||
|
||||
def make_merger_sentinel():
|
||||
def no_new_mergers(records):
|
||||
nonlocal number_of_records_so_far
|
||||
|
||||
number_of_records_now = len(records)
|
||||
|
||||
if number_of_records_now == number_of_records_so_far:
|
||||
return True
|
||||
|
||||
else:
|
||||
number_of_records_so_far = number_of_records_now
|
||||
return False
|
||||
|
||||
number_of_records_so_far = -1
|
||||
|
||||
return no_new_mergers
|
||||
BIN
data/2017-1078223.pdf
Normal file
BIN
data/2017-1078223.pdf
Normal file
Binary file not shown.
BIN
data/2017-1078223.vlp_output.annotated.pdf
Normal file
BIN
data/2017-1078223.vlp_output.annotated.pdf
Normal file
Binary file not shown.
98825
data/2017-1078223.vlp_output.json
Normal file
98825
data/2017-1078223.vlp_output.json
Normal file
File diff suppressed because it is too large
Load Diff
BIN
data/table_inference_test_files.zip
Normal file
BIN
data/table_inference_test_files.zip
Normal file
Binary file not shown.
30
devenvsetup.sh
Normal file
30
devenvsetup.sh
Normal file
@ -0,0 +1,30 @@
|
||||
#!/bin/bash
|
||||
python_version=$1
|
||||
gitlab_user=$2
|
||||
gitlab_personal_access_token=$3
|
||||
|
||||
# cookiecutter https://gitlab.knecon.com/knecon/research/template-python-project.git --checkout master
|
||||
# latest_dir=$(ls -td -- */ | head -n 1) # should be the dir cookiecutter just created
|
||||
|
||||
# cd $latest_dir
|
||||
|
||||
pyenv install $python_version
|
||||
pyenv local $python_version
|
||||
pyenv shell $python_version
|
||||
|
||||
pip install --upgrade pip
|
||||
pip install poetry
|
||||
|
||||
poetry config installer.max-workers 10
|
||||
# research package registry
|
||||
poetry config repositories.gitlab-research https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
|
||||
poetry config http-basic.gitlab-research ${gitlab_user} ${gitlab_personal_access_token}
|
||||
# redactmanager package registry
|
||||
poetry config repositories.gitlab-red https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
|
||||
poetry config http-basic.gitlab-red ${gitlab_user} ${gitlab_personal_access_token}
|
||||
|
||||
poetry env use $(pyenv which python)
|
||||
poetry install --with=dev
|
||||
poetry update
|
||||
|
||||
source .venv/bin/activate
|
||||
@ -28,4 +28,4 @@ services:
|
||||
volumes:
|
||||
- /opt/bitnami/rabbitmq/.rabbitmq/:/data/bitnami
|
||||
volumes:
|
||||
mdata:
|
||||
mdata:
|
||||
|
||||
4
docs/build/html/.buildinfo
vendored
Normal file
4
docs/build/html/.buildinfo
vendored
Normal file
@ -0,0 +1,4 @@
|
||||
# Sphinx build info version 1
|
||||
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
|
||||
config: 04e9c6c5d3e412413c2949e598da60dc
|
||||
tags: 645f666f9bcd5a90fca523b33c5a78b7
|
||||
BIN
docs/build/html/.doctrees/README.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/README.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/environment.pickle
vendored
Normal file
BIN
docs/build/html/.doctrees/environment.pickle
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/index.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/index.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.config.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.config.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.figure_detection.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.figure_detection.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.figure_detection.figure_detection.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.figure_detection.figure_detection.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.figure_detection.figures.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.figure_detection.figures.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.figure_detection.text.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.figure_detection.text.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.layout_parsing.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.layout_parsing.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.locations.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.locations.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.redaction_detection.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.redaction_detection.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.server.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.server.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.server.pipeline.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.server.pipeline.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.table_inference.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.table_inference.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.table_parsing.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.table_parsing.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.annotate.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.annotate.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.banner.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.banner.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.connect_rects.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.connect_rects.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.display.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.display.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.draw.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.draw.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.filters.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.filters.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.image_extraction.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.image_extraction.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.open_pdf.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.open_pdf.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.postprocessing.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.postprocessing.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.preprocessing.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.preprocessing.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.structures.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.structures.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.test_metrics.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.test_metrics.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.utils.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.utils.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.visual_logging.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/cv_analysis.utils.visual_logging.doctree
vendored
Normal file
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/serve.doctree
vendored
Normal file
BIN
docs/build/html/.doctrees/modules/serve.doctree
vendored
Normal file
Binary file not shown.
657
docs/build/html/README.html
vendored
Normal file
657
docs/build/html/README.html
vendored
Normal file
@ -0,0 +1,657 @@
|
||||
|
||||
<!DOCTYPE html>
|
||||
|
||||
|
||||
<html lang="en" data-content_root="./" >
|
||||
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
|
||||
<title>cv-analysis - Visual (CV-Based) Document Parsing — CV Analysis Service 2.5.2 documentation</title>
|
||||
|
||||
|
||||
|
||||
<script data-cfasync="false">
|
||||
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
|
||||
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
|
||||
</script>
|
||||
|
||||
<!-- Loaded before other Sphinx assets -->
|
||||
<link href="_static/styles/theme.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
|
||||
<link href="_static/styles/bootstrap.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
|
||||
<link href="_static/styles/pydata-sphinx-theme.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
|
||||
|
||||
|
||||
<link href="_static/vendor/fontawesome/6.5.1/css/all.min.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
|
||||
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.5.1/webfonts/fa-solid-900.woff2" />
|
||||
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.5.1/webfonts/fa-brands-400.woff2" />
|
||||
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.5.1/webfonts/fa-regular-400.woff2" />
|
||||
|
||||
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=a746c00c" />
|
||||
<link rel="stylesheet" type="text/css" href="https://assets.readthedocs.org/static/css/badge_only.css" />
|
||||
|
||||
<!-- Pre-loaded scripts that we'll load fully later -->
|
||||
<link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=8d27b9dea8ad943066ae" />
|
||||
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=8d27b9dea8ad943066ae" />
|
||||
<script src="_static/vendor/fontawesome/6.5.1/js/all.min.js?digest=8d27b9dea8ad943066ae"></script>
|
||||
|
||||
<script src="_static/documentation_options.js?v=afc61bbc"></script>
|
||||
<script src="_static/doctools.js?v=9a2dae69"></script>
|
||||
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
|
||||
<script>DOCUMENTATION_OPTIONS.pagename = 'README';</script>
|
||||
<script async="async" src="https://assets.readthedocs.org/static/javascript/readthedocs-doc-embed.js"></script>
|
||||
<link rel="index" title="Index" href="genindex.html" />
|
||||
<link rel="search" title="Search" href="search.html" />
|
||||
<link rel="next" title="cv_analysis package" href="modules/cv_analysis.html" />
|
||||
<link rel="prev" title="Welcome to CV Analysis Service documentation!" href="index.html" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
|
||||
<!-- RTD Extra Head -->
|
||||
|
||||
<link rel="stylesheet" href="https://assets.readthedocs.org/static/css/readthedocs-doc-embed.css" type="text/css" />
|
||||
|
||||
<script type="application/json" id="READTHEDOCS_DATA">{"ad_free": "", "api_host": "", "builder": "sphinx", "canonical_url": "", "docroot": "", "features": {"docsearch_disabled": false}, "global_analytics_code": null, "language": "", "page": "README", "programming_language": "", "project": "", "source_suffix": ".md", "subprojects": {}, "theme": "", "user_analytics_code": null, "version": ""}</script>
|
||||
|
||||
<!--
|
||||
Using this variable directly instead of using `JSON.parse` is deprecated.
|
||||
The READTHEDOCS_DATA global variable will be removed in the future.
|
||||
-->
|
||||
<script type="text/javascript">
|
||||
READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerHTML);
|
||||
</script>
|
||||
|
||||
<script type="text/javascript" src="https://assets.readthedocs.org/static/javascript/readthedocs-analytics.js" async="async"></script>
|
||||
|
||||
<!-- end RTD <extrahead> -->
|
||||
</head>
|
||||
|
||||
|
||||
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
|
||||
|
||||
|
||||
|
||||
<a id="pst-skip-link" class="skip-link" href="#main-content">Skip to main content</a>
|
||||
|
||||
<div id="pst-scroll-pixel-helper"></div>
|
||||
|
||||
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
|
||||
<i class="fa-solid fa-arrow-up"></i>
|
||||
Back to top
|
||||
</button>
|
||||
|
||||
|
||||
<input type="checkbox"
|
||||
class="sidebar-toggle"
|
||||
name="__primary"
|
||||
id="__primary"/>
|
||||
<label class="overlay overlay-primary" for="__primary"></label>
|
||||
|
||||
<input type="checkbox"
|
||||
class="sidebar-toggle"
|
||||
name="__secondary"
|
||||
id="__secondary"/>
|
||||
<label class="overlay overlay-secondary" for="__secondary"></label>
|
||||
|
||||
<div class="search-button__wrapper">
|
||||
<div class="search-button__overlay"></div>
|
||||
<div class="search-button__search-container">
|
||||
<form class="bd-search d-flex align-items-center"
|
||||
action="search.html"
|
||||
method="get">
|
||||
<i class="fa-solid fa-magnifying-glass"></i>
|
||||
<input type="search"
|
||||
class="form-control"
|
||||
name="q"
|
||||
id="search-input"
|
||||
placeholder="Search the docs ..."
|
||||
aria-label="Search the docs ..."
|
||||
autocomplete="off"
|
||||
autocorrect="off"
|
||||
autocapitalize="off"
|
||||
spellcheck="false"/>
|
||||
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
|
||||
</form></div>
|
||||
</div>
|
||||
|
||||
<header class="bd-header navbar navbar-expand-lg bd-navbar">
|
||||
<div class="bd-header__inner bd-page-width">
|
||||
<label class="sidebar-toggle primary-toggle" for="__primary">
|
||||
<span class="fa-solid fa-bars"></span>
|
||||
</label>
|
||||
|
||||
|
||||
<div class="col-lg-3 navbar-header-items__start">
|
||||
|
||||
<div class="navbar-item">
|
||||
|
||||
|
||||
|
||||
<a class="navbar-brand logo" href="index.html">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<img src="_static/logo.png" class="logo__image only-light" alt="CV Analysis Service 2.5.2 documentation - Home"/>
|
||||
<script>document.write(`<img src="_static/logo.png" class="logo__image only-dark" alt="CV Analysis Service 2.5.2 documentation - Home"/>`);</script>
|
||||
|
||||
|
||||
</a></div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="col-lg-9 navbar-header-items">
|
||||
|
||||
<div class="me-auto navbar-header-items__center">
|
||||
|
||||
<div class="navbar-item">
|
||||
<nav class="navbar-nav">
|
||||
<ul class="bd-navbar-elements navbar-nav">
|
||||
|
||||
<li class="nav-item current active">
|
||||
<a class="nav-link nav-internal" href="#">
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="modules/cv_analysis.html">
|
||||
cv_analysis package
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="modules/serve.html">
|
||||
serve module
|
||||
</a>
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<div class="navbar-header-items__end">
|
||||
|
||||
<div class="navbar-item navbar-persistent--container">
|
||||
|
||||
|
||||
<script>
|
||||
document.write(`
|
||||
<button class="btn navbar-btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||||
<i class="fa-solid fa-magnifying-glass"></i>
|
||||
<span class="search-button__default-text">Search</span>
|
||||
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
|
||||
</button>
|
||||
`);
|
||||
</script>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="navbar-item">
|
||||
|
||||
<script>
|
||||
document.write(`
|
||||
<button class="btn btn-sm navbar-btn theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||||
<span class="theme-switch nav-link" data-mode="light"><i class="fa-solid fa-sun fa-lg"></i></span>
|
||||
<span class="theme-switch nav-link" data-mode="dark"><i class="fa-solid fa-moon fa-lg"></i></span>
|
||||
<span class="theme-switch nav-link" data-mode="auto"><i class="fa-solid fa-circle-half-stroke fa-lg"></i></span>
|
||||
</button>
|
||||
`);
|
||||
</script></div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<div class="navbar-persistent--mobile">
|
||||
|
||||
<script>
|
||||
document.write(`
|
||||
<button class="btn navbar-btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||||
<i class="fa-solid fa-magnifying-glass"></i>
|
||||
<span class="search-button__default-text">Search</span>
|
||||
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
|
||||
</button>
|
||||
`);
|
||||
</script>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<label class="sidebar-toggle secondary-toggle" for="__secondary" tabindex="0">
|
||||
<span class="fa-solid fa-outdent"></span>
|
||||
</label>
|
||||
|
||||
</div>
|
||||
|
||||
</header>
|
||||
|
||||
|
||||
<div class="bd-container">
|
||||
<div class="bd-container__inner bd-page-width">
|
||||
|
||||
|
||||
|
||||
<div class="bd-sidebar-primary bd-sidebar">
|
||||
|
||||
|
||||
|
||||
<div class="sidebar-header-items sidebar-primary__section">
|
||||
|
||||
|
||||
<div class="sidebar-header-items__center">
|
||||
|
||||
<div class="navbar-item">
|
||||
<nav class="navbar-nav">
|
||||
<ul class="bd-navbar-elements navbar-nav">
|
||||
|
||||
<li class="nav-item current active">
|
||||
<a class="nav-link nav-internal" href="#">
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="modules/cv_analysis.html">
|
||||
cv_analysis package
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="modules/serve.html">
|
||||
serve module
|
||||
</a>
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="sidebar-header-items__end">
|
||||
|
||||
<div class="navbar-item">
|
||||
|
||||
<script>
|
||||
document.write(`
|
||||
<button class="btn btn-sm navbar-btn theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||||
<span class="theme-switch nav-link" data-mode="light"><i class="fa-solid fa-sun fa-lg"></i></span>
|
||||
<span class="theme-switch nav-link" data-mode="dark"><i class="fa-solid fa-moon fa-lg"></i></span>
|
||||
<span class="theme-switch nav-link" data-mode="auto"><i class="fa-solid fa-circle-half-stroke fa-lg"></i></span>
|
||||
</button>
|
||||
`);
|
||||
</script></div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="sidebar-primary-items__start sidebar-primary__section">
|
||||
<div class="sidebar-primary-item">
|
||||
<nav class="bd-docs-nav bd-links"
|
||||
aria-label="Section Navigation">
|
||||
<p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p>
|
||||
<div class="bd-toc-item navbar-nav"></div>
|
||||
</nav></div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="sidebar-primary-items__end sidebar-primary__section">
|
||||
</div>
|
||||
|
||||
<div id="rtd-footer-container"></div>
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
<main id="main-content" class="bd-main">
|
||||
|
||||
|
||||
<div class="bd-content">
|
||||
<div class="bd-article-container">
|
||||
|
||||
<div class="bd-header-article">
|
||||
<div class="header-article-items header-article__inner">
|
||||
|
||||
<div class="header-article-items__start">
|
||||
|
||||
<div class="header-article-item">
|
||||
|
||||
|
||||
|
||||
<nav aria-label="Breadcrumb">
|
||||
<ul class="bd-breadcrumbs">
|
||||
|
||||
<li class="breadcrumb-item breadcrumb-home">
|
||||
<a href="index.html" class="nav-link" aria-label="Home">
|
||||
<i class="fa-solid fa-home"></i>
|
||||
</a>
|
||||
</li>
|
||||
<li class="breadcrumb-item active" aria-current="page">cv-analysis...</li>
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
<div id="searchbox"></div>
|
||||
<article class="bd-article">
|
||||
|
||||
<section id="cv-analysis-visual-cv-based-document-parsing">
|
||||
<h1>cv-analysis - Visual (CV-Based) Document Parsing<a class="headerlink" href="#cv-analysis-visual-cv-based-document-parsing" title="Link to this heading">#</a></h1>
|
||||
<p>parse_pdf()
|
||||
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
|
||||
previous redactions in documents.</p>
|
||||
<section id="api">
|
||||
<h2>API<a class="headerlink" href="#api" title="Link to this heading">#</a></h2>
|
||||
<p>Input message:</p>
|
||||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="nt">"targetFilePath"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="nt">"pdf"</span><span class="p">:</span><span class="w"> </span><span class="s2">"absolute file path"</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="nt">"vlp_output"</span><span class="p">:</span><span class="w"> </span><span class="s2">"absolute file path"</span>
|
||||
<span class="w"> </span><span class="p">},</span>
|
||||
<span class="w"> </span><span class="nt">"responseFilePath"</span><span class="p">:</span><span class="w"> </span><span class="s2">"absolute file path"</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="nt">"operation"</span><span class="p">:</span><span class="w"> </span><span class="s2">"table_image_inference"</span>
|
||||
<span class="p">}</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>Response is uploaded to the storage as specified in the <code class="docutils literal notranslate"><span class="pre">responseFilePath</span></code> field. The structure is as follows:</p>
|
||||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="err">...</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="nt">"data"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span>
|
||||
<span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="err">'pageNum'</span><span class="p">:</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'bbox'</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="err">'x</span><span class="mi">1</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mf">55.3407</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'y</span><span class="mi">1</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mf">247.0246</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'x</span><span class="mi">2</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mf">558.5602</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'y</span><span class="mi">2</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mf">598.0585</span>
|
||||
<span class="w"> </span><span class="p">},</span>
|
||||
<span class="w"> </span><span class="err">'uuid'</span><span class="p">:</span><span class="w"> </span><span class="err">'</span><span class="mi">2</span><span class="err">b</span><span class="mi">10</span><span class="err">c</span><span class="mi">1</span><span class="err">a</span><span class="mi">2-393</span><span class="err">c</span><span class="mi">-4</span><span class="kc">f</span><span class="err">ca</span><span class="mi">-</span><span class="err">b</span><span class="mf">9e3-0</span><span class="err">ad</span><span class="mi">5</span><span class="err">b</span><span class="mi">774</span><span class="err">ac</span><span class="mi">84</span><span class="err">'</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'label'</span><span class="p">:</span><span class="w"> </span><span class="err">'</span><span class="kc">ta</span><span class="err">ble'</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'</span><span class="kc">ta</span><span class="err">bleLi</span><span class="kc">nes</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="p">[</span>
|
||||
<span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="err">'x</span><span class="mi">1</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'y</span><span class="mi">1</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mi">16</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'x</span><span class="mi">2</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mi">1399</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'y</span><span class="mi">2</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mi">16</span>
|
||||
<span class="w"> </span><span class="p">},</span>
|
||||
<span class="w"> </span><span class="err">...</span>
|
||||
<span class="w"> </span><span class="p">],</span>
|
||||
<span class="w"> </span><span class="err">'imageI</span><span class="kc">nf</span><span class="err">o'</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="err">'heigh</span><span class="kc">t</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mi">693</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'wid</span><span class="kc">t</span><span class="err">h'</span><span class="p">:</span><span class="w"> </span><span class="mi">1414</span>
|
||||
<span class="w"> </span><span class="p">}</span>
|
||||
<span class="w"> </span><span class="p">},</span>
|
||||
<span class="w"> </span><span class="err">...</span>
|
||||
<span class="w"> </span><span class="p">]</span>
|
||||
<span class="p">}</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
</section>
|
||||
<section id="installation">
|
||||
<h2>Installation<a class="headerlink" href="#installation" title="Link to this heading">#</a></h2>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>git<span class="w"> </span>clone<span class="w"> </span>ssh://git@git.iqser.com:2222/rr/cv-analysis.git
|
||||
<span class="nb">cd</span><span class="w"> </span>cv-analysis
|
||||
|
||||
python<span class="w"> </span>-m<span class="w"> </span>venv<span class="w"> </span>env
|
||||
<span class="nb">source</span><span class="w"> </span>env/bin/activate
|
||||
|
||||
pip<span class="w"> </span>install<span class="w"> </span>-e<span class="w"> </span>.
|
||||
pip<span class="w"> </span>install<span class="w"> </span>-r<span class="w"> </span>requirements.txt
|
||||
|
||||
dvc<span class="w"> </span>pull
|
||||
</pre></div>
|
||||
</div>
|
||||
</section>
|
||||
<section id="usage">
|
||||
<h2>Usage<a class="headerlink" href="#usage" title="Link to this heading">#</a></h2>
|
||||
<section id="as-an-api">
|
||||
<h3>As an API<a class="headerlink" href="#as-an-api" title="Link to this heading">#</a></h3>
|
||||
<p>The module provided functions for the individual tasks that all return some kind of collection of points, depending on
|
||||
the specific task.</p>
|
||||
<section id="redaction-detection-api">
|
||||
<h4>Redaction Detection (API)<a class="headerlink" href="#redaction-detection-api" title="Link to this heading">#</a></h4>
|
||||
<p>The below snippet shows hot to find the outlines of previous redactions.</p>
|
||||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">cv_analysis.redaction_detection</span> <span class="kn">import</span> <span class="n">find_redactions</span>
|
||||
<span class="kn">import</span> <span class="nn">pdf2image</span>
|
||||
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
|
||||
|
||||
<span class="n">pdf_path</span> <span class="o">=</span> <span class="o">...</span>
|
||||
<span class="n">page_index</span> <span class="o">=</span> <span class="o">...</span>
|
||||
|
||||
<span class="n">page</span> <span class="o">=</span> <span class="n">pdf2image</span><span class="o">.</span><span class="n">convert_from_path</span><span class="p">(</span><span class="n">pdf_path</span><span class="p">,</span> <span class="n">first_page</span><span class="o">=</span><span class="n">page_index</span><span class="p">,</span> <span class="n">last_page</span><span class="o">=</span><span class="n">page_index</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
|
||||
<span class="n">page</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">page</span><span class="p">)</span>
|
||||
|
||||
<span class="n">redaction_contours</span> <span class="o">=</span> <span class="n">find_redactions</span><span class="p">(</span><span class="n">page</span><span class="p">)</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
</section>
|
||||
</section>
|
||||
</section>
|
||||
<section id="as-a-cli-tool">
|
||||
<h2>As a CLI Tool<a class="headerlink" href="#as-a-cli-tool" title="Link to this heading">#</a></h2>
|
||||
<p>Core API functionalities can be used through a CLI.</p>
|
||||
<section id="table-parsing">
|
||||
<h3>Table Parsing<a class="headerlink" href="#table-parsing" title="Link to this heading">#</a></h3>
|
||||
<p>The tables parsing utility detects and segments tables into individual cells.</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">7</span><span class="w"> </span>--type<span class="w"> </span>table
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>The below image shows a parsed table, where each table cell has been detected individually.</p>
|
||||
<p><img alt="Table Parsing Demonstration" src="_images/table_parsing.png" /></p>
|
||||
</section>
|
||||
<section id="redaction-detection-cli">
|
||||
<h3>Redaction Detection (CLI)<a class="headerlink" href="#redaction-detection-cli" title="Link to this heading">#</a></h3>
|
||||
<p>The redaction detection utility detects previous redactions in PDFs (filled black rectangles).</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">2</span><span class="w"> </span>--type<span class="w"> </span>redaction
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>The below image shows the detected redactions with green outlines.</p>
|
||||
<p><img alt="Redaction Detection Demonstration" src="_images/redaction_detection.png" /></p>
|
||||
</section>
|
||||
<section id="layout-parsing">
|
||||
<h3>Layout Parsing<a class="headerlink" href="#layout-parsing" title="Link to this heading">#</a></h3>
|
||||
<p>The layout parsing utility detects elements such as paragraphs, tables and figures.</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">7</span><span class="w"> </span>--type<span class="w"> </span>layout
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>The below image shows the detected layout elements on a page.</p>
|
||||
<p><img alt="Layout Parsing Demonstration" src="_images/layout_parsing.png" /></p>
|
||||
</section>
|
||||
<section id="figure-detection">
|
||||
<h3>Figure Detection<a class="headerlink" href="#figure-detection" title="Link to this heading">#</a></h3>
|
||||
<p>The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility.</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">3</span><span class="w"> </span>--type<span class="w"> </span>figure
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>The below image shows the detected figure on a page.</p>
|
||||
<p><img alt="Figure Detection Demonstration" src="_images/figure_detection.png" /></p>
|
||||
</section>
|
||||
</section>
|
||||
<section id="running-as-a-service">
|
||||
<h2>Running as a service<a class="headerlink" href="#running-as-a-service" title="Link to this heading">#</a></h2>
|
||||
<section id="building">
|
||||
<h3>Building<a class="headerlink" href="#building" title="Link to this heading">#</a></h3>
|
||||
<p>Build base image</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>bash<span class="w"> </span>setup/docker.sh
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>Build head image</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>docker<span class="w"> </span>build<span class="w"> </span>-f<span class="w"> </span>Dockerfile<span class="w"> </span>-t<span class="w"> </span>cv-analysis<span class="w"> </span>.<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">BASE_ROOT</span><span class="o">=</span><span class="s2">""</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
</section>
|
||||
<section id="usage-service">
|
||||
<h3>Usage (service)<a class="headerlink" href="#usage-service" title="Link to this heading">#</a></h3>
|
||||
<p>Shell 1</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>docker<span class="w"> </span>run<span class="w"> </span>--rm<span class="w"> </span>--net<span class="o">=</span>host<span class="w"> </span>--rm<span class="w"> </span>cv-analysis
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>Shell 2</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/client_mock.py<span class="w"> </span>--pdf_path<span class="w"> </span>/path/to/a/pdf
|
||||
</pre></div>
|
||||
</div>
|
||||
</section>
|
||||
</section>
|
||||
</section>
|
||||
|
||||
|
||||
</article>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<footer class="prev-next-footer">
|
||||
|
||||
<div class="prev-next-area">
|
||||
<a class="left-prev"
|
||||
href="index.html"
|
||||
title="previous page">
|
||||
<i class="fa-solid fa-angle-left"></i>
|
||||
<div class="prev-next-info">
|
||||
<p class="prev-next-subtitle">previous</p>
|
||||
<p class="prev-next-title">Welcome to CV Analysis Service documentation!</p>
|
||||
</div>
|
||||
</a>
|
||||
<a class="right-next"
|
||||
href="modules/cv_analysis.html"
|
||||
title="next page">
|
||||
<div class="prev-next-info">
|
||||
<p class="prev-next-subtitle">next</p>
|
||||
<p class="prev-next-title">cv_analysis package</p>
|
||||
</div>
|
||||
<i class="fa-solid fa-angle-right"></i>
|
||||
</a>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
|
||||
|
||||
|
||||
<div class="sidebar-secondary-item">
|
||||
<div
|
||||
id="pst-page-navigation-heading-2"
|
||||
class="page-toc tocsection onthispage">
|
||||
<i class="fa-solid fa-list"></i> On this page
|
||||
</div>
|
||||
<nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
|
||||
<ul class="visible nav section-nav flex-column">
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#api">API</a></li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#installation">Installation</a></li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#usage">Usage</a><ul class="nav section-nav flex-column">
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#as-an-api">As an API</a><ul class="nav section-nav flex-column">
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#redaction-detection-api">Redaction Detection (API)</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#as-a-cli-tool">As a CLI Tool</a><ul class="nav section-nav flex-column">
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#table-parsing">Table Parsing</a></li>
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#redaction-detection-cli">Redaction Detection (CLI)</a></li>
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#layout-parsing">Layout Parsing</a></li>
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#figure-detection">Figure Detection</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#running-as-a-service">Running as a service</a><ul class="nav section-nav flex-column">
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#building">Building</a></li>
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#usage-service">Usage (service)</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</nav></div>
|
||||
|
||||
<div class="sidebar-secondary-item">
|
||||
|
||||
<div class="tocsection sourcelink">
|
||||
<a href="_sources/README.md.txt">
|
||||
<i class="fa-solid fa-file-lines"></i> Show Source
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
</div>
|
||||
<footer class="bd-footer-content">
|
||||
|
||||
</footer>
|
||||
|
||||
</main>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Scripts loaded after <body> so the DOM is not blocked -->
|
||||
<script src="_static/scripts/bootstrap.js?digest=8d27b9dea8ad943066ae"></script>
|
||||
<script src="_static/scripts/pydata-sphinx-theme.js?digest=8d27b9dea8ad943066ae"></script>
|
||||
|
||||
<footer class="bd-footer">
|
||||
<div class="bd-footer__inner bd-page-width">
|
||||
|
||||
<div class="footer-items__start">
|
||||
|
||||
<div class="footer-item">
|
||||
|
||||
<p class="copyright">
|
||||
|
||||
© Copyright All rights reserved.
|
||||
<br/>
|
||||
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
|
||||
<p class="sphinx-version">
|
||||
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 7.3.7.
|
||||
<br/>
|
||||
</p>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="footer-items__end">
|
||||
|
||||
<div class="footer-item">
|
||||
<p class="theme-version">
|
||||
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.15.2.
|
||||
</p></div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</footer>
|
||||
</body>
|
||||
</html>
|
||||
BIN
docs/build/html/_images/figure_detection.png
vendored
Normal file
BIN
docs/build/html/_images/figure_detection.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 707 KiB |
BIN
docs/build/html/_images/layout_parsing.png
vendored
Normal file
BIN
docs/build/html/_images/layout_parsing.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 568 KiB |
BIN
docs/build/html/_images/redaction_detection.png
vendored
Normal file
BIN
docs/build/html/_images/redaction_detection.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 3.2 MiB |
BIN
docs/build/html/_images/table_parsing.png
vendored
Normal file
BIN
docs/build/html/_images/table_parsing.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 566 KiB |
178
docs/build/html/_sources/README.md.txt
vendored
Normal file
178
docs/build/html/_sources/README.md.txt
vendored
Normal file
@ -0,0 +1,178 @@
|
||||
# cv-analysis - Visual (CV-Based) Document Parsing
|
||||
|
||||
parse_pdf()
|
||||
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
|
||||
previous redactions in documents.
|
||||
|
||||
## API
|
||||
|
||||
Input message:
|
||||
|
||||
```json
|
||||
{
|
||||
"targetFilePath": {
|
||||
"pdf": "absolute file path",
|
||||
"vlp_output": "absolute file path"
|
||||
},
|
||||
"responseFilePath": "absolute file path",
|
||||
"operation": "table_image_inference"
|
||||
}
|
||||
```
|
||||
|
||||
Response is uploaded to the storage as specified in the `responseFilePath` field. The structure is as follows:
|
||||
|
||||
```json
|
||||
{
|
||||
...,
|
||||
"data": [
|
||||
{
|
||||
'pageNum': 0,
|
||||
'bbox': {
|
||||
'x1': 55.3407,
|
||||
'y1': 247.0246,
|
||||
'x2': 558.5602,
|
||||
'y2': 598.0585
|
||||
},
|
||||
'uuid': '2b10c1a2-393c-4fca-b9e3-0ad5b774ac84',
|
||||
'label': 'table',
|
||||
'tableLines': [
|
||||
{
|
||||
'x1': 0,
|
||||
'y1': 16,
|
||||
'x2': 1399,
|
||||
'y2': 16
|
||||
},
|
||||
...
|
||||
],
|
||||
'imageInfo': {
|
||||
'height': 693,
|
||||
'width': 1414
|
||||
}
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
git clone ssh://git@git.iqser.com:2222/rr/cv-analysis.git
|
||||
cd cv-analysis
|
||||
|
||||
python -m venv env
|
||||
source env/bin/activate
|
||||
|
||||
pip install -e .
|
||||
pip install -r requirements.txt
|
||||
|
||||
dvc pull
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### As an API
|
||||
|
||||
The module provided functions for the individual tasks that all return some kind of collection of points, depending on
|
||||
the specific task.
|
||||
|
||||
#### Redaction Detection (API)
|
||||
|
||||
The below snippet shows hot to find the outlines of previous redactions.
|
||||
|
||||
```python
|
||||
from cv_analysis.redaction_detection import find_redactions
|
||||
import pdf2image
|
||||
import numpy as np
|
||||
|
||||
pdf_path = ...
|
||||
page_index = ...
|
||||
|
||||
page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0]
|
||||
page = np.array(page)
|
||||
|
||||
redaction_contours = find_redactions(page)
|
||||
```
|
||||
|
||||
## As a CLI Tool
|
||||
|
||||
Core API functionalities can be used through a CLI.
|
||||
|
||||
### Table Parsing
|
||||
|
||||
The tables parsing utility detects and segments tables into individual cells.
|
||||
|
||||
```bash
|
||||
python scripts/annotate.py data/test_pdf.pdf 7 --type table
|
||||
```
|
||||
|
||||
The below image shows a parsed table, where each table cell has been detected individually.
|
||||
|
||||

|
||||
|
||||
### Redaction Detection (CLI)
|
||||
|
||||
The redaction detection utility detects previous redactions in PDFs (filled black rectangles).
|
||||
|
||||
```bash
|
||||
python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
|
||||
```
|
||||
|
||||
The below image shows the detected redactions with green outlines.
|
||||
|
||||

|
||||
|
||||
### Layout Parsing
|
||||
|
||||
The layout parsing utility detects elements such as paragraphs, tables and figures.
|
||||
|
||||
```bash
|
||||
python scripts/annotate.py data/test_pdf.pdf 7 --type layout
|
||||
```
|
||||
|
||||
The below image shows the detected layout elements on a page.
|
||||
|
||||

|
||||
|
||||
### Figure Detection
|
||||
|
||||
The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility.
|
||||
|
||||
```bash
|
||||
python scripts/annotate.py data/test_pdf.pdf 3 --type figure
|
||||
```
|
||||
|
||||
The below image shows the detected figure on a page.
|
||||
|
||||

|
||||
|
||||
## Running as a service
|
||||
|
||||
### Building
|
||||
|
||||
Build base image
|
||||
|
||||
```bash
|
||||
bash setup/docker.sh
|
||||
```
|
||||
|
||||
Build head image
|
||||
|
||||
```bash
|
||||
docker build -f Dockerfile -t cv-analysis . --build-arg BASE_ROOT=""
|
||||
```
|
||||
|
||||
### Usage (service)
|
||||
|
||||
Shell 1
|
||||
|
||||
```bash
|
||||
docker run --rm --net=host --rm cv-analysis
|
||||
```
|
||||
|
||||
Shell 2
|
||||
|
||||
```bash
|
||||
python scripts/client_mock.py --pdf_path /path/to/a/pdf
|
||||
```
|
||||
37
docs/build/html/_sources/index.rst.txt
vendored
Normal file
37
docs/build/html/_sources/index.rst.txt
vendored
Normal file
@ -0,0 +1,37 @@
|
||||
.. Keyword Extraction Service documentation master file, created by
|
||||
sphinx-quickstart on Mon Sep 12 12:04:24 2022.
|
||||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
=============================================
|
||||
Welcome to CV Analysis Service documentation!
|
||||
=============================================
|
||||
|
||||
.. note::
|
||||
|
||||
If you'd like to change the looks of things 👉 https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html
|
||||
|
||||
|
||||
Table of Contents
|
||||
-----------------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 3
|
||||
:caption: README
|
||||
|
||||
README.md
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 3
|
||||
:caption: Modules
|
||||
|
||||
modules/cv_analysis
|
||||
modules/serve
|
||||
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
||||
* :ref:`genindex`
|
||||
* :ref:`modindex`
|
||||
* :ref:`search`
|
||||
7
docs/build/html/_sources/modules/cv_analysis.config.rst.txt
vendored
Normal file
7
docs/build/html/_sources/modules/cv_analysis.config.rst.txt
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
cv\_analysis.config module
|
||||
==========================
|
||||
|
||||
.. automodule:: cv_analysis.config
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
7
docs/build/html/_sources/modules/cv_analysis.figure_detection.figure_detection.rst.txt
vendored
Normal file
7
docs/build/html/_sources/modules/cv_analysis.figure_detection.figure_detection.rst.txt
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
cv\_analysis.figure\_detection.figure\_detection module
|
||||
=======================================================
|
||||
|
||||
.. automodule:: cv_analysis.figure_detection.figure_detection
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
7
docs/build/html/_sources/modules/cv_analysis.figure_detection.figures.rst.txt
vendored
Normal file
7
docs/build/html/_sources/modules/cv_analysis.figure_detection.figures.rst.txt
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
cv\_analysis.figure\_detection.figures module
|
||||
=============================================
|
||||
|
||||
.. automodule:: cv_analysis.figure_detection.figures
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
17
docs/build/html/_sources/modules/cv_analysis.figure_detection.rst.txt
vendored
Normal file
17
docs/build/html/_sources/modules/cv_analysis.figure_detection.rst.txt
vendored
Normal file
@ -0,0 +1,17 @@
|
||||
cv\_analysis.figure\_detection package
|
||||
======================================
|
||||
|
||||
.. automodule:: cv_analysis.figure_detection
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
cv_analysis.figure_detection.figure_detection
|
||||
cv_analysis.figure_detection.figures
|
||||
cv_analysis.figure_detection.text
|
||||
7
docs/build/html/_sources/modules/cv_analysis.figure_detection.text.rst.txt
vendored
Normal file
7
docs/build/html/_sources/modules/cv_analysis.figure_detection.text.rst.txt
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
cv\_analysis.figure\_detection.text module
|
||||
==========================================
|
||||
|
||||
.. automodule:: cv_analysis.figure_detection.text
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
7
docs/build/html/_sources/modules/cv_analysis.layout_parsing.rst.txt
vendored
Normal file
7
docs/build/html/_sources/modules/cv_analysis.layout_parsing.rst.txt
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
cv\_analysis.layout\_parsing module
|
||||
===================================
|
||||
|
||||
.. automodule:: cv_analysis.layout_parsing
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
7
docs/build/html/_sources/modules/cv_analysis.locations.rst.txt
vendored
Normal file
7
docs/build/html/_sources/modules/cv_analysis.locations.rst.txt
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
cv\_analysis.locations module
|
||||
=============================
|
||||
|
||||
.. automodule:: cv_analysis.locations
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
7
docs/build/html/_sources/modules/cv_analysis.redaction_detection.rst.txt
vendored
Normal file
7
docs/build/html/_sources/modules/cv_analysis.redaction_detection.rst.txt
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
cv\_analysis.redaction\_detection module
|
||||
========================================
|
||||
|
||||
.. automodule:: cv_analysis.redaction_detection
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
30
docs/build/html/_sources/modules/cv_analysis.rst.txt
vendored
Normal file
30
docs/build/html/_sources/modules/cv_analysis.rst.txt
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
cv\_analysis package
|
||||
====================
|
||||
|
||||
.. automodule:: cv_analysis
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Subpackages
|
||||
-----------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
cv_analysis.figure_detection
|
||||
cv_analysis.server
|
||||
cv_analysis.utils
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
cv_analysis.config
|
||||
cv_analysis.layout_parsing
|
||||
cv_analysis.locations
|
||||
cv_analysis.redaction_detection
|
||||
cv_analysis.table_inference
|
||||
cv_analysis.table_parsing
|
||||
7
docs/build/html/_sources/modules/cv_analysis.server.pipeline.rst.txt
vendored
Normal file
7
docs/build/html/_sources/modules/cv_analysis.server.pipeline.rst.txt
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
cv\_analysis.server.pipeline module
|
||||
===================================
|
||||
|
||||
.. automodule:: cv_analysis.server.pipeline
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
15
docs/build/html/_sources/modules/cv_analysis.server.rst.txt
vendored
Normal file
15
docs/build/html/_sources/modules/cv_analysis.server.rst.txt
vendored
Normal file
@ -0,0 +1,15 @@
|
||||
cv\_analysis.server package
|
||||
===========================
|
||||
|
||||
.. automodule:: cv_analysis.server
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
|
||||
Submodules
|
||||
----------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
cv_analysis.server.pipeline
|
||||
7
docs/build/html/_sources/modules/cv_analysis.table_inference.rst.txt
vendored
Normal file
7
docs/build/html/_sources/modules/cv_analysis.table_inference.rst.txt
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
cv\_analysis.table\_inference module
|
||||
====================================
|
||||
|
||||
.. automodule:: cv_analysis.table_inference
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
7
docs/build/html/_sources/modules/cv_analysis.table_parsing.rst.txt
vendored
Normal file
7
docs/build/html/_sources/modules/cv_analysis.table_parsing.rst.txt
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
cv\_analysis.table\_parsing module
|
||||
==================================
|
||||
|
||||
.. automodule:: cv_analysis.table_parsing
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
7
docs/build/html/_sources/modules/cv_analysis.utils.annotate.rst.txt
vendored
Normal file
7
docs/build/html/_sources/modules/cv_analysis.utils.annotate.rst.txt
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
cv\_analysis.utils.annotate module
|
||||
==================================
|
||||
|
||||
.. automodule:: cv_analysis.utils.annotate
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
7
docs/build/html/_sources/modules/cv_analysis.utils.banner.rst.txt
vendored
Normal file
7
docs/build/html/_sources/modules/cv_analysis.utils.banner.rst.txt
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
cv\_analysis.utils.banner module
|
||||
================================
|
||||
|
||||
.. automodule:: cv_analysis.utils.banner
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
7
docs/build/html/_sources/modules/cv_analysis.utils.connect_rects.rst.txt
vendored
Normal file
7
docs/build/html/_sources/modules/cv_analysis.utils.connect_rects.rst.txt
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
cv\_analysis.utils.connect\_rects module
|
||||
========================================
|
||||
|
||||
.. automodule:: cv_analysis.utils.connect_rects
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
7
docs/build/html/_sources/modules/cv_analysis.utils.display.rst.txt
vendored
Normal file
7
docs/build/html/_sources/modules/cv_analysis.utils.display.rst.txt
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
cv\_analysis.utils.display module
|
||||
=================================
|
||||
|
||||
.. automodule:: cv_analysis.utils.display
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
7
docs/build/html/_sources/modules/cv_analysis.utils.draw.rst.txt
vendored
Normal file
7
docs/build/html/_sources/modules/cv_analysis.utils.draw.rst.txt
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
cv\_analysis.utils.draw module
|
||||
==============================
|
||||
|
||||
.. automodule:: cv_analysis.utils.draw
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
7
docs/build/html/_sources/modules/cv_analysis.utils.filters.rst.txt
vendored
Normal file
7
docs/build/html/_sources/modules/cv_analysis.utils.filters.rst.txt
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
cv\_analysis.utils.filters module
|
||||
=================================
|
||||
|
||||
.. automodule:: cv_analysis.utils.filters
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user